Commit 4d7c8162 authored by Konrad Sztyber's avatar Konrad Sztyber Committed by Ben Walker
Browse files

lib/ftl: non-volatile cache data recovery



Use the data placed on the non-volatile cache to perform recovery in
case the device wasn't shut down cleanly. The write phase ranges are
read and their data is copied onto the OC device.

The code added in this patch will correctly copy the data from
overlapping ranges, however it won't do anything about these overlapping
areas, so subsequent power loss happening quickly after recovery might
result in data loss.

Change-Id: Ib4c66092cee858496ec66f789fcfb1e7e32f5c20
Signed-off-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/458105


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarMateusz Kozlowski <mateusz.kozlowski@intel.com>
Reviewed-by: default avatarWojciech Malikowski <wojciech.malikowski@intel.com>
Reviewed-by: default avatarDarek Stojaczyk <dariusz.stojaczyk@intel.com>
parent 81e37974
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -512,6 +512,14 @@ ftl_nv_cache_next_phase(unsigned int current)
	return phases[current];
}

static inline unsigned int
ftl_nv_cache_prev_phase(unsigned int current)
{
	static const unsigned int phases[] = { 0, 3, 1, 2 };
	assert(ftl_nv_cache_phase_is_valid(current));
	return phases[current];
}

static inline uint64_t
ftl_nv_cache_pack_lba(uint64_t lba, unsigned int phase)
{
+220 −18
Original line number Diff line number Diff line
@@ -52,8 +52,11 @@ struct ftl_restore_band {
	STAILQ_ENTRY(ftl_restore_band)	stailq;
};

struct ftl_nv_cache_restore;

/* Describes single phase to be restored from non-volatile cache */
struct ftl_nv_cache_range {
	struct ftl_nv_cache_restore	*parent;
	/* Start offset */
	uint64_t			start_addr;
	/* Last block's address */
@@ -63,10 +66,16 @@ struct ftl_nv_cache_range {
	 * and the starting block due to range overlap)
	 */
	uint64_t			num_blocks;
	/* Number of blocks already recovered */
	uint64_t			num_recovered;
	/* Current address during recovery */
	uint64_t			current_addr;
	/* Phase of the range */
	unsigned int			phase;
	/* Indicates whether the data from this range needs to be recovered */
	bool				recovery;
};

struct ftl_nv_cache_restore;

struct ftl_nv_cache_block {
	struct ftl_nv_cache_restore	*parent;
	/* Data buffer */
@@ -97,6 +106,10 @@ struct ftl_nv_cache_restore {
	size_t				num_outstanding;
	/* Recovery/scan status */
	int				status;
	/* Recover the data from non-volatile cache */
	bool				recovery;
	/* Current phase of the recovery */
	unsigned int			phase;
};

struct ftl_restore {
@@ -438,36 +451,27 @@ ftl_nv_cache_restore_complete(struct ftl_nv_cache_restore *restore, int status)
	}
}

static void ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);

static void
ftl_nv_cache_scan_done(struct ftl_nv_cache_restore *restore)
ftl_nv_cache_recovery_done(struct ftl_nv_cache_restore *restore)
{
	struct ftl_nv_cache *nv_cache = restore->nv_cache;
	struct ftl_nv_cache_range *range;
	struct spdk_bdev *bdev;
	unsigned int phase = nv_cache->phase;
	uint64_t current_addr;
	unsigned int phase;

	bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
	phase = nv_cache->phase;

#if defined(DEBUG)
	uint64_t i, num_blocks = 0;
	for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) {
		range = &restore->range[i];
		SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Range %"PRIu64": %"PRIu64"-%"PRIu64" (%" PRIu64
			      ")\n", i, range->start_addr, range->last_addr, range->num_blocks);
		num_blocks += range->num_blocks;
	}
	assert(num_blocks == nv_cache->num_data_blocks);
#endif
	/* The latest phase is the one written in the header (set in nvc_cache->phase) */
	range = &restore->range[phase];
	current_addr = range->last_addr + 1;

	/*
	 * The first range might be empty (only the header was written) or the range might end at
	 * the last available address, in which case set current address to the beginning of the
	 * device.
	 * The first range might be empty (only the header was written) or the range might
	 * end at the last available address, in which case set current address to the
	 * beginning of the device.
	 */
	if (range->num_blocks == 0 || current_addr >= spdk_bdev_get_num_blocks(bdev)) {
		current_addr = FTL_NV_CACHE_DATA_OFFSET;
@@ -484,6 +488,199 @@ ftl_nv_cache_scan_done(struct ftl_nv_cache_restore *restore)
	ftl_nv_cache_restore_complete(restore, 0);
}

static void
ftl_nv_cache_recover_block(struct ftl_nv_cache_block *block)
{
	struct ftl_nv_cache_restore *restore = block->parent;
	struct ftl_nv_cache *nv_cache = restore->nv_cache;
	struct ftl_nv_cache_range *range = &restore->range[restore->phase];
	int rc;

	assert(range->current_addr <= range->last_addr);

	restore->num_outstanding++;
	block->offset = range->current_addr++;
	rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch,
					   block->buf, block->md_buf,
					   block->offset, 1, ftl_nv_cache_block_read_cb,
					   block);
	if (spdk_unlikely(rc != 0)) {
		SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n",
			    block->offset, spdk_strerror(-rc));
		restore->num_outstanding--;
		ftl_nv_cache_restore_complete(restore, rc);
	}
}

static void
ftl_nv_cache_recover_range(struct ftl_nv_cache_restore *restore)
{
	struct ftl_nv_cache_range *range;
	unsigned int phase = restore->phase;

	do {
		/* Find first range with non-zero number of blocks that is marked for recovery */
		range = &restore->range[phase];
		if (range->recovery && range->num_recovered < range->num_blocks) {
			break;
		}

		phase = ftl_nv_cache_next_phase(phase);
	} while (phase != restore->phase);

	/* There are no ranges to be recovered, we're done */
	if (range->num_recovered == range->num_blocks || !range->recovery) {
		SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache recovery done\n");
		ftl_nv_cache_recovery_done(restore);
		return;
	}

	range->current_addr = range->start_addr;
	restore->phase = phase;

	SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Recovering range %u %"PRIu64"-%"PRIu64" (%"PRIu64")\n",
		      phase, range->start_addr, range->last_addr, range->num_blocks);

	ftl_nv_cache_recover_block(&restore->block[0]);
}

static void
ftl_nv_cache_write_cb(struct ftl_io *io, void *cb_arg, int status)
{
	struct ftl_nv_cache_block *block = cb_arg;
	struct ftl_nv_cache_restore *restore = block->parent;
	struct ftl_nv_cache_range *range = &restore->range[restore->phase];

	restore->num_outstanding--;
	if (status != 0) {
		SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n",
			    block->offset, spdk_strerror(-status));
		ftl_nv_cache_restore_complete(restore, -ENOMEM);
		return;
	}

	range->num_recovered++;
	if (range->current_addr <= range->last_addr) {
		ftl_nv_cache_recover_block(block);
	} else if (restore->num_outstanding == 0) {
		assert(range->num_recovered == range->num_blocks);
		ftl_nv_cache_recover_range(restore);
	}
}

static struct ftl_io *
ftl_nv_cache_alloc_io(struct ftl_nv_cache_block *block, uint64_t lba)
{
	struct ftl_restore *restore = SPDK_CONTAINEROF(block->parent, struct ftl_restore, nv_cache);
	struct ftl_io_init_opts opts = {
		.dev		= restore->dev,
		.io		= NULL,
		.flags		= FTL_IO_BYPASS_CACHE,
		.type		= FTL_IO_WRITE,
		.lbk_cnt	= 1,
		.cb_fn		= ftl_nv_cache_write_cb,
		.cb_ctx		= block,
		.data		= block->buf,
	};
	struct ftl_io *io;

	io = ftl_io_init_internal(&opts);
	if (spdk_unlikely(!io)) {
		return NULL;
	}

	io->lba.single = lba;
	return io;
}

static void
ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
	struct ftl_nv_cache_block *block = cb_arg;
	struct ftl_nv_cache_restore *restore = block->parent;
	struct ftl_nv_cache_range *range = &restore->range[restore->phase];
	struct ftl_io *io;
	unsigned int phase;
	uint64_t lba;

	spdk_bdev_free_io(bdev_io);
	restore->num_outstanding--;

	if (!success) {
		SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64"\n",
			    block->offset);
		ftl_nv_cache_restore_complete(restore, -EIO);
		return;
	}

	ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase);
	if (spdk_unlikely(phase != restore->phase)) {
		if (range->current_addr < range->last_addr) {
			ftl_nv_cache_recover_block(block);
		} else if (restore->num_outstanding == 0) {
			ftl_nv_cache_recover_range(restore);
		}

		return;
	}

	io = ftl_nv_cache_alloc_io(block, lba);
	if (spdk_unlikely(!io)) {
		SPDK_ERRLOG("Failed to allocate ftl_io during non-volatile cache recovery\n");
		ftl_nv_cache_restore_complete(restore, -ENOMEM);
		return;
	}

	restore->num_outstanding++;
	ftl_io_write(io);
}

/*
 * Since we have no control over the order in which the requests complete in regards to their
 * submission, the cache can be in either of the following states:
 *  - [1 1 1 1 1 1 1 1 1 1]: simplest case, whole cache contains single phase (although it should be
 *			     very rare),
 *  - [1 1 1 1 3 3 3 3 3 3]: two phases, changing somewhere in the middle with no overlap. This is
 *			     the state left by clean shutdown,
 *  - [1 1 1 1 3 1 3 3 3 3]: similar to the above, but this time the two ranges overlap. This
 *			     happens when completions are reordered during unsafe shutdown,
 *  - [2 1 2 1 1 1 1 3 1 3]: three different phases, each one of which can overlap with
 *			     previous/next one. The data from the oldest phase doesn't need to be
 *			     recovered, as it was already being written to, which means it's
 *			     already on the main storage.
 */
static void
ftl_nv_cache_scan_done(struct ftl_nv_cache_restore *restore)
{
	struct ftl_nv_cache *nv_cache = restore->nv_cache;
#if defined(DEBUG)
	struct ftl_nv_cache_range *range;
	uint64_t i, num_blocks = 0;

	for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) {
		range = &restore->range[i];
		SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Range %"PRIu64": %"PRIu64"-%"PRIu64" (%" PRIu64
			      ")\n", i, range->start_addr, range->last_addr, range->num_blocks);
		num_blocks += range->num_blocks;
	}
	assert(num_blocks == nv_cache->num_data_blocks);
#endif
	if (!restore->recovery) {
		ftl_nv_cache_recovery_done(restore);
	} else {
		restore->phase = ftl_nv_cache_prev_phase(nv_cache->phase);
		/*
		 * Only the latest two phases need to be recovered. The third one, even if present,
		 * already has to be stored on the main storage, as it's already started to be
		 * overwritten (only present here because of reordering of requests' completions).
		 */
		restore->range[nv_cache->phase].recovery = true;
		restore->range[restore->phase].recovery = true;

		ftl_nv_cache_recover_range(restore);
	}
}

static int ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block);

static void
@@ -669,9 +866,12 @@ ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb)
	}

	for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) {
		nvc_restore->range[i].parent = nvc_restore;
		nvc_restore->range[i].start_addr = FTL_LBA_INVALID;
		nvc_restore->range[i].last_addr = FTL_LBA_INVALID;
		nvc_restore->range[i].num_blocks = 0;
		nvc_restore->range[i].recovery = false;
		nvc_restore->range[i].phase = i;
	}

	rc = spdk_bdev_read_blocks(nv_cache->bdev_desc, ioch->cache_ioch, nv_cache->dma_buf,
@@ -714,6 +914,7 @@ ftl_pad_chunk_pad_finish(struct ftl_restore_band *rband, bool direct_access)
			rband->band->state = FTL_BAND_STATE_CLOSED;
			ftl_band_set_direct_access(rband->band, false);
		}

		next_band = STAILQ_NEXT(rband, stailq);
		if (!next_band) {
			ftl_restore_complete(restore, restore->pad_status);
@@ -896,6 +1097,7 @@ ftl_restore_tail_md_cb(struct ftl_io *io, void *ctx, int status)
			SPDK_ERRLOG("%s while restoring tail md. Will attempt to pad band %u.\n",
				    spdk_strerror(-status), rband->band->id);
			STAILQ_INSERT_TAIL(&restore->pad_bands, rband, stailq);
			restore->nv_cache.recovery = true;
		}
	}