Commit e7e5bc07 authored by Kozlowski Mateusz's avatar Kozlowski Mateusz Committed by Ben Walker
Browse files

FTL: Add initial L2P cache logic



L2P cache allows for partial storing of L2P in memory, paging in and out
as necessary, lowering the total memory consumption.

Signed-off-by: default avatarKozlowski Mateusz <mateusz.kozlowski@intel.com>
Signed-off-by: default avatarArtur Paszkiewicz <artur.paszkiewicz@intel.com>
Change-Id: I727fec9d2f0ade4ca73e872d62a2ec10cfdb0a88
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13353


Community-CI: Mellanox Build Bot
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent 01cec249
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -37,6 +37,9 @@ struct spdk_ftl_conf {
	/* Percentage of base device blocks not exposed to the user */
	uint64_t				overprovisioning;

	/* l2p cache size that could reside in DRAM (in MiB) */
	size_t					l2p_dram_limit;

	/* Core mask - core thread plus additional relocation threads */
	char					*core_mask;

+1 −1
Original line number Diff line number Diff line
@@ -22,7 +22,7 @@ CFLAGS += -I.
FTL_SUBDIRS := mngt utils

C_SRCS = ftl_core.c ftl_init.c ftl_layout.c ftl_debug.c ftl_io.c ftl_sb.c ftl_l2p.c ftl_l2p_flat.c
C_SRCS += ftl_nv_cache.c ftl_band.c ftl_band_ops.c ftl_writer.c ftl_rq.c ftl_reloc.c
C_SRCS += ftl_nv_cache.c ftl_band.c ftl_band_ops.c ftl_writer.c ftl_rq.c ftl_reloc.c ftl_l2p_cache.c
C_SRCS += mngt/ftl_mngt.c mngt/ftl_mngt_bdev.c mngt/ftl_mngt_shutdown.c mngt/ftl_mngt_startup.c
C_SRCS += mngt/ftl_mngt_md.c mngt/ftl_mngt_misc.c mngt/ftl_mngt_ioch.c mngt/ftl_mngt_l2p.c
C_SRCS += mngt/ftl_mngt_band.c
+3 −0
Original line number Diff line number Diff line
@@ -113,6 +113,9 @@ struct spdk_ftl_dev {
	/* Logical -> physical table */
	void				*l2p;

	/* l2p deferred pins list */
	TAILQ_HEAD(, ftl_l2p_pin_ctx)	l2p_deferred_pins;

	/* Size of the l2p table */
	uint64_t			num_lbas;

+14 −2
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
int
ftl_l2p_init(struct spdk_ftl_dev *dev)
{
	TAILQ_INIT(&dev->l2p_deferred_pins);
	return FTL_L2P_OP(init)(dev);
}

@@ -78,12 +79,24 @@ ftl_l2p_clear(struct spdk_ftl_dev *dev, ftl_l2p_cb cb, void *cb_ctx)
void
ftl_l2p_process(struct spdk_ftl_dev *dev)
{
	struct ftl_l2p_pin_ctx *pin_ctx;

	pin_ctx = TAILQ_FIRST(&dev->l2p_deferred_pins);
	if (pin_ctx) {
		TAILQ_REMOVE(&dev->l2p_deferred_pins, pin_ctx, link);
		FTL_L2P_OP(pin)(dev, pin_ctx);
	}

	FTL_L2P_OP(process)(dev);
}

bool
ftl_l2p_is_halted(struct spdk_ftl_dev *dev)
{
	if (!TAILQ_EMPTY(&dev->l2p_deferred_pins)) {
		return false;
	}

	return FTL_L2P_OP(is_halted)(dev);
}

@@ -183,8 +196,7 @@ void
ftl_l2p_pin_complete(struct spdk_ftl_dev *dev, int status, struct ftl_l2p_pin_ctx *pin_ctx)
{
	if (spdk_unlikely(status == -EAGAIN)) {
		/* Path updated in later patch */
		assert(false);
		TAILQ_INSERT_TAIL(&dev->l2p_deferred_pins, pin_ctx, link);
	} else {
		pin_ctx->cb(dev, status, pin_ctx);
	}
+361 −0
Original line number Diff line number Diff line
/*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (c) Intel Corporation.
 *   All rights reserved.
 */

#include "spdk/stdinc.h"
#include "spdk/cpuset.h"
#include "spdk/queue.h"
#include "spdk/thread.h"
#include "spdk/event.h"
#include "spdk/ftl.h"
#include "spdk/conf.h"
#include "spdk/env.h"
#include "spdk/util.h"

#include "ftl_core.h"
#include "ftl_l2p_cache.h"
#include "ftl_layout.h"
#include "ftl_nv_cache_io.h"
#include "mngt/ftl_mngt_steps.h"
#include "utils/ftl_defs.h"
#include "utils/ftl_addr_utils.h"

struct ftl_l2p_cache_page_io_ctx {
	struct ftl_l2p_cache *cache;
	uint64_t updates;
	struct spdk_bdev_io_wait_entry bdev_io_wait;
};

enum ftl_l2p_page_state {
	L2P_CACHE_PAGE_INIT,		/* Page in memory not initialized from disk page */
	L2P_CACHE_PAGE_READY,		/* Page initialized from disk */
	L2P_CACHE_PAGE_FLUSHING,	/* Page is being flushed to disk and removed from memory */
	L2P_CACHE_PAGE_PERSISTING,	/* Page is being flushed to disk and not removed from memory */
	L2P_CACHE_PAGE_CLEARING,	/* Page is being initialized with INVALID addresses */
	L2P_CACHE_PAGE_CORRUPTED	/* Page corrupted */
};

struct ftl_l2p_page {
	uint64_t updates; /* Number of times an L2P entry was updated in the page since it was last persisted */
	TAILQ_HEAD(, ftl_l2p_page_wait_ctx) ppe_list; /* for deferred pins */
	TAILQ_ENTRY(ftl_l2p_page) list_entry;
	uint64_t page_no;
	enum ftl_l2p_page_state state;
	uint64_t pin_ref_cnt;
	struct ftl_l2p_cache_page_io_ctx ctx;
	bool on_rank_list;
	void *page_buffer;
	ftl_df_obj_id obj_id;
};

struct ftl_l2p_page_set;

struct ftl_l2p_page_wait_ctx {
	uint16_t	pg_pin_issued;
	uint16_t	pg_pin_completed;
	struct ftl_l2p_page_set *parent;
	uint64_t	pg_no;
	TAILQ_ENTRY(ftl_l2p_page_wait_ctx) list_entry;
};

/* A L2P page contains 1024 4B entries (or 512 8B ones for big drives).
 * Currently internal IO will only pin 1 LBA at a time, so only one entry should be needed.
 * User IO is split on internal xfer_size boundaries, which is currently set to 1MiB (256 blocks),
 * so one entry should also be enough.
 * TODO: We should probably revisit this though, when/if the xfer_size is based on io requirements of the
 * bottom device (e.g. RAID5F), since then big IOs (especially unaligned ones) could potentially break this.
 */
#define L2P_MAX_PAGES_TO_PIN 4
struct ftl_l2p_page_set {
	uint16_t to_pin_cnt;
	uint16_t pinned_cnt;
	uint16_t pin_fault_cnt;
	uint8_t locked;
	uint8_t deferred;
	struct ftl_l2p_pin_ctx *pin_ctx;
	TAILQ_ENTRY(ftl_l2p_page_set) list_entry;
	struct ftl_l2p_page_wait_ctx entry[L2P_MAX_PAGES_TO_PIN];
};

struct ftl_l2p_l1_map_entry {
	ftl_df_obj_id page_obj_id;
};

enum ftl_l2p_cache_state {
	L2P_CACHE_INIT,
	L2P_CACHE_RUNNING,
	L2P_CACHE_IN_SHUTDOWN,
	L2P_CACHE_SHUTDOWN_DONE,
};

struct ftl_l2p_cache_process_ctx {
	int status;
	ftl_l2p_cb cb;
	void *cb_ctx;
	uint64_t idx;
	uint64_t qd;
};

struct ftl_l2p_cache {
	struct spdk_ftl_dev *dev;
	struct ftl_l2p_l1_map_entry *l2_mapping;
	struct ftl_md *l2_md;
	struct ftl_md *l2_ctx_md;
	struct ftl_mempool *l2_ctx_pool;
	struct ftl_md *l1_md;

	TAILQ_HEAD(l2p_lru_list, ftl_l2p_page) lru_list;
	/* TODO: A lot of / and % operations are done on this value, consider adding a shift based field and calculactions instead */
	uint64_t lbas_in_page;
	uint64_t num_pages;		/* num pages to hold the entire L2P */

	uint64_t ios_in_flight;		/* Currently in flight IOs, to determine l2p shutdown readiness */
	enum ftl_l2p_cache_state state;
	uint32_t l2_pgs_avail;
	uint32_t l2_pgs_evicting;
	uint32_t l2_pgs_resident_max;
	uint32_t evict_keep;
	struct ftl_mempool *page_pinners_pool;
	TAILQ_HEAD(, ftl_l2p_page_set) deferred_pinner_list; /* for deferred pinners */

	/* This is a context for a management process */
	struct ftl_l2p_cache_process_ctx mctx;

	/* MD layout cache: Offset on a device in FTL_BLOCK_SIZE unit */
	uint64_t cache_layout_offset;

	/* MD layout cache: Device of region */
	struct spdk_bdev_desc *cache_layout_bdev_desc;

	/* MD layout cache: IO channel of region */
	struct spdk_io_channel *cache_layout_ioch;
};

typedef void (*ftl_l2p_cache_clear_cb)(struct ftl_l2p_cache *cache, int status, void *ctx_page);
typedef void (*ftl_l2p_cache_persist_cb)(struct ftl_l2p_cache *cache, int status, void *ctx_page);
typedef void (*ftl_l2p_cache_sync_cb)(struct spdk_ftl_dev *dev, int status, void *page,
				      void *user_ctx);

static inline uint64_t
ftl_l2p_cache_get_l1_page_size(void)
{
	return 1UL << 12;
}

static inline size_t
ftl_l2p_cache_get_page_all_size(void)
{
	return sizeof(struct ftl_l2p_page) + ftl_l2p_cache_get_l1_page_size();
}

static void *
_ftl_l2p_cache_init(struct spdk_ftl_dev *dev, size_t addr_size, uint64_t l2p_size)
{
	struct ftl_l2p_cache *cache;
	uint64_t l2_pages = spdk_divide_round_up(l2p_size, ftl_l2p_cache_get_l1_page_size());
	size_t l2_size = l2_pages * sizeof(struct ftl_l2p_l1_map_entry);

	cache = calloc(1, sizeof(struct ftl_l2p_cache));
	if (cache == NULL) {
		return NULL;
	}
	cache->dev = dev;

	cache->l2_md = ftl_md_create(dev,
				     spdk_divide_round_up(l2_size, FTL_BLOCK_SIZE), 0,
				     FTL_L2P_CACHE_MD_NAME_L2,
				     ftl_md_create_shm_flags(dev), NULL);

	if (cache->l2_md == NULL) {
		goto fail_l2_md;
	}
	cache->l2_mapping = ftl_md_get_buffer(cache->l2_md);

	cache->lbas_in_page = dev->layout.l2p.lbas_in_page;
	cache->num_pages = l2_pages;

	return cache;
fail_l2_md:
	free(cache);
	return NULL;
}

int
ftl_l2p_cache_init(struct spdk_ftl_dev *dev)
{
	uint64_t l2p_size = dev->num_lbas * dev->layout.l2p.addr_size;
	struct ftl_l2p_cache *cache;
	const struct ftl_layout_region *reg;
	void *l2p = _ftl_l2p_cache_init(dev, dev->layout.l2p.addr_size, l2p_size);
	size_t page_pinners_pool_size = 1 << 15;
	size_t max_resident_size, max_resident_pgs;

	if (!l2p) {
		return -1;
	}
	dev->l2p = l2p;

	cache = (struct ftl_l2p_cache *)dev->l2p;
	cache->page_pinners_pool = ftl_mempool_create(page_pinners_pool_size,
				   sizeof(struct ftl_l2p_page_set),
				   64, SPDK_ENV_SOCKET_ID_ANY);
	if (!cache->page_pinners_pool) {
		return -1;
	}

	max_resident_size = dev->conf.l2p_dram_limit << 20;
	max_resident_pgs = max_resident_size / ftl_l2p_cache_get_page_all_size();

	if (max_resident_pgs > cache->num_pages) {
		SPDK_NOTICELOG("l2p memory limit higher than entire L2P size\n");
		max_resident_pgs = cache->num_pages;
	}

	/* Round down max res pgs to the nearest # of l2/l1 pgs */
	max_resident_size = max_resident_pgs * ftl_l2p_cache_get_page_all_size();
	SPDK_NOTICELOG("l2p maximum resident size is: %"PRIu64" (of %"PRIu64") MiB\n",
		       max_resident_size >> 20, dev->conf.l2p_dram_limit);

	TAILQ_INIT(&cache->deferred_pinner_list);
	TAILQ_INIT(&cache->lru_list);

	cache->l2_ctx_md = ftl_md_create(dev,
					 spdk_divide_round_up(max_resident_pgs * SPDK_ALIGN_CEIL(sizeof(struct ftl_l2p_page), 64),
							 FTL_BLOCK_SIZE), 0, FTL_L2P_CACHE_MD_NAME_L2_CTX, ftl_md_create_shm_flags(dev), NULL);

	if (cache->l2_ctx_md == NULL) {
		return -1;
	}

	cache->l2_pgs_resident_max = max_resident_pgs;
	cache->l2_pgs_avail = max_resident_pgs;
	cache->l2_pgs_evicting = 0;
	cache->l2_ctx_pool = ftl_mempool_create_ext(ftl_md_get_buffer(cache->l2_ctx_md),
			     max_resident_pgs, sizeof(struct ftl_l2p_page), 64);

	if (cache->l2_ctx_pool == NULL) {
		return -1;
	}

#define FTL_L2P_CACHE_PAGE_AVAIL_MAX            16UL << 10
#define FTL_L2P_CACHE_PAGE_AVAIL_RATIO          5UL
	cache->evict_keep = spdk_divide_round_up(cache->num_pages * FTL_L2P_CACHE_PAGE_AVAIL_RATIO, 100);
	cache->evict_keep = spdk_min(FTL_L2P_CACHE_PAGE_AVAIL_MAX, cache->evict_keep);

	if (!ftl_fast_startup(dev) && !ftl_fast_recovery(dev)) {
		memset(cache->l2_mapping, (int)FTL_DF_OBJ_ID_INVALID, ftl_md_get_buffer_size(cache->l2_md));
		ftl_mempool_initialize_ext(cache->l2_ctx_pool);
	}

	cache->l1_md = ftl_md_create(dev,
				     max_resident_pgs, 0,
				     FTL_L2P_CACHE_MD_NAME_L1,
				     ftl_md_create_shm_flags(dev), NULL);

	if (cache->l1_md == NULL) {
		return -1;
	}

	/* Cache MD layout */
	reg = &dev->layout.region[FTL_LAYOUT_REGION_TYPE_L2P];
	cache->cache_layout_offset = reg->current.offset;
	cache->cache_layout_bdev_desc = reg->bdev_desc;
	cache->cache_layout_ioch = reg->ioch;

	cache->state = L2P_CACHE_RUNNING;
	return 0;
}

static void
ftl_l2p_cache_deinit_l2(struct spdk_ftl_dev *dev, struct ftl_l2p_cache *cache)
{
	ftl_md_destroy(cache->l2_ctx_md, ftl_md_destroy_shm_flags(dev));
	cache->l2_ctx_md = NULL;

	ftl_mempool_destroy_ext(cache->l2_ctx_pool);
	cache->l2_ctx_pool = NULL;

	ftl_md_destroy(cache->l1_md, ftl_md_destroy_shm_flags(dev));
	cache->l1_md = NULL;

	ftl_mempool_destroy(cache->page_pinners_pool);
	cache->page_pinners_pool = NULL;
}

static void
_ftl_l2p_cache_deinit(struct spdk_ftl_dev *dev)
{
	struct ftl_l2p_cache *cache = (struct ftl_l2p_cache *)dev->l2p;

	ftl_l2p_cache_deinit_l2(dev, cache);
	ftl_md_destroy(cache->l2_md, ftl_md_destroy_shm_flags(dev));
	free(cache);
}

void
ftl_l2p_cache_deinit(struct spdk_ftl_dev *dev)
{
	struct ftl_l2p_cache *cache = (struct ftl_l2p_cache *)dev->l2p;

	if (!cache) {
		return;
	}
	assert(cache->state == L2P_CACHE_SHUTDOWN_DONE || cache->state == L2P_CACHE_INIT);

	_ftl_l2p_cache_deinit(dev);
	dev->l2p = 0;
}

static void
clear_cb(struct spdk_ftl_dev *dev, struct ftl_md *md, int status)
{
	ftl_l2p_cb cb = md->owner.private;
	void *cb_cntx = md->owner.cb_ctx;

	cb(dev, status, cb_cntx);
}

void
ftl_l2p_cache_clear(struct spdk_ftl_dev *dev, ftl_l2p_cb cb, void *cb_ctx)
{
	struct ftl_md *md = dev->layout.md[FTL_LAYOUT_REGION_TYPE_L2P];
	ftl_addr invalid_addr = FTL_ADDR_INVALID;

	md->cb =  clear_cb;
	md->owner.cb_ctx = cb_ctx;
	md->owner.private = cb;

	ftl_md_clear(md, invalid_addr, NULL);
}

bool
ftl_l2p_cache_is_halted(struct spdk_ftl_dev *dev)
{
	struct ftl_l2p_cache *cache = (struct ftl_l2p_cache *)dev->l2p;

	return cache->state == L2P_CACHE_SHUTDOWN_DONE;
}

void
ftl_l2p_cache_halt(struct spdk_ftl_dev *dev)
{
	struct ftl_l2p_cache *cache = (struct ftl_l2p_cache *)dev->l2p;

	if (cache->state != L2P_CACHE_SHUTDOWN_DONE) {
		cache->state = L2P_CACHE_IN_SHUTDOWN;
		if (!cache->ios_in_flight && !cache->l2_pgs_evicting) {
			cache->state = L2P_CACHE_SHUTDOWN_DONE;
		}
	}
}

void
ftl_l2p_cache_process(struct spdk_ftl_dev *dev)
{
	struct ftl_l2p_cache *cache = dev->l2p;

	if (spdk_unlikely(cache->state != L2P_CACHE_RUNNING)) {
		return;
	}
}
Loading