Commit 41fd1f6b authored by Mateusz Kozlowski's avatar Mateusz Kozlowski Committed by Tomasz Zawadzki
Browse files

lib/blob: Release cluster on unmap



Cluster will now be released if an unmap request is sent to an LBA range
that fully covers it.

Change-Id: I2e204a34b6ebea384be0d614801f3e9a20f2fed7
Signed-off-by: default avatarMateusz Kozlowski <mateusz.kozlowski@solidigm.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/21907


Reviewed-by: default avatarBen Walker <ben@nvidia.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarJim Harris <jim.harris@samsung.com>
Community-CI: Mellanox Build Bot
parent 5846a390
Loading
Loading
Loading
Loading
+197 −0
Original line number Diff line number Diff line
@@ -30,6 +30,8 @@ static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
		uint64_t cluster, uint32_t extent, struct spdk_blob_md_page *page,
		spdk_blob_op_complete cb_fn, void *cb_arg);
static void blob_free_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
		uint32_t extent_page, struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg);

static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
			  uint16_t value_len, bool internal);
@@ -2545,6 +2547,15 @@ struct spdk_blob_copy_cluster_ctx {
	struct spdk_blob_md_page *new_cluster_page;
};

struct spdk_blob_free_cluster_ctx {
	struct spdk_blob *blob;
	uint64_t page;
	struct spdk_blob_md_page *md_page;
	uint64_t cluster_num;
	uint32_t extent_page;
	spdk_bs_sequence_t *seq;
};

static void
blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno)
{
@@ -2570,6 +2581,17 @@ blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno)
	free(ctx);
}

static void
blob_free_cluster_cpl(void *cb_arg, int bserrno)
{
	struct spdk_blob_free_cluster_ctx *ctx = cb_arg;
	spdk_bs_sequence_t *seq = ctx->seq;

	bs_sequence_finish(seq, bserrno);

	free(ctx);
}

static void
blob_insert_cluster_cpl(void *cb_arg, int bserrno)
{
@@ -2932,6 +2954,21 @@ blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob,
	blob_request_submit_op_split_next(ctx, 0);
}

static void
spdk_free_cluster_unmap_complete(void *cb_arg, int bserrno)
{
	struct spdk_blob_free_cluster_ctx *ctx = cb_arg;

	if (bserrno) {
		bs_sequence_finish(ctx->seq, bserrno);
		free(ctx);
		return;
	}

	blob_free_cluster_on_md_thread(ctx->blob, ctx->cluster_num,
				       ctx->extent_page, ctx->md_page, blob_free_cluster_cpl, ctx);
}

static void
blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob,
			      void *payload, uint64_t offset, uint64_t length,
@@ -3026,10 +3063,59 @@ blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blo
		break;
	}
	case SPDK_BLOB_UNMAP: {
		struct spdk_blob_free_cluster_ctx *ctx = NULL;
		spdk_bs_batch_t *batch;

		/* if aligned with cluster release cluster */
		if (spdk_blob_is_thin_provisioned(blob) && is_allocated &&
		    bs_io_units_per_cluster(blob) == length) {
			struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch);
			uint32_t cluster_start_page;
			uint32_t cluster_number;

			assert(offset % bs_io_units_per_cluster(blob) == 0);

			/* Round the io_unit offset down to the first page in the cluster */
			cluster_start_page = bs_io_unit_to_cluster_start(blob, offset);

			/* Calculate which index in the metadata cluster array the corresponding
			 * cluster is supposed to be at. */
			cluster_number = bs_io_unit_to_cluster_number(blob, offset);

			ctx = calloc(1, sizeof(*ctx));
			if (!ctx) {
				cb_fn(cb_arg, -ENOMEM);
				return;
			}
			/* When freeing a cluster the flow should be (in order):
			 * 1. Unmap the underlying area (so if the cluster is reclaimed in the future, it won't leak
			 * old data)
			 * 2. Once the unmap completes (to avoid any races with incoming writes that may claim the
			 * cluster), update and sync metadata freeing the cluster
			 * 3. Once metadata update is done, complete the user unmap request
			 */
			ctx->blob = blob;
			ctx->page = cluster_start_page;
			ctx->cluster_num = cluster_number;
			ctx->md_page = bs_channel->new_cluster_page;
			ctx->seq = bs_sequence_start_bs(_ch, &cpl);
			if (!ctx->seq) {
				free(ctx);
				cb_fn(cb_arg, -ENOMEM);
				return;
			}

			if (blob->use_extent_table) {
				ctx->extent_page = *bs_cluster_to_extent_page(blob, cluster_number);
			}

			cpl.u.blob_basic.cb_fn = spdk_free_cluster_unmap_complete;
			cpl.u.blob_basic.cb_arg = ctx;
		}

		batch = bs_batch_open(_ch, &cpl, blob);
		if (!batch) {
			free(ctx);
			cb_fn(cb_arg, -ENOMEM);
			return;
		}
@@ -7883,6 +7969,46 @@ struct spdk_blob_write_extent_page_ctx {
	struct spdk_blob_md_page	*page;
};

static void
blob_free_cluster_msg_cb(void *arg, int bserrno)
{
	struct spdk_blob_cluster_op_ctx *ctx = arg;

	spdk_spin_lock(&ctx->blob->bs->used_lock);
	bs_release_cluster(ctx->blob->bs, bs_lba_to_cluster(ctx->blob->bs, ctx->cluster));
	spdk_spin_unlock(&ctx->blob->bs->used_lock);

	ctx->rc = bserrno;
	spdk_thread_send_msg(ctx->thread, blob_op_cluster_msg_cpl, ctx);
}

static void
blob_free_cluster_update_ep_cb(void *arg, int bserrno)
{
	struct spdk_blob_cluster_op_ctx *ctx = arg;

	if (bserrno != 0 || ctx->blob->bs->clean == 0) {
		blob_free_cluster_msg_cb(ctx, bserrno);
		return;
	}

	ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
	blob_sync_md(ctx->blob, blob_free_cluster_msg_cb, ctx);
}

static void
blob_free_cluster_free_ep_cb(void *arg, int bserrno)
{
	struct spdk_blob_cluster_op_ctx *ctx = arg;

	spdk_spin_lock(&ctx->blob->bs->used_lock);
	assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
	bs_release_md_page(ctx->blob->bs, ctx->extent_page);
	spdk_spin_unlock(&ctx->blob->bs->used_lock);
	ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
	blob_sync_md(ctx->blob, blob_free_cluster_msg_cb, ctx);
}

static void
blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
{
@@ -8018,6 +8144,77 @@ blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
	spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx);
}

static void
blob_free_cluster_msg(void *arg)
{
	struct spdk_blob_cluster_op_ctx *ctx = arg;
	uint32_t *extent_page;
	uint32_t start_cluster_idx;
	bool free_extent_page = true;
	size_t i;

	ctx->cluster = ctx->blob->active.clusters[ctx->cluster_num];
	ctx->blob->active.clusters[ctx->cluster_num] = 0;

	if (ctx->blob->use_extent_table == false) {
		/* Extent table is not used, proceed with sync of md that will only use extents_rle. */
		spdk_spin_lock(&ctx->blob->bs->used_lock);
		bs_release_cluster(ctx->blob->bs, bs_lba_to_cluster(ctx->blob->bs, ctx->cluster));
		spdk_spin_unlock(&ctx->blob->bs->used_lock);
		ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
		blob_sync_md(ctx->blob, blob_op_cluster_msg_cb, ctx);
		return;
	}

	extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);

	/* There shouldn't be parallel release operations on same cluster */
	assert(*extent_page == ctx->extent_page);

	start_cluster_idx = (ctx->cluster_num / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP;
	for (i = 0; i < SPDK_EXTENTS_PER_EP; ++i) {
		if (ctx->blob->active.clusters[start_cluster_idx + i] != 0) {
			free_extent_page = false;
			break;
		}
	}

	if (free_extent_page) {
		assert(ctx->extent_page != 0);
		assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
		ctx->blob->active.extent_pages[bs_cluster_to_extent_table_id(ctx->cluster_num)] = 0;
		blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, ctx->page,
				       blob_free_cluster_free_ep_cb, ctx);
	} else {
		blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, ctx->page,
				       blob_free_cluster_update_ep_cb, ctx);
	}
}


static void
blob_free_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, uint32_t extent_page,
			       struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg)
{
	struct spdk_blob_cluster_op_ctx *ctx;

	ctx = calloc(1, sizeof(*ctx));
	if (ctx == NULL) {
		cb_fn(cb_arg, -ENOMEM);
		return;
	}

	ctx->thread = spdk_get_thread();
	ctx->blob = blob;
	ctx->cluster_num = cluster_num;
	ctx->extent_page = extent_page;
	ctx->page = page;
	ctx->cb_fn = cb_fn;
	ctx->cb_arg = cb_arg;

	spdk_thread_send_msg(blob->bs->md_thread, blob_free_cluster_msg, ctx);
}

/* START spdk_blob_close */

static void
+14 −0
Original line number Diff line number Diff line
@@ -4509,6 +4509,20 @@ blob_thin_prov_write_count_io(void)
		 * For extent table metadata, we should have written the I/O and the extent metadata page.
		 */
		CU_ASSERT((g_dev_write_bytes - write_bytes) / page_size == 2);

		/* Send unmap aligned to the whole cluster - should free it up */
		g_bserrno = -1;
		spdk_blob_io_unmap(blob, ch, pages_per_extent_page * i, pages_per_cluster, blob_op_complete, NULL);
		poll_threads();
		CU_ASSERT(g_bserrno == 0);
		CU_ASSERT(free_clusters - (2 * i + 1) == spdk_bs_free_cluster_count(bs));

		/* Write back to the freed cluster */
		g_bserrno = -1;
		spdk_blob_io_write(blob, ch, payload_write, pages_per_extent_page * i, 1, blob_op_complete, NULL);
		poll_threads();
		CU_ASSERT(g_bserrno == 0);
		CU_ASSERT(free_clusters - (2 * i + 2) == spdk_bs_free_cluster_count(bs));
	}

	ut_blob_close_and_delete(bs, blob);