Commit 2e7a7fe5 authored by Evgeniy Kochetov's avatar Evgeniy Kochetov Committed by Tomasz Zawadzki
Browse files

blob: Optimize copy-on-write flow for clusters backed by zeroes device



Writing to unallocated cluster triggers copy-on-write sequence. If
this cluster is backed by zeroes device we can skip the copy part. For
a simple thin provisioned volume copy this shortcut is already
implemented because `blob->parent_id == SPDK_BLOBID_INVALID`. But this
will not work for thin provisioned volumes created from snapshot. In
this case we need to traverse the whole stack of underlying
`spdk_bs_dev` devices for specific cluster to check if it is zeroes
backed.

This patch adds `is_zeroes` operation to `spdk_bs_dev`. For zeroes
device it always returns 'true', for real bdev (`blob_bs_dev`) always
returns false, for another layer of `blob_bs_dev` does lba conversion
and forwards to backing device.

In blobstore's cluster copy flow we check if cluster is backed by
zeroes device and skip copy part if it is.

Signed-off-by: default avatarEvgeniy Kochetov <evgeniik@nvidia.com>
Change-Id: I640773ac78f8f466b96e96a34c3a6c3c91f87dab
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13446


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarTomasz Zawadzki <tomasz.zawadzki@intel.com>
parent ab58ddf1
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -27,6 +27,9 @@ For now we are using hard-coded PSK and only support TLS 1.3
Reserve space for used_cluster bitmap. The reserved space could be used for blobstore growing
in the future.

Added `is_zeroes` operation to `spdk_bs_dev`. It allows to detect if logical blocks are backed
by zeroes device and do a shortcut in copy-on-write flow by excluding copy part from zeroes device.

### lvol

Add num_md_pages_per_cluster_ratio parameter to the bdev_lvol_create_lvstore RPC.
+3 −1
Original line number Diff line number Diff line
/*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (c) Intel Corporation.
 *   All rights reserved.
 *   Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 *   Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

/** \file
@@ -195,6 +195,8 @@ struct spdk_bs_dev {

	struct spdk_bdev *(*get_base_bdev)(struct spdk_bs_dev *dev);

	bool (*is_zeroes)(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count);

	uint64_t	blockcnt;
	uint32_t	blocklen; /* In bytes */
};
+19 −0
Original line number Diff line number Diff line
@@ -117,6 +117,24 @@ blob_bs_dev_destroy(struct spdk_bs_dev *bs_dev)
	spdk_blob_close(b->blob, blob_bs_dev_destroy_cpl, b);
}

static bool
blob_bs_is_zeroes(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count)
{
	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;
	struct spdk_blob *blob = b->blob;

	assert(lba == bs_cluster_to_lba(blob->bs, bs_lba_to_cluster(blob->bs, lba)));
	assert(lba_count == bs_dev_byte_to_lba(dev, blob->bs->cluster_sz));

	if (bs_io_unit_is_allocated(blob, lba)) {
		return false;
	}

	assert(blob->back_bs_dev != NULL);
	return blob->back_bs_dev->is_zeroes(blob->back_bs_dev,
					    bs_io_unit_to_back_dev_lba(blob, lba),
					    bs_io_unit_to_back_dev_lba(blob, lba_count));
}

struct spdk_bs_dev *
bs_create_blob_bs_dev(struct spdk_blob *blob)
@@ -142,6 +160,7 @@ bs_create_blob_bs_dev(struct spdk_blob *blob)
	b->bs_dev.readv_ext = blob_bs_dev_readv_ext;
	b->bs_dev.write_zeroes = blob_bs_dev_write_zeroes;
	b->bs_dev.unmap = blob_bs_dev_unmap;
	b->bs_dev.is_zeroes = blob_bs_is_zeroes;
	b->blob = blob;

	return &b->bs_dev;
+6 −2
Original line number Diff line number Diff line
@@ -2399,6 +2399,7 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
	struct spdk_blob_copy_cluster_ctx *ctx;
	uint32_t cluster_start_page;
	uint32_t cluster_number;
	bool is_zeroes;
	int rc;

	ch = spdk_io_channel_get_ctx(_ch);
@@ -2431,7 +2432,10 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
	ctx->new_cluster_page = ch->new_cluster_page;
	memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE);

	if (blob->parent_id != SPDK_BLOBID_INVALID) {
	is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev,
			bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
			bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz));
	if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) {
		ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
				       NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
		if (!ctx->buf) {
@@ -2472,7 +2476,7 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
	/* Queue the user op to block other incoming operations */
	TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);

	if (blob->parent_id != SPDK_BLOBID_INVALID) {
	if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) {
		/* Read cluster from backing device */
		bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
					bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
+7 −0
Original line number Diff line number Diff line
@@ -118,6 +118,12 @@ zeroes_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
	assert(false);
}

static bool
zeroes_is_zeroes(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count)
{
	return true;
}

static struct spdk_bs_dev g_zeroes_bs_dev = {
	.blockcnt = UINT64_MAX,
	.blocklen = 512,
@@ -132,6 +138,7 @@ static struct spdk_bs_dev g_zeroes_bs_dev = {
	.writev_ext = zeroes_writev_ext,
	.write_zeroes = zeroes_write_zeroes,
	.unmap = zeroes_unmap,
	.is_zeroes = zeroes_is_zeroes,
};

struct spdk_bs_dev *
Loading