Commit b7bfa504 authored by Evgeniy Kochetov's avatar Evgeniy Kochetov Committed by Tomasz Zawadzki
Browse files

blob: Use bdev copy command in CoW flow if supported



Copy-on-write happens when cluster is written for the first time for
thin provisioned volume. Currently it is implemented as two separate
requests to underlying bdev: read of the whole cluster to bounce
buffer and then write of this buffer to the new location on the same
underlying bdev.

This patch improves copy-on-write flow by utilizing copy command of
underlying bdev if it is supported. In this case we have just one
request to bdev and don't need the bounce buffer.

Signed-off-by: default avatarEvgeniy Kochetov <evgeniik@nvidia.com>
Change-Id: I92552e0f18f7a41820d589e7bb1e86160c69183f
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14351


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
parent 9e843fdb
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -206,6 +206,10 @@ struct spdk_bs_dev {
	 */
	bool (*translate_lba)(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba);

	void (*copy)(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
		     uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count,
		     struct spdk_bs_dev_cb_args *cb_args);

	uint64_t	blockcnt;
	uint32_t	blocklen; /* In bytes */
};
+36 −6
Original line number Diff line number Diff line
@@ -2389,6 +2389,28 @@ blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
			      blob_write_copy_cpl, ctx);
}

static bool
blob_can_copy(struct spdk_blob *blob, uint32_t cluster_start_page, uint64_t *base_lba)
{
	uint64_t lba = bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page);

	return (blob->bs->dev->copy != NULL) &&
	       blob->back_bs_dev->translate_lba(blob->back_bs_dev, lba, base_lba);
}

static void
blob_copy(struct spdk_blob_copy_cluster_ctx *ctx, spdk_bs_user_op_t *op, uint64_t src_lba)
{
	struct spdk_blob *blob = ctx->blob;
	uint64_t lba_count = bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz);

	bs_sequence_copy_dev(ctx->seq,
			     bs_cluster_to_lba(blob->bs, ctx->new_cluster),
			     src_lba,
			     lba_count,
			     blob_write_copy_cpl, ctx);
}

static void
bs_allocate_and_copy_cluster(struct spdk_blob *blob,
			     struct spdk_io_channel *_ch,
@@ -2400,6 +2422,8 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
	uint32_t cluster_start_page;
	uint32_t cluster_number;
	bool is_zeroes;
	bool can_copy;
	uint64_t copy_src_lba;
	int rc;

	ch = spdk_io_channel_get_ctx(_ch);
@@ -2431,11 +2455,12 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
	ctx->page = cluster_start_page;
	ctx->new_cluster_page = ch->new_cluster_page;
	memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE);
	can_copy = blob_can_copy(blob, cluster_start_page, &copy_src_lba);

	is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev,
			bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
			bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz));
	if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) {
	if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes && !can_copy) {
		ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
				       NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
		if (!ctx->buf) {
@@ -2477,11 +2502,16 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
	TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);

	if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) {
		if (can_copy) {
			blob_copy(ctx, op, copy_src_lba);
		} else {
			/* Read cluster from backing device */
			bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
						bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
						bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz),
						blob_write_copy, ctx);
		}

	} else {
		blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
						 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx);
+16 −0
Original line number Diff line number Diff line
@@ -238,6 +238,22 @@ bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq,
				   &set->cb_args);
}

void
bs_sequence_copy_dev(spdk_bs_sequence_t *seq, uint64_t dst_lba, uint64_t src_lba,
		     uint64_t lba_count, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
{
	struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
	struct spdk_bs_channel     *channel = set->channel;

	SPDK_DEBUGLOG(blob_rw, "Copying %" PRIu64 " blocks from LBA %" PRIu64 " to LBA %" PRIu64 "\n",
		      lba_count, src_lba, dst_lba);

	set->u.sequence.cb_fn = cb_fn;
	set->u.sequence.cb_arg = cb_arg;

	channel->dev->copy(channel->dev, channel->dev_channel, dst_lba, src_lba, lba_count, &set->cb_args);
}

void
bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno)
{
+4 −0
Original line number Diff line number Diff line
@@ -150,6 +150,10 @@ void bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq,
				  uint64_t lba, uint64_t lba_count,
				  spdk_bs_sequence_cpl cb_fn, void *cb_arg);

void bs_sequence_copy_dev(spdk_bs_sequence_t *seq,
			  uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count,
			  spdk_bs_sequence_cpl cb_fn, void *cb_arg);

void bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno);

void bs_user_op_sequence_finish(void *cb_arg, int bserrno);
+38 −10
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@ struct blob_resubmit {
	void *payload;
	int iovcnt;
	uint64_t lba;
	uint64_t src_lba;
	uint32_t lba_count;
	struct spdk_bs_dev_cb_args *cb_args;
	struct spdk_blob_ext_io_opts *ext_io_opts;
@@ -64,8 +65,9 @@ bdev_blob_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *arg)

static void
bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
		   int iovcnt, uint64_t lba, uint32_t lba_count, enum spdk_bdev_io_type io_type,
		   struct spdk_bs_dev_cb_args *cb_args, struct spdk_blob_ext_io_opts *ext_io_opts)
		   int iovcnt, uint64_t lba, uint64_t src_lba, uint32_t lba_count,
		   enum spdk_bdev_io_type io_type, struct spdk_bs_dev_cb_args *cb_args,
		   struct spdk_blob_ext_io_opts *ext_io_opts)
{
	int rc;
	struct spdk_bdev *bdev = __get_bdev(dev);
@@ -85,6 +87,7 @@ bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, voi
	ctx->payload = payload;
	ctx->iovcnt = iovcnt;
	ctx->lba = lba;
	ctx->src_lba = src_lba;
	ctx->lba_count = lba_count;
	ctx->cb_args = cb_args;
	ctx->bdev_io_wait.bdev = bdev;
@@ -110,7 +113,7 @@ bdev_blob_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *p
	rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba,
				   lba_count, bdev_blob_io_complete, cb_args);
	if (rc == -ENOMEM) {
		bdev_blob_queue_io(dev, channel, payload, 0, lba,
		bdev_blob_queue_io(dev, channel, payload, 0, lba, 0,
				   lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL);
	} else if (rc != 0) {
		cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@@ -126,7 +129,7 @@ bdev_blob_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *
	rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba,
				    lba_count, bdev_blob_io_complete, cb_args);
	if (rc == -ENOMEM) {
		bdev_blob_queue_io(dev, channel, payload, 0, lba,
		bdev_blob_queue_io(dev, channel, payload, 0, lba, 0,
				   lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL);
	} else if (rc != 0) {
		cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@@ -143,7 +146,7 @@ bdev_blob_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
	rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba,
				    lba_count, bdev_blob_io_complete, cb_args);
	if (rc == -ENOMEM) {
		bdev_blob_queue_io(dev, channel, iov, iovcnt, lba,
		bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0,
				   lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL);
	} else if (rc != 0) {
		cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@@ -160,7 +163,7 @@ bdev_blob_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
	rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba,
				     lba_count, bdev_blob_io_complete, cb_args);
	if (rc == -ENOMEM) {
		bdev_blob_queue_io(dev, channel, iov, iovcnt, lba,
		bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0,
				   lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL);
	} else if (rc != 0) {
		cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@@ -187,7 +190,7 @@ bdev_blob_readv_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
	rc = spdk_bdev_readv_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count,
					bdev_blob_io_complete, cb_args, bdev_io_opts);
	if (rc == -ENOMEM) {
		bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args,
		bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args,
				   io_opts);
	} else if (rc != 0) {
		cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@@ -214,7 +217,7 @@ bdev_blob_writev_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
	rc = spdk_bdev_writev_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count,
					 bdev_blob_io_complete, cb_args, bdev_io_opts);
	if (rc == -ENOMEM) {
		bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args,
		bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args,
				   io_opts);
	} else if (rc != 0) {
		cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@@ -230,7 +233,7 @@ bdev_blob_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
	rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba,
					   lba_count, bdev_blob_io_complete, cb_args);
	if (rc == -ENOMEM) {
		bdev_blob_queue_io(dev, channel, NULL, 0, lba,
		bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0,
				   lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args, NULL);
	} else if (rc != 0) {
		cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@@ -248,7 +251,7 @@ bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64
		rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count,
					    bdev_blob_io_complete, cb_args);
		if (rc == -ENOMEM) {
			bdev_blob_queue_io(dev, channel, NULL, 0, lba,
			bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0,
					   lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args, NULL);
		} else if (rc != 0) {
			cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@@ -263,6 +266,24 @@ bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64
	}
}

static void
bdev_blob_copy(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
	       uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count,
	       struct spdk_bs_dev_cb_args *cb_args)
{
	int rc;

	rc = spdk_bdev_copy_blocks(__get_desc(dev), channel,
				   dst_lba, src_lba, lba_count,
				   bdev_blob_io_complete, cb_args);
	if (rc == -ENOMEM) {
		bdev_blob_queue_io(dev, channel, NULL, 0, dst_lba, src_lba,
				   lba_count, SPDK_BDEV_IO_TYPE_COPY, cb_args, NULL);
	} else if (rc != 0) {
		cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
	}
}

static void
bdev_blob_resubmit(void *arg)
{
@@ -295,6 +316,10 @@ bdev_blob_resubmit(void *arg)
		bdev_blob_write_zeroes(ctx->dev, ctx->channel,
				       ctx->lba, ctx->lba_count, ctx->cb_args);
		break;
	case SPDK_BDEV_IO_TYPE_COPY:
		bdev_blob_copy(ctx->dev, ctx->channel,
			       ctx->lba, ctx->src_lba, ctx->lba_count, ctx->cb_args);
		break;
	default:
		SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type);
		assert(false);
@@ -390,6 +415,9 @@ blob_bdev_init(struct blob_bdev *b, struct spdk_bdev_desc *desc)
	b->bs_dev.writev_ext = bdev_blob_writev_ext;
	b->bs_dev.write_zeroes = bdev_blob_write_zeroes;
	b->bs_dev.unmap = bdev_blob_unmap;
	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
		b->bs_dev.copy = bdev_blob_copy;
	}
	b->bs_dev.get_base_bdev = bdev_blob_get_base_bdev;
	b->bs_dev.is_zeroes = bdev_blob_is_zeroes;
	b->bs_dev.translate_lba = bdev_blob_translate_lba;