Commit 00311abc authored by Diwakar Sharma's avatar Diwakar Sharma Committed by Jim Harris
Browse files

blob: handle IO to a resized blob



When a blob is resized and is backed by a another blob like
a snapshot, then the resized blob has more number of clusters
compared to backing device. Since the blob is thin, either
originally or as a result of creating snapshot, the IOs to
the new cluster ranges fail as there is no cluster to be looked
at in the backing. This patch introduces mechanism to handle reads
by zeroeing out the payload which is trailing beyond cluster range,
and handle writes by validating backing dev cluster range during
copy-on-write.

Change-Id: Ib204b922819775396e53ffb718e8230ebf8fa46b
Signed-off-by: default avatarDiwakar Sharma <diwakar.sharma@datacore.com>
Co-authored-by: default avatarDiwakar Sharma <diwakar.sharma@datacore.com>
Co-authored-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/22377


Community-CI: Mellanox Build Bot
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarJim Harris <jim.harris@samsung.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent ba31ad82
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -230,6 +230,10 @@ struct spdk_bs_dev {
	struct spdk_bdev *(*get_base_bdev)(struct spdk_bs_dev *dev);

	bool (*is_zeroes)(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count);
	/* Is the lba range we are looking for valid or not for this bs_dev. Used to
	 * check if we can safely reference the bs_dev during CoW or perhaps even
	 * during read. */
	bool (*is_range_valid)(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count);

	/* Translate blob lba to lba on the underlying bdev.
	 * This operation recurses down the whole chain of bs_dev's.
+80 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
#include "spdk/stdinc.h"
#include "spdk/blob.h"
#include "spdk/log.h"
#include "spdk/likely.h"
#include "blobstore.h"

static void
@@ -65,11 +66,56 @@ blob_bs_dev_read_cpl(void *cb_arg, int bserrno)
	cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno);
}

static inline void
zero_trailing_bytes(struct spdk_blob_bs_dev *b, struct iovec *iov, int iovcnt,
		    uint64_t lba, uint32_t *lba_count)
{
	uint32_t zero_lba_count;
	uint64_t zero_bytes, zero_len;
	uint64_t payload_bytes;
	uint64_t valid_bytes;
	void *zero_start;
	struct iovec *i;

	if (spdk_likely(lba + *lba_count <= b->bs_dev.blockcnt)) {
		return;
	}

	/* Figure out how many bytes in the payload will need to be zeroed. */
	zero_lba_count = spdk_min(*lba_count, lba + *lba_count - b->bs_dev.blockcnt);
	zero_bytes = zero_lba_count * b->bs_dev.blocklen;

	payload_bytes = *lba_count * b->bs_dev.blocklen;
	valid_bytes = payload_bytes - zero_bytes;

	i = iov;
	while (zero_bytes > 0) {
		if (i->iov_len > valid_bytes) {
			zero_start = i->iov_base + valid_bytes;
			zero_len = spdk_min(payload_bytes, i->iov_len - valid_bytes);
			memset(zero_start, 0, zero_bytes);
			valid_bytes = 0;
			zero_bytes -= zero_len;
		}
		valid_bytes -= spdk_min(valid_bytes, i->iov_len);
		payload_bytes -= spdk_min(payload_bytes, i->iov_len);
		i++;
	}

	*lba_count -= zero_lba_count;
}

static inline void
blob_bs_dev_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
		 uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
{
	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;
	struct iovec iov;

	iov.iov_base = payload;
	iov.iov_len = lba_count * b->bs_dev.blocklen;
	/* The backing blob may be smaller than this blob, so zero any trailing bytes. */
	zero_trailing_bytes(b, &iov, 1, lba, &lba_count);

	spdk_blob_io_read(b->blob, channel, payload, lba, lba_count,
			  blob_bs_dev_read_cpl, cb_args);
@@ -82,6 +128,9 @@ blob_bs_dev_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
{
	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;

	/* The backing blob may be smaller than this blob, so zero any trailing bytes. */
	zero_trailing_bytes(b, iov, iovcnt, lba, &lba_count);

	spdk_blob_io_readv(b->blob, channel, iov, iovcnt, lba, lba_count,
			   blob_bs_dev_read_cpl, cb_args);
}
@@ -94,6 +143,9 @@ blob_bs_dev_readv_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
{
	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;

	/* The backing blob may be smaller than this blob, so zero any trailing bytes. */
	zero_trailing_bytes(b, iov, iovcnt, lba, &lba_count);

	spdk_blob_io_readv_ext(b->blob, channel, iov, iovcnt, lba, lba_count,
			       blob_bs_dev_read_cpl, cb_args, ext_opts);
}
@@ -136,6 +188,33 @@ blob_bs_is_zeroes(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count)
					    bs_io_unit_to_back_dev_lba(blob, lba_count));
}

static bool
blob_bs_is_range_valid(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count)
{
	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;
	struct spdk_blob *blob = b->blob;
	uint64_t	page;
	uint64_t	pages_per_cluster;

	/* The lba here is supposed to be the first lba of cluster. lba_count
	 * will typically be fixed e.g. 8192 for 4MiB cluster. */
	assert(lba_count == blob->bs->cluster_sz / dev->blocklen);
	assert(lba % lba_count == 0);

	pages_per_cluster = blob->bs->pages_per_cluster;
	page = bs_io_unit_to_page(blob->bs, lba);

	/* A blob will either have:
	* - no backing bs_bdev (normal thick blob), or
	* - zeroes backing bs_bdev (thin provisioned blob), or
	* - blob backing bs_bdev (e.g snapshot)
	* It may be possible that backing bs_bdev has lesser number of clusters
	* than the child lvol blob because lvol blob has been expanded after
	* taking snapshot. In such a case, page will be outside the cluster page
	* range of the backing dev. Always return true for zeroes backing bdev. */
	return page < blob->active.num_clusters * pages_per_cluster;
}

static bool
blob_bs_translate_lba(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba)
{
@@ -187,6 +266,7 @@ bs_create_blob_bs_dev(struct spdk_blob *blob)
	b->bs_dev.write_zeroes = blob_bs_dev_write_zeroes;
	b->bs_dev.unmap = blob_bs_dev_unmap;
	b->bs_dev.is_zeroes = blob_bs_is_zeroes;
	b->bs_dev.is_range_valid = blob_bs_is_range_valid;
	b->bs_dev.translate_lba = blob_bs_translate_lba;
	b->bs_dev.is_degraded = blob_bs_is_degraded;
	b->blob = blob;
+12 −2
Original line number Diff line number Diff line
@@ -2736,6 +2736,7 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
	uint32_t cluster_number;
	bool is_zeroes;
	bool can_copy;
	bool is_valid_range;
	uint64_t copy_src_lba;
	int rc;

@@ -2768,9 +2769,18 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
	ctx->page = cluster_start_page;
	ctx->new_cluster_page = ch->new_cluster_page;
	memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE);
	can_copy = blob_can_copy(blob, cluster_start_page, &copy_src_lba);

	is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev,
	/* Check if the cluster that we intend to do CoW for is valid for
	 * the backing dev. For zeroes backing dev, it'll be always valid.
	 * For other backing dev e.g. a snapshot, it could be invalid if
	 * the blob has been resized after snapshot was taken. */
	is_valid_range = blob->back_bs_dev->is_range_valid(blob->back_bs_dev,
			 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
			 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz));

	can_copy = is_valid_range && blob_can_copy(blob, cluster_start_page, &copy_src_lba);

	is_zeroes = is_valid_range && blob->back_bs_dev->is_zeroes(blob->back_bs_dev,
			bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
			bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz));
	if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes && !can_copy) {
+7 −0
Original line number Diff line number Diff line
@@ -124,6 +124,12 @@ zeroes_is_zeroes(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count)
	return true;
}

static bool
zeroes_is_range_valid(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count)
{
	return true;
}

static bool
zeroes_translate_lba(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba)
{
@@ -145,6 +151,7 @@ static struct spdk_bs_dev g_zeroes_bs_dev = {
	.write_zeroes = zeroes_write_zeroes,
	.unmap = zeroes_unmap,
	.is_zeroes = zeroes_is_zeroes,
	.is_range_valid = zeroes_is_range_valid,
	.translate_lba = zeroes_translate_lba,
};

+8 −0
Original line number Diff line number Diff line
@@ -1801,6 +1801,13 @@ bs_dev_degraded_is_zeroes(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_co
	return false;
}

static bool
bs_dev_degraded_is_range_valid(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count)
{
	assert(false);
	return false;
}

static struct spdk_io_channel *
bs_dev_degraded_create_channel(struct spdk_bs_dev *bs_dev)
{
@@ -1833,6 +1840,7 @@ static struct spdk_bs_dev bs_dev_degraded = {
	.readv = bs_dev_degraded_readv,
	.readv_ext = bs_dev_degraded_readv_ext,
	.is_zeroes = bs_dev_degraded_is_zeroes,
	.is_range_valid = bs_dev_degraded_is_range_valid,
	.is_degraded = bs_dev_degraded_is_degraded,
	/* Make the device as large as possible without risk of uint64 overflow. */
	.blockcnt = UINT64_MAX / 512,
Loading