Commit 3fd7f28d authored by Seth Howell's avatar Seth Howell Committed by Daniel Verkamp
Browse files

bdev: add fallback from write_zeroes to writev



if write_zeroes is not supported by the block device, we can get the
same behavior by simply writing a buffer full of zeroes to the blocks
we want to erase. I also incorporate splitting into the bdev layer to
accomodate large i/o.

Change-Id: I8fa1bfaaf22d7bfc6e3afb6e89d22fa9f7767e55
Signed-off-by: default avatarSeth Howell <seth.howell@intel.com>
Reviewed-on: https://review.gerrithub.io/373829


Reviewed-by: default avatarDaniel Verkamp <daniel.verkamp@intel.com>
Tested-by: default avatarSPDK Automated Test System <sys_sgsw@intel.com>
parent 5f5edbcb
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -271,6 +271,12 @@ struct spdk_bdev_io {
	/** Status for the IO */
	int16_t status;

	/** number of blocks remaining in a split i/o */
	uint64_t split_remaining_num_blocks;

	/** current offset of the split I/O in the bdev */
	uint64_t split_current_offset_blocks;

	/**
	 * Set to true while the bdev module submit_request function is in progress.
	 *
@@ -337,6 +343,9 @@ struct spdk_bdev_io {
	/** User function that will be called when this completes */
	spdk_bdev_io_completion_cb cb;

	/** stored user callback in case we split the I/O and use a temporary callback */
	spdk_bdev_io_completion_cb stored_user_cb;

	/** Context that will be passed to the completion callback */
	void *caller_ctx;

+82 −5
Original line number Diff line number Diff line
@@ -58,6 +58,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
#define BUF_SMALL_POOL_SIZE	8192
#define BUF_LARGE_POOL_SIZE	1024
#define NOMEM_THRESHOLD_COUNT	8
#define ZERO_BUFFER_SIZE	0x100000

typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t;

@@ -67,6 +68,8 @@ struct spdk_bdev_mgr {
	struct spdk_mempool *buf_small_pool;
	struct spdk_mempool *buf_large_pool;

	void *zero_buffer;

	TAILQ_HEAD(, spdk_bdev_module_if) bdev_modules;

	TAILQ_HEAD(, spdk_bdev) bdevs;
@@ -150,6 +153,8 @@ struct spdk_bdev_channel {

};

static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);

struct spdk_bdev *
spdk_bdev_first(void)
{
@@ -527,6 +532,14 @@ spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg,
		return;
	}

	g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
				 NULL);
	if (!g_bdev_mgr.zero_buffer) {
		SPDK_ERRLOG("create bdev zero buffer failed\n");
		spdk_bdev_init_complete(-1);
		return;
	}

#ifdef SPDK_CONFIG_VTUNE
	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
#endif
@@ -579,6 +592,7 @@ spdk_bdev_finish(void)
	spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
	spdk_mempool_free(g_bdev_mgr.buf_small_pool);
	spdk_mempool_free(g_bdev_mgr.buf_large_pool);
	spdk_dma_free(g_bdev_mgr.zero_buffer);

	spdk_io_device_unregister(&g_bdev_mgr, NULL);
}
@@ -1088,26 +1102,60 @@ spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channe
	struct spdk_bdev *bdev = desc->bdev;
	struct spdk_bdev_io *bdev_io;
	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
	uint64_t len;
	bool split_request = false;

	if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) {
		SPDK_ERRLOG("length argument out of range in write_zeroes\n");
		return -ERANGE;
	}

	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
		return -EINVAL;
	}

	bdev_io = spdk_bdev_get_io();

	if (!bdev_io) {
		SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n");
		return -ENOMEM;
	}

	bdev_io->ch = channel;
	bdev_io->u.bdev.iovs = NULL;
	bdev_io->u.bdev.iovcnt = 0;
	bdev_io->u.bdev.num_blocks = num_blocks;
	bdev_io->u.bdev.offset_blocks = offset_blocks;

	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
		bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
		bdev_io->u.bdev.num_blocks = num_blocks;
		bdev_io->u.bdev.iovs = NULL;
		bdev_io->u.bdev.iovcnt = 0;

	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
	} else {
		assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE);

		len = spdk_bdev_get_block_size(bdev) * num_blocks;

		if (len > ZERO_BUFFER_SIZE) {
			split_request = true;
			len = ZERO_BUFFER_SIZE;
		}

		bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
		bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer;
		bdev_io->u.bdev.iov.iov_len = len;
		bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov;
		bdev_io->u.bdev.iovcnt = 1;
		bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev);
		bdev_io->split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks;
		bdev_io->split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks;
	}

	if (split_request) {
		bdev_io->stored_user_cb = cb;
		spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split);
	} else {
		spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
	}
	spdk_bdev_io_submit(bdev_io);
	return 0;
}
@@ -1948,6 +1996,35 @@ spdk_bdev_part_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_
	spdk_bdev_free_io(bdev_io);
}

static void
spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
	uint64_t len;

	if (!success) {
		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
		return;
	}

	/* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */
	len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->split_remaining_num_blocks,
		       ZERO_BUFFER_SIZE);

	bdev_io->u.bdev.offset_blocks = bdev_io->split_current_offset_blocks;
	bdev_io->u.bdev.iov.iov_len = len;
	bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev);
	bdev_io->split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks;
	bdev_io->split_current_offset_blocks += bdev_io->u.bdev.num_blocks;

	/* if this round completes the i/o, change the callback to be the original user callback */
	if (bdev_io->split_remaining_num_blocks == 0) {
		spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->stored_user_cb);
	} else {
		spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split);
	}
	spdk_bdev_io_submit(bdev_io);
}

void
spdk_bdev_part_submit_request(struct spdk_bdev_part_channel *ch, struct spdk_bdev_io *bdev_io)
{
+150 −18
Original line number Diff line number Diff line
@@ -209,6 +209,21 @@ __blockdev_write(void *arg1, void *arg2)
	}
}

static void
__blockdev_write_zeroes(void *arg1, void *arg2)
{
	struct bdevio_request *req = arg1;
	struct io_target *target = req->target;
	int rc;

	rc = spdk_bdev_write_zeroes(target->bdev_desc, target->ch, req->offset,
				    req->data_len, quick_test_complete, NULL);
	if (rc) {
		g_completion_success = false;
		wake_ut_thread();
	}
}

static void
sgl_chop_buffer(struct bdevio_request *req, int iov_len)
{
@@ -250,6 +265,22 @@ blockdev_write(struct io_target *target, char *tx_buf,
	execute_spdk_function(__blockdev_write, &req, NULL);
}

static void
blockdev_write_zeroes(struct io_target *target, char *tx_buf,
		      uint64_t offset, int data_len)
{
	struct bdevio_request req;

	req.target = target;
	req.buf = tx_buf;
	req.data_len = data_len;
	req.offset = offset;

	g_completion_success = false;

	execute_spdk_function(__blockdev_write_zeroes, &req, NULL);
}

static void
__blockdev_read(void *arg1, void *arg2)
{
@@ -303,7 +334,7 @@ blockdev_write_read_data_match(char *rx_buf, char *tx_buf, int data_length)

static void
blockdev_write_read(uint32_t data_length, uint32_t iov_len, int pattern, uint64_t offset,
		    int expected_rc)
		    int expected_rc, bool write_zeroes)
{
	struct io_target *target;
	char	*tx_buf = NULL;
@@ -312,22 +343,30 @@ blockdev_write_read(uint32_t data_length, uint32_t iov_len, int pattern, uint64_

	target = g_io_targets;
	while (target != NULL) {
		if (data_length < spdk_bdev_get_block_size(target->bdev)) {
		if (data_length < spdk_bdev_get_block_size(target->bdev) ||
		    data_length / spdk_bdev_get_block_size(target->bdev) > spdk_bdev_get_num_blocks(target->bdev)) {
			target = target->next;
			continue;
		}

		if (!write_zeroes) {
			initialize_buffer(&tx_buf, pattern, data_length);
			initialize_buffer(&rx_buf, 0, data_length);

			blockdev_write(target, tx_buf, offset, data_length, iov_len);
		} else {
			initialize_buffer(&tx_buf, 0, data_length);
			initialize_buffer(&rx_buf, pattern, data_length);

			blockdev_write_zeroes(target, tx_buf, offset, data_length);
		}


		if (expected_rc == 0) {
			CU_ASSERT_EQUAL(g_completion_success, true);
		} else {
			CU_ASSERT_EQUAL(g_completion_success, false);
		}

		blockdev_read(target, rx_buf, offset, data_length, iov_len);

		if (expected_rc == 0) {
@@ -364,7 +403,96 @@ blockdev_write_read_4k(void)
	 * of write and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}

static void
blockdev_write_zeroes_read_4k(void)
{
	uint32_t data_length;
	uint64_t offset;
	int pattern;
	int expected_rc;

	/* Data size = 4K */
	data_length = 4096;
	offset = 0;
	pattern = 0xA3;
	/* Params are valid, hence the expected return value
	 * of write_zeroes and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1);
}

/*
 * This i/o will not have to split at the bdev layer.
 */
static void
blockdev_write_zeroes_read_1m(void)
{
	uint32_t data_length;
	uint64_t offset;
	int pattern;
	int expected_rc;

	/* Data size = 1M */
	data_length = 1048576;
	offset = 0;
	pattern = 0xA3;
	/* Params are valid, hence the expected return value
	 * of write_zeroes and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1);
}

/*
 * This i/o will have to split at the bdev layer if
 * write-zeroes is not supported by the bdev.
 */
static void
blockdev_write_zeroes_read_3m(void)
{
	uint32_t data_length;
	uint64_t offset;
	int pattern;
	int expected_rc;

	/* Data size = 3M */
	data_length = 3145728;
	offset = 0;
	pattern = 0xA3;
	/* Params are valid, hence the expected return value
	 * of write_zeroes and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1);
}

/*
 * This i/o will have to split at the bdev layer if
 * write-zeroes is not supported by the bdev. It also
 * tests a write size that is not an even multiple of
 * the bdev layer zero buffer size.
 */
static void
blockdev_write_zeroes_read_3m_500k(void)
{
	uint32_t data_length;
	uint64_t offset;
	int pattern;
	int expected_rc;

	/* Data size = 3.5M */
	data_length = 3670016;
	offset = 0;
	pattern = 0xA3;
	/* Params are valid, hence the expected return value
	 * of write_zeroes and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 1);
}

static void
@@ -385,7 +513,7 @@ blockdev_writev_readv_4k(void)
	 * of write and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}

static void
@@ -406,7 +534,7 @@ blockdev_writev_readv_30x4k(void)
	 * of write and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}

static void
@@ -426,7 +554,7 @@ blockdev_write_read_512Bytes(void)
	 * of write and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}

static void
@@ -447,7 +575,7 @@ blockdev_writev_readv_512Bytes(void)
	 * of write and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}

static void
@@ -467,7 +595,7 @@ blockdev_write_read_size_gt_128k(void)
	 * of write and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}

static void
@@ -488,7 +616,7 @@ blockdev_writev_readv_size_gt_128k(void)
	 * of write and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}

static void
@@ -509,7 +637,7 @@ blockdev_writev_readv_size_gt_128k_two_iov(void)
	 * of write and read for all blockdevs is 0. */
	expected_rc = 0;

	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc);
	blockdev_write_read(data_length, iov_len, pattern, offset, expected_rc, 0);
}

static void
@@ -529,7 +657,7 @@ blockdev_write_read_invalid_size(void)
	 * of write and read for all blockdevs is < 0 */
	expected_rc = -1;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}

static void
@@ -628,7 +756,7 @@ blockdev_write_read_max_offset(void)
	 * of write and read for all blockdevs is < 0 */
	expected_rc = -1;

	blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}

static void
@@ -649,7 +777,7 @@ blockdev_overlapped_write_read_8k(void)
	expected_rc = 0;
	/* Assert the write by comparing it with values read
	 * from the same offset for each blockdev */
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);

	/* Overwrite the pattern 0xbb of size 8K on an address offset overlapping
	 * with the address written above and assert the new value in
@@ -660,7 +788,7 @@ blockdev_overlapped_write_read_8k(void)
	offset = 4096;
	/* Assert the write by comparing it with values read
	 * from the overlapped offset for each blockdev */
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc);
	blockdev_write_read(data_length, 0, pattern, offset, expected_rc, 0);
}

static void
@@ -741,6 +869,10 @@ __run_ut_thread(void *arg1, void *arg2)

	if (
		CU_add_test(suite, "blockdev write read 4k", blockdev_write_read_4k) == NULL
		|| CU_add_test(suite, "blockdev write zeroes read 4k", blockdev_write_zeroes_read_4k) == NULL
		|| CU_add_test(suite, "blockdev write zeroes read 1m", blockdev_write_zeroes_read_1m) == NULL
		|| CU_add_test(suite, "blockdev write zeroes read 3m", blockdev_write_zeroes_read_3m) == NULL
		|| CU_add_test(suite, "blockdev write zeroes read 3.5m", blockdev_write_zeroes_read_3m_500k) == NULL
		|| CU_add_test(suite, "blockdev write read 512 bytes",
			       blockdev_write_read_512Bytes) == NULL
		|| CU_add_test(suite, "blockdev write read size > 128k",