Commit d6e9827e authored by Artur Paszkiewicz's avatar Artur Paszkiewicz Committed by Tomasz Zawadzki
Browse files

bdev: split writes based on write_unit_size



Add new bdev property split_on_write_unit which, if set to true, causes
writes to be split to match write_unit_size and fail if not aligned to
or not multiple of write_unit_size.

Signed-off-by: default avatarArtur Paszkiewicz <artur.paszkiewicz@intel.com>
Change-Id: Id49f58a3288ddf5cfe4921ce4020ae4bcdd67298
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/11390


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarTomasz Zawadzki <tomasz.zawadzki@intel.com>
Reviewed-by: default avatarKrzysztof Karas <krzysztof.karas@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
parent c89e2008
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
@@ -309,6 +309,20 @@ struct spdk_bdev {
	/** Number of blocks */
	uint64_t blockcnt;

	/**
	 * Specifies whether the write_unit_size is mandatory or
	 * only advisory. If set to true, the bdev layer will split
	 * WRITE I/O that span the write_unit_size before
	 * submitting them to the bdev module.
	 *
	 * This field takes precedence over split_on_optimal_io_boundary
	 * for WRITE I/O if both are set to true.
	 *
	 * Note that this field cannot be used to force splitting of
	 * UNMAP, WRITE_ZEROES or FLUSH I/O.
	 */
	bool split_on_write_unit;

	/** Number of blocks required for write */
	uint32_t write_unit_size;

+28 −6
Original line number Diff line number Diff line
@@ -2166,6 +2166,14 @@ bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_i
		}
	}

	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
			  bdev_io->bdev->split_on_write_unit &&
			  bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
		SPDK_ERRLOG("IO does not match the write_unit_size\n");
		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
		return;
	}

	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
		bdev_ch->io_outstanding++;
		shared_resource->io_outstanding++;
@@ -2242,11 +2250,18 @@ bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb
static bool
bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
{
	uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary;
	uint32_t max_size = bdev_io->bdev->max_segment_size;
	int max_segs = bdev_io->bdev->max_num_segments;
	uint32_t io_boundary;
	struct spdk_bdev *bdev = bdev_io->bdev;
	uint32_t max_size = bdev->max_segment_size;
	int max_segs = bdev->max_num_segments;

	io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0;
	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
		io_boundary = bdev->write_unit_size;
	} else if (bdev->split_on_optimal_io_boundary) {
		io_boundary = bdev->optimal_io_boundary;
	} else {
		io_boundary = 0;
	}

	if (spdk_likely(!io_boundary && !max_segs && !max_size)) {
		return false;
@@ -2449,7 +2464,7 @@ _bdev_rw_split(void *_bdev_io)
	uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
	uint32_t iovcnt, iov_len, child_iovsize;
	uint32_t blocklen = bdev->blocklen;
	uint32_t io_boundary = bdev->optimal_io_boundary;
	uint32_t io_boundary;
	uint32_t max_segment_size = bdev->max_segment_size;
	uint32_t max_child_iovcnt = bdev->max_num_segments;
	void *md_buf = NULL;
@@ -2458,7 +2473,14 @@ _bdev_rw_split(void *_bdev_io)
	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) :
			   BDEV_IO_NUM_CHILD_IOV;
	io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX;

	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
		io_boundary = bdev->write_unit_size;
	} else if (bdev->split_on_optimal_io_boundary) {
		io_boundary = bdev->optimal_io_boundary;
	} else {
		io_boundary = UINT32_MAX;
	}

	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
	current_offset = bdev_io->u.bdev.split_current_offset_blocks;
+141 −0
Original line number Diff line number Diff line
@@ -1249,6 +1249,25 @@ bdev_io_spans_split_test(void)

	/* Exceed max_sizes */
	CU_ASSERT(bdev_io_should_split(&bdev_io) == true);

	bdev.max_segment_size = 0;
	bdev.write_unit_size = 32;
	bdev.split_on_write_unit = true;
	bdev_io.type = SPDK_BDEV_IO_TYPE_WRITE;

	/* This I/O is one write unit */
	CU_ASSERT(bdev_io_should_split(&bdev_io) == false);

	bdev_io.u.bdev.num_blocks = 32 * 2;

	/* This I/O is more than one write unit */
	CU_ASSERT(bdev_io_should_split(&bdev_io) == true);

	bdev_io.u.bdev.offset_blocks = 1;
	bdev_io.u.bdev.num_blocks = 32;

	/* This I/O is not aligned to write unit size */
	CU_ASSERT(bdev_io_should_split(&bdev_io) == true);
}

static void
@@ -2846,6 +2865,127 @@ bdev_io_split_with_io_wait(void)
	poll_threads();
}

static void
bdev_io_write_unit_split_test(void)
{
	struct spdk_bdev *bdev;
	struct spdk_bdev_desc *desc = NULL;
	struct spdk_io_channel *io_ch;
	struct spdk_bdev_opts bdev_opts = {};
	struct iovec iov[BDEV_IO_NUM_CHILD_IOV * 4];
	struct ut_expected_io *expected_io;
	uint64_t i;
	int rc;

	spdk_bdev_get_opts(&bdev_opts, sizeof(bdev_opts));
	bdev_opts.bdev_io_pool_size = 512;
	bdev_opts.bdev_io_cache_size = 64;

	rc = spdk_bdev_set_opts(&bdev_opts);
	CU_ASSERT(rc == 0);
	spdk_bdev_initialize(bdev_init_cb, NULL);

	bdev = allocate_bdev("bdev0");

	rc = spdk_bdev_open_ext(bdev->name, true, bdev_ut_event_cb, NULL, &desc);
	CU_ASSERT(rc == 0);
	SPDK_CU_ASSERT_FATAL(desc != NULL);
	io_ch = spdk_bdev_get_io_channel(desc);
	CU_ASSERT(io_ch != NULL);

	/* Write I/O 2x larger than write_unit_size should get split into 2 I/Os */
	bdev->write_unit_size = 32;
	bdev->split_on_write_unit = true;
	g_io_done = false;

	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 0, 32, 1);
	ut_expected_io_set_iov(expected_io, 0, (void *)0xF000, 32 * 512);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);

	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 32, 32, 1);
	ut_expected_io_set_iov(expected_io, 0, (void *)(0xF000 + 32 * 512), 32 * 512);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);

	rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 64, io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2);
	stub_complete_io(2);
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
	CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_SUCCESS);

	/* Same as above but with optimal_io_boundary < write_unit_size - the I/O should be split
	 * based on write_unit_size, not optimal_io_boundary */
	bdev->split_on_optimal_io_boundary = true;
	bdev->optimal_io_boundary = 16;
	g_io_done = false;

	rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 64, io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2);
	stub_complete_io(2);
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
	CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_SUCCESS);

	/* Write I/O should fail if it is smaller than write_unit_size */
	g_io_done = false;

	rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 31, io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
	poll_threads();
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
	CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED);

	/* Same for I/O not aligned to write_unit_size */
	g_io_done = false;

	rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 1, 32, io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
	poll_threads();
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
	CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED);

	/* Write should fail if it needs to be split but there are not enough iovs to submit
	 * an entire write unit */
	bdev->write_unit_size = SPDK_COUNTOF(iov) / 2;
	g_io_done = false;

	for (i = 0; i < SPDK_COUNTOF(iov); i++) {
		iov[i].iov_base = (void *)(0x1000 + 512 * i);
		iov[i].iov_len = 512;
	}

	rc = spdk_bdev_writev_blocks(desc, io_ch, iov, SPDK_COUNTOF(iov), 0, SPDK_COUNTOF(iov),
				     io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
	poll_threads();
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
	CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED);

	spdk_put_io_channel(io_ch);
	spdk_bdev_close(desc);
	free_bdev(bdev);
	spdk_bdev_finish(bdev_fini_cb, NULL);
	poll_threads();
}

static void
bdev_io_alignment(void)
{
@@ -5874,6 +6014,7 @@ main(int argc, char **argv)
	CU_ADD_TEST(suite, bdev_io_max_size_and_segment_split_test);
	CU_ADD_TEST(suite, bdev_io_mix_split_test);
	CU_ADD_TEST(suite, bdev_io_split_with_io_wait);
	CU_ADD_TEST(suite, bdev_io_write_unit_split_test);
	CU_ADD_TEST(suite, bdev_io_alignment_with_boundary);
	CU_ADD_TEST(suite, bdev_io_alignment);
	CU_ADD_TEST(suite, bdev_histograms);