Commit 01ccef6e authored by Konrad Sztyber's avatar Konrad Sztyber
Browse files

bdev: allow bdevs to specify max size of IOs



There are cases when a bdev module needs to limit the maximum size of an
IO to a certain value (e.g. large iobuf if it needs to allocate a bounce
buffer).

Previously, this was covered by specifying optimal_io_boundary and
setting the split_on_optimal_io_boundary flag.  However, this can be
inefficient, as it'll also split smaller requests that do not exceed the
maximum size, but cross the boundary.  For instance, with
optimal_io_boundary=4, an IO with offset_blocks=3, num_blocks=2 will be
split.

To avoid that, a new field, max_rw_size, is introduced to allow for
specifying the maximum size of an IO without taking into account
offset_blocks.  It only affects READ and WRITE requests, as other IO
types are already controlled by other values (e.g. max_unmap, max_copy,
max_write_zeroes).

Signed-off-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
Change-Id: Idcafdeaf22a34f25aa599f03726caa9307949e1f
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/20584


Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarJim Harris <jim.harris@samsung.com>
parent 9ad5ba22
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -512,6 +512,12 @@ struct spdk_bdev {
	 */
	uint32_t max_copy;

	/**
	 * Maximum number of blocks in a single read/write I/O.  Requests exceeding this value will
	 * be split by the bdev layer.
	 */
	uint32_t max_rw_size;

	/**
	 * UUID for this bdev.
	 *
+1 −1
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk

SO_VER := 13
SO_VER := 14
SO_MINOR := 0

ifeq ($(CONFIG_VTUNE),y)
+17 −12
Original line number Diff line number Diff line
@@ -2804,7 +2804,8 @@ bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
{
	uint32_t io_boundary;
	struct spdk_bdev *bdev = bdev_io->bdev;
	uint32_t max_size = bdev->max_segment_size;
	uint32_t max_segment_size = bdev->max_segment_size;
	uint32_t max_size = bdev->max_rw_size;
	int max_segs = bdev->max_num_segments;

	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
@@ -2815,7 +2816,7 @@ bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
		io_boundary = 0;
	}

	if (spdk_likely(!io_boundary && !max_segs && !max_size)) {
	if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) {
		return false;
	}

@@ -2844,14 +2845,20 @@ bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
		}
	}

	if (max_size) {
	if (max_segment_size) {
		for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
			if (bdev_io->u.bdev.iovs[i].iov_len > max_size) {
			if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) {
				return true;
			}
		}
	}

	if (max_size) {
		if (bdev_io->u.bdev.num_blocks > max_size) {
			return true;
		}
	}

	return false;
}

@@ -3051,9 +3058,11 @@ _bdev_rw_split(void *_bdev_io)
	uint32_t io_boundary;
	uint32_t max_segment_size = bdev->max_segment_size;
	uint32_t max_child_iovcnt = bdev->max_num_segments;
	uint32_t max_size = bdev->max_rw_size;
	void *md_buf = NULL;
	int rc;

	max_size = max_size ? max_size : UINT32_MAX;
	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
			   SPDK_BDEV_IO_NUM_CHILD_IOV;
@@ -3085,6 +3094,7 @@ _bdev_rw_split(void *_bdev_io)
	       child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
		to_next_boundary = _to_next_boundary(current_offset, io_boundary);
		to_next_boundary = spdk_min(remaining, to_next_boundary);
		to_next_boundary = spdk_min(max_size, to_next_boundary);
		to_next_boundary_bytes = to_next_boundary * blocklen;

		iov = &bdev_io->child_iov[child_iovcnt];
@@ -7499,14 +7509,10 @@ bdev_register(struct spdk_bdev *bdev)
		}
	}

	spdk_iobuf_get_opts(&iobuf_opts);
	if (spdk_bdev_get_buf_align(bdev) > 1) {
		if (bdev->split_on_optimal_io_boundary) {
			bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary,
							     SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen);
		} else {
			bdev->split_on_optimal_io_boundary = true;
			bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen;
		}
		bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX,
					     iobuf_opts.large_bufsize / bdev->blocklen);
	}

	/* If the user didn't specify a write unit size, set it to one. */
@@ -7524,7 +7530,6 @@ bdev_register(struct spdk_bdev *bdev)
	}

	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
		spdk_iobuf_get_opts(&iobuf_opts);
		bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize);
	}

+133 −0
Original line number Diff line number Diff line
@@ -2571,6 +2571,139 @@ bdev_io_max_size_and_segment_split_test(void)
	stub_complete_io(1);
	CU_ASSERT(g_io_done == true);

	/* Test that IOs are split on max_rw_size */
	bdev->max_rw_size = 2;
	bdev->max_segment_size = 0;
	bdev->max_num_segments = 0;
	g_io_done = false;

	/* 5 blocks in a contiguous buffer */
	iov[0].iov_base = (void *)0x10000;
	iov[0].iov_len = 5 * 512;

	/* First: offset=0, num_blocks=2 */
	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 0, 2, 1);
	ut_expected_io_set_iov(expected_io, 0, (void *)0x10000, 2 * 512);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
	/* Second: offset=2, num_blocks=2 */
	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 2, 2, 1);
	ut_expected_io_set_iov(expected_io, 0, (void *)0x10000 + 2 * 512, 2 * 512);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
	/* Third: offset=4, num_blocks=1 */
	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 4, 1, 1);
	ut_expected_io_set_iov(expected_io, 0, (void *)0x10000 + 4 * 512, 512);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);

	rc = spdk_bdev_readv_blocks(desc, io_ch, iov, 1, 0, 5, io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 3);
	stub_complete_io(3);
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);

	/* Check splitting on both max_rw_size + max_num_segments */
	bdev->max_rw_size = 2;
	bdev->max_num_segments = 2;
	bdev->max_segment_size = 0;
	g_io_done = false;

	/* 5 blocks split across 4 iovs */
	iov[0].iov_base = (void *)0x10000;
	iov[0].iov_len = 3 * 512;
	iov[1].iov_base = (void *)0x20000;
	iov[1].iov_len = 256;
	iov[2].iov_base = (void *)0x30000;
	iov[2].iov_len = 256;
	iov[3].iov_base = (void *)0x40000;
	iov[3].iov_len = 512;

	/* First: offset=0, num_blocks=2, iovcnt=1 */
	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 0, 2, 1);
	ut_expected_io_set_iov(expected_io, 0, (void *)0x10000, 2 * 512);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
	/* Second: offset=2, num_blocks=1, iovcnt=1 (max_segment_size prevents from submitting
	 * the rest of iov[0], and iov[1]+iov[2])
	 */
	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 2, 1, 1);
	ut_expected_io_set_iov(expected_io, 0, (void *)0x10000 + 2 * 512, 512);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
	/* Third: offset=3, num_blocks=1, iovcnt=2 (iov[1]+iov[2]) */
	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 3, 1, 2);
	ut_expected_io_set_iov(expected_io, 0, (void *)0x20000, 256);
	ut_expected_io_set_iov(expected_io, 1, (void *)0x30000, 256);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
	/* Fourth: offset=4, num_blocks=1, iovcnt=1 (iov[3]) */
	expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 4, 1, 1);
	ut_expected_io_set_iov(expected_io, 0, (void *)0x40000, 512);
	TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);

	rc = spdk_bdev_readv_blocks(desc, io_ch, iov, 4, 0, 5, io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 4);
	stub_complete_io(4);
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);

	/* Check splitting on both max_rw_size + max_segment_size */
	bdev->max_rw_size = 2;
	bdev->max_segment_size = 512;
	bdev->max_num_segments = 0;
	g_io_done = false;

	/* 6 blocks in a contiguous buffer */
	iov[0].iov_base = (void *)0x10000;
	iov[0].iov_len = 6 * 512;

	/* We expect 3 IOs each with 2 blocks and 2 iovs */
	for (i = 0; i < 3; ++i) {
		expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, i * 2, 2, 2);
		ut_expected_io_set_iov(expected_io, 0, (void *)0x10000 + i * 2 * 512, 512);
		ut_expected_io_set_iov(expected_io, 1, (void *)0x10000 + i * 2 * 512 + 512, 512);
		TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
	}

	rc = spdk_bdev_readv_blocks(desc, io_ch, iov, 1, 0, 6, io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 3);
	stub_complete_io(3);
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);

	/* Check splitting on max_rw_size limited by SPDK_BDEV_IO_NUM_CHILD_IOV */
	bdev->max_rw_size = 1;
	bdev->max_segment_size = 0;
	bdev->max_num_segments = 0;
	g_io_done = false;

	/* SPDK_BDEV_IO_NUM_CHILD_IOV + 1 blocks */
	iov[0].iov_base = (void *)0x10000;
	iov[0].iov_len = (SPDK_BDEV_IO_NUM_CHILD_IOV + 1) * 512;

	/* We expect SPDK_BDEV_IO_NUM_CHILD_IOV + 1 IOs each with a single iov */
	for (i = 0; i < 3; ++i) {
		expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, i, 1, 1);
		ut_expected_io_set_iov(expected_io, 0, (void *)0x10000 + i * 512, 512);
		TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
	}

	rc = spdk_bdev_readv_blocks(desc, io_ch, iov, 1, 0, SPDK_BDEV_IO_NUM_CHILD_IOV + 1, io_done, NULL);
	CU_ASSERT(rc == 0);
	CU_ASSERT(g_io_done == false);

	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == SPDK_BDEV_IO_NUM_CHILD_IOV);
	stub_complete_io(SPDK_BDEV_IO_NUM_CHILD_IOV);
	CU_ASSERT(g_io_done == false);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
	stub_complete_io(1);
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);

	spdk_put_io_channel(io_ch);
	spdk_bdev_close(desc);
	free_bdev(bdev);