Commit dfc98943 authored by Krzysztof Karas's avatar Krzysztof Karas Committed by Tomasz Zawadzki
Browse files

bdev: send bdev reset based on outstanding IO and a new timeout parameter



A new parameter io_drain_timeout has been added to spdk_bdev
structure. If this value is unset, the bdev reset behavior
does not change.
The io_drain_timeout controls how long a bdev reset must wait for IO
to complete prior to issuing a reset to the underlying device.
If there is no outstanding IO at the end of that period, the reset
is skipped.

Change-Id: I585af427064ce234a4f60afc3d69bc9fc3252432
Signed-off-by: default avatarKrzysztof Karas <krzysztof.karas@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14501


Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarTomasz Zawadzki <tomasz.zawadzki@intel.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
parent eafb489c
Loading
Loading
Loading
Loading
+30 −0
Original line number Diff line number Diff line
@@ -28,6 +28,11 @@
extern "C" {
#endif

/* This parameter is best defined for bdevs that share an underlying bdev,
 * such as multiple lvol bdevs sharing an nvme device, to avoid unnecessarily
 * resetting the underlying bdev and affecting other bdevs that are sharing it. */
#define BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE 5

/** Block device module */
struct spdk_bdev_module {
	/**
@@ -431,6 +436,25 @@ struct spdk_bdev {
	 */
	bool media_events;

	/* Upon receiving a reset request, this is the amount of time in seconds
	 * to wait for all I/O to complete before moving forward with the reset.
	 * If all I/O completes prior to this time out, the reset will be skipped.
	 * A value of 0 is special and will always send resets immediately, even
	 * if there is no I/O outstanding.
	 *
	 * Use case example:
	 * A shared bdev (e.g. multiple lvol bdevs sharing an underlying nvme bdev)
	 * needs to be reset. For a non-zero value bdev reset code will wait
	 * `reset_io_drain_timeout` seconds for outstanding IO that are present
	 * on any bdev channel, before sending a reset down to the underlying device.
	 * That way we can avoid sending "empty" resets and interrupting work of
	 * other lvols that use the same bdev. BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE
	 * is a good choice for the value of this parameter.
	 *
	 * If this parameter remains equal to zero, the bdev reset will be forcefully
	 * sent down to the device, without any delays and waiting for outstanding IO. */
	uint16_t reset_io_drain_timeout;

	/**
	 * Pointer to the bdev module that registered this bdev.
	 */
@@ -629,6 +653,12 @@ struct spdk_bdev_io {
		struct {
			/** Channel reference held while messages for this reset are in progress. */
			struct spdk_io_channel *ch_ref;
			struct {
				/* Handle to timed poller that checks each channel for outstanding IO. */
				struct spdk_poller *poller;
				/* Store calculated time value, when a poller should stop its work. */
				uint64_t  stop_time_tsc;
			} wait_poller;
		} reset;
		struct {
			/** The outstanding request matching bio_cb_arg which this abort attempts to cancel. */
+81 −4
Original line number Diff line number Diff line
@@ -54,6 +54,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
 * when splitting into children requests at a time.
 */
#define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
#define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000

static const char *qos_rpc_type[] = {"rw_ios_per_sec",
				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
@@ -5276,15 +5277,91 @@ spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
	return 0;
}

static int bdev_reset_poll_for_outstanding_io(void *ctx);

static void
bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status)
{
	struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
	struct spdk_bdev_io *bdev_io;

	bdev_io = TAILQ_FIRST(&ch->queued_resets);

	if (status == -EBUSY) {
		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
							      ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
		} else {
			/* If outstanding IOs are still present and reset_io_drain_timeout seconds passed,
			 * start the reset. */
			TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
			bdev_io_submit_reset(bdev_io);
		}
	} else {
		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
		SPDK_DEBUGLOG(bdev,
			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
			      ch->bdev->name);
		/* Mark the completion status as a SUCCESS and complete the reset. */
		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
	}
}

static void
bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i)
{
	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
	struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch);
	int status = 0;

	if (cur_ch->io_outstanding > 0) {
		/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
		 * further iteration over the rest of the channels and pass non-zero status
		 * to the callback function. */
		status = -EBUSY;
	}
	spdk_for_each_channel_continue(i, status);
}

static int
bdev_reset_poll_for_outstanding_io(void *ctx)
{
	struct spdk_bdev_channel *ch = ctx;
	struct spdk_bdev_io *bdev_io;

	bdev_io = TAILQ_FIRST(&ch->queued_resets);

	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
			      ch, bdev_reset_check_outstanding_io_done);

	return SPDK_POLLER_BUSY;
}

static void
bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status)
{
	struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
	struct spdk_bdev *bdev = ch->bdev;
	struct spdk_bdev_io *bdev_io;

	bdev_io = TAILQ_FIRST(&ch->queued_resets);

	if (bdev->reset_io_drain_timeout == 0) {
		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);

		bdev_io_submit_reset(bdev_io);
		return;
	}

	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
			(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());

	/* In case bdev->reset_io_drain_timeout is not equal to zero,
	 * submit the reset to the underlying module only if outstanding I/O
	 * remain after reset_io_drain_timeout seconds have passed. */
	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
			      ch, bdev_reset_check_outstanding_io_done);
}

static void
@@ -5331,7 +5408,7 @@ bdev_start_reset(void *ctx)
	struct spdk_bdev_channel *ch = ctx;

	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel,
			      ch, bdev_reset_dev);
			      ch, bdev_reset_freeze_channel_done);
}

static void
+7 −0
Original line number Diff line number Diff line
@@ -1041,6 +1041,13 @@ _create_lvol_disk(struct spdk_lvol *lvol, bool destroy)
	bdev->fn_table = &vbdev_lvol_fn_table;
	bdev->module = &g_lvol_if;

	/* Set default bdev reset waiting time. This value indicates how much
	 * time a reset should wait before forcing a reset down to the underlying
	 * bdev module.
	 * Setting this parameter is mainly to avoid "empty" resets to a shared
	 * bdev that may be used by multiple lvols. */
	bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;

	rc = spdk_bdev_register(bdev);
	if (rc) {
		free(lvol_bdev);
+212 −0
Original line number Diff line number Diff line
@@ -486,6 +486,8 @@ aborted_reset_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
	spdk_bdev_free_io(bdev_io);
}

static void io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);

static void
aborted_reset(void)
{
@@ -542,6 +544,58 @@ aborted_reset(void)
	teardown_test();
}

static void
aborted_reset_no_outstanding_io(void)
{
	struct spdk_io_channel *io_ch[2];
	struct spdk_bdev_channel *bdev_ch[2];
	struct spdk_bdev *bdev[2];
	enum spdk_bdev_io_status status1 = SPDK_BDEV_IO_STATUS_PENDING,
				 status2 = SPDK_BDEV_IO_STATUS_PENDING;

	setup_test();

	/*
	 * This time we test the reset without any outstanding IO
	 * present on the bdev channel, so both resets should finish
	 * immediately.
	 */

	set_thread(0);
	/* Set reset_io_drain_timeout to allow bdev
	 * reset to stay pending until we call abort. */
	io_ch[0] = spdk_bdev_get_io_channel(g_desc);
	bdev_ch[0] = spdk_io_channel_get_ctx(io_ch[0]);
	bdev[0] = bdev_ch[0]->bdev;
	bdev[0]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
	CU_ASSERT(io_ch[0] != NULL);
	spdk_bdev_reset(g_desc, io_ch[0], aborted_reset_done, &status1);
	poll_threads();
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
	CU_ASSERT(status1 == SPDK_BDEV_IO_STATUS_SUCCESS);
	spdk_put_io_channel(io_ch[0]);

	set_thread(1);
	/* Set reset_io_drain_timeout to allow bdev
	 * reset to stay pending until we call abort. */
	io_ch[1] = spdk_bdev_get_io_channel(g_desc);
	bdev_ch[1] = spdk_io_channel_get_ctx(io_ch[1]);
	bdev[1] = bdev_ch[1]->bdev;
	bdev[1]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
	CU_ASSERT(io_ch[1] != NULL);
	spdk_bdev_reset(g_desc, io_ch[1], aborted_reset_done, &status2);
	poll_threads();
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
	CU_ASSERT(status2 == SPDK_BDEV_IO_STATUS_SUCCESS);
	spdk_put_io_channel(io_ch[1]);

	stub_complete_io(g_bdev.io_target, 0);
	poll_threads();

	teardown_test();
}


static void
io_during_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
@@ -655,6 +709,162 @@ io_during_reset(void)
	teardown_test();
}

static uint32_t
count_queued_resets(void *io_target)
{
	struct spdk_io_channel *_ch = spdk_get_io_channel(io_target);
	struct ut_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
	struct spdk_bdev_io *io;
	uint32_t submitted_resets = 0;

	TAILQ_FOREACH(io, &ch->outstanding_io, module_link) {
		if (io->type == SPDK_BDEV_IO_TYPE_RESET) {
			submitted_resets++;
		}
	}

	spdk_put_io_channel(_ch);

	return submitted_resets;
}

static void
reset_completions(void)
{
	struct spdk_io_channel *io_ch;
	struct spdk_bdev_channel *bdev_ch;
	struct spdk_bdev *bdev;
	enum spdk_bdev_io_status status0, status_reset;
	int rc, iter;

	setup_test();

	/* This test covers four test cases:
	 * 1) reset_io_drain_timeout of a bdev is greater than 0
	 * 2) No outstandind IO are present on any bdev channel
	 * 3) Outstanding IO finish during bdev reset
	 * 4) Outstanding IO do not finish before reset is done waiting
	 *    for them.
	 *
	 * Above conditions mainly affect the timing of bdev reset completion
	 * and whether a reset should be skipped via spdk_bdev_io_complete()
	 * or sent down to the underlying bdev module via bdev_io_submit_reset(). */

	/* Test preparation */
	set_thread(0);
	io_ch = spdk_bdev_get_io_channel(g_desc);
	bdev_ch = spdk_io_channel_get_ctx(io_ch);
	CU_ASSERT(bdev_ch->flags == 0);


	/* Test case 1) reset_io_drain_timeout set to 0. Reset should be sent down immediately. */
	bdev = &g_bdev.bdev;
	bdev->reset_io_drain_timeout = 0;

	status_reset = SPDK_BDEV_IO_STATUS_PENDING;
	rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
	CU_ASSERT(rc == 0);
	poll_threads();
	CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1);

	/* Call reset completion inside bdev module. */
	stub_complete_io(g_bdev.io_target, 0);
	poll_threads();
	CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
	CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);


	/* Test case 2) no outstanding IO are present. Reset should perform one iteration over
	* channels and then be skipped. */
	bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
	status_reset = SPDK_BDEV_IO_STATUS_PENDING;

	rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
	CU_ASSERT(rc == 0);
	poll_threads();
	/* Reset was never submitted to the bdev module. */
	CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
	CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);


	/* Test case 3) outstanding IO finish during bdev reset procedure. Reset should initiate
	* wait poller to check for IO completions every second, until reset_io_drain_timeout is
	* reached, but finish earlier than this threshold. */
	status0 = SPDK_BDEV_IO_STATUS_PENDING;
	status_reset = SPDK_BDEV_IO_STATUS_PENDING;
	rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0);
	CU_ASSERT(rc == 0);

	rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
	CU_ASSERT(rc == 0);
	poll_threads();
	/* The reset just started and should not have been submitted yet. */
	CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);

	poll_threads();
	CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
	/* Let the poller wait for about half the time then complete outstanding IO. */
	for (iter = 0; iter < 2; iter++) {
		/* Reset is still processing and not submitted at this point. */
		CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
		spdk_delay_us(1000 * 1000);
		poll_threads();
		poll_threads();
	}
	CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
	stub_complete_io(g_bdev.io_target, 0);
	poll_threads();
	spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
	poll_threads();
	poll_threads();
	CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
	/* Sending reset to the bdev module has been skipped. */
	CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);


	/* Test case 4) outstanding IO are still present after reset_io_drain_timeout
	* seconds have passed. */
	status0 = SPDK_BDEV_IO_STATUS_PENDING;
	status_reset = SPDK_BDEV_IO_STATUS_PENDING;
	rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0);
	CU_ASSERT(rc == 0);

	rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
	CU_ASSERT(rc == 0);
	poll_threads();
	/* The reset just started and should not have been submitted yet. */
	CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);

	poll_threads();
	CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
	/* Let the poller wait for reset_io_drain_timeout seconds. */
	for (iter = 0; iter < bdev->reset_io_drain_timeout; iter++) {
		CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
		spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
		poll_threads();
		poll_threads();
	}

	/* After timing out, the reset should have been sent to the module. */
	CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1);
	/* Complete reset submitted to the module and the read IO. */
	stub_complete_io(g_bdev.io_target, 0);
	poll_threads();
	CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);


	/* Destroy the channel and end the test. */
	spdk_put_io_channel(io_ch);
	poll_threads();

	teardown_test();
}


static void
basic_qos(void)
{
@@ -2092,7 +2302,9 @@ main(int argc, char **argv)
	CU_ADD_TEST(suite, basic_qos);
	CU_ADD_TEST(suite, put_channel_during_reset);
	CU_ADD_TEST(suite, aborted_reset);
	CU_ADD_TEST(suite, aborted_reset_no_outstanding_io);
	CU_ADD_TEST(suite, io_during_reset);
	CU_ADD_TEST(suite, reset_completions);
	CU_ADD_TEST(suite, io_during_qos_queue);
	CU_ADD_TEST(suite, io_during_qos_reset);
	CU_ADD_TEST(suite, enomem);