Commit ae620784 authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Jim Harris
Browse files

bdev/nvme: Retry I/O to the same path if error is I/O error



When an I/O gets an I/O error, the I/O path to which the I/O was
submitted may be still available. In this case, the I/O should be
retried to the same I/O path. However, a new I/O path was always
selected for an I/O retry.

For the active/passive policy, the same I/O path was selected naturally.
However, for the active/active policy, it was very likely that a
different I/O path was selected.

To use the same I/O path for an I/O retry, add a helper function
bdev_nvme_retry_io() into bdev_nvme_retry_ios() and replace
bdev_nvme_submit_request() by bdev_nvme_retry_io(). bdev_nvme_retry_io()
checks if nbdev_io->io_path is not NULL and is available. Then, call
_bdev_nvme_submit_request() if true, or call bdev_nvme_submit_request()
otherwise. For I/O path error, clear nbdev_io->io_path for
clarification. Add unit test to verify this change.

Linux kernel native NVMe multipath already takes this approach. Hence,
this change will be reasonable.

Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: I7022aafd8b1cdd5830c4f743d64b080aa970cf8d
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/16015


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarRichael <richael.zhuang@arm.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
parent 6fa7cf97
Loading
Loading
Loading
Loading
+24 −6
Original line number Diff line number Diff line
@@ -145,6 +145,8 @@ static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
		struct nvme_async_probe_ctx *ctx);
static int bdev_nvme_library_init(void);
static void bdev_nvme_library_fini(void);
static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch,
				      struct spdk_bdev_io *bdev_io);
static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
				     struct spdk_bdev_io *bdev_io);
static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
@@ -943,11 +945,24 @@ any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
	return false;
}

static void
bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
{
	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
	struct spdk_io_channel *ch;

	if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) {
		_bdev_nvme_submit_request(nbdev_ch, bdev_io);
	} else {
		ch = spdk_io_channel_from_ctx(nbdev_ch);
		bdev_nvme_submit_request(ch, bdev_io);
	}
}

static int
bdev_nvme_retry_ios(void *arg)
{
	struct nvme_bdev_channel *nbdev_ch = arg;
	struct spdk_io_channel *ch = spdk_io_channel_from_ctx(nbdev_ch);
	struct spdk_bdev_io *bdev_io, *tmp_bdev_io;
	struct nvme_bdev_io *bio;
	uint64_t now, delay_us;
@@ -962,7 +977,7 @@ bdev_nvme_retry_ios(void *arg)

		TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link);

		bdev_nvme_submit_request(ch, bdev_io);
		bdev_nvme_retry_io(nbdev_ch, bdev_io);
	}

	spdk_poller_unregister(&nbdev_ch->retry_io_poller);
@@ -1112,11 +1127,15 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
	    !nvme_io_path_is_available(io_path) ||
	    !nvme_ctrlr_is_available(nvme_ctrlr)) {
		nbdev_ch->current_io_path = NULL;
		bio->io_path = NULL;
		if (spdk_nvme_cpl_is_ana_error(cpl)) {
			if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) {
				io_path->nvme_ns->ana_state_updating = true;
			}
		}
		if (!any_io_path_may_become_available(nbdev_ch)) {
			goto complete;
		}
		delay_ms = 0;
	} else {
		bio->retry_count++;
@@ -1130,10 +1149,8 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
		}
	}

	if (any_io_path_may_become_available(nbdev_ch)) {
	bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
	return;
	}

complete:
	bio->retry_count = 0;
@@ -1158,6 +1175,7 @@ bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
		nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));

		nbdev_ch->current_io_path = NULL;
		bio->io_path = NULL;

		if (any_io_path_may_become_available(nbdev_ch)) {
			bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
+176 −0
Original line number Diff line number Diff line
@@ -6179,6 +6179,181 @@ test_uuid_generation(void)
	CU_ASSERT((spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &uuid1)) == 0);
}

static void
test_retry_io_to_same_path(void)
{
	struct nvme_path_id path1 = {}, path2 = {};
	struct spdk_nvme_ctrlr *ctrlr1, *ctrlr2;
	struct nvme_bdev_ctrlr *nbdev_ctrlr;
	struct nvme_ctrlr *nvme_ctrlr1, *nvme_ctrlr2;
	const int STRING_SIZE = 32;
	const char *attached_names[STRING_SIZE];
	struct nvme_bdev *bdev;
	struct spdk_bdev_io *bdev_io;
	struct nvme_bdev_io *bio;
	struct spdk_io_channel *ch;
	struct nvme_bdev_channel *nbdev_ch;
	struct nvme_io_path *io_path1, *io_path2;
	struct ut_nvme_req *req;
	struct spdk_uuid uuid1 = { .u.raw = { 0x1 } };
	int done;
	int rc;

	g_opts.nvme_ioq_poll_period_us = 1;

	memset(attached_names, 0, sizeof(char *) * STRING_SIZE);
	ut_init_trid(&path1.trid);
	ut_init_trid2(&path2.trid);
	g_ut_attach_ctrlr_status = 0;
	g_ut_attach_bdev_count = 1;

	set_thread(0);

	ctrlr1 = ut_attach_ctrlr(&path1.trid, 1, true, true);
	SPDK_CU_ASSERT_FATAL(ctrlr1 != NULL);

	ctrlr1->ns[0].uuid = &uuid1;

	rc = bdev_nvme_create(&path1.trid, "nvme0", attached_names, STRING_SIZE,
			      attach_ctrlr_done, NULL, NULL, NULL, true);
	CU_ASSERT(rc == 0);

	spdk_delay_us(1000);
	poll_threads();
	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
	poll_threads();

	ctrlr2 = ut_attach_ctrlr(&path2.trid, 1, true, true);
	SPDK_CU_ASSERT_FATAL(ctrlr2 != NULL);

	ctrlr2->ns[0].uuid = &uuid1;

	rc = bdev_nvme_create(&path2.trid, "nvme0", attached_names, STRING_SIZE,
			      attach_ctrlr_done, NULL, NULL, NULL, true);
	CU_ASSERT(rc == 0);

	spdk_delay_us(1000);
	poll_threads();
	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
	poll_threads();

	nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name("nvme0");
	SPDK_CU_ASSERT_FATAL(nbdev_ctrlr != NULL);

	nvme_ctrlr1 = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, &path1.trid);
	SPDK_CU_ASSERT_FATAL(nvme_ctrlr1 != NULL);

	nvme_ctrlr2 = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, &path2.trid);
	SPDK_CU_ASSERT_FATAL(nvme_ctrlr2 != NULL);

	bdev = nvme_bdev_ctrlr_get_bdev(nbdev_ctrlr, 1);
	SPDK_CU_ASSERT_FATAL(bdev != NULL);

	done = -1;
	bdev_nvme_set_multipath_policy(bdev->disk.name, BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
				       ut_set_multipath_policy_done, &done);
	poll_threads();
	CU_ASSERT(done == 0);

	CU_ASSERT(bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE);

	ch = spdk_get_io_channel(bdev);
	SPDK_CU_ASSERT_FATAL(ch != NULL);
	nbdev_ch = spdk_io_channel_get_ctx(ch);

	CU_ASSERT(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE);

	bdev_io = ut_alloc_bdev_io(SPDK_BDEV_IO_TYPE_WRITE, bdev, ch);
	ut_bdev_io_set_buf(bdev_io);

	bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;

	io_path1 = ut_get_io_path_by_ctrlr(nbdev_ch, nvme_ctrlr1);
	SPDK_CU_ASSERT_FATAL(io_path1 != NULL);

	io_path2 = ut_get_io_path_by_ctrlr(nbdev_ch, nvme_ctrlr2);
	SPDK_CU_ASSERT_FATAL(io_path2 != NULL);

	/* The 1st I/O should be submitted to io_path1. */
	bdev_io->internal.in_submit_request = true;

	bdev_nvme_submit_request(ch, bdev_io);
	CU_ASSERT(bdev_io->internal.in_submit_request == true);
	CU_ASSERT(bio->io_path == io_path1);
	CU_ASSERT(io_path1->qpair->qpair->num_outstanding_reqs == 1);

	spdk_delay_us(1);

	poll_threads();
	CU_ASSERT(bdev_io->internal.in_submit_request == false);
	CU_ASSERT(bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS);

	/* The 2nd I/O should be submitted to io_path2 because the path selection
	 * policy is round-robin.
	 */
	bdev_io->internal.in_submit_request = true;

	bdev_nvme_submit_request(ch, bdev_io);
	CU_ASSERT(bdev_io->internal.in_submit_request == true);
	CU_ASSERT(bio->io_path == io_path2);
	CU_ASSERT(io_path2->qpair->qpair->num_outstanding_reqs == 1);

	req = ut_get_outstanding_nvme_request(io_path2->qpair->qpair, bio);
	SPDK_CU_ASSERT_FATAL(req != NULL);

	/* Set retry count to non-zero. */
	g_opts.bdev_retry_count = 1;

	/* Inject an I/O error. */
	req->cpl.status.sc = SPDK_NVME_SC_NAMESPACE_NOT_READY;
	req->cpl.status.sct = SPDK_NVME_SCT_GENERIC;

	/* The 2nd I/O should be queued to nbdev_ch. */
	spdk_delay_us(1);
	poll_thread_times(0, 1);

	CU_ASSERT(io_path2->qpair->qpair->num_outstanding_reqs == 0);
	CU_ASSERT(bdev_io->internal.in_submit_request == true);
	CU_ASSERT(bdev_io == TAILQ_FIRST(&nbdev_ch->retry_io_list));

	/* The 2nd I/O should keep caching io_path2. */
	CU_ASSERT(bio->io_path == io_path2);

	/* The 2nd I/O should be submitted to io_path2 again. */
	poll_thread_times(0, 1);

	CU_ASSERT(bdev_io->internal.in_submit_request == true);
	CU_ASSERT(bio->io_path == io_path2);
	CU_ASSERT(io_path2->qpair->qpair->num_outstanding_reqs == 1);

	spdk_delay_us(1);
	poll_threads();

	CU_ASSERT(io_path2->qpair->qpair->num_outstanding_reqs == 0);
	CU_ASSERT(bdev_io->internal.in_submit_request == false);
	CU_ASSERT(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);

	free(bdev_io);

	spdk_put_io_channel(ch);

	poll_threads();
	spdk_delay_us(1);
	poll_threads();

	rc = bdev_nvme_delete("nvme0", &g_any_path);
	CU_ASSERT(rc == 0);

	poll_threads();
	spdk_delay_us(1000);
	poll_threads();

	CU_ASSERT(nvme_bdev_ctrlr_get_by_name("nvme0") == NULL);

	g_opts.nvme_ioq_poll_period_us = 0;
	g_opts.bdev_retry_count = 0;
}

int
main(int argc, const char **argv)
{
@@ -6230,6 +6405,7 @@ main(int argc, const char **argv)
	CU_ADD_TEST(suite, test_disable_auto_failback);
	CU_ADD_TEST(suite, test_set_multipath_policy);
	CU_ADD_TEST(suite, test_uuid_generation);
	CU_ADD_TEST(suite, test_retry_io_to_same_path);

	CU_basic_set_mode(CU_BRM_VERBOSE);