Commit 6cdc0f25 authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Tomasz Zawadzki
Browse files

bdev/nvme: Defer failover() until reset() completes



There was a complex issue that failover was lost and I/O qpair was never
created again if fabric connect command got timeout for I/O qpair while
controller was being reset.

To create I/O qpair for such case, add a boolean pending_failover variable
to nvme_ctrlr structure, When bdev_nvme_failover() is called, if
nvme_ctrlr->resetting is true, set pending_failover to true and return.
Then, at _bdev_nvme_reset_complete() if pending_failover is true, call
set failover_pending to false and call bdev_nvme_failover().

However, we have to be more careful. most SPDK threads call
bdev_nvme_failover() almost simultaneously for a network error.
For this case, we have to call bdev_nvme_failover() only once per
network error. To do this, add and use another boolean variable
in_failover.

After this change, bdev_nvme_failover() call is not lost but deferred.
Hence, use -EINPROGRESS instead of -EBUSY for clarification.

Verify this change by adding a unit test case.

NOTE: Better practical workaround will be to extend timeout for fabric
connect command. While fabric connect command is in progress, I/Os are
queued even if the upper layer does not enable I/O error resiliency.
But, this fix will be necessary. Otherwise, connection establishment is
not retried.

Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: Ibe346b8ae35cab5bd2bcbda1aaa12d2d9364e283
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/18209


Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
parent 7348b89c
Loading
Loading
Loading
Loading
+20 −2
Original line number Diff line number Diff line
@@ -1880,6 +1880,7 @@ enum bdev_nvme_op_after_reset {
	OP_COMPLETE_PENDING_DESTRUCT,
	OP_DESTRUCT,
	OP_DELAYED_RECONNECT,
	OP_FAILOVER,
};

typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset;
@@ -1890,6 +1891,10 @@ bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
	if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
		/* Complete pending destruct after reset completes. */
		return OP_COMPLETE_PENDING_DESTRUCT;
	} else if (nvme_ctrlr->pending_failover) {
		nvme_ctrlr->pending_failover = false;
		nvme_ctrlr->reset_start_tsc = 0;
		return OP_FAILOVER;
	} else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) {
		nvme_ctrlr->reset_start_tsc = 0;
		return OP_NONE;
@@ -1978,6 +1983,7 @@ _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
	pthread_mutex_lock(&nvme_ctrlr->mutex);
	nvme_ctrlr->resetting = false;
	nvme_ctrlr->dont_retry = false;
	nvme_ctrlr->in_failover = false;

	op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success);
	pthread_mutex_unlock(&nvme_ctrlr->mutex);
@@ -1997,6 +2003,9 @@ _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
	case OP_DELAYED_RECONNECT:
		nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer);
		break;
	case OP_FAILOVER:
		bdev_nvme_failover(nvme_ctrlr, false);
		break;
	default:
		break;
	}
@@ -2385,9 +2394,17 @@ bdev_nvme_failover_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove)
	}

	if (nvme_ctrlr->resetting) {
		SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
		if (!nvme_ctrlr->in_failover) {
			SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n");

			/* Defer failover until reset completes. */
			nvme_ctrlr->pending_failover = true;
			return -EINPROGRESS;
		} else {
			SPDK_NOTICELOG("Unable to perform failover, already in progress.\n");
			return -EBUSY;
		}
	}

	bdev_nvme_failover_trid(nvme_ctrlr, remove, true);

@@ -2399,6 +2416,7 @@ bdev_nvme_failover_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove)
	}

	nvme_ctrlr->resetting = true;
	nvme_ctrlr->in_failover = true;

	assert(nvme_ctrlr->reset_start_tsc == 0);
	nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
+2 −0
Original line number Diff line number Diff line
@@ -111,6 +111,8 @@ struct nvme_ctrlr {

	uint32_t				resetting : 1;
	uint32_t				reconnect_is_delayed : 1;
	uint32_t				in_failover : 1;
	uint32_t				pending_failover : 1;
	uint32_t				fast_io_fail_timedout : 1;
	uint32_t				destruct : 1;
	uint32_t				ana_log_page_updating : 1;
+146 −3
Original line number Diff line number Diff line
@@ -1474,11 +1474,11 @@ test_reset_ctrlr(void)

	poll_thread_times(0, 2);
	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(curr_trid->last_failed_tsc == 0);
	poll_thread_times(1, 1);
	CU_ASSERT(nvme_ctrlr->resetting == true);
	poll_thread_times(0, 1);
	CU_ASSERT(nvme_ctrlr->resetting == false);
	CU_ASSERT(curr_trid->last_failed_tsc == 0);

	/* Case 4: ctrlr is already removed. */
	ctrlr.is_removed = true;
@@ -1644,7 +1644,7 @@ test_failover_ctrlr(void)
	nvme_ctrlr->resetting = true;

	rc = bdev_nvme_failover(nvme_ctrlr, false);
	CU_ASSERT(rc == -EBUSY);
	CU_ASSERT(rc == -EINPROGRESS);

	/* Case 3: reset completes successfully. */
	nvme_ctrlr->resetting = false;
@@ -1683,7 +1683,7 @@ test_failover_ctrlr(void)
	nvme_ctrlr->resetting = true;

	rc = bdev_nvme_failover(nvme_ctrlr, false);
	CU_ASSERT(rc == -EBUSY);
	CU_ASSERT(rc == -EINPROGRESS);

	/* Case 5: failover completes successfully. */
	nvme_ctrlr->resetting = false;
@@ -6561,6 +6561,148 @@ test_retry_io_to_same_path(void)
	g_opts.bdev_retry_count = 0;
}

/* This case is to verify a fix for a complex race condition that
 * failover is lost if fabric connect command gets timeout while
 * controller is being reset.
 */
static void
test_race_between_reset_and_disconnected(void)
{
	struct spdk_nvme_transport_id trid = {};
	struct spdk_nvme_ctrlr ctrlr = {};
	struct nvme_ctrlr *nvme_ctrlr = NULL;
	struct nvme_path_id *curr_trid;
	struct spdk_io_channel *ch1, *ch2;
	struct nvme_ctrlr_channel *ctrlr_ch1, *ctrlr_ch2;
	int rc;

	ut_init_trid(&trid);
	TAILQ_INIT(&ctrlr.active_io_qpairs);

	set_thread(0);

	rc = nvme_ctrlr_create(&ctrlr, "nvme0", &trid, NULL);
	CU_ASSERT(rc == 0);

	nvme_ctrlr = nvme_ctrlr_get_by_name("nvme0");
	SPDK_CU_ASSERT_FATAL(nvme_ctrlr != NULL);

	curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids);
	SPDK_CU_ASSERT_FATAL(curr_trid != NULL);

	ch1 = spdk_get_io_channel(nvme_ctrlr);
	SPDK_CU_ASSERT_FATAL(ch1 != NULL);

	ctrlr_ch1 = spdk_io_channel_get_ctx(ch1);
	CU_ASSERT(ctrlr_ch1->qpair != NULL);

	set_thread(1);

	ch2 = spdk_get_io_channel(nvme_ctrlr);
	SPDK_CU_ASSERT_FATAL(ch2 != NULL);

	ctrlr_ch2 = spdk_io_channel_get_ctx(ch2);
	CU_ASSERT(ctrlr_ch2->qpair != NULL);

	/* Reset starts from thread 1. */
	set_thread(1);

	nvme_ctrlr->resetting = false;
	curr_trid->last_failed_tsc = spdk_get_ticks();
	ctrlr.is_failed = true;

	rc = bdev_nvme_reset(nvme_ctrlr);
	CU_ASSERT(rc == 0);
	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(ctrlr_ch1->qpair != NULL);
	CU_ASSERT(ctrlr_ch2->qpair != NULL);

	poll_thread_times(0, 3);
	CU_ASSERT(ctrlr_ch1->qpair->qpair == NULL);
	CU_ASSERT(ctrlr_ch2->qpair->qpair != NULL);

	poll_thread_times(0, 1);
	poll_thread_times(1, 1);
	CU_ASSERT(ctrlr_ch1->qpair->qpair == NULL);
	CU_ASSERT(ctrlr_ch2->qpair->qpair == NULL);
	CU_ASSERT(ctrlr.is_failed == true);

	poll_thread_times(1, 1);
	poll_thread_times(0, 1);
	CU_ASSERT(ctrlr.is_failed == false);
	CU_ASSERT(ctrlr.adminq.is_connected == false);

	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
	poll_thread_times(0, 2);
	CU_ASSERT(ctrlr.adminq.is_connected == true);

	poll_thread_times(0, 1);
	CU_ASSERT(ctrlr_ch1->qpair->qpair != NULL);
	CU_ASSERT(ctrlr_ch2->qpair->qpair == NULL);

	poll_thread_times(1, 1);
	CU_ASSERT(ctrlr_ch1->qpair->qpair != NULL);
	CU_ASSERT(ctrlr_ch2->qpair->qpair != NULL);
	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(curr_trid->last_failed_tsc != 0);

	poll_thread_times(0, 2);
	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(curr_trid->last_failed_tsc == 0);
	poll_thread_times(1, 1);
	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(nvme_ctrlr->pending_failover == false);

	/* Here is just one poll before _bdev_nvme_reset_complete() is executed.
	 *
	 * spdk_nvme_ctrlr_reconnect_poll_async() returns success before fabric
	 * connect command is executed. If fabric connect command gets timeout,
	 * bdev_nvme_failover() is executed. This should be deferred until
	 * _bdev_nvme_reset_complete() sets ctrlr->resetting to false.
	 *
	 * Simulate fabric connect command timeout by calling bdev_nvme_failover().
	 */
	rc = bdev_nvme_failover(nvme_ctrlr, false);
	CU_ASSERT(rc == -EINPROGRESS);
	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(nvme_ctrlr->pending_failover == true);
	CU_ASSERT(curr_trid->last_failed_tsc == 0);

	poll_thread_times(0, 1);

	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(nvme_ctrlr->pending_failover == false);
	CU_ASSERT(curr_trid->last_failed_tsc != 0);

	poll_threads();
	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
	poll_threads();

	CU_ASSERT(nvme_ctrlr->resetting == false);
	CU_ASSERT(nvme_ctrlr->pending_failover == false);
	CU_ASSERT(curr_trid->last_failed_tsc == 0);
	CU_ASSERT(ctrlr_ch1->qpair->qpair != NULL);
	CU_ASSERT(ctrlr_ch2->qpair->qpair != NULL);


	spdk_put_io_channel(ch2);

	set_thread(0);

	spdk_put_io_channel(ch1);

	poll_threads();

	rc = bdev_nvme_delete("nvme0", &g_any_path);
	CU_ASSERT(rc == 0);

	poll_threads();
	spdk_delay_us(1000);
	poll_threads();

	CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
}

int
main(int argc, const char **argv)
{
@@ -6614,6 +6756,7 @@ main(int argc, const char **argv)
	CU_ADD_TEST(suite, test_set_multipath_policy);
	CU_ADD_TEST(suite, test_uuid_generation);
	CU_ADD_TEST(suite, test_retry_io_to_same_path);
	CU_ADD_TEST(suite, test_race_between_reset_and_disconnected);

	CU_basic_set_mode(CU_BRM_VERBOSE);