Commit 7348b89c authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Tomasz Zawadzki
Browse files

bdev/nvme: Reset tries alternate paths immediately if allowed



Previously, one bdev_nvme_failover() call tried only one alternate path.
However, when the controller has more than two alternate paths and its
reconnect_delay_sec is non-zero, if the first failover failed, the following
retries will be delayed with reconnect_delay_sec seconds.

We want all alternate paths to be tried immediately but want to set
backoff if we try the same alternate path again in a single
bdev_nvme_failover() call.

Hence, add last_failed_tsc to nvme_path_id structure. Set the current
timestamp to the last_failed_tsc when bdev_nvme_failover() is called or
reconnect is failed, or clear the last_failed_tsc if reconnect succeeds.

Then, control trid switch and reconnect timing based on the
last_failed_tsc.

Add comments in the source code to explain this complex logic and modify
unit tests to test this behavior.

Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: I464ea12a32efe0de3889a6705fa0a6c92aeadbd6
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/16864


Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent d1efcf8c
Loading
Loading
Loading
Loading
+80 −18
Original line number Diff line number Diff line
@@ -1744,8 +1744,14 @@ bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
	spdk_for_each_channel_continue(i, 0);
}

static void
bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove)
/* This function marks the current trid as failed by storing the current ticks
 * and then sets the next trid to the active trid within a controller if exists.
 *
 * The purpose of the boolean return value is to request the caller to disconnect
 * the current trid now to try connecting the next trid.
 */
static bool
bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start)
{
	struct nvme_path_id *path_id, *next_path;
	int rc __attribute__((unused));
@@ -1755,10 +1761,21 @@ bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove)
	assert(path_id == nvme_ctrlr->active_path_id);
	next_path = TAILQ_NEXT(path_id, link);

	path_id->is_failed = true;
	/* Update the last failed time. It means the trid is failed if its last
	 * failed time is non-zero.
	 */
	path_id->last_failed_tsc = spdk_get_ticks();

	if (next_path == NULL) {
		return;
		/* There is no alternate trid within a controller. */
		return false;
	}

	if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) {
		/* Connect is not retried in a controller reset sequence. Connecting
		 * the next trid will be done by the next bdev_nvme_failover() call.
		 */
		return false;
	}

	assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
@@ -1779,6 +1796,22 @@ bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove)
	} else {
		free(path_id);
	}

	if (start || next_path->last_failed_tsc == 0) {
		/* bdev_nvme_failover() is just called or the next trid is not failed
		 * or used yet. Try the next trid now.
		 */
		return true;
	}

	if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() *
	    nvme_ctrlr->opts.reconnect_delay_sec) {
		/* Enough backoff passed since the next trid failed. Try the next trid now. */
		return true;
	}

	/* The next trid will be tried after reconnect_delay_sec seconds. */
	return false;
}

static bool
@@ -1866,7 +1899,6 @@ bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
		if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) {
			nvme_ctrlr->fast_io_fail_timedout = true;
		}
		bdev_nvme_failover_trid(nvme_ctrlr, false);
		return OP_DELAYED_RECONNECT;
	}
}
@@ -1928,7 +1960,6 @@ _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
{
	struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
	bool success = spdk_io_channel_iter_get_ctx(i) == NULL;
	struct nvme_path_id *path_id;
	bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn;
	void *reset_cb_arg = nvme_ctrlr->reset_cb_arg;
	enum bdev_nvme_op_after_reset op_after_reset;
@@ -1948,14 +1979,7 @@ _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
	nvme_ctrlr->resetting = false;
	nvme_ctrlr->dont_retry = false;

	path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
	assert(path_id != NULL);
	assert(path_id == nvme_ctrlr->active_path_id);

	path_id->is_failed = !success;

	op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success);

	pthread_mutex_unlock(&nvme_ctrlr->mutex);

	if (reset_cb_fn) {
@@ -1981,6 +2005,31 @@ _bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status)
static void
bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
{
	pthread_mutex_lock(&nvme_ctrlr->mutex);
	if (!success) {
		/* Connecting the active trid failed. Set the next alternate trid to the
		 * active trid if it exists.
		 */
		if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) {
			/* The next alternate trid exists and is ready to try. Try it now. */
			pthread_mutex_unlock(&nvme_ctrlr->mutex);

			nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
			return;
		}

		/* We came here if there is no alternate trid or if the next trid exists but
		 * is not ready to try. We will try the active trid after reconnect_delay_sec
		 * seconds if it is non-zero or at the next reset call otherwise.
		 */
	} else {
		/* Connecting the active trid succeeded. Clear the last failed time because it
		 * means the trid is failed if its last failed time is non-zero.
		 */
		nvme_ctrlr->active_path_id->last_failed_tsc = 0;
	}
	pthread_mutex_unlock(&nvme_ctrlr->mutex);

	/* Make sure we clear any pending resets before returning. */
	spdk_for_each_channel(nvme_ctrlr,
			      bdev_nvme_complete_pending_resets,
@@ -2340,7 +2389,7 @@ bdev_nvme_failover_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove)
		return -EBUSY;
	}

	bdev_nvme_failover_trid(nvme_ctrlr, remove);
	bdev_nvme_failover_trid(nvme_ctrlr, remove, true);

	if (nvme_ctrlr->reconnect_is_delayed) {
		SPDK_NOTICELOG("Reconnect is already scheduled.\n");
@@ -5126,22 +5175,35 @@ static int
_bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
			      struct spdk_nvme_transport_id *trid)
{
	struct nvme_path_id *new_trid, *tmp_trid;
	struct nvme_path_id *active_id, *new_trid, *tmp_trid;

	new_trid = calloc(1, sizeof(*new_trid));
	if (new_trid == NULL) {
		return -ENOMEM;
	}
	new_trid->trid = *trid;
	new_trid->is_failed = false;

	TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
		if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) {
	active_id = nvme_ctrlr->active_path_id;
	assert(active_id != NULL);
	assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids));

	/* Skip the active trid not to replace it until it is failed. */
	tmp_trid = TAILQ_NEXT(active_id, link);
	if (tmp_trid == NULL) {
		goto add_tail;
	}

	/* It means the trid is faled if its last failed time is non-zero.
	 * Insert the new alternate trid before any failed trid.
	 */
	TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) {
		if (tmp_trid->last_failed_tsc != 0) {
			TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
			return 0;
		}
	}

add_tail:
	TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
	return 0;
}
+1 −1
Original line number Diff line number Diff line
@@ -93,7 +93,7 @@ struct nvme_path_id {
	struct spdk_nvme_transport_id		trid;
	struct spdk_nvme_host_id		hostid;
	TAILQ_ENTRY(nvme_path_id)		link;
	bool					is_failed;
	uint64_t				last_failed_tsc;
};

typedef void (*bdev_nvme_reset_cb)(void *cb_arg, bool success);
+71 −34
Original line number Diff line number Diff line
@@ -1434,7 +1434,7 @@ test_reset_ctrlr(void)

	/* Case 3: reset completes successfully. */
	nvme_ctrlr->resetting = false;
	curr_trid->is_failed = true;
	curr_trid->last_failed_tsc = spdk_get_ticks();
	ctrlr.is_failed = true;

	rc = bdev_nvme_reset(nvme_ctrlr);
@@ -1470,7 +1470,7 @@ test_reset_ctrlr(void)
	CU_ASSERT(ctrlr_ch1->qpair->qpair != NULL);
	CU_ASSERT(ctrlr_ch2->qpair->qpair != NULL);
	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(curr_trid->is_failed == true);
	CU_ASSERT(curr_trid->last_failed_tsc != 0);

	poll_thread_times(0, 2);
	CU_ASSERT(nvme_ctrlr->resetting == true);
@@ -1478,7 +1478,7 @@ test_reset_ctrlr(void)
	CU_ASSERT(nvme_ctrlr->resetting == true);
	poll_thread_times(0, 1);
	CU_ASSERT(nvme_ctrlr->resetting == false);
	CU_ASSERT(curr_trid->is_failed == false);
	CU_ASSERT(curr_trid->last_failed_tsc == 0);

	/* Case 4: ctrlr is already removed. */
	ctrlr.is_removed = true;
@@ -1637,7 +1637,7 @@ test_failover_ctrlr(void)

	rc = bdev_nvme_failover(nvme_ctrlr, false);
	CU_ASSERT(rc == -ENXIO);
	CU_ASSERT(curr_trid->is_failed == false);
	CU_ASSERT(curr_trid->last_failed_tsc == 0);

	/* Case 2: reset is in progress. */
	nvme_ctrlr->destruct = false;
@@ -1653,7 +1653,7 @@ test_failover_ctrlr(void)
	CU_ASSERT(rc == 0);

	CU_ASSERT(nvme_ctrlr->resetting == true);
	CU_ASSERT(curr_trid->is_failed == true);
	CU_ASSERT(curr_trid->last_failed_tsc != 0);

	poll_threads();
	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
@@ -1663,7 +1663,7 @@ test_failover_ctrlr(void)
	SPDK_CU_ASSERT_FATAL(curr_trid != NULL);

	CU_ASSERT(nvme_ctrlr->resetting == false);
	CU_ASSERT(curr_trid->is_failed == false);
	CU_ASSERT(curr_trid->last_failed_tsc == 0);

	set_thread(0);

@@ -1789,7 +1789,7 @@ test_race_between_failover_and_add_secondary_trid(void)

	poll_threads();

	CU_ASSERT(path_id1->is_failed == true);
	CU_ASSERT(path_id1->last_failed_tsc != 0);
	CU_ASSERT(path_id1 == nvme_ctrlr->active_path_id);

	rc = bdev_nvme_reset(nvme_ctrlr);
@@ -2485,6 +2485,39 @@ test_add_remove_trid(void)
	}
	CU_ASSERT(ctrid != NULL);

	/* Mark path3 as failed by setting its last_failed_tsc to non-zero forcefully.
	 * If we add path2 again, path2 should be inserted between path1 and path3.
	 * Then, we remove path2. It is not used, and simply removed.
	 */
	ctrid->last_failed_tsc = spdk_get_ticks() + 1;

	ctrlr2 = ut_attach_ctrlr(&path2.trid, 0, false, false);
	SPDK_CU_ASSERT_FATAL(ctrlr2 != NULL);

	rc = bdev_nvme_create(&path2.trid, "nvme0", attached_names, STRING_SIZE,
			      attach_ctrlr_done, NULL, NULL, NULL, false);
	CU_ASSERT(rc == 0);

	spdk_delay_us(1000);
	poll_threads();

	CU_ASSERT(spdk_nvme_transport_id_compare(&nvme_ctrlr->active_path_id->trid, &path1.trid) == 0);

	ctrid = TAILQ_NEXT(nvme_ctrlr->active_path_id, link);
	SPDK_CU_ASSERT_FATAL(ctrid != NULL);
	CU_ASSERT(spdk_nvme_transport_id_compare(&ctrid->trid, &path2.trid) == 0);

	ctrid = TAILQ_NEXT(ctrid, link);
	SPDK_CU_ASSERT_FATAL(ctrid != NULL);
	CU_ASSERT(spdk_nvme_transport_id_compare(&ctrid->trid, &path3.trid) == 0);

	rc = bdev_nvme_delete("nvme0", &path2);
	CU_ASSERT(rc == 0);
	CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == nvme_ctrlr);
	TAILQ_FOREACH(ctrid, &nvme_ctrlr->trids, link) {
		CU_ASSERT(spdk_nvme_transport_id_compare(&ctrid->trid, &path2.trid) != 0);
	}

	/* path1 is currently used and path3 is an alternative path.
	 * If we remove path1, path is changed to path3.
	 */
@@ -4025,9 +4058,9 @@ test_reset_bdev_ctrlr(void)
	 * pending reset requests for ctrlr1.
	 */
	ctrlr1->is_failed = true;
	curr_path1->is_failed = true;
	curr_path1->last_failed_tsc = spdk_get_ticks();
	ctrlr2->is_failed = true;
	curr_path2->is_failed = true;
	curr_path2->last_failed_tsc = spdk_get_ticks();

	set_thread(0);

@@ -4049,7 +4082,7 @@ test_reset_bdev_ctrlr(void)
	CU_ASSERT(nvme_ctrlr1->resetting == true);
	CU_ASSERT(ctrlr1->is_failed == false);
	CU_ASSERT(ctrlr1->adminq.is_connected == false);
	CU_ASSERT(curr_path1->is_failed == true);
	CU_ASSERT(curr_path1->last_failed_tsc != 0);

	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
	poll_thread_times(0, 2);
@@ -4069,7 +4102,7 @@ test_reset_bdev_ctrlr(void)
	CU_ASSERT(nvme_ctrlr1->resetting == true);
	poll_thread_times(0, 2);
	CU_ASSERT(nvme_ctrlr1->resetting == false);
	CU_ASSERT(curr_path1->is_failed == false);
	CU_ASSERT(curr_path1->last_failed_tsc == 0);
	CU_ASSERT(first_bio->io_path == io_path12);
	CU_ASSERT(nvme_ctrlr2->resetting == true);

@@ -4086,7 +4119,7 @@ test_reset_bdev_ctrlr(void)
	CU_ASSERT(nvme_ctrlr2->resetting == true);
	CU_ASSERT(ctrlr2->is_failed == false);
	CU_ASSERT(ctrlr2->adminq.is_connected == false);
	CU_ASSERT(curr_path2->is_failed == true);
	CU_ASSERT(curr_path2->last_failed_tsc != 0);

	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
	poll_thread_times(0, 2);
@@ -4107,7 +4140,7 @@ test_reset_bdev_ctrlr(void)
	poll_thread_times(0, 2);
	CU_ASSERT(first_bio->io_path == NULL);
	CU_ASSERT(nvme_ctrlr2->resetting == false);
	CU_ASSERT(curr_path2->is_failed == false);
	CU_ASSERT(curr_path2->last_failed_tsc == 0);

	poll_threads();

@@ -4121,9 +4154,9 @@ test_reset_bdev_ctrlr(void)
	 * ctrl2, both complete successfully.
	 */
	ctrlr1->is_failed = true;
	curr_path1->is_failed = true;
	curr_path1->last_failed_tsc = spdk_get_ticks();
	ctrlr2->is_failed = true;
	curr_path2->is_failed = true;
	curr_path2->last_failed_tsc = spdk_get_ticks();
	first_bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
	second_bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;

@@ -4146,9 +4179,9 @@ test_reset_bdev_ctrlr(void)
	poll_threads();

	CU_ASSERT(ctrlr1->is_failed == false);
	CU_ASSERT(curr_path1->is_failed == false);
	CU_ASSERT(curr_path1->last_failed_tsc == 0);
	CU_ASSERT(ctrlr2->is_failed == false);
	CU_ASSERT(curr_path2->is_failed == false);
	CU_ASSERT(curr_path2->last_failed_tsc == 0);
	CU_ASSERT(first_bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
	CU_ASSERT(second_bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);

@@ -5408,10 +5441,18 @@ test_retry_failover_ctrlr(void)

	path_id1 = ut_get_path_id_by_trid(nvme_ctrlr, &trid1);
	SPDK_CU_ASSERT_FATAL(path_id1 != NULL);
	CU_ASSERT(path_id1->is_failed == false);
	CU_ASSERT(path_id1->last_failed_tsc == 0);
	CU_ASSERT(path_id1 == nvme_ctrlr->active_path_id);

	/* If reset failed and reconnect is scheduled, path_id is switched from trid1 to trid2. */
	path_id2 = ut_get_path_id_by_trid(nvme_ctrlr, &trid2);
	SPDK_CU_ASSERT_FATAL(path_id2 != NULL);

	path_id3 = ut_get_path_id_by_trid(nvme_ctrlr, &trid3);
	SPDK_CU_ASSERT_FATAL(path_id3 != NULL);

	/* It is expected that connecting both of trid1, trid2, and trid3 fail,
	 * and a reconnect timer is started. */
	ctrlr.fail_reset = true;
	ctrlr.is_failed = true;

@@ -5427,29 +5468,24 @@ test_retry_failover_ctrlr(void)
	CU_ASSERT(ctrlr_ch->qpair->qpair == NULL);
	CU_ASSERT(nvme_ctrlr->reconnect_delay_timer != NULL);
	CU_ASSERT(nvme_ctrlr->reconnect_is_delayed == true);
	CU_ASSERT(path_id1->is_failed == true);
	CU_ASSERT(path_id1->last_failed_tsc != 0);

	path_id2 = ut_get_path_id_by_trid(nvme_ctrlr, &trid2);
	SPDK_CU_ASSERT_FATAL(path_id2 != NULL);
	CU_ASSERT(path_id2->is_failed == false);
	CU_ASSERT(path_id2 == nvme_ctrlr->active_path_id);
	CU_ASSERT(path_id2->last_failed_tsc != 0);
	CU_ASSERT(path_id3->last_failed_tsc != 0);
	CU_ASSERT(path_id1 == nvme_ctrlr->active_path_id);

	/* If we remove trid2 while reconnect is scheduled, trid2 is removed and path_id is
	 * switched to trid3 but reset is not started.
	/* If we remove trid1 while reconnect is scheduled, trid1 is removed and path_id is
	 * switched to trid2 but reset is not started.
	 */
	rc = bdev_nvme_failover(nvme_ctrlr, true);
	CU_ASSERT(rc == 0);

	CU_ASSERT(ut_get_path_id_by_trid(nvme_ctrlr, &trid2) == NULL);

	path_id3 = ut_get_path_id_by_trid(nvme_ctrlr, &trid3);
	SPDK_CU_ASSERT_FATAL(path_id3 != NULL);
	CU_ASSERT(path_id3->is_failed == false);
	CU_ASSERT(path_id3 == nvme_ctrlr->active_path_id);
	CU_ASSERT(ut_get_path_id_by_trid(nvme_ctrlr, &trid1) == NULL);
	CU_ASSERT(path_id2 == nvme_ctrlr->active_path_id);

	CU_ASSERT(nvme_ctrlr->resetting == false);

	/* If reconnect succeeds, trid3 should be the active path_id */
	/* If reconnect succeeds, trid2 should be the active path_id */
	ctrlr.fail_reset = false;

	spdk_delay_us(SPDK_SEC_TO_USEC);
@@ -5462,8 +5498,9 @@ test_retry_failover_ctrlr(void)
	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
	poll_threads();

	CU_ASSERT(path_id3->is_failed == false);
	CU_ASSERT(path_id3 == nvme_ctrlr->active_path_id);
	CU_ASSERT(ut_get_path_id_by_trid(nvme_ctrlr, &trid2) != NULL);
	CU_ASSERT(path_id2->last_failed_tsc == 0);
	CU_ASSERT(path_id2 == nvme_ctrlr->active_path_id);
	CU_ASSERT(nvme_ctrlr->resetting == false);
	CU_ASSERT(ctrlr_ch->qpair->qpair != NULL);
	CU_ASSERT(nvme_ctrlr->reconnect_is_delayed == false);