Commit 2826ef13 authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Tomasz Zawadzki
Browse files

bdev/nvme: Fail reset sequence immediately if ctrlr is already removed.



After a controller was hot-removed, if a reset sequence started to
the controller, spdk_nvme_ctrlr_disconnect() failed and caused core
dump in debug mode.

When implemented, how to cause the failure and how to process the
failure were not clear. Hence assert was added to detect the
failure.

We know how we cause the failure now. Let's handle the failure
appropriately.

If spdk_nvme_ctrlr_disconnect() fails, we are on the nvme_ctrlr->thread.
Hence call bdev_nvme_reset_complete() with failure immediately.

Even if spdk_nvme_ctrlr_disconnect() completes synchronously, the
completion callback is executed asynchronously when polling an adminq.

Hence set the completion callback only if spdk_nvme_ctrlr_disconnect()
succeeds.

Fixes issue #2632

Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: I11f61853aba9eca2515592f964a291e59def7247
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13892


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarKrzysztof Karas <krzysztof.karas@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
parent 6439abc0
Loading
Loading
Loading
Loading
+12 −8
Original line number Diff line number Diff line
@@ -1533,10 +1533,21 @@ bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
	}
}

static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success);

static void
nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn)
{
	int rc __attribute__((unused));
	int rc;

	rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
	if (rc != 0) {
		/* Disconnect fails if ctrlr is already resetting or removed. In this case,
		 * fail the reset sequence immediately.
		 */
		bdev_nvme_reset_complete(nvme_ctrlr, false);
		return;
	}

	/* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq.
	 * Set callback here to execute the specified operation after ctrlr is really disconnected.
@@ -1544,13 +1555,6 @@ nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb
	assert(nvme_ctrlr->disconnected_cb == NULL);
	nvme_ctrlr->disconnected_cb = cb_fn;

	/* Disconnect fails if ctrlr is already resetting or removed. Both cases are
	 * not possible. Reset is controlled and the callback to hot remove is called
	 * when ctrlr is hot removed.
	 */
	rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
	assert(rc == 0);

	/* During disconnection, reduce the period to poll adminq more often. */
	bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0);
}
+35 −0
Original line number Diff line number Diff line
@@ -233,6 +233,7 @@ struct spdk_nvme_ctrlr {
	bool				attached;
	bool				is_failed;
	bool				fail_reset;
	bool				is_removed;
	struct spdk_nvme_transport_id	trid;
	TAILQ_HEAD(, spdk_nvme_qpair)	active_io_qpairs;
	TAILQ_ENTRY(spdk_nvme_ctrlr)	tailq;
@@ -764,6 +765,10 @@ spdk_nvme_ctrlr_reconnect_async(struct spdk_nvme_ctrlr *ctrlr)
int
spdk_nvme_ctrlr_disconnect(struct spdk_nvme_ctrlr *ctrlr)
{
	if (ctrlr->is_removed) {
		return -ENXIO;
	}

	ctrlr->adminq.is_connected = false;
	ctrlr->is_failed = false;

@@ -1303,6 +1308,17 @@ test_create_ctrlr(void)
	CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
}

static void
ut_check_hotplug_on_reset(void *cb_arg, bool success)
{
	bool *detect_remove = cb_arg;

	CU_ASSERT(success == false);
	SPDK_CU_ASSERT_FATAL(detect_remove != NULL);

	*detect_remove = true;
}

static void
test_reset_ctrlr(void)
{
@@ -1312,6 +1328,7 @@ test_reset_ctrlr(void)
	struct nvme_path_id *curr_trid;
	struct spdk_io_channel *ch1, *ch2;
	struct nvme_ctrlr_channel *ctrlr_ch1, *ctrlr_ch2;
	bool detect_remove;
	int rc;

	ut_init_trid(&trid);
@@ -1406,6 +1423,24 @@ test_reset_ctrlr(void)
	CU_ASSERT(nvme_ctrlr->resetting == false);
	CU_ASSERT(curr_trid->is_failed == false);

	/* Case 4: ctrlr is already removed. */
	ctrlr.is_removed = true;

	rc = bdev_nvme_reset(nvme_ctrlr);
	CU_ASSERT(rc == 0);

	detect_remove = false;
	nvme_ctrlr->reset_cb_fn = ut_check_hotplug_on_reset;
	nvme_ctrlr->reset_cb_arg = &detect_remove;

	poll_threads();

	CU_ASSERT(nvme_ctrlr->reset_cb_fn == NULL);
	CU_ASSERT(nvme_ctrlr->reset_cb_arg == NULL);
	CU_ASSERT(detect_remove == true);

	ctrlr.is_removed = false;

	spdk_put_io_channel(ch2);

	set_thread(0);