Commit 3c3dbb47 authored by Vladislav Fedyaev's avatar Vladislav Fedyaev Committed by Jim Harris
Browse files

bdev: fixed data race with qos poller start on bdev open/close



There is a rare race condition(test case with comments provided), where
there can be a qos poller still standing after bdev is closed &
unregistered, preventing the destruction of qos poller & it's
corresponding thread.

Change-Id: I4d5ea2545516f9a4b3f65d34967028f2a8d42009
Signed-off-by: default avatarVladislav Fedyaev <a37206@gmail.com>
Reviewed-on: https://review.spdk.io/c/spdk/spdk/+/25672


Reviewed-by: default avatarJim Harris <jim.harris@nvidia.com>
Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK Automated Test System <spdkbot@gmail.com>
Reviewed-by: default avatarBen Walker <ben@nvidia.com>
Reviewed-by: default avatarVasilii Ivanov <iwanovvvasilij@gmail.com>
parent 559f03a9
Loading
Loading
Loading
Loading
+24 −2
Original line number Diff line number Diff line
@@ -8422,6 +8422,15 @@ spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void
	bdev->internal.unregister_cb = cb_fn;
	bdev->internal.unregister_ctx = cb_arg;
	bdev->internal.unregister_td = thread;

	/* kill QoS, if it's still running */
	if (bdev->internal.qos && bdev->internal.qos->poller && TAILQ_EMPTY(&bdev->internal.open_descs)) {
		SPDK_DEBUGLOG(bdev, "Data race detected - QoS poller still present on closed bdev name: %s",
			      bdev->name);
		if (bdev_qos_destroy(bdev)) {
			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
		}
	}
	spdk_spin_unlock(&bdev->internal.spinlock);
	spdk_spin_unlock(&g_bdev_mgr.spinlock);

@@ -9827,11 +9836,24 @@ bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
		    struct spdk_io_channel *ch, void *_ctx)
{
	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
	int rc = 0;

	spdk_spin_lock(&bdev->internal.spinlock);
	if (bdev->internal.status == SPDK_BDEV_STATUS_READY) {
		bdev_enable_qos(bdev, bdev_ch);
	} else {
		SPDK_DEBUGLOG(bdev,
			      "Data race detected - requested to enable QoS on wrong bdev state bdev name: %s, bdev state: %d",
			      bdev->name, bdev->internal.status);
		if (bdev->internal.qos &&
		    bdev->internal.qos->ch == NULL) { /* QoS has not been fully created yet, shall clean up */
			free(bdev->internal.qos);
			bdev->internal.qos = NULL;
			rc = -EAGAIN;
		}
	}
	spdk_spin_unlock(&bdev->internal.spinlock);
	spdk_bdev_for_each_channel_continue(i, 0);
	spdk_bdev_for_each_channel_continue(i, rc);
}

static void
+59 −0
Original line number Diff line number Diff line
@@ -2863,6 +2863,64 @@ reset_start_complete_race(void)
	teardown_test();
}

static void
qos_bdev_open_close_data_race(void)
{
	bool done;
	struct spdk_bdev_desc *desc = NULL, *desc2 = NULL;
	struct spdk_io_channel *io_ch;

	int rc;
	uint64_t limits[4] = { 1000, 10000, 1000, 1000 };

	setup_test();
	set_thread(0);

	/* setup_test() automatically opens the bdev, * but this test needs to do that in a different way. */
	spdk_bdev_close(g_desc);
	poll_threads();

	set_thread(0);
	spdk_bdev_open_ext("ut_bdev", true, _bdev_event_cb, NULL, &desc);
	SPDK_CU_ASSERT_FATAL(desc != NULL);
	done = false;

	spdk_bdev_set_qos_rate_limits(&g_bdev.bdev, limits, qos_dynamic_enable_done, &rc);
	poll_threads();

	set_thread(1);
	/* bdev_create_channel - start qos on channel 1, channel ref = 2 */
	io_ch = spdk_bdev_get_io_channel(desc);
	poll_threads();

	set_thread(0);
	/* close bdev - kill qos - send put channel 1, channel ref = 1 */
	spdk_bdev_close(desc);
	set_thread(1);
	/* send put channel 1, channel ref = 0, destroy ref = 1, send destroy channel */
	spdk_put_io_channel(io_ch);
	set_thread(1);
	/* start qos - get channel 1, channel ref = 1, destruction is cancelled */
	spdk_bdev_open_ext("ut_bdev", true, _bdev_event_cb, NULL, &desc2);
	/* no qos poller present, no put_channel call */
	spdk_bdev_close(desc2);
	poll_threads();
	set_thread(0);
	/* remove bdev rpc, qos poller is created and prevents bdev destruction */
	spdk_bdev_unregister(&g_bdev.bdev, _bdev_unregistered, &done);

	SPDK_CU_ASSERT_FATAL(g_bdev.bdev.internal.qos->poller == NULL);

	/* Poll the threads to allow all events to be processed */
	poll_threads();

	/* Restore the original g_bdev so that we can use teardown_test(). */
	set_thread(0);
	register_bdev(&g_bdev, "ut_bdev", &g_io_device);
	spdk_bdev_open_ext("ut_bdev", true, _bdev_event_cb, NULL, &g_desc);
	teardown_test();
}

int
main(int argc, char **argv)
{
@@ -2901,6 +2959,7 @@ main(int argc, char **argv)
	CU_ADD_TEST(suite, event_notify_and_close);
	CU_ADD_TEST(suite, unregister_and_qos_poller);
	CU_ADD_TEST(suite, reset_start_complete_race);
	CU_ADD_TEST(suite, qos_bdev_open_close_data_race);

	num_failures = spdk_ut_run_tests(argc, argv, NULL);
	CU_cleanup_registry();