Commit 2bd876f8 authored by Vladislav Fedyaev's avatar Vladislav Fedyaev Committed by Jim Harris
Browse files

bdev: fix data race with bdev unregister/open with QoS enabled



There is a rare data race condition, where bdev_enable_qos_done()
can be called after the bdev is unregistered, resulting in
"unrecoverable spinlock error 8: Lock has been destroyed (!sspin->destroyed)"
and abort call.

Change-Id: I7749062b3cb55a871c4c11d69da29173bd0b81f7
Signed-off-by: default avatarVladislav Fedyaev <a37206@gmail.com>
Reviewed-on: https://review.spdk.io/c/spdk/spdk/+/26124


Reviewed-by: default avatarVasilii Ivanov <iwanovvvasilij@gmail.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarChangpeng Liu <changpeliu@tencent.com>
Reviewed-by: default avatarAnkit Kumar <ankit.kumar@samsung.com>
Reviewed-by: default avatarJim Harris <jim.harris@nvidia.com>
Tested-by: default avatarSPDK Automated Test System <spdkbot@gmail.com>
parent 5ee159bc
Loading
Loading
Loading
Loading
+26 −0
Original line number Diff line number Diff line
@@ -8332,6 +8332,11 @@ bdev_unregister_unsafe(struct spdk_bdev *bdev)
		event_notify(desc, _remove_notify);
	}

	if (bdev->internal.qos_mod_in_progress) {
		/* QoS setup is in progress, can't unregister for now. */
		rc = -EBUSY;
	}

	/* If there are no descriptors, proceed removing the bdev */
	if (rc == 0) {
		bdev_examine_allowlist_remove(bdev->name);
@@ -8484,6 +8489,7 @@ bdev_start_qos(struct spdk_bdev *bdev)
			return -ENOMEM;
		}
		ctx->bdev = bdev;
		ctx->bdev->internal.qos_mod_in_progress = true;
		spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
	}

@@ -9764,6 +9770,9 @@ bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb
static void
bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
{
	struct spdk_bdev *bdev;
	int rc;

	spdk_spin_lock(&ctx->bdev->internal.spinlock);
	ctx->bdev->internal.qos_mod_in_progress = false;
	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
@@ -9771,7 +9780,24 @@ bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
	if (ctx->cb_fn) {
		ctx->cb_fn(ctx->cb_arg, status);
	}
	bdev = ctx->bdev;
	free(ctx);

	spdk_spin_lock(&g_bdev_mgr.spinlock);
	spdk_spin_lock(&bdev->internal.spinlock);
	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
		SPDK_DEBUGLOG(bdev, "Data race detected - trying to enable QoS on unregistered bdev %s",
			      bdev->name);
		rc = bdev_unregister_unsafe(bdev);
		spdk_spin_unlock(&bdev->internal.spinlock);

		if (rc == 0) {
			spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
		}
	} else {
		spdk_spin_unlock(&bdev->internal.spinlock);
	}
	spdk_spin_unlock(&g_bdev_mgr.spinlock);
}

static void
+61 −0
Original line number Diff line number Diff line
@@ -2921,6 +2921,66 @@ qos_bdev_open_close_data_race(void)
	teardown_test();
}

static void
bdev_unregister_open_qos_data_race(void)
{
	bool done;
	struct spdk_bdev_desc *desc = NULL;
	struct spdk_io_channel *io_ch;
	int rc;
	uint64_t limits[4] = { 1000, 10000, 1000, 1000 };

	setup_test();
	set_thread(0);

	/* setup_test() automatically opens the bdev,
	 * but this test needs to do that in a different
	 * way. */

	spdk_bdev_close(g_desc);
	poll_threads();

	set_thread(1);
	spdk_bdev_open_ext("ut_bdev", true, _bdev_event_cb, NULL, &desc);
	SPDK_CU_ASSERT_FATAL(desc != NULL);
	done = false;

	set_thread(2);
	io_ch = spdk_bdev_get_io_channel(desc);

	set_thread(0);
	/* Enable qos, so bdev_open() will call it later. */
	spdk_bdev_set_qos_rate_limits(&g_bdev.bdev, limits, qos_dynamic_enable_done, &rc);
	poll_threads();
	set_thread(2);
	spdk_put_io_channel(io_ch);
	set_thread(1);
	/* All channels are closed, close the last descriptor. */
	spdk_bdev_close(desc);
	/* bdev_open() will call start_qos(). */
	spdk_bdev_open_ext("ut_bdev", true, _bdev_event_cb, NULL, &desc);
	set_thread(2);
	/* unregister() will proceed to spdk_io_device_unregister(),
	 * bdev_unregister_unsafe() returns 0, * as there are no channels open.  */
	spdk_bdev_unregister(&g_bdev.bdev, _bdev_unregistered, &done);
	set_thread(1);
	/* Close the last descriptor, unregister() will destroy the bdev. */
	spdk_bdev_close(desc);
	/* bdev_enable_qos_done() is called, trying to lock the destroyed spinlock. */
	poll_thread(1);
	poll_thread(2);

	/* Poll the threads to allow all events to be processed. */
	poll_threads();

	/* Restore the original g_bdev so that we can use teardown_test(). */
	set_thread(0);
	register_bdev(&g_bdev, "ut_bdev", &g_io_device);
	spdk_bdev_open_ext("ut_bdev", true, _bdev_event_cb, NULL, &g_desc);
	teardown_test();
}


int
main(int argc, char **argv)
{
@@ -2960,6 +3020,7 @@ main(int argc, char **argv)
	CU_ADD_TEST(suite, unregister_and_qos_poller);
	CU_ADD_TEST(suite, reset_start_complete_race);
	CU_ADD_TEST(suite, qos_bdev_open_close_data_race);
	CU_ADD_TEST(suite, bdev_unregister_open_qos_data_race);

	num_failures = spdk_ut_run_tests(argc, argv, NULL);
	CU_cleanup_registry();