Commit cc4335f4 authored by Daniel Nowak's avatar Daniel Nowak Committed by Jim Harris
Browse files

lib/bdev: fix the issue where locking a range could cause a hang



This situation was observed in the bdev_lock_lba_range_check_io()
function and occurred in the following scenario:

1. An incoming I/O request is sent to the raid5f bdev.
2. The I/O is split into two child I/Os.
3. The first child I/O is submitted.
4. A range lock is applied to the entire raid5f bdev due to a failure in
   one of its member devices.
5. The second child I/O is submitted. As a result, it is placed on the
   io_locked list in the bdev_io_submit() function.
6. Consequently, the locking procedure gets stuck in the
   bdev_lock_lba_range_check_io() function.

The expectation is that no I/Os on the io_submitted list overlap with
the locked range. However, the parent I/O is still waiting for the
second child I/O to complete. Since the second child I/O is on the
io_locked list, it is not processed, preventing the parent I/O from
completing. This results in a form of deadlock.

Note that this issue can only occur when not all child I/Os are
submitted immediately by bdev_io_split(). The unit test simulates a
scenario where the pool does not have enough bdev_ios available, so the
second child I/O can only be submitted after the first one completes.
This could also happen if the split ran out of child iovecs, for
example. This second child I/O won't be allowed to proceed because the
range is locked and no new I/Os are allowed. But this is not a new I/O -
it is a part of an I/O that was submitted before the range was locked
and it needs to go through. Otherwise, the parent I/O and the lock won't
complete.

This commit fixes the issue by preventing a lock on an I/O if it is part
of a parent-split I/O.

Change-Id: I0445815f0bf8af70f59bea0a4f7649552d4eeed2
Signed-off-by: default avatarDaniel Nowak <daniel.nowak@solidigm.com>
Signed-off-by: default avatarArtur Paszkiewicz <artur.paszkiewicz@solidigm.com>
Reviewed-on: https://review.spdk.io/c/spdk/spdk/+/26149


Tested-by: default avatarSPDK Automated Test System <spdkbot@gmail.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarJim Harris <jim.harris@nvidia.com>
Reviewed-by: default avatarBen Walker <ben@nvidia.com>
parent 3904cdae
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -986,7 +986,10 @@ struct spdk_bdev_io_internal_fields {
			/** Whether we are currently inside the submit request call */
			uint8_t in_submit_request		: 1;

			uint8_t reserved			: 2;
			/** Whether the I/O is a sub-I/O of a split parent I/O */
			uint8_t child_io		: 1;

			uint8_t reserved			: 1;
		};
		uint8_t raw;
	} f;
+10 −2
Original line number Diff line number Diff line
@@ -3784,7 +3784,9 @@ bdev_io_submit(struct spdk_bdev_io *bdev_io)

	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);

	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
	/* Child I/Os are not checked against locked ranges because their parent I/O was already
	 * checked before splitting, so they must be allowed to proceed. */
	if (!bdev_io->internal.f.child_io && !TAILQ_EMPTY(&ch->locked_ranges)) {
		struct lba_range *range;

		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
@@ -4011,8 +4013,14 @@ bdev_io_init(struct spdk_bdev_io *bdev_io,
	bdev_io->internal.get_aux_buf_cb = NULL;
	bdev_io->internal.data_transfer_cpl = NULL;
	bdev_io->internal.waitq_entry.dep_unblock = false;
	if (cb == bdev_io_split_done) {
		bdev_io->internal.f.child_io = true;
		bdev_io->internal.f.split = false;
	} else {
		bdev_io->internal.f.child_io = false;
		bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
	}
}

static bool
bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
+83 −0
Original line number Diff line number Diff line
@@ -5175,6 +5175,88 @@ lock_lba_range_overlapped(void)
	ut_fini_bdev();
}

static void
lock_lba_range_with_split_io(void)
{
	struct spdk_bdev *bdev;
	struct spdk_bdev_desc *desc = NULL;
	struct spdk_io_channel *io_ch;
	struct spdk_bdev_channel *channel;
	struct spdk_bdev_opts bdev_opts = {};
	struct lba_range *range;
	char buf[4096];
	int ctx1;
	int rc;

	spdk_bdev_get_opts(&bdev_opts, sizeof(bdev_opts));
	bdev_opts.bdev_io_pool_size = 2;
	bdev_opts.bdev_io_cache_size = 0;

	ut_init_bdev(&bdev_opts);
	bdev = allocate_bdev("bdev0");

	/* Force the write I/O to be split based on write_unit_size. There are only two bdev_io in
	 * the pool to make the second part of the split I/O submitted after the first is completed.
	 */
	bdev->write_unit_size = 2;
	bdev->split_on_write_unit = true;

	rc = spdk_bdev_open_ext("bdev0", true, bdev_ut_event_cb, NULL, &desc);
	CU_ASSERT(rc == 0);
	CU_ASSERT(desc != NULL);
	CU_ASSERT(bdev == spdk_bdev_desc_get_bdev(desc));
	io_ch = spdk_bdev_get_io_channel(desc);
	CU_ASSERT(io_ch != NULL);
	channel = spdk_io_channel_get_ctx(io_ch);

	g_io_done = false;
	rc = spdk_bdev_write_blocks(desc, io_ch, buf, 20, 4, io_done, &ctx1);
	CU_ASSERT(rc == 0);

	g_lock_lba_range_done = false;
	rc = bdev_lock_lba_range(desc, io_ch, 20, 10, lock_lba_range_done, &ctx1);
	CU_ASSERT(rc == 0);
	poll_threads();

	/* The lock should not be fully valid yet, since a write I/O is outstanding.
	 * But note that the range should be on the channel's locked_list, to make sure no
	 * new write I/O are started.
	 */
	CU_ASSERT(g_io_done == false);
	CU_ASSERT(g_lock_lba_range_done == false);
	range = TAILQ_FIRST(&channel->locked_ranges);
	SPDK_CU_ASSERT_FATAL(range != NULL);
	CU_ASSERT(range->offset == 20);
	CU_ASSERT(range->length == 10);

	/* Complete the first part of the split I/O */
	stub_complete_io(1);
	spdk_delay_us(100);
	poll_threads();
	CU_ASSERT(g_io_done == false);
	CU_ASSERT(g_lock_lba_range_done == false);

	/* Complete the write I/O.  This should make the lock valid (checked by confirming
	 * our callback was invoked).
	 */
	stub_complete_io(1);
	spdk_delay_us(100);
	poll_threads();
	CU_ASSERT(g_io_done == true);
	CU_ASSERT(g_lock_lba_range_done == true);

	rc = bdev_unlock_lba_range(desc, io_ch, 20, 10, unlock_lba_range_done, &ctx1);
	CU_ASSERT(rc == 0);
	poll_threads();

	CU_ASSERT(TAILQ_EMPTY(&channel->locked_ranges));

	spdk_put_io_channel(io_ch);
	spdk_bdev_close(desc);
	free_bdev(bdev);
	ut_fini_bdev();
}

static void
bdev_quiesce_done(void *ctx, int status)
{
@@ -7800,6 +7882,7 @@ main(int argc, char **argv)
	CU_ADD_TEST(suite, lock_lba_range_check_ranges);
	CU_ADD_TEST(suite, lock_lba_range_with_io_outstanding);
	CU_ADD_TEST(suite, lock_lba_range_overlapped);
	CU_ADD_TEST(suite, lock_lba_range_with_split_io);
	CU_ADD_TEST(suite, bdev_quiesce);
	CU_ADD_TEST(suite, bdev_io_abort);
	CU_ADD_TEST(suite, bdev_unmap);