Commit d47eb51c authored by Jinlong Chen's avatar Jinlong Chen Committed by Konrad Sztyber
Browse files

bdev: fix a race between reset start and complete



There is a race between reset start and complete:

1. reset_1 is completing. It clears bdev->internal.reset_in_progress and
   sends unfreeze_channel messages to remove queued resets of all
   channels.
2. reset_2 is starting. As bdev->internal.reset_in_progress has been
   cleared, it is inserted to queued_resets list and starts to freeze
   channels.
3. reset_1's unfreeze_channel message removes reset_2 from queued_resets
   list.
4. reset_2 finishes freezing channels, but the corresponding bdev_io has
   gone, hence resulting in segmentation fault.

To fix this, we use per-bdev queued_resets list instead of per-channel
ones, and nullify bdev->internal.reset_in_progress after unfreezing bdev
channels. In this way, we can assure that all resets submitted during an
in-progress reset can be queued and completed correctly.

Besides, we do not insert the reset that is submitted to the underlying
device into the queued_resets list, so that the list can be processsed
cleanly.

Change-Id: I7cb14d790c1e20cea86e4829555d04acc408ee28
Signed-off-by: default avatarJinlong Chen <chenjinlong.cjl@alibaba-inc.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/25371


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
Community-CI: Community CI Samsung <spdk.community.ci.samsung@gmail.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Reviewed-by: default avatarGangCao <gang.cao@intel.com>
parent 83e8405e
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -731,6 +731,9 @@ struct spdk_bdev {
		/** points to a reset bdev_io if one is in progress. */
		struct spdk_bdev_io *reset_in_progress;

		/** List of reset bdev_ios that are not submitted to the underlying device. */
		bdev_io_tailq_t		queued_resets;

		/** poller for tracking the queue_depth of a device, NULL if not tracking */
		struct spdk_poller *qd_poller;

+66 −84
Original line number Diff line number Diff line
@@ -318,8 +318,6 @@ struct spdk_bdev_channel {
	struct spdk_bdev_io_stat *prev_stat;
#endif

	bdev_io_tailq_t		queued_resets;

	lba_range_tailq_t	locked_ranges;

	/** List of I/Os queued by QoS. */
@@ -4230,7 +4228,6 @@ bdev_channel_create(void *io_device, void *ctx_buf)
	}

	ch->io_outstanding = 0;
	TAILQ_INIT(&ch->queued_resets);
	TAILQ_INIT(&ch->locked_ranges);
	TAILQ_INIT(&ch->qos_queued_io);
	ch->flags = 0;
@@ -4655,8 +4652,6 @@ bdev_channel_destroy(void *io_device, void *ctx_buf)
	spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat);
	spdk_spin_unlock(&ch->bdev->internal.spinlock);

	bdev_abort_all_queued_io(&ch->queued_resets, ch);

	bdev_channel_abort_queued_ios(ch);

	if (ch->histogram) {
@@ -6507,18 +6502,14 @@ static int bdev_reset_poll_for_outstanding_io(void *ctx);
static void
bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
{
	struct spdk_bdev_channel *ch = _ctx;
	struct spdk_bdev_io *bdev_io;

	bdev_io = TAILQ_FIRST(&ch->queued_resets);
	struct spdk_bdev_io *bdev_io = _ctx;
	struct spdk_bdev_channel *ch = bdev_io->internal.ch;

	if (status == -EBUSY) {
		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
							      ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
							      bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
		} else {
			TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);

			if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) {
				/* If outstanding IOs are still present and reset_io_drain_timeout
				 * seconds passed, start the reset. */
@@ -6531,7 +6522,6 @@ bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int sta
			}
		}
	} else {
		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
		SPDK_DEBUGLOG(bdev,
			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
			      ch->bdev->name);
@@ -6561,13 +6551,10 @@ bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bd
static int
bdev_reset_poll_for_outstanding_io(void *ctx)
{
	struct spdk_bdev_channel *ch = ctx;
	struct spdk_bdev_io *bdev_io;

	bdev_io = TAILQ_FIRST(&ch->queued_resets);
	struct spdk_bdev_io *bdev_io = ctx;

	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
	spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io,
				   bdev_reset_check_outstanding_io_done);

	return SPDK_POLLER_BUSY;
@@ -6576,25 +6563,20 @@ bdev_reset_poll_for_outstanding_io(void *ctx)
static void
bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
{
	struct spdk_bdev_channel *ch = _ctx;
	struct spdk_bdev_io *bdev_io;

	bdev_io = TAILQ_FIRST(&ch->queued_resets);
	struct spdk_bdev_io *bdev_io = _ctx;

	if (bdev->reset_io_drain_timeout == 0) {
		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);

		bdev_io_submit_reset(bdev_io);
		return;
	}

	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
			(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
			(bdev->reset_io_drain_timeout * spdk_get_ticks_hz());

	/* In case bdev->reset_io_drain_timeout is not equal to zero,
	 * submit the reset to the underlying module only if outstanding I/O
	 * remain after reset_io_drain_timeout seconds have passed. */
	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
	spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io,
				   bdev_reset_check_outstanding_io_done);
}

@@ -6627,35 +6609,34 @@ bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bd
}

static void
bdev_start_reset(void *ctx)
bdev_start_reset(struct spdk_bdev_io *bdev_io)
{
	struct spdk_bdev_channel *ch = ctx;

	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch,
				   bdev_reset_freeze_channel_done);
}
	struct spdk_bdev *bdev = bdev_io->bdev;
	bool freeze_channel = false;

static void
bdev_channel_start_reset(struct spdk_bdev_channel *ch)
{
	struct spdk_bdev *bdev = ch->bdev;
	bdev_ch_add_to_io_submitted(bdev_io);

	assert(!TAILQ_EMPTY(&ch->queued_resets));
	/**
	 * Take a channel reference for the target bdev for the life of this
	 *  reset.  This guards against the channel getting destroyed before
	 *  the reset is completed.  We will release the reference when this
	 *  reset is completed.
	 */
	bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));

	spdk_spin_lock(&bdev->internal.spinlock);
	if (bdev->internal.reset_in_progress == NULL) {
		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
		/*
		 * Take a channel reference for the target bdev for the life of this
		 *  reset.  This guards against the channel getting destroyed while
		 *  spdk_bdev_for_each_channel() calls related to this reset IO are in
		 *  progress.  We will release the reference when this reset is
		 *  completed.
		 */
		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
		bdev_start_reset(ch);
		bdev->internal.reset_in_progress = bdev_io;
		freeze_channel = true;
	} else {
		TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link);
	}
	spdk_spin_unlock(&bdev->internal.spinlock);

	if (freeze_channel) {
		spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io,
					   bdev_reset_freeze_channel_done);
	}
}

int
@@ -6675,17 +6656,9 @@ spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
	bdev_io->internal.desc = desc;
	bdev_io->internal.submit_tsc = spdk_get_ticks();
	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
	bdev_io->u.reset.ch_ref = NULL;
	bdev_io_init(bdev_io, bdev, cb_arg, cb);

	spdk_spin_lock(&bdev->internal.spinlock);
	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
	spdk_spin_unlock(&bdev->internal.spinlock);

	bdev_ch_add_to_io_submitted(bdev_io);

	bdev_channel_start_reset(channel);

	bdev_start_reset(bdev_io);
	return 0;
}

@@ -7435,17 +7408,45 @@ bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io)

static void bdev_destroy_cb(void *io_device);

static inline void
_bdev_reset_complete(void *ctx)
{
	struct spdk_bdev_io *bdev_io = ctx;

	/* Put the channel reference we got in submission. */
	assert(bdev_io->u.reset.ch_ref != NULL);
	spdk_put_io_channel(bdev_io->u.reset.ch_ref);
	bdev_io->u.reset.ch_ref = NULL;

	bdev_io_complete(bdev_io);
}

static void
bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
{
	struct spdk_bdev_io *bdev_io = _ctx;
	bdev_io_tailq_t queued_resets;
	struct spdk_bdev_io *queued_reset;

	if (bdev_io->u.reset.ch_ref != NULL) {
		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
		bdev_io->u.reset.ch_ref = NULL;
	assert(bdev_io == bdev->internal.reset_in_progress);

	TAILQ_INIT(&queued_resets);

	spdk_spin_lock(&bdev->internal.spinlock);
	TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets,
		   spdk_bdev_io, internal.link);
	bdev->internal.reset_in_progress = NULL;
	spdk_spin_unlock(&bdev->internal.spinlock);

	while (!TAILQ_EMPTY(&queued_resets)) {
		queued_reset = TAILQ_FIRST(&queued_resets);
		TAILQ_REMOVE(&queued_resets, queued_reset, internal.link);
		queued_reset->internal.status = bdev_io->internal.status;
		spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset),
				     _bdev_reset_complete, queued_reset);
	}

	bdev_io_complete(bdev_io);
	_bdev_reset_complete(bdev_io);

	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
	    TAILQ_EMPTY(&bdev->internal.open_descs)) {
@@ -7457,16 +7458,9 @@ static void
bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
		      struct spdk_io_channel *_ch, void *_ctx)
{
	struct spdk_bdev_io *bdev_io = _ctx;
	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
	struct spdk_bdev_io *queued_reset;

	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
	while (!TAILQ_EMPTY(&ch->queued_resets)) {
		queued_reset = TAILQ_FIRST(&ch->queued_resets);
		TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
		spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
	}

	spdk_bdev_for_each_channel_continue(i, 0);
}
@@ -7505,23 +7499,10 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
	bdev_io->internal.status = status;

	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
		bool unlock_channels = false;

		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
			SPDK_ERRLOG("NOMEM returned for reset\n");
		}
		spdk_spin_lock(&bdev->internal.spinlock);
		if (bdev_io == bdev->internal.reset_in_progress) {
			bdev->internal.reset_in_progress = NULL;
			unlock_channels = true;
		}
		spdk_spin_unlock(&bdev->internal.spinlock);

		if (unlock_channels) {
		assert(bdev_io == bdev->internal.reset_in_progress);
		spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
					   bdev_reset_complete);
		return;
		}
	} else {
		bdev_io_decrement_outstanding(bdev_ch, shared_resource);
		if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
@@ -7826,6 +7807,7 @@ bdev_register(struct spdk_bdev *bdev)
	TAILQ_INIT(&bdev->internal.open_descs);
	TAILQ_INIT(&bdev->internal.locked_ranges);
	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
	TAILQ_INIT(&bdev->internal.queued_resets);
	TAILQ_INIT(&bdev->aliases);

	/* UUID may be specified by the user or defined by bdev itself.
+85 −9
Original line number Diff line number Diff line
@@ -592,6 +592,7 @@ put_channel_during_reset(void)
{
	struct spdk_io_channel *io_ch;
	bool done = false;
	uint32_t num_completed;

	setup_test();

@@ -607,7 +608,12 @@ put_channel_during_reset(void)
	spdk_bdev_reset(g_desc, io_ch, reset_done, &done);
	spdk_put_io_channel(io_ch);
	poll_threads();
	stub_complete_io(g_bdev.io_target, 0);

	/* Complete the reset. */
	num_completed = stub_complete_io(g_bdev.io_target, 0);
	CU_ASSERT(num_completed == 1);
	poll_threads();
	CU_ASSERT(done == true);

	teardown_test();
}
@@ -652,21 +658,17 @@ aborted_reset(void)
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress != NULL);

	/*
	 * Now destroy ch1.  This will abort the queued reset.  Check that
	 *  the second reset was completed with failed status.  Also check
	 *  that bdev->internal.reset_in_progress != NULL, since the
	 *  original reset has not been completed yet.  This ensures that
	 *  the bdev code is correctly noticing that the failed reset is
	 *  *not* the one that had been submitted to the bdev module.
	 * Now destroy ch1.  Nothing would really happen because the pending second reset
	 *  is still holding a reference of ch1.
	 */
	set_thread(1);
	spdk_put_io_channel(io_ch[1]);
	poll_threads();
	CU_ASSERT(status2 == SPDK_BDEV_IO_STATUS_FAILED);
	CU_ASSERT(status2 == SPDK_BDEV_IO_STATUS_PENDING);
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress != NULL);

	/*
	 * Now complete the first reset, verify that it completed with SUCCESS
	 * Now complete the first reset, verify that both resets completed with SUCCESS
	 *  status and that bdev->internal.reset_in_progress is also set back to NULL.
	 */
	set_thread(0);
@@ -674,8 +676,12 @@ aborted_reset(void)
	stub_complete_io(g_bdev.io_target, 0);
	poll_threads();
	CU_ASSERT(status1 == SPDK_BDEV_IO_STATUS_SUCCESS);
	CU_ASSERT(status2 == SPDK_BDEV_IO_STATUS_SUCCESS);
	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);

	/*
	 * Teardown should succeed.
	 */
	teardown_test();
}

@@ -2745,6 +2751,75 @@ unregister_and_qos_poller(void)
	teardown_test();
}

/**
 * There was a race between reset start and complete:
 *
 * 1. reset_1 is completing. It clears bdev->internal.reset_in_progress and sends
 *    unfreeze_channel messages to remove queued resets of all channels.
 * 2. reset_2 is starting. As bdev->internal.reset_in_progress has been cleared, it
 *    is inserted to queued_resets list and starts to freeze channels.
 * 3. reset_1's unfreeze_channel message removes reset_2 from queued_resets list.
 * 4. reset_2 finishes freezing channels, but the corresponding bdev_io has gone,
 *    hence resulting in segmentation fault.
 *
 * To fix this,
 * 1. Do not queue the reset that is submitted to the underlying device.
 * 2. Queue all other resets in a per-bdev list, so all of them can be completed
 *    at once.
 */
static void
reset_start_complete_race(void)
{
	struct spdk_io_channel *io_ch;
	bool done_reset_1 = false, done_reset_2 = false;
	uint32_t num_completed;
	int rc;

	setup_test();
	set_thread(0);

	io_ch = spdk_bdev_get_io_channel(g_desc);
	SPDK_CU_ASSERT_FATAL(io_ch != NULL);

	CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);

	/**
	 * Submit reset_1.
	 */
	rc = spdk_bdev_reset(g_desc, io_ch, reset_done, &done_reset_1);
	CU_ASSERT(rc == 0);

	/**
	 * Poll threads so that reset_1 completes freezing channels and gets submitted to
	 *  the undelying device.
	 */
	poll_threads();

	/**
	 * Complete reset_1. This will start the unfreezing channel stage of reset_1, but
	 *  not complete it until next poll_threads.
	 */
	num_completed = stub_complete_io(g_bdev.io_target, 0);
	CU_ASSERT(num_completed == 1);

	/**
	 * Submit reset_2. It should be queued because reset_1 has not been completed yet.
	 */
	rc = spdk_bdev_reset(g_desc, io_ch, reset_done, &done_reset_2);
	CU_ASSERT(rc == 0);

	/**
	 * Poll threads. reset_1 completes unfreezing channels, then completes queued reset_2,
	 *  and finally itself gets completed.
	 */
	poll_threads();
	CU_ASSERT(done_reset_1 == true);
	CU_ASSERT(done_reset_2 == true);

	spdk_put_io_channel(io_ch);
	teardown_test();
}

int
main(int argc, char **argv)
{
@@ -2781,6 +2856,7 @@ main(int argc, char **argv)
	CU_ADD_TEST(suite_wt, spdk_bdev_examine_wt);
	CU_ADD_TEST(suite, event_notify_and_close);
	CU_ADD_TEST(suite, unregister_and_qos_poller);
	CU_ADD_TEST(suite, reset_start_complete_race);

	num_failures = spdk_ut_run_tests(argc, argv, NULL);
	CU_cleanup_registry();