Commit 583a24a4 authored by Dariusz Stojaczyk's avatar Dariusz Stojaczyk Committed by Jim Harris
Browse files

bdev: share nomem_io data between bdevs built on the same device



When there are two bdevs built on the same io_device,
it is possible that one bdev entirely saturates
underlying queue, not letting the second bdev issue
a single I/O. The second bdev will silently fail any
subsequent I/O and append it to the nomem_io list.
However, since we resend I/O only from I/O completion
callback and there's no outstanding I/O for that bdev
(io_outstanding==0), the I/O will never be resent.
It'll be stuck in nomem_io forever.

This patch makes nomem_io list to be shared between
bdevs built on the same device. It is now possible
that I/O completion callback from one bdev will retry
sending I/O from other bdev.

The shared bdev data is based on thread-local
bdev_mgmt_channel, so doesn't need any external
synchronization.

Change-Id: Ia5ac3a1627ce3de4087e43907c329aa7d07ed7c7
Signed-off-by: default avatarDariusz Stojaczyk <dariuszx.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/394658


Tested-by: default avatarSPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarZiye Yang <optimistyzy@gmail.com>
parent 5e799ff4
Loading
Loading
Loading
Loading
+88 −31
Original line number Diff line number Diff line
@@ -113,6 +113,8 @@ struct spdk_bdev_mgmt_channel {
	 */
	bdev_io_stailq_t per_thread_cache;
	uint32_t	per_thread_cache_count;

	TAILQ_HEAD(, spdk_bdev_module_channel) module_channels;
};

struct spdk_bdev_desc {
@@ -136,14 +138,33 @@ struct spdk_bdev_channel {

	struct spdk_bdev_io_stat stat;

	bdev_io_tailq_t		queued_resets;

	uint32_t		flags;

	/* Per-device channel */
	struct spdk_bdev_module_channel *module_ch;

#ifdef SPDK_CONFIG_VTUNE
	uint64_t		start_tsc;
	uint64_t		interval_tsc;
	__itt_string_handle	*handle;
#endif

};

/*
 * Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device
 * will queue here their IO that awaits retry. It makes it posible to retry sending
 * IO to one bdev after IO from other bdev completes.
 */
struct spdk_bdev_module_channel {
	/*
	 * Count of I/O submitted to bdev module and waiting for completion.
	 * Incremented before submit_request() is called on an spdk_bdev_io.
	 */
	uint64_t		io_outstanding;

	bdev_io_tailq_t		queued_resets;

	/*
	 * Queue of IO awaiting retry because of a previous NOMEM status returned
	 *  on this channel.
@@ -155,14 +176,12 @@ struct spdk_bdev_channel {
	 */
	uint64_t		nomem_threshold;

	uint32_t		flags;
	/* I/O channel allocated by a bdev module */
	struct spdk_io_channel	*module_ch;

#ifdef SPDK_CONFIG_VTUNE
	uint64_t		start_tsc;
	uint64_t		interval_tsc;
	__itt_string_handle	*handle;
#endif
	uint32_t		ref;

	TAILQ_ENTRY(spdk_bdev_module_channel) link;
};

static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
@@ -379,6 +398,8 @@ spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
	STAILQ_INIT(&ch->per_thread_cache);
	ch->per_thread_cache_count = 0;

	TAILQ_INIT(&ch->module_channels);

	return 0;
}

@@ -782,17 +803,18 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
	struct spdk_bdev *bdev = bdev_io->bdev;
	struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
	struct spdk_io_channel *ch = bdev_ch->channel;
	struct spdk_bdev_module_channel	*shared_ch = bdev_ch->module_ch;

	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);

	bdev_ch->io_outstanding++;
	shared_ch->io_outstanding++;
	bdev_io->in_submit_request = true;
	if (spdk_likely(bdev_ch->flags == 0)) {
		if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) {
		if (spdk_likely(TAILQ_EMPTY(&shared_ch->nomem_io))) {
			bdev->fn_table->submit_request(ch, bdev_io);
		} else {
			bdev_ch->io_outstanding--;
			TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link);
			shared_ch->io_outstanding--;
			TAILQ_INSERT_TAIL(&shared_ch->nomem_io, bdev_io, link);
		}
	} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
@@ -851,6 +873,8 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
{
	struct spdk_bdev		*bdev = io_device;
	struct spdk_bdev_channel	*ch = ctx_buf;
	struct spdk_bdev_mgmt_channel	*mgmt_ch;
	struct spdk_bdev_module_channel	*shared_ch;

	ch->bdev = io_device;
	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
@@ -864,12 +888,34 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
		return -1;
	}

	mgmt_ch = spdk_io_channel_get_ctx(ch->mgmt_channel);
	TAILQ_FOREACH(shared_ch, &mgmt_ch->module_channels, link) {
		if (shared_ch->module_ch == ch->channel) {
			shared_ch->ref++;
			break;
		}
	}

	if (shared_ch == NULL) {
		shared_ch = calloc(1, sizeof(*shared_ch));
		if (!shared_ch) {
			spdk_put_io_channel(ch->channel);
			spdk_put_io_channel(ch->mgmt_channel);
			return -1;
		}

		shared_ch->io_outstanding = 0;
		TAILQ_INIT(&shared_ch->nomem_io);
		shared_ch->nomem_threshold = 0;
		shared_ch->module_ch = ch->channel;
		shared_ch->ref = 1;
		TAILQ_INSERT_TAIL(&mgmt_ch->module_channels, shared_ch, link);
	}

	memset(&ch->stat, 0, sizeof(ch->stat));
	ch->io_outstanding = 0;
	TAILQ_INIT(&ch->queued_resets);
	TAILQ_INIT(&ch->nomem_io);
	ch->nomem_threshold = 0;
	ch->flags = 0;
	ch->module_ch = shared_ch;

#ifdef SPDK_CONFIG_VTUNE
	{
@@ -935,7 +981,7 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
			 *  that spdk_bdev_io_complete() will do.
			 */
			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
				ch->io_outstanding++;
				ch->module_ch->io_outstanding++;
			}
			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
		}
@@ -947,17 +993,24 @@ spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
{
	struct spdk_bdev_channel	*ch = ctx_buf;
	struct spdk_bdev_mgmt_channel	*mgmt_channel;
	struct spdk_bdev_module_channel	*shared_ch = ch->module_ch;

	mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);

	_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
	_spdk_bdev_abort_queued_io(&ch->nomem_io, ch);
	_spdk_bdev_abort_queued_io(&shared_ch->nomem_io, ch);
	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch);
	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch);

	assert(shared_ch->ref > 0);
	shared_ch->ref--;
	if (shared_ch->ref == 0) {
		assert(shared_ch->io_outstanding == 0);
		TAILQ_REMOVE(&mgmt_channel->module_channels, shared_ch, link);
		free(shared_ch);
	}
	spdk_put_io_channel(ch->channel);
	spdk_put_io_channel(ch->mgmt_channel);
	assert(ch->io_outstanding == 0);
}

int
@@ -1502,14 +1555,16 @@ _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
	struct spdk_io_channel 		*ch;
	struct spdk_bdev_channel	*channel;
	struct spdk_bdev_mgmt_channel	*mgmt_channel;
	struct spdk_bdev_module_channel	*shared_ch;

	ch = spdk_io_channel_iter_get_channel(i);
	channel = spdk_io_channel_get_ctx(ch);
	mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel);
	shared_ch = channel->module_ch;

	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;

	_spdk_bdev_abort_queued_io(&channel->nomem_io, channel);
	_spdk_bdev_abort_queued_io(&shared_ch->nomem_io, channel);
	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);

@@ -1724,9 +1779,10 @@ static void
_spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
{
	struct spdk_bdev *bdev = bdev_ch->bdev;
	struct spdk_bdev_module_channel	*shared_ch = bdev_ch->module_ch;
	struct spdk_bdev_io *bdev_io;

	if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) {
	if (shared_ch->io_outstanding > shared_ch->nomem_threshold) {
		/*
		 * Allow some more I/O to complete before retrying the nomem_io queue.
		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
@@ -1738,12 +1794,12 @@ _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
		return;
	}

	while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) {
		bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io);
		TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link);
		bdev_ch->io_outstanding++;
	while (!TAILQ_EMPTY(&shared_ch->nomem_io)) {
		bdev_io = TAILQ_FIRST(&shared_ch->nomem_io);
		TAILQ_REMOVE(&shared_ch->nomem_io, bdev_io, link);
		shared_ch->io_outstanding++;
		bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
		bdev->fn_table->submit_request(bdev_ch->channel, bdev_io);
		bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io);
		if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) {
			break;
		}
@@ -1791,6 +1847,7 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
{
	struct spdk_bdev *bdev = bdev_io->bdev;
	struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
	struct spdk_bdev_module_channel	*shared_ch = bdev_ch->module_ch;

	bdev_io->status = status;

@@ -1813,22 +1870,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
			return;
		}
	} else {
		assert(bdev_ch->io_outstanding > 0);
		bdev_ch->io_outstanding--;
		assert(shared_ch->io_outstanding > 0);
		shared_ch->io_outstanding--;
		if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) {
			if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) {
			if (spdk_unlikely(!TAILQ_EMPTY(&shared_ch->nomem_io))) {
				_spdk_bdev_ch_retry_io(bdev_ch);
			}
		} else {
			TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link);
			TAILQ_INSERT_HEAD(&shared_ch->nomem_io, bdev_io, link);
			/*
			 * Wait for some of the outstanding I/O to complete before we
			 *  retry any of the nomem_io.  Normally we will wait for
			 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
			 *  depth channels we will instead wait for half to complete.
			 */
			bdev_ch->nomem_threshold = spdk_max((int64_t)bdev_ch->io_outstanding / 2,
							    (int64_t)bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
			shared_ch->nomem_threshold = spdk_max((int64_t)shared_ch->io_outstanding / 2,
							      (int64_t)shared_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
			return;
		}
	}
+89 −14
Original line number Diff line number Diff line
@@ -486,6 +486,7 @@ enomem(void)
{
	struct spdk_io_channel *io_ch;
	struct spdk_bdev_channel *bdev_ch;
	struct spdk_bdev_module_channel *module_ch;
	struct ut_bdev_channel *ut_ch;
	const uint32_t IO_ARRAY_SIZE = 64;
	const uint32_t AVAIL = 20;
@@ -499,6 +500,7 @@ enomem(void)
	set_thread(0);
	io_ch = spdk_bdev_get_io_channel(g_desc);
	bdev_ch = spdk_io_channel_get_ctx(io_ch);
	module_ch = bdev_ch->module_ch;
	ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
	ut_ch->avail_cnt = AVAIL;

@@ -508,7 +510,7 @@ enomem(void)
		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
		CU_ASSERT(rc == 0);
	}
	CU_ASSERT(TAILQ_EMPTY(&bdev_ch->nomem_io));
	CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io));

	/*
	 * Next, submit one additional I/O.  This one should fail with ENOMEM and then go onto
@@ -517,8 +519,8 @@ enomem(void)
	status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
	rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
	CU_ASSERT(rc == 0);
	SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&bdev_ch->nomem_io));
	first_io = TAILQ_FIRST(&bdev_ch->nomem_io);
	SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io));
	first_io = TAILQ_FIRST(&module_ch->nomem_io);

	/*
	 * Now submit a bunch more I/O.  These should all fail with ENOMEM and get queued behind
@@ -531,10 +533,10 @@ enomem(void)
	}

	/* Assert that first_io is still at the head of the list. */
	CU_ASSERT(TAILQ_FIRST(&bdev_ch->nomem_io) == first_io);
	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
	nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
	CU_ASSERT(bdev_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));
	CU_ASSERT(TAILQ_FIRST(&module_ch->nomem_io) == first_io);
	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
	nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io);
	CU_ASSERT(module_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));

	/*
	 * Complete 1 I/O only.  The key check here is bdev_io_tailq_cnt - this should not have
@@ -542,19 +544,19 @@ enomem(void)
	 *  list.
	 */
	stub_complete_io(g_bdev.io_target, 1);
	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt);

	/*
	 * Complete enough I/O to hit the nomem_theshold.  This should trigger retrying nomem_io,
	 *  and we should see I/O get resubmitted to the test bdev module.
	 */
	stub_complete_io(g_bdev.io_target, NOMEM_THRESHOLD_COUNT - 1);
	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) < nomem_cnt);
	nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) < nomem_cnt);
	nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io);

	/* Complete 1 I/O only.  This should not trigger retrying the queued nomem_io. */
	stub_complete_io(g_bdev.io_target, 1);
	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt);

	/*
	 * Send a reset and confirm that all I/O are completed, including the ones that
@@ -567,14 +569,86 @@ enomem(void)
	/* This will complete the reset. */
	stub_complete_io(g_bdev.io_target, 0);

	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == 0);
	CU_ASSERT(bdev_ch->io_outstanding == 0);
	CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == 0);
	CU_ASSERT(module_ch->io_outstanding == 0);

	spdk_put_io_channel(io_ch);
	poll_threads();
	teardown_test();
}

static void
enomem_multi_bdev(void)
{
	struct spdk_io_channel *io_ch;
	struct spdk_bdev_channel *bdev_ch;
	struct spdk_bdev_module_channel *module_ch;
	struct ut_bdev_channel *ut_ch;
	const uint32_t IO_ARRAY_SIZE = 64;
	const uint32_t AVAIL = 20;
	enum spdk_bdev_io_status status[IO_ARRAY_SIZE];
	uint32_t i;
	struct ut_bdev *second_bdev;
	struct spdk_bdev_desc *second_desc;
	struct spdk_bdev_channel *second_bdev_ch;
	struct spdk_io_channel *second_ch;
	int rc;

	setup_test();

	/* Register second bdev with the same io_target  */
	second_bdev = calloc(1, sizeof(*second_bdev));
	SPDK_CU_ASSERT_FATAL(second_bdev != NULL);
	register_bdev(second_bdev, "ut_bdev2", g_bdev.io_target);
	spdk_bdev_open(&second_bdev->bdev, true, NULL, NULL, &second_desc);

	set_thread(0);
	io_ch = spdk_bdev_get_io_channel(g_desc);
	bdev_ch = spdk_io_channel_get_ctx(io_ch);
	module_ch = bdev_ch->module_ch;
	ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
	ut_ch->avail_cnt = AVAIL;

	second_ch = spdk_bdev_get_io_channel(second_desc);
	second_bdev_ch = spdk_io_channel_get_ctx(second_ch);
	SPDK_CU_ASSERT_FATAL(module_ch == second_bdev_ch->module_ch);

	/* Saturate io_target through bdev A. */
	for (i = 0; i < AVAIL; i++) {
		status[i] = SPDK_BDEV_IO_STATUS_PENDING;
		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
		CU_ASSERT(rc == 0);
	}
	CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io));

	/*
	 * Now submit I/O through the second bdev. This should fail with ENOMEM
	 * and then go onto the nomem_io list.
	 */
	status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
	rc = spdk_bdev_read_blocks(second_desc, second_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
	CU_ASSERT(rc == 0);
	SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io));

	/* Complete first bdev's I/O. This should retry sending second bdev's nomem_io */
	stub_complete_io(g_bdev.io_target, AVAIL);

	SPDK_CU_ASSERT_FATAL(TAILQ_EMPTY(&module_ch->nomem_io));
	CU_ASSERT(module_ch->io_outstanding == 1);

	/* Now complete our retried I/O  */
	stub_complete_io(g_bdev.io_target, 1);
	SPDK_CU_ASSERT_FATAL(module_ch->io_outstanding == 0);

	spdk_put_io_channel(io_ch);
	spdk_put_io_channel(second_ch);
	spdk_bdev_close(second_desc);
	unregister_bdev(second_bdev);
	free(second_bdev);
	poll_threads();
	teardown_test();
}

int
main(int argc, char **argv)
{
@@ -596,7 +670,8 @@ main(int argc, char **argv)
		CU_add_test(suite, "put_channel_during_reset", put_channel_during_reset) == NULL ||
		CU_add_test(suite, "aborted_reset", aborted_reset) == NULL ||
		CU_add_test(suite, "io_during_reset", io_during_reset) == NULL ||
		CU_add_test(suite, "enomem", enomem) == NULL
		CU_add_test(suite, "enomem", enomem) == NULL ||
		CU_add_test(suite, "enomem_multi_bdev", enomem_multi_bdev) == NULL
	) {
		CU_cleanup_registry();
		return CU_get_error();