bdev: share nomem_io data between bdevs built on the same device (583a24a4) · Commits · Public Repositories / spdk

lib/bdev/bdev.c

+88 −31

Original line number	Diff line number	Diff line
		@@ -113,6 +113,8 @@ struct spdk_bdev_mgmt_channel {
		*/
		bdev_io_stailq_t per_thread_cache;
		uint32_t per_thread_cache_count;

		TAILQ_HEAD(, spdk_bdev_module_channel) module_channels;
		};

		struct spdk_bdev_desc {
		@@ -136,14 +138,33 @@ struct spdk_bdev_channel {

		struct spdk_bdev_io_stat stat;

		bdev_io_tailq_t queued_resets;

		uint32_t flags;

		/* Per-device channel */
		struct spdk_bdev_module_channel *module_ch;

		#ifdef SPDK_CONFIG_VTUNE
		uint64_t start_tsc;
		uint64_t interval_tsc;
		__itt_string_handle *handle;
		#endif

		};

		/*
		* Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device
		* will queue here their IO that awaits retry. It makes it posible to retry sending
		* IO to one bdev after IO from other bdev completes.
		*/
		struct spdk_bdev_module_channel {
		/*
		* Count of I/O submitted to bdev module and waiting for completion.
		* Incremented before submit_request() is called on an spdk_bdev_io.
		*/
		uint64_t io_outstanding;

		bdev_io_tailq_t queued_resets;

		/*
		* Queue of IO awaiting retry because of a previous NOMEM status returned
		* on this channel.
		@@ -155,14 +176,12 @@ struct spdk_bdev_channel {
		*/
		uint64_t nomem_threshold;

		uint32_t flags;
		/* I/O channel allocated by a bdev module */
		struct spdk_io_channel *module_ch;

		#ifdef SPDK_CONFIG_VTUNE
		uint64_t start_tsc;
		uint64_t interval_tsc;
		__itt_string_handle *handle;
		#endif
		uint32_t ref;

		TAILQ_ENTRY(spdk_bdev_module_channel) link;
		};

		static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io bdev_io, bool success, void cb_arg);
		@@ -379,6 +398,8 @@ spdk_bdev_mgmt_channel_create(void io_device, void ctx_buf)
		STAILQ_INIT(&ch->per_thread_cache);
		ch->per_thread_cache_count = 0;

		TAILQ_INIT(&ch->module_channels);

		return 0;
		}

		@@ -782,17 +803,18 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
		struct spdk_bdev *bdev = bdev_io->bdev;
		struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
		struct spdk_io_channel *ch = bdev_ch->channel;
		struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch;

		assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);

		bdev_ch->io_outstanding++;
		shared_ch->io_outstanding++;
		bdev_io->in_submit_request = true;
		if (spdk_likely(bdev_ch->flags == 0)) {
		if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) {
		if (spdk_likely(TAILQ_EMPTY(&shared_ch->nomem_io))) {
		bdev->fn_table->submit_request(ch, bdev_io);
		} else {
		bdev_ch->io_outstanding--;
		TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link);
		shared_ch->io_outstanding--;
		TAILQ_INSERT_TAIL(&shared_ch->nomem_io, bdev_io, link);
		}
		} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
		@@ -851,6 +873,8 @@ spdk_bdev_channel_create(void io_device, void ctx_buf)
		{
		struct spdk_bdev *bdev = io_device;
		struct spdk_bdev_channel *ch = ctx_buf;
		struct spdk_bdev_mgmt_channel *mgmt_ch;
		struct spdk_bdev_module_channel *shared_ch;

		ch->bdev = io_device;
		ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
		@@ -864,12 +888,34 @@ spdk_bdev_channel_create(void io_device, void ctx_buf)
		return -1;
		}

		mgmt_ch = spdk_io_channel_get_ctx(ch->mgmt_channel);
		TAILQ_FOREACH(shared_ch, &mgmt_ch->module_channels, link) {
		if (shared_ch->module_ch == ch->channel) {
		shared_ch->ref++;
		break;
		}
		}

		if (shared_ch == NULL) {
		shared_ch = calloc(1, sizeof(*shared_ch));
		if (!shared_ch) {
		spdk_put_io_channel(ch->channel);
		spdk_put_io_channel(ch->mgmt_channel);
		return -1;
		}

		shared_ch->io_outstanding = 0;
		TAILQ_INIT(&shared_ch->nomem_io);
		shared_ch->nomem_threshold = 0;
		shared_ch->module_ch = ch->channel;
		shared_ch->ref = 1;
		TAILQ_INSERT_TAIL(&mgmt_ch->module_channels, shared_ch, link);
		}

		memset(&ch->stat, 0, sizeof(ch->stat));
		ch->io_outstanding = 0;
		TAILQ_INIT(&ch->queued_resets);
		TAILQ_INIT(&ch->nomem_io);
		ch->nomem_threshold = 0;
		ch->flags = 0;
		ch->module_ch = shared_ch;

		#ifdef SPDK_CONFIG_VTUNE
		{
		@@ -935,7 +981,7 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t queue, struct spdk_bdev_channel ch)
		* that spdk_bdev_io_complete() will do.
		*/
		if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
		ch->io_outstanding++;
		ch->module_ch->io_outstanding++;
		}
		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
		}
		@@ -947,17 +993,24 @@ spdk_bdev_channel_destroy(void io_device, void ctx_buf)
		{
		struct spdk_bdev_channel *ch = ctx_buf;
		struct spdk_bdev_mgmt_channel *mgmt_channel;
		struct spdk_bdev_module_channel *shared_ch = ch->module_ch;

		mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);

		_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
		_spdk_bdev_abort_queued_io(&ch->nomem_io, ch);
		_spdk_bdev_abort_queued_io(&shared_ch->nomem_io, ch);
		_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch);
		_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch);

		assert(shared_ch->ref > 0);
		shared_ch->ref--;
		if (shared_ch->ref == 0) {
		assert(shared_ch->io_outstanding == 0);
		TAILQ_REMOVE(&mgmt_channel->module_channels, shared_ch, link);
		free(shared_ch);
		}
		spdk_put_io_channel(ch->channel);
		spdk_put_io_channel(ch->mgmt_channel);
		assert(ch->io_outstanding == 0);
		}

		int
		@@ -1502,14 +1555,16 @@ _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
		struct spdk_io_channel *ch;
		struct spdk_bdev_channel *channel;
		struct spdk_bdev_mgmt_channel *mgmt_channel;
		struct spdk_bdev_module_channel *shared_ch;

		ch = spdk_io_channel_iter_get_channel(i);
		channel = spdk_io_channel_get_ctx(ch);
		mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel);
		shared_ch = channel->module_ch;

		channel->flags \|= BDEV_CH_RESET_IN_PROGRESS;

		_spdk_bdev_abort_queued_io(&channel->nomem_io, channel);
		_spdk_bdev_abort_queued_io(&shared_ch->nomem_io, channel);
		_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
		_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);

		@@ -1724,9 +1779,10 @@ static void
		_spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
		{
		struct spdk_bdev *bdev = bdev_ch->bdev;
		struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch;
		struct spdk_bdev_io *bdev_io;

		if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) {
		if (shared_ch->io_outstanding > shared_ch->nomem_threshold) {
		/*
		* Allow some more I/O to complete before retrying the nomem_io queue.
		* Some drivers (such as nvme) cannot immediately take a new I/O in
		@@ -1738,12 +1794,12 @@ _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
		return;
		}

		while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) {
		bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io);
		TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link);
		bdev_ch->io_outstanding++;
		while (!TAILQ_EMPTY(&shared_ch->nomem_io)) {
		bdev_io = TAILQ_FIRST(&shared_ch->nomem_io);
		TAILQ_REMOVE(&shared_ch->nomem_io, bdev_io, link);
		shared_ch->io_outstanding++;
		bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
		bdev->fn_table->submit_request(bdev_ch->channel, bdev_io);
		bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io);
		if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) {
		break;
		}
		@@ -1791,6 +1847,7 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
		{
		struct spdk_bdev *bdev = bdev_io->bdev;
		struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
		struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch;

		bdev_io->status = status;

		@@ -1813,22 +1870,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
		return;
		}
		} else {
		assert(bdev_ch->io_outstanding > 0);
		bdev_ch->io_outstanding--;
		assert(shared_ch->io_outstanding > 0);
		shared_ch->io_outstanding--;
		if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) {
		if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) {
		if (spdk_unlikely(!TAILQ_EMPTY(&shared_ch->nomem_io))) {
		_spdk_bdev_ch_retry_io(bdev_ch);
		}
		} else {
		TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link);
		TAILQ_INSERT_HEAD(&shared_ch->nomem_io, bdev_io, link);
		/*
		* Wait for some of the outstanding I/O to complete before we
		* retry any of the nomem_io. Normally we will wait for
		* NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
		* depth channels we will instead wait for half to complete.
		*/
		bdev_ch->nomem_threshold = spdk_max((int64_t)bdev_ch->io_outstanding / 2,
		(int64_t)bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
		shared_ch->nomem_threshold = spdk_max((int64_t)shared_ch->io_outstanding / 2,
		(int64_t)shared_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
		return;
		}
		}

test/unit/lib/bdev/mt/bdev.c/bdev_ut.c

+89 −14

Original line number	Diff line number	Diff line
		@@ -486,6 +486,7 @@ enomem(void)
		{
		struct spdk_io_channel *io_ch;
		struct spdk_bdev_channel *bdev_ch;
		struct spdk_bdev_module_channel *module_ch;
		struct ut_bdev_channel *ut_ch;
		const uint32_t IO_ARRAY_SIZE = 64;
		const uint32_t AVAIL = 20;
		@@ -499,6 +500,7 @@ enomem(void)
		set_thread(0);
		io_ch = spdk_bdev_get_io_channel(g_desc);
		bdev_ch = spdk_io_channel_get_ctx(io_ch);
		module_ch = bdev_ch->module_ch;
		ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
		ut_ch->avail_cnt = AVAIL;

		@@ -508,7 +510,7 @@ enomem(void)
		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
		CU_ASSERT(rc == 0);
		}
		CU_ASSERT(TAILQ_EMPTY(&bdev_ch->nomem_io));
		CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io));

		/*
		* Next, submit one additional I/O. This one should fail with ENOMEM and then go onto
		@@ -517,8 +519,8 @@ enomem(void)
		status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
		CU_ASSERT(rc == 0);
		SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&bdev_ch->nomem_io));
		first_io = TAILQ_FIRST(&bdev_ch->nomem_io);
		SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io));
		first_io = TAILQ_FIRST(&module_ch->nomem_io);

		/*
		* Now submit a bunch more I/O. These should all fail with ENOMEM and get queued behind
		@@ -531,10 +533,10 @@ enomem(void)
		}

		/* Assert that first_io is still at the head of the list. */
		CU_ASSERT(TAILQ_FIRST(&bdev_ch->nomem_io) == first_io);
		CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
		nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
		CU_ASSERT(bdev_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));
		CU_ASSERT(TAILQ_FIRST(&module_ch->nomem_io) == first_io);
		CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
		nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io);
		CU_ASSERT(module_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));

		/*
		* Complete 1 I/O only. The key check here is bdev_io_tailq_cnt - this should not have
		@@ -542,19 +544,19 @@ enomem(void)
		* list.
		*/
		stub_complete_io(g_bdev.io_target, 1);
		CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
		CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt);

		/*
		* Complete enough I/O to hit the nomem_theshold. This should trigger retrying nomem_io,
		* and we should see I/O get resubmitted to the test bdev module.
		*/
		stub_complete_io(g_bdev.io_target, NOMEM_THRESHOLD_COUNT - 1);
		CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) < nomem_cnt);
		nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
		CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) < nomem_cnt);
		nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io);

		/* Complete 1 I/O only. This should not trigger retrying the queued nomem_io. */
		stub_complete_io(g_bdev.io_target, 1);
		CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
		CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt);

		/*
		* Send a reset and confirm that all I/O are completed, including the ones that
		@@ -567,14 +569,86 @@ enomem(void)
		/* This will complete the reset. */
		stub_complete_io(g_bdev.io_target, 0);

		CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == 0);
		CU_ASSERT(bdev_ch->io_outstanding == 0);
		CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == 0);
		CU_ASSERT(module_ch->io_outstanding == 0);

		spdk_put_io_channel(io_ch);
		poll_threads();
		teardown_test();
		}

		static void
		enomem_multi_bdev(void)
		{
		struct spdk_io_channel *io_ch;
		struct spdk_bdev_channel *bdev_ch;
		struct spdk_bdev_module_channel *module_ch;
		struct ut_bdev_channel *ut_ch;
		const uint32_t IO_ARRAY_SIZE = 64;
		const uint32_t AVAIL = 20;
		enum spdk_bdev_io_status status[IO_ARRAY_SIZE];
		uint32_t i;
		struct ut_bdev *second_bdev;
		struct spdk_bdev_desc *second_desc;
		struct spdk_bdev_channel *second_bdev_ch;
		struct spdk_io_channel *second_ch;
		int rc;

		setup_test();

		/* Register second bdev with the same io_target */
		second_bdev = calloc(1, sizeof(*second_bdev));
		SPDK_CU_ASSERT_FATAL(second_bdev != NULL);
		register_bdev(second_bdev, "ut_bdev2", g_bdev.io_target);
		spdk_bdev_open(&second_bdev->bdev, true, NULL, NULL, &second_desc);

		set_thread(0);
		io_ch = spdk_bdev_get_io_channel(g_desc);
		bdev_ch = spdk_io_channel_get_ctx(io_ch);
		module_ch = bdev_ch->module_ch;
		ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
		ut_ch->avail_cnt = AVAIL;

		second_ch = spdk_bdev_get_io_channel(second_desc);
		second_bdev_ch = spdk_io_channel_get_ctx(second_ch);
		SPDK_CU_ASSERT_FATAL(module_ch == second_bdev_ch->module_ch);

		/* Saturate io_target through bdev A. */
		for (i = 0; i < AVAIL; i++) {
		status[i] = SPDK_BDEV_IO_STATUS_PENDING;
		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
		CU_ASSERT(rc == 0);
		}
		CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io));

		/*
		* Now submit I/O through the second bdev. This should fail with ENOMEM
		* and then go onto the nomem_io list.
		*/
		status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
		rc = spdk_bdev_read_blocks(second_desc, second_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
		CU_ASSERT(rc == 0);
		SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io));

		/* Complete first bdev's I/O. This should retry sending second bdev's nomem_io */
		stub_complete_io(g_bdev.io_target, AVAIL);

		SPDK_CU_ASSERT_FATAL(TAILQ_EMPTY(&module_ch->nomem_io));
		CU_ASSERT(module_ch->io_outstanding == 1);

		/* Now complete our retried I/O */
		stub_complete_io(g_bdev.io_target, 1);
		SPDK_CU_ASSERT_FATAL(module_ch->io_outstanding == 0);

		spdk_put_io_channel(io_ch);
		spdk_put_io_channel(second_ch);
		spdk_bdev_close(second_desc);
		unregister_bdev(second_bdev);
		free(second_bdev);
		poll_threads();
		teardown_test();
		}

		int
		main(int argc, char **argv)
		{
		@@ -596,7 +670,8 @@ main(int argc, char **argv)
		CU_add_test(suite, "put_channel_during_reset", put_channel_during_reset) == NULL \|\|
		CU_add_test(suite, "aborted_reset", aborted_reset) == NULL \|\|
		CU_add_test(suite, "io_during_reset", io_during_reset) == NULL \|\|
		CU_add_test(suite, "enomem", enomem) == NULL
		CU_add_test(suite, "enomem", enomem) == NULL \|\|
		CU_add_test(suite, "enomem_multi_bdev", enomem_multi_bdev) == NULL
		) {
		CU_cleanup_registry();
		return CU_get_error();