Commit 94bc8cfd authored by Jim Harris's avatar Jim Harris
Browse files

bdev: add ENOMEM handling



At very high queue depths, bdev modules may not have enough
internal resources to track all of the incoming I/O.  For example,
we allocate a finite number of nvme_request objects per allocated
queue pair.  Currently if these resources are exhausted, the
bdev module will return failure (with no indication why) which
gets propagated all the way back to the application.

So instead, add SPDK_BDEV_IO_STATUS_NOMEM to allow bdev modules
to indicate this type of failure.  Also add handling for this
status type in the generic bdev layer, involving queuing these
I/O for later retry after other I/O on the failing channel have
completed.

This does place an expectation on the bdev module that these
internal resources are allocated per io_channel.  Otherwise we
cannot guarantee forward progress solely on reception of
completions.  For example, without this guarantee, a bdev
module could theoretically return ENOMEM even if there were
no I/O oustanding for that io_channel.  nvme, aio, rbd,
virtio and null drivers comply with this expectation already.
malloc only complies though when not using copy offload.

This patch will fix malloc w/ copy engine to at least
return ENOMEM when no copy descriptors are available.  If the
condition above occurs, I/O waiting for resources will get
failed as part of a subsequent reset which matches the
behavior it has today.

Signed-off-by: default avatarJim Harris <james.r.harris@intel.com>
Change-Id: Iea7cd51a611af8abe882794d0b2361fdbb74e84e

Reviewed-on: https://review.gerrithub.io/378853


Tested-by: default avatarSPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: default avatarDaniel Verkamp <daniel.verkamp@intel.com>
Reviewed-by: default avatarChangpeng Liu <changpeng.liu@intel.com>
parent c5d8b108
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -161,6 +161,13 @@ struct spdk_bdev_fn_table {

/** bdev I/O completion status */
enum spdk_bdev_io_status {
	/*
	 * NOMEM should be returned when a bdev module cannot start an I/O because of
	 *  some lack of resources.  It may not be returned for RESET I/O.  I/O completed
	 *  with NOMEM status will be retried after some I/O from the same channel have
	 *  completed.
	 */
	SPDK_BDEV_IO_STATUS_NOMEM = -4,
	SPDK_BDEV_IO_STATUS_SCSI_ERROR = -3,
	SPDK_BDEV_IO_STATUS_NVME_ERROR = -2,
	SPDK_BDEV_IO_STATUS_FAILED = -1,
+10 −2
Original line number Diff line number Diff line
@@ -120,7 +120,11 @@ bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,

	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
	if (rc < 0) {
		if (rc == EAGAIN) {
			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
		} else {
			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
		}
		SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
		return -1;
	}
@@ -146,7 +150,11 @@ bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,

	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
	if (rc < 0) {
		if (rc == EAGAIN) {
			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
		} else {
			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
		}
		SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
		return -1;
	}
+80 −1
Original line number Diff line number Diff line
@@ -57,6 +57,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
#define SPDK_BDEV_IO_POOL_SIZE	(64 * 1024)
#define BUF_SMALL_POOL_SIZE	8192
#define BUF_LARGE_POOL_SIZE	1024
#define NOMEM_THRESHOLD_COUNT	8

typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t;

@@ -128,6 +129,17 @@ struct spdk_bdev_channel {

	bdev_io_tailq_t		queued_resets;

	/*
	 * Queue of IO awaiting retry because of a previous NOMEM status returned
	 *  on this channel.
	 */
	bdev_io_tailq_t		nomem_io;

	/*
	 * Threshold which io_outstanding must drop to before retrying nomem_io.
	 */
	uint64_t		nomem_threshold;

	uint32_t		flags;

#ifdef SPDK_CONFIG_VTUNE
@@ -612,7 +624,12 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
	bdev_ch->io_outstanding++;
	bdev_io->in_submit_request = true;
	if (spdk_likely(bdev_ch->flags == 0)) {
		if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) {
			bdev->fn_table->submit_request(ch, bdev_io);
		} else {
			bdev_ch->io_outstanding--;
			TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link);
		}
	} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
	} else {
@@ -676,6 +693,8 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
	memset(&ch->stat, 0, sizeof(ch->stat));
	ch->io_outstanding = 0;
	TAILQ_INIT(&ch->queued_resets);
	TAILQ_INIT(&ch->nomem_io);
	ch->nomem_threshold = 0;
	ch->flags = 0;

#ifdef SPDK_CONFIG_VTUNE
@@ -725,6 +744,15 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
	TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) {
		if (bdev_io->ch == ch) {
			TAILQ_REMOVE(queue, bdev_io, link);
			/*
			 * spdk_bdev_io_complete() assumes that the completed I/O had
			 *  been submitted to the bdev module.  Since in this case it
			 *  hadn't, bump io_outstanding to account for the decrement
			 *  that spdk_bdev_io_complete() will do.
			 */
			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
				ch->io_outstanding++;
			}
			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
		}
	}
@@ -739,6 +767,7 @@ spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
	mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);

	_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
	_spdk_bdev_abort_queued_io(&ch->nomem_io, ch);
	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch);
	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch);

@@ -1201,6 +1230,7 @@ _spdk_bdev_reset_abort_channel(void *io_device, struct spdk_io_channel *ch,

	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;

	_spdk_bdev_abort_queued_io(&channel->nomem_io, channel);
	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);
}
@@ -1378,6 +1408,36 @@ spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
	return 0;
}

static void
_spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
{
	struct spdk_bdev *bdev = bdev_ch->bdev;
	struct spdk_bdev_io *bdev_io;

	if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) {
		/*
		 * Allow some more I/O to complete before retrying the nomem_io queue.
		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
		 *  the context of a completion, because the resources for the I/O are
		 *  not released until control returns to the bdev poller.  Also, we
		 *  may require several small I/O to complete before a larger I/O
		 *  (that requires splitting) can be submitted.
		 */
		return;
	}

	while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) {
		bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io);
		TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link);
		bdev_ch->io_outstanding++;
		bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
		bdev->fn_table->submit_request(bdev_ch->channel, bdev_io);
		if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) {
			break;
		}
	}
}

static void
_spdk_bdev_io_complete(void *ctx)
{
@@ -1396,6 +1456,9 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
	bdev_io->status = status;

	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
			SPDK_ERRLOG("NOMEM returned for reset\n");
		}
		pthread_mutex_lock(&bdev->mutex);
		if (bdev_io == bdev->reset_in_progress) {
			bdev->reset_in_progress = NULL;
@@ -1408,6 +1471,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
	} else {
		assert(bdev_ch->io_outstanding > 0);
		bdev_ch->io_outstanding--;
		if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) {
			if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) {
				_spdk_bdev_ch_retry_io(bdev_ch);
			}
		} else {
			TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link);
			/*
			 * Wait for some of the outstanding I/O to complete before we
			 *  retry any of the nomem_io.  Normally we will wait for
			 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
			 *  depth channels we will instead wait for half to complete.
			 */
			bdev_ch->nomem_threshold = spdk_max(bdev_ch->io_outstanding / 2,
							    bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
			return;
		}
	}

	if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+5 −1
Original line number Diff line number Diff line
@@ -75,8 +75,12 @@ malloc_done(void *ref, int status)
	struct malloc_task *task = __malloc_task_from_copy_task(ref);

	if (status != 0) {
		if (status == -ENOMEM) {
			task->status = SPDK_BDEV_IO_STATUS_NOMEM;
		} else {
			task->status = SPDK_BDEV_IO_STATUS_FAILED;
		}
	}

	if (--task->num_outstanding == 0) {
		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status);
+11 −3
Original line number Diff line number Diff line
@@ -347,7 +347,11 @@ bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
			      bdev_io->u.bdev.num_blocks,
			      bdev_io->u.bdev.offset_blocks);

	if (ret != 0) {
	if (spdk_likely(ret == 0)) {
		return;
	} else if (ret == -ENOMEM) {
		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
	} else {
		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
	}
}
@@ -428,9 +432,13 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i
	int rc = _bdev_nvme_submit_request(ch, bdev_io);

	if (spdk_unlikely(rc != 0)) {
		if (rc == -ENOMEM) {
			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
		} else {
			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
		}
	}
}

static bool
bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
@@ -1256,7 +1264,7 @@ bdev_nvme_queue_cmd(struct nvme_bdev *bdev, struct spdk_nvme_qpair *qpair,
					     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge);
	}

	if (rc != 0) {
	if (rc != 0 && rc != -ENOMEM) {
		SPDK_ERRLOG("%s failed: rc = %d\n", direction == BDEV_DISK_READ ? "readv" : "writev", rc);
	}
	return rc;
Loading