Commit 84850dac authored by Ben Walker's avatar Ben Walker Committed by Jim Harris
Browse files

bdev: Add a zero copy I/O path



Add a ZCOPY operation to obtain buffers that represent
data regions on the backing block device.

Change-Id: Ie941c16ee051d0009e3888b52b8f41773bba47b3
Signed-off-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/386166


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: default avatarDarek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
parent b92c3d41
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -49,6 +49,9 @@ block except for metadata.
spdk_vbdev_register() has been deprecated.  spdk_bdev_register() should be used
instead.

A mechanism for acquiring and releasing data buffers from bdev modules, used
to perform zero copy operations, was added.

### NVMe-oF Target

Support for per-device shared receive queues in the RDMA transport has been added.
+42 −0
Original line number Diff line number Diff line
@@ -110,6 +110,7 @@ enum spdk_bdev_io_type {
	SPDK_BDEV_IO_TYPE_NVME_IO,
	SPDK_BDEV_IO_TYPE_NVME_IO_MD,
	SPDK_BDEV_IO_TYPE_WRITE_ZEROES,
	SPDK_BDEV_IO_TYPE_ZCOPY,
	SPDK_BDEV_NUM_IO_TYPES /* Keep last */
};

@@ -785,6 +786,47 @@ int spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel
			    uint64_t offset_blocks, uint64_t num_blocks,
			    spdk_bdev_io_completion_cb cb, void *cb_arg);

/**
 * Submit a request to acquire a data buffer that represents the given
 * range of blocks. The data buffer is placed in the spdk_bdev_io structure
 * and can be obtained by calling spdk_bdev_io_get_iovec().
 *
 * \param desc Block device descriptor
 * \param ch I/O channel. Obtained by calling spdk_bdev_get_io_channel().
 * \param offset_blocks The offset, in blocks, from the start of the block device.
 * \param num_blocks The number of blocks.
 * \param populate Whether the data buffer should be populated with the
 *                 data at the given blocks. Populating the data buffer can
 *                 be skipped if the user writes new data to the entire buffer.
 * \param cb Called when the request is complete.
 * \param cb_arg Argument passed to cb.
 *
 * \return 0 on success. On success, the callback will always
 * be called (even if the request ultimately failed). Return
 * negated errno on failure, in which case the callback will not be called.
 */
int spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
			  uint64_t offset_blocks, uint64_t num_blocks,
			  bool populate,
			  spdk_bdev_io_completion_cb cb, void *cb_arg);


/**
 * Submit a request to release a data buffer representing a range of blocks.
 *
 * \param bdev_io I/O request returned in the completion callback of spdk_bdev_zcopy_start().
 * \param commit Whether to commit the data in the buffers to the blocks before releasing.
 *               The data does not need to be committed if it was not modified.
 * \param cb Called when the request is complete.
 * \param cb_arg Argument passed to cb.
 *
 * \return 0 on success. On success, the callback will always
 * be called (even if the request ultimately failed). Return
 * negated errno on failure, in which case the callback will not be called.
 */
int spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
			spdk_bdev_io_completion_cb cb, void *cb_arg);

/**
 * Submit a write zeroes request to the bdev on the given channel. This command
 *  ensures that all bytes in the specified range are set to 00h
+11 −0
Original line number Diff line number Diff line
@@ -453,6 +453,17 @@ struct spdk_bdev_io {

			/** count of outstanding batched split I/Os */
			uint32_t split_outstanding;

			struct {
				/** Whether the buffer should be populated with the real data */
				uint8_t populate : 1;

				/** Whether the buffer should be committed back to disk */
				uint8_t commit : 1;

				/** True if this request is in the 'start' phase of zcopy. False if in 'end'. */
				uint8_t start : 1;
			} zcopy;
		} bdev;
		struct {
			/** Channel reference held while messages for this reset are in progress. */
+69 −4
Original line number Diff line number Diff line
@@ -424,6 +424,11 @@ spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
{
	struct iovec *iovs;

	if (bdev_io->u.bdev.iovs == NULL) {
		bdev_io->u.bdev.iovs = &bdev_io->iov;
		bdev_io->u.bdev.iovcnt = 1;
	}

	iovs = bdev_io->u.bdev.iovs;

	assert(iovs != NULL);
@@ -436,6 +441,10 @@ spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
static bool
_is_buf_allocated(struct iovec *iovs)
{
	if (iovs == NULL) {
		return false;
	}

	return iovs[0].iov_base != NULL;
}

@@ -585,7 +594,6 @@ spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, u
	bool buf_allocated;

	assert(cb != NULL);
	assert(bdev_io->u.bdev.iovs != NULL);

	alignment = spdk_bdev_get_buf_align(bdev_io->bdev);
	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
@@ -2744,6 +2752,65 @@ spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
	return 0;
}

int
spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
		      uint64_t offset_blocks, uint64_t num_blocks,
		      bool populate,
		      spdk_bdev_io_completion_cb cb, void *cb_arg)
{
	struct spdk_bdev *bdev = desc->bdev;
	struct spdk_bdev_io *bdev_io;
	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);

	if (!desc->write) {
		return -EBADF;
	}

	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
		return -EINVAL;
	}

	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
		return -ENOTSUP;
	}

	bdev_io = spdk_bdev_get_io(channel);
	if (!bdev_io) {
		return -ENOMEM;
	}

	bdev_io->internal.ch = channel;
	bdev_io->internal.desc = desc;
	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
	bdev_io->u.bdev.num_blocks = num_blocks;
	bdev_io->u.bdev.offset_blocks = offset_blocks;
	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
	bdev_io->u.bdev.zcopy.commit = 0;
	bdev_io->u.bdev.zcopy.start = 1;
	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);

	spdk_bdev_io_submit(bdev_io);
	return 0;
}

int
spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
		    spdk_bdev_io_completion_cb cb, void *cb_arg)
{
	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
		return -EINVAL;
	}

	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
	bdev_io->u.bdev.zcopy.start = 0;
	bdev_io->internal.caller_ctx = cb_arg;
	bdev_io->internal.cb = cb;
	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;

	spdk_bdev_io_submit(bdev_io);
	return 0;
}

int
spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
		       uint64_t offset, uint64_t len,
@@ -4018,10 +4085,8 @@ spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *i

	switch (bdev_io->type) {
	case SPDK_BDEV_IO_TYPE_READ:
		iovs = bdev_io->u.bdev.iovs;
		iovcnt = bdev_io->u.bdev.iovcnt;
		break;
	case SPDK_BDEV_IO_TYPE_WRITE:
	case SPDK_BDEV_IO_TYPE_ZCOPY:
		iovs = bdev_io->u.bdev.iovs;
		iovcnt = bdev_io->u.bdev.iovcnt;
		break;