Commit be802592 authored by Changpeng Liu's avatar Changpeng Liu Committed by Tomasz Zawadzki
Browse files

lib/ublk: add USER_COPY feature support

Currently copy between io buffer and SPDK ublk target buffer is
done inside kernel `ublk_map_io()` or `ublk_unmap_io()`. This way
performs very well in case of pre-allocated ublk target io buffer.

For dynamically allocated io buffer, UBLK_F_NEED_GET_DATA is added
for ublk target to provide buffer by one extra command communication
for WRITE request. For READ, ublk target simply provides buffer,
but can't know when the buffer is done in the kernel driver[1].

Commit dfae1900 workarounds this issue.

UBLK_F_USER_COPY is added to kernel driver by moving io data copy
out of kernel by providing read()/write() on /dev/ublkcN, and simply
let ublk target do the io data copy.

[1] Question about ublk and NEED_GET_DATA
https://lore.kernel.org/linux-block/116d8a56-0881-56d3-9bcc-78ff3e1dc4e5@linux.alibaba.com/T/#m23bd4b8634c0a054e6797063167b469949a247bb



Change-Id: I5ef584760ea42b28b3c46ecf02268904198b7269
Signed-off-by: default avatarChangpeng Liu <changpeng.liu@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/19042


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarJim Harris <jim.harris@gmail.com>
Reviewed-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
parent 26cac6bf
Loading
Loading
Loading
Loading
+145 −26
Original line number Diff line number Diff line
@@ -73,6 +73,7 @@ struct ublk_io {
	void			*payload;
	void			*mpool_entry;
	bool			need_data;
	bool			user_copy;
	uint16_t		tag;
	uint32_t		payload_size;
	uint32_t		cmd_op;
@@ -156,6 +157,8 @@ struct ublk_tgt {
	uint64_t		features;
	/* `ublk_drv` supports UBLK_F_CMD_IOCTL_ENCODE */
	bool			ioctl_encode;
	/* `ublk_drv` supports UBLK_F_USER_COPY */
	bool			user_copy;
};

static TAILQ_HEAD(, spdk_ublk_dev) g_ublk_devs = TAILQ_HEAD_INITIALIZER(g_ublk_devs);
@@ -251,6 +254,13 @@ user_data_to_op(uint64_t user_data)
	return (user_data >> 16) & 0xff;
}

static inline uint64_t
ublk_user_copy_pos(uint16_t q_id, uint16_t tag)
{
	return (uint64_t)UBLKSRV_IO_BUF_OFFSET + ((((uint64_t)q_id) << UBLK_QID_OFF) | (((
				uint64_t)tag) << UBLK_TAG_OFF));
}

void
spdk_ublk_init(void)
{
@@ -398,6 +408,7 @@ ublk_ctrl_cmd_get_features(void)

	if (cqe->res == 0) {
		g_ublk_tgt.ioctl_encode = !!(g_ublk_tgt.features & UBLK_F_CMD_IOCTL_ENCODE);
		g_ublk_tgt.user_copy = !!(g_ublk_tgt.features & UBLK_F_USER_COPY);
	}
	io_uring_cqe_seen(&g_ublk_tgt.ctrl_ring, cqe);

@@ -598,6 +609,7 @@ _ublk_fini_done(void *args)
	g_ublk_tgt.active = false;
	g_ublk_tgt.features = 0;
	g_ublk_tgt.ioctl_encode = false;
	g_ublk_tgt.user_copy = false;

	if (g_ublk_tgt.cb_fn) {
		g_ublk_tgt.cb_fn(g_ublk_tgt.cb_arg);
@@ -862,7 +874,10 @@ ublk_try_close_dev(void *arg)
	struct spdk_ublk_dev *ublk = arg;

	assert(spdk_thread_is_app_thread(NULL));

	ublk->queues_closed += 1;
	SPDK_DEBUGLOG(ublk_io, "ublkb%u closed queues %u\n", ublk->ublk_id, ublk->queues_closed);

	if (ublk->queues_closed < ublk->num_queues) {
		return;
	}
@@ -956,6 +971,61 @@ ublk_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
	}
}

static int
ublk_user_copy_io_submit(struct ublk_io *io, bool is_write)
{
	struct ublk_queue *q = io->q;
	const struct ublksrv_io_desc *iod = io->iod;
	struct io_uring_sqe *sqe;
	uint64_t pos;
	int rc;
	uint32_t nbytes;

	nbytes = iod->nr_sectors * (1ULL << LINUX_SECTOR_SHIFT);
	pos = ublk_user_copy_pos(q->q_id, io->tag);
	sqe = io_uring_get_sqe(&q->ring);
	assert(sqe);

	if (is_write) {
		io_uring_prep_read(sqe, 0, io->payload, nbytes, pos);
	} else {
		io_uring_prep_write(sqe, 0, io->payload, nbytes, pos);
	}
	io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE);
	io_uring_sqe_set_data64(sqe, build_user_data(io->tag, 0));

	q->cmd_inflight += 1;
	rc = io_uring_submit(&q->ring);
	if (rc < 0) {
		SPDK_ERRLOG("uring submit rc %d\n", rc);
		return rc;
	}
	io->user_copy = true;

	return 0;
}

static void
ublk_user_copy_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
	struct ublk_io	*io = cb_arg;

	spdk_bdev_free_io(bdev_io);

	if (success) {
		int rc;

		rc = ublk_user_copy_io_submit(io, false);
		if (rc) {
			goto out;
		}
		return;
	}

out:
	ublk_io_done(NULL, false, cb_arg);
}

static void
ublk_resubmit_io(void *arg)
{
@@ -1028,6 +1098,7 @@ _ublk_submit_bdev_io(struct ublk_queue *q, struct ublk_io *io)
	struct spdk_bdev_desc *desc = io->bdev_desc;
	struct spdk_io_channel *ch = io->bdev_ch;
	uint64_t offset_blocks, num_blocks;
	spdk_bdev_io_completion_cb read_cb;
	uint8_t ublk_op;
	int rc = 0;
	const struct ublksrv_io_desc *iod = io->iod;
@@ -1036,13 +1107,16 @@ _ublk_submit_bdev_io(struct ublk_queue *q, struct ublk_io *io)
	offset_blocks = iod->start_sector >> ublk->sector_per_block_shift;
	num_blocks = iod->nr_sectors >> ublk->sector_per_block_shift;

	io->result = num_blocks * spdk_bdev_get_data_block_size(ublk->bdev);
	switch (ublk_op) {
	case UBLK_IO_OP_READ:
		rc = spdk_bdev_read_blocks(desc, ch, io->payload, offset_blocks, num_blocks, ublk_io_done, io);
		if (g_ublk_tgt.user_copy) {
			read_cb = ublk_user_copy_read_done;
		} else {
			read_cb = ublk_io_done;
		}
		rc = spdk_bdev_read_blocks(desc, ch, io->payload, offset_blocks, num_blocks, read_cb, io);
		break;
	case UBLK_IO_OP_WRITE:
		assert((void *)iod->addr == io->payload);
		rc = spdk_bdev_write_blocks(desc, ch, io->payload, offset_blocks, num_blocks, ublk_io_done, io);
		break;
	case UBLK_IO_OP_FLUSH:
@@ -1075,6 +1149,17 @@ read_get_buffer_done(struct ublk_io *io)
	_ublk_submit_bdev_io(io->q, io);
}

static void
user_copy_write_get_buffer_done(struct ublk_io *io)
{
	int rc;

	rc = ublk_user_copy_io_submit(io, true);
	if (rc) {
		ublk_io_done(NULL, false, io);
	}
}

static void
ublk_submit_bdev_io(struct ublk_queue *q, struct ublk_io *io)
{
@@ -1082,11 +1167,19 @@ ublk_submit_bdev_io(struct ublk_queue *q, struct ublk_io *io)
	const struct ublksrv_io_desc *iod = io->iod;
	uint8_t ublk_op;

	io->result = iod->nr_sectors * (1ULL << LINUX_SECTOR_SHIFT);
	ublk_op = ublksrv_get_op(iod);
	switch (ublk_op) {
	case UBLK_IO_OP_READ:
		ublk_io_get_buffer(io, iobuf_ch, read_get_buffer_done);
		break;
	case UBLK_IO_OP_WRITE:
		if (g_ublk_tgt.user_copy) {
			ublk_io_get_buffer(io, iobuf_ch, user_copy_write_get_buffer_done);
		} else {
			_ublk_submit_bdev_io(q, io);
		}
		break;
	default:
		_ublk_submit_bdev_io(q, io);
		break;
@@ -1123,14 +1216,13 @@ ublksrv_queue_io_cmd(struct ublk_queue *q,
	sqe->flags	= IOSQE_FIXED_FILE;
	sqe->rw_flags	= 0;
	cmd->tag	= tag;
	cmd->addr	= (__u64)(uintptr_t)(io->payload);
	cmd->addr	= g_ublk_tgt.user_copy ? 0 : (__u64)(uintptr_t)(io->payload);
	cmd->q_id	= q->q_id;

	user_data = build_user_data(tag, cmd_op);
	io_uring_sqe_set_data64(sqe, user_data);

	io->cmd_op = 0;
	q->cmd_inflight += 1;

	SPDK_DEBUGLOG(ublk_io, "(qid %d tag %u cmd_op %u) iof %x stopping %d\n",
		      q->q_id, tag, cmd_op,
@@ -1166,6 +1258,7 @@ ublk_io_xmit(struct ublk_queue *q)
		count++;
	}

	q->cmd_inflight += count;
	rc = io_uring_submit(&q->ring);
	if (rc != count) {
		SPDK_ERRLOG("could not submit all commands\n");
@@ -1217,14 +1310,15 @@ ublk_io_recv(struct ublk_queue *q)
	iobuf_ch = &q->poll_group->iobuf_ch;
	io_uring_for_each_cqe(&q->ring, head, cqe) {
		tag = user_data_to_tag(cqe->user_data);
		fetch = (cqe->res != UBLK_IO_RES_ABORT) && !q->is_stopping;
		io = &q->ios[tag];

		SPDK_DEBUGLOG(ublk_io, "res %d qid %d tag %u cmd_op %u\n",
			      cqe->res, q->q_id, tag, user_data_to_op(cqe->user_data));
		SPDK_DEBUGLOG(ublk_io, "res %d qid %d tag %u, user copy %u, cmd_op %u\n",
			      cqe->res, q->q_id, tag, io->user_copy, user_data_to_op(cqe->user_data));

		q->cmd_inflight--;
		io = &q->ios[tag];

		if (!io->user_copy) {
			fetch = (cqe->res != UBLK_IO_RES_ABORT) && !q->is_stopping;
			if (!fetch) {
				q->is_stopping = true;
				if (io->cmd_op == UBLK_IO_FETCH_REQ) {
@@ -1244,6 +1338,25 @@ ublk_io_recv(struct ublk_queue *q)
				}
				TAILQ_REMOVE(&q->inflight_io_list, io, tailq);
			}
		} else {

			/* clear `user_copy` for next use of this IO structure */
			io->user_copy = false;

			assert((ublksrv_get_op(io->iod) == UBLK_IO_OP_READ) ||
			       (ublksrv_get_op(io->iod) == UBLK_IO_OP_WRITE));
			if (cqe->res != io->result) {
				/* EIO */
				ublk_io_done(NULL, false, io);
			} else {
				if (ublksrv_get_op(io->iod) == UBLK_IO_OP_READ) {
					/* bdev_io is already freed in first READ cycle */
					ublk_io_done(NULL, true, io);
				} else {
					_ublk_submit_bdev_io(q, io);
				}
			}
		}
		count += 1;
		if (count == UBLK_QUEUE_REQUEST) {
			break;
@@ -1358,7 +1471,6 @@ ublk_dev_queue_init(struct ublk_queue *q)

	ublk_dev_init_io_cmds(&q->ring, q->q_depth);

	return 0;
err:
	return rc;
}
@@ -1404,6 +1516,7 @@ ublk_dev_queue_io_init(struct ublk_queue *q)
		ublksrv_queue_io_cmd(q, io, i);
	}

	q->cmd_inflight += q->q_depth;
	rc = io_uring_submit(&q->ring);
	assert(rc == (int)q->q_depth);
	for (i = 0; i < q->q_depth; i++) {
@@ -1449,7 +1562,7 @@ ublk_info_param_init(struct spdk_ublk_dev *ublk)
		.dev_id = ublk->ublk_id,
		.max_io_buf_bytes = UBLK_IO_MAX_BYTES,
		.ublksrv_pid = getpid(),
		.flags = UBLK_F_NEED_GET_DATA | UBLK_F_URING_CMD_COMP_IN_TASK,
		.flags = UBLK_F_URING_CMD_COMP_IN_TASK,
	};
	struct ublk_params uparams = {
		.types = UBLK_PARAM_TYPE_BASIC,
@@ -1474,6 +1587,12 @@ ublk_info_param_init(struct spdk_ublk_dev *ublk)
		}
	}

	if (g_ublk_tgt.user_copy) {
		uinfo.flags |= UBLK_F_USER_COPY;
	} else {
		uinfo.flags |= UBLK_F_NEED_GET_DATA;
	}

	ublk->dev_info = uinfo;
	ublk->dev_params = uparams;
}