Commit 5818b42f authored by matthewb's avatar matthewb Committed by Tomasz Zawadzki
Browse files

lib/nvmf : Added support for ZCOPY



Implemented nvmf code to allow transports to use ZCOPY. Note ZCOPY
has to be enabled within the individual transport layer

Signed-off-by: default avatarmatthewb <matthew.burbridge@hpe.com>
Change-Id: I273b3d4ab44d882c916ac39e821505e1f4211ded
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6817


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: default avatarXiaodong Liu <xiaodong.liu@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
parent 5a169179
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -63,6 +63,9 @@ the uint128 related data.

Removed deprecated `net` library.

Added support for ZCOPY.  ZCOPY is used in preference to READ and WRITE if the
bdev module supports ZCOPY and the bdev module has ZCOPY enabled.

### nvme

`spdk_nvme_map_prps` and `spdk_nvme_map_cmd` were moved to nvmf/vfio-user as internal APIs
+19 −0
Original line number Diff line number Diff line
@@ -78,6 +78,21 @@ struct spdk_nvmf_dif_info {
	uint32_t				orig_length;
};

enum spdk_nvmf_zcopy_phase {
	NVMF_ZCOPY_PHASE_NONE,        /* Request is not using ZCOPY */
	NVMF_ZCOPY_PHASE_INIT,        /* Requesting Buffers */
	NVMF_ZCOPY_PHASE_EXECUTE,     /* Got buffers processing commands */
	NVMF_ZCOPY_PHASE_END_PENDING, /* Releasing buffers */
	NVMF_ZCOPY_PHASE_COMPLETE,    /* Buffers Released */
	NVMF_ZCOPY_PHASE_INIT_FAILED  /* Failed to get the buffers */
};

static inline bool
spdk_nvmf_using_zcopy(enum spdk_nvmf_zcopy_phase phase)
{
	return (phase != NVMF_ZCOPY_PHASE_NONE);
}

struct spdk_nvmf_request {
	struct spdk_nvmf_qpair		*qpair;
	uint32_t			length;
@@ -101,6 +116,8 @@ struct spdk_nvmf_request {
	struct spdk_nvmf_request	*first_fused_req;
	struct spdk_nvmf_request	*req_to_abort;
	struct spdk_poller		*poller;
	struct spdk_bdev_io		*zcopy_bdev_io; /* Contains the bdev_io when using ZCOPY */
	enum spdk_nvmf_zcopy_phase	zcopy_phase;

	TAILQ_ENTRY(spdk_nvmf_request)	link;
};
@@ -426,6 +443,8 @@ void spdk_nvmf_request_exec(struct spdk_nvmf_request *req);
void spdk_nvmf_request_exec_fabrics(struct spdk_nvmf_request *req);
int spdk_nvmf_request_free(struct spdk_nvmf_request *req);
int spdk_nvmf_request_complete(struct spdk_nvmf_request *req);
int spdk_nvmf_request_zcopy_start(struct spdk_nvmf_request *req);
int spdk_nvmf_request_zcopy_end(struct spdk_nvmf_request *req);

/**
 * Remove the given qpair from the poll group.
+180 −23
Original line number Diff line number Diff line
@@ -775,11 +775,20 @@ nvmf_subsystem_pg_from_connect_cmd(struct spdk_nvmf_request *req)
	return &req->qpair->group->sgroups[subsystem->id];
}

static void
nvmf_add_to_outstanding_queue(struct spdk_nvmf_request *req)
{
	if (!spdk_nvmf_using_zcopy(req->zcopy_phase)) {
		/* if using zcopy then request has been added when the start zcopy was actioned */
		struct spdk_nvmf_qpair *qpair = req->qpair;
		TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
	}
}

int
spdk_nvmf_ctrlr_connect(struct spdk_nvmf_request *req)
{
	struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
	struct spdk_nvmf_qpair *qpair = req->qpair;
	struct spdk_nvmf_subsystem_poll_group *sgroup;
	enum spdk_nvmf_request_exec_status status;

@@ -791,7 +800,7 @@ spdk_nvmf_ctrlr_connect(struct spdk_nvmf_request *req)
	}

	sgroup->mgmt_io_outstanding++;
	TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
	nvmf_add_to_outstanding_queue(req);

	status = _nvmf_ctrlr_connect(req);

@@ -3516,6 +3525,112 @@ nvmf_ctrlr_process_io_fused_cmd(struct spdk_nvmf_request *req, struct spdk_bdev
	return rc;
}

bool
nvmf_ctrlr_use_zcopy(struct spdk_nvmf_request *req)
{
	struct spdk_nvmf_ns *ns;

	req->zcopy_phase = NVMF_ZCOPY_PHASE_NONE;

	if (nvmf_qpair_is_admin_queue(req->qpair)) {
		/* Admin queue */
		return false;
	}

	if ((req->cmd->nvme_cmd.opc != SPDK_NVME_OPC_WRITE) &&
	    (req->cmd->nvme_cmd.opc != SPDK_NVME_OPC_READ)) {
		/* Not a READ or WRITE command */
		return false;
	}

	if (req->cmd->nvme_cmd.fuse != SPDK_NVME_CMD_FUSE_NONE) {
		/* Fused commands dont use zcopy buffers */
		return false;
	}

	ns = _nvmf_subsystem_get_ns(req->qpair->ctrlr->subsys, req->cmd->nvme_cmd.nsid);
	if (ns == NULL || ns->bdev == NULL || !ns->zcopy) {
		return false;
	}

	req->zcopy_phase = NVMF_ZCOPY_PHASE_INIT;
	return true;
}

/* If this function returns a non-zero value the request
 * reverts to using SPDK buffers
 */
int
spdk_nvmf_request_zcopy_start(struct spdk_nvmf_request *req)
{
	struct spdk_nvmf_qpair *qpair = req->qpair;
	struct spdk_nvmf_subsystem_poll_group *sgroup = NULL;
	struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
	uint32_t nsid;
	struct spdk_bdev *bdev;
	struct spdk_bdev_desc *desc;
	struct spdk_io_channel *ch;
	int rc;

	if (!qpair->ctrlr) {
		goto end;
	}

	if (qpair->group->sgroups == NULL) {
		goto end;
	}

	rc = spdk_nvmf_request_get_bdev(req->cmd->nvme_cmd.nsid, req,
					&bdev, &desc, &ch);
	if (rc != 0) {
		goto end;
	}

	if (ch == NULL) {
		goto end;
	}

	nsid = req->cmd->nvme_cmd.nsid;
	sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id];
	ns_info = &sgroup->ns_info[nsid - 1];
	if (ns_info->state != SPDK_NVMF_SUBSYSTEM_ACTIVE) {
		goto end;
	}

	if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) {
		goto end;
	}

	/* backward compatible */
	req->data = req->iov[0].iov_base;

	/* Set iovcnt to be the maximum number of
	 * iovs that the ZCOPY can use
	 */
	req->iovcnt = NVMF_REQ_MAX_BUFFERS;
	TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
	rc = nvmf_bdev_ctrlr_start_zcopy(bdev, desc, ch, req);
	if (rc == 0) {
		ns_info->io_outstanding++;
		return 0;
	}
	TAILQ_REMOVE(&qpair->outstanding, req, link);

end:
	/* An error occurred, the subsystem is paused, or the qpair is not active.
	 * Revert to using SPDK buffers
	 */
	req->zcopy_phase = NVMF_ZCOPY_PHASE_NONE;
	return -1;
}

int
spdk_nvmf_request_zcopy_end(struct spdk_nvmf_request *req)
{
	req->zcopy_phase = NVMF_ZCOPY_PHASE_END_PENDING;
	return nvmf_bdev_ctrlr_end_zcopy(req);
}

int
nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req)
{
@@ -3691,7 +3806,30 @@ _nvmf_request_complete(void *ctx)
		spdk_nvme_print_completion(qpair->qid, rsp);
	}

	switch (req->zcopy_phase) {
	case NVMF_ZCOPY_PHASE_NONE:
		TAILQ_REMOVE(&qpair->outstanding, req, link);
		break;
	case NVMF_ZCOPY_PHASE_INIT:
		if (spdk_unlikely(spdk_nvme_cpl_is_error(rsp))) {
			/* The START failed or was aborted so revert to a normal IO */
			req->zcopy_phase = NVMF_ZCOPY_PHASE_INIT_FAILED;
			TAILQ_REMOVE(&qpair->outstanding, req, link);
		} else {
			req->zcopy_phase = NVMF_ZCOPY_PHASE_EXECUTE;
		}
		break;
	case NVMF_ZCOPY_PHASE_EXECUTE:
		break;
	case NVMF_ZCOPY_PHASE_END_PENDING:
		TAILQ_REMOVE(&qpair->outstanding, req, link);
		req->zcopy_phase = NVMF_ZCOPY_PHASE_COMPLETE;
		break;
	default:
		SPDK_ERRLOG("Invalid ZCOPY phase %u\n", req->zcopy_phase);
		break;
	}

	if (nvmf_transport_req_complete(req)) {
		SPDK_ERRLOG("Transport request completion error!\n");
	}
@@ -3703,11 +3841,16 @@ _nvmf_request_complete(void *ctx)
			assert(sgroup->mgmt_io_outstanding > 0);
			sgroup->mgmt_io_outstanding--;
		} else {
			if ((req->zcopy_phase == NVMF_ZCOPY_PHASE_NONE) ||
			    (req->zcopy_phase == NVMF_ZCOPY_PHASE_COMPLETE)) {
				/* End of request */

				/* NOTE: This implicitly also checks for 0, since 0 - 1 wraps around to UINT32_MAX. */
				if (spdk_likely(nsid - 1 < sgroup->num_ns)) {
					sgroup->ns_info[nsid - 1].io_outstanding--;
				}
			}
		}

		if (spdk_unlikely(sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSING &&
				  sgroup->mgmt_io_outstanding == 0)) {
@@ -3767,7 +3910,7 @@ spdk_nvmf_request_exec_fabrics(struct spdk_nvmf_request *req)
	sgroup->mgmt_io_outstanding++;

	/* Place the request on the outstanding list so we can keep track of it */
	TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
	nvmf_add_to_outstanding_queue(req);

	assert(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC);
	status = nvmf_ctrlr_process_fabrics_cmd(req);
@@ -3777,13 +3920,11 @@ spdk_nvmf_request_exec_fabrics(struct spdk_nvmf_request *req)
	}
}

void
spdk_nvmf_request_exec(struct spdk_nvmf_request *req)
static bool nvmf_check_subsystem_active(struct spdk_nvmf_request *req)
{
	struct spdk_nvmf_qpair *qpair = req->qpair;
	struct spdk_nvmf_subsystem_poll_group *sgroup = NULL;
	struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
	enum spdk_nvmf_request_exec_status status;
	uint32_t nsid;

	if (qpair->ctrlr) {
@@ -3800,7 +3941,7 @@ spdk_nvmf_request_exec(struct spdk_nvmf_request *req)
			if (sgroup->state != SPDK_NVMF_SUBSYSTEM_ACTIVE) {
				/* The subsystem is not currently active. Queue this request. */
				TAILQ_INSERT_TAIL(&sgroup->queued, req, link);
				return;
				return false;
			}
			sgroup->mgmt_io_outstanding++;
		} else {
@@ -3811,9 +3952,9 @@ spdk_nvmf_request_exec(struct spdk_nvmf_request *req)
				req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
				req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
				req->rsp->nvme_cpl.status.dnr = 1;
				TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
				nvmf_add_to_outstanding_queue(req);
				_nvmf_request_complete(req);
				return;
				return false;
			}

			ns_info = &sgroup->ns_info[nsid - 1];
@@ -3825,35 +3966,51 @@ spdk_nvmf_request_exec(struct spdk_nvmf_request *req)
				req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
				req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
				req->rsp->nvme_cpl.status.dnr = 1;
				TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
				nvmf_add_to_outstanding_queue(req);
				ns_info->io_outstanding++;
				_nvmf_request_complete(req);
				return;
				return false;
			}

			if (ns_info->state != SPDK_NVMF_SUBSYSTEM_ACTIVE) {
				/* The namespace is not currently active. Queue this request. */
				TAILQ_INSERT_TAIL(&sgroup->queued, req, link);
				return;
				return false;
			}

			ns_info->io_outstanding++;
		}
	}

		if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) {
			req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
			req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
		TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
			nvmf_add_to_outstanding_queue(req);
			_nvmf_request_complete(req);
			return false;
		}
	}

	return true;
}

void
spdk_nvmf_request_exec(struct spdk_nvmf_request *req)
{
	struct spdk_nvmf_qpair *qpair = req->qpair;
	enum spdk_nvmf_request_exec_status status;

	if (!spdk_nvmf_using_zcopy(req->zcopy_phase)) {
		if (!nvmf_check_subsystem_active(req)) {
			return;
		}
	}

	if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) {
		spdk_nvme_print_command(qpair->qid, &req->cmd->nvme_cmd);
	}

	/* Place the request on the outstanding list so we can keep track of it */
	TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
	nvmf_add_to_outstanding_queue(req);

	if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) {
		status = nvmf_ctrlr_process_fabrics_cmd(req);
+117 −0
Original line number Diff line number Diff line
@@ -267,6 +267,12 @@ nvmf_bdev_ctrl_queue_io(struct spdk_nvmf_request *req, struct spdk_bdev *bdev,
	req->qpair->group->stat.pending_bdev_io++;
}

bool
nvmf_bdev_zcopy_enabled(struct spdk_bdev *bdev)
{
	return spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY);
}

int
nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
			 struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
@@ -296,6 +302,13 @@ nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
	}

	if (req->zcopy_phase == NVMF_ZCOPY_PHASE_EXECUTE) {
		/* Return here after checking the lba etc */
		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
	}

	assert(!spdk_nvmf_using_zcopy(req->zcopy_phase));

	rc = spdk_bdev_readv_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
				    nvmf_bdev_ctrlr_complete_cmd, req);
	if (spdk_unlikely(rc)) {
@@ -340,6 +353,13 @@ nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
	}

	if (req->zcopy_phase == NVMF_ZCOPY_PHASE_EXECUTE) {
		/* Return here after checking the lba etc */
		return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
	}

	assert(!spdk_nvmf_using_zcopy(req->zcopy_phase));

	rc = spdk_bdev_writev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
				     nvmf_bdev_ctrlr_complete_cmd, req);
	if (spdk_unlikely(rc)) {
@@ -769,3 +789,100 @@ nvmf_bdev_ctrlr_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd,

	return (rc == 0) ? true : false;
}

static void
nvmf_bdev_ctrlr_start_zcopy_complete(struct spdk_bdev_io *bdev_io, bool success,
				     void *cb_arg)
{
	struct spdk_nvmf_request	*req = cb_arg;
	struct iovec *iov;
	int iovcnt;

	if (spdk_unlikely(!success)) {
		int                     sc = 0, sct = 0;
		uint32_t                cdw0 = 0;
		struct spdk_nvme_cpl    *response = &req->rsp->nvme_cpl;
		spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);

		response->cdw0 = cdw0;
		response->status.sc = sc;
		response->status.sct = sct;

		spdk_bdev_free_io(bdev_io);
		spdk_nvmf_request_complete(req);
		return;
	}

	spdk_bdev_io_get_iovec(bdev_io, &iov, &iovcnt);

	assert(iovcnt <= NVMF_REQ_MAX_BUFFERS);

	req->iovcnt = iovcnt;

	assert(req->iov == iov);

	req->zcopy_bdev_io = bdev_io; /* Preserve the bdev_io for the end zcopy */

	spdk_nvmf_request_complete(req);
	/* Don't free the bdev_io here as it is needed for the END ZCOPY */
}

int
nvmf_bdev_ctrlr_start_zcopy(struct spdk_bdev *bdev,
			    struct spdk_bdev_desc *desc,
			    struct spdk_io_channel *ch,
			    struct spdk_nvmf_request *req)
{
	uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
	uint32_t block_size = spdk_bdev_get_block_size(bdev);
	uint64_t start_lba;
	uint64_t num_blocks;

	nvmf_bdev_ctrlr_get_rw_params(&req->cmd->nvme_cmd, &start_lba, &num_blocks);

	if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
		SPDK_ERRLOG("end of media\n");
		return -ENXIO;
	}

	if (spdk_unlikely(num_blocks * block_size > req->length)) {
		SPDK_ERRLOG("Read NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
			    num_blocks, block_size, req->length);
		return -ENXIO;
	}

	bool populate = (req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_READ) ? true : false;

	return spdk_bdev_zcopy_start(desc, ch, req->iov, req->iovcnt, start_lba,
				     num_blocks, populate, nvmf_bdev_ctrlr_start_zcopy_complete, req);
}

static void
nvmf_bdev_ctrlr_end_zcopy_complete(struct spdk_bdev_io *bdev_io, bool success,
				   void *cb_arg)
{
	struct spdk_nvmf_request	*req = cb_arg;

	if (spdk_unlikely(!success)) {
		int                     sc = 0, sct = 0;
		uint32_t                cdw0 = 0;
		struct spdk_nvme_cpl    *response = &req->rsp->nvme_cpl;
		spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);

		response->cdw0 = cdw0;
		response->status.sc = sc;
		response->status.sct = sct;
	}

	spdk_bdev_free_io(bdev_io);
	req->zcopy_bdev_io = NULL;
	spdk_nvmf_request_complete(req);
}

int
nvmf_bdev_ctrlr_end_zcopy(struct spdk_nvmf_request *req)
{
	bool commit = (req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_WRITE) ? true : false;

	return spdk_bdev_zcopy_end(req->zcopy_bdev_io, commit, nvmf_bdev_ctrlr_end_zcopy_complete, req);
}
+1 −0
Original line number Diff line number Diff line
@@ -1597,6 +1597,7 @@ nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group,
	/* Release all queued requests */
	TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) {
		TAILQ_REMOVE(&sgroup->queued, req, link);
		assert(req->zcopy_phase == NVMF_ZCOPY_PHASE_NONE);
		spdk_nvmf_request_exec(req);
	}
fini:
Loading