Commit 183c3485 authored by Jim Harris's avatar Jim Harris Committed by Tomasz Zawadzki
Browse files

nvmf/rdma: issue fused cmds consecutively to tgt layer



RDMA READs can cause cmds to be submitted to the target
layer in a different order than they were received
from the host.  Normally this is fine, but not for
fused commands.

So track fused commands as they reach
nvmf_rdma_request_process().  If we find a pair of
sequential commands that don't have valid FUSED settings
(i.e. NONE/SECOND, FIRST/NONE, FIRST/FIRST), we mark
the requests as "fused_failed" and will later fail them
just before they would be normally sent to the target
layer.

When we do find a pair of valid fused commands (FIRST
followed by SECOND), we will wait until both are
READY_TO_EXECUTE, and then submit them to the target
layer consecutively.

This fixes issue #2428 for RDMA transport.

Signed-off-by: default avatarJim Harris <james.r.harris@intel.com>
Change-Id: I01ebf90e17761499fb6601456811f442dc2a2950
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12018


Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarChangpeng Liu <changpeng.liu@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
parent c8802892
Loading
Loading
Loading
Loading
+121 −0
Original line number Diff line number Diff line
@@ -264,6 +264,9 @@ struct spdk_nvmf_rdma_request {
	uint32_t				num_outstanding_data_wr;
	uint64_t				receive_tsc;

	bool					fused_failed;
	struct spdk_nvmf_rdma_request		*fused_pair;

	STAILQ_ENTRY(spdk_nvmf_rdma_request)	state_link;
};

@@ -380,6 +383,12 @@ struct spdk_nvmf_rdma_qpair {
	 */
	enum ibv_qp_state			ibv_state;

	/* Points to the a request that has fuse bits set to
	 * SPDK_NVME_CMD_FUSE_FIRST, when the qpair is waiting
	 * for the request that has SPDK_NVME_CMD_FUSE_SECOND.
	 */
	struct spdk_nvmf_rdma_request		*fused_first;

	/*
	 * io_channel which is used to destroy qpair when it is removed from poll group
	 */
@@ -1938,6 +1947,19 @@ _nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
	rdma_req->data.wr.next = NULL;
	rdma_req->offset = 0;
	rdma_req->req.dif_enabled = false;
	rdma_req->fused_failed = false;
	if (rdma_req->fused_pair) {
		/* This req was part of a valid fused pair, but failed before it got to
		 * READ_TO_EXECUTE state.  This means we need to fail the other request
		 * in the pair, because it is no longer part of a valid pair.  If the pair
		 * already reached READY_TO_EXECUTE state, we need to kick it.
		 */
		rdma_req->fused_pair->fused_failed = true;
		if (rdma_req->fused_pair->state == RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
			nvmf_rdma_request_process(rtransport, rdma_req->fused_pair);
		}
		rdma_req->fused_pair = NULL;
	}
	memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif));
	rqpair->qd--;

@@ -1945,6 +1967,57 @@ _nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
	rdma_req->state = RDMA_REQUEST_STATE_FREE;
}

static void
nvmf_rdma_check_fused_ordering(struct spdk_nvmf_rdma_transport *rtransport,
			       struct spdk_nvmf_rdma_qpair *rqpair,
			       struct spdk_nvmf_rdma_request *rdma_req)
{
	enum spdk_nvme_cmd_fuse last, next;

	last = rqpair->fused_first ? rqpair->fused_first->req.cmd->nvme_cmd.fuse : SPDK_NVME_CMD_FUSE_NONE;
	next = rdma_req->req.cmd->nvme_cmd.fuse;

	assert(last != SPDK_NVME_CMD_FUSE_SECOND);

	if (spdk_likely(last == SPDK_NVME_CMD_FUSE_NONE && next == SPDK_NVME_CMD_FUSE_NONE)) {
		return;
	}

	if (last == SPDK_NVME_CMD_FUSE_FIRST) {
		if (next == SPDK_NVME_CMD_FUSE_SECOND) {
			/* This is a valid pair of fused commands.  Point them at each other
			 * so they can be submitted consecutively once ready to be executed.
			 */
			rqpair->fused_first->fused_pair = rdma_req;
			rdma_req->fused_pair = rqpair->fused_first;
			rqpair->fused_first = NULL;
			return;
		} else {
			/* Mark the last req as failed since it wasn't followed by a SECOND. */
			rqpair->fused_first->fused_failed = true;

			/* If the last req is in READY_TO_EXECUTE state, then call
			 * nvmf_rdma_request_process(), otherwise nothing else will kick it.
			 */
			if (rqpair->fused_first->state == RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
				nvmf_rdma_request_process(rtransport, rqpair->fused_first);
			}

			rqpair->fused_first = NULL;
		}
	}

	if (next == SPDK_NVME_CMD_FUSE_FIRST) {
		/* Set rqpair->fused_first here so that we know to check that the next request
		 * is a SECOND (and to fail this one if it isn't).
		 */
		rqpair->fused_first = rdma_req;
	} else if (next == SPDK_NVME_CMD_FUSE_SECOND) {
		/* Mark this req failed since it ia SECOND and the last one was not a FIRST. */
		rdma_req->fused_failed = true;
	}
}

bool
nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
			  struct spdk_nvmf_rdma_request *rdma_req)
@@ -2008,6 +2081,8 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
				rdma_req->req.dif_enabled = true;
			}

			nvmf_rdma_check_fused_ordering(rtransport, rqpair, rdma_req);

#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
			rdma_req->rsp.wr.opcode = IBV_WR_SEND;
			rdma_req->rsp.wr.imm_data = 0;
@@ -2129,8 +2204,54 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
				rdma_req->req.length = rdma_req->req.dif.elba_length;
			}

			if (rdma_req->req.cmd->nvme_cmd.fuse != SPDK_NVME_CMD_FUSE_NONE) {
				if (rdma_req->fused_failed) {
					/* This request failed FUSED semantics.  Fail it immediately, without
					 * even sending it to the target layer.
					 */
					rsp->status.sct = SPDK_NVME_SCT_GENERIC;
					rsp->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
					rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
					break;
				}

				if (rdma_req->fused_pair == NULL ||
				    rdma_req->fused_pair->state != RDMA_REQUEST_STATE_READY_TO_EXECUTE) {
					/* This request is ready to execute, but either we don't know yet if it's
					 * valid - i.e. this is a FIRST but we haven't received the next
					 * request yet or the other request of this fused pair isn't ready to
					 * execute.  So break here and this request will get processed later either
					 * when the other request is ready or we find that this request isn't valid.
					 */
					break;
				}
			}

			/* If we get to this point, and this request is a fused command, we know that
			 * it is part of valid sequence (FIRST followed by a SECOND) and that both
			 * requests are READY_TO_EXECUTE. So call spdk_nvmf_request_exec() both on this
			 * request, and the other request of the fused pair, in the correct order.
			 * Also clear the ->fused_pair pointers on both requests, since after this point
			 * we no longer need to maintain the relationship between these two requests.
			 */
			if (rdma_req->req.cmd->nvme_cmd.fuse == SPDK_NVME_CMD_FUSE_SECOND) {
				assert(rdma_req->fused_pair != NULL);
				assert(rdma_req->fused_pair->fused_pair != NULL);
				rdma_req->fused_pair->state = RDMA_REQUEST_STATE_EXECUTING;
				spdk_nvmf_request_exec(&rdma_req->fused_pair->req);
				rdma_req->fused_pair->fused_pair = NULL;
				rdma_req->fused_pair = NULL;
			}
			rdma_req->state = RDMA_REQUEST_STATE_EXECUTING;
			spdk_nvmf_request_exec(&rdma_req->req);
			if (rdma_req->req.cmd->nvme_cmd.fuse == SPDK_NVME_CMD_FUSE_FIRST) {
				assert(rdma_req->fused_pair != NULL);
				assert(rdma_req->fused_pair->fused_pair != NULL);
				rdma_req->fused_pair->state = RDMA_REQUEST_STATE_EXECUTING;
				spdk_nvmf_request_exec(&rdma_req->fused_pair->req);
				rdma_req->fused_pair->fused_pair = NULL;
				rdma_req->fused_pair = NULL;
			}
			break;
		case RDMA_REQUEST_STATE_EXECUTING:
			spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0,