Commit 9d63933b authored by Seth Howell's avatar Seth Howell Committed by Jim Harris
Browse files

rdma: batch rdma sends.



By batching ibv sends each time we poll, we can reduce the number of
MMIO writes that we do.

Change-Id: Ia5a07b0037365abfa8732629c34d34a9ed49ac70
Signed-off-by: default avatarSeth Howell <seth.howell@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/449253


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
parent 4c03cae6
Loading
Loading
Loading
Loading
+126 −23
Original line number Diff line number Diff line
@@ -286,6 +286,11 @@ struct spdk_nvmf_rdma_resource_opts {
	bool				shared;
};

struct spdk_nvmf_send_wr_list {
	struct ibv_send_wr	*first;
	struct ibv_send_wr	*last;
};

struct spdk_nvmf_rdma_resources {
	/* Array of size "max_queue_depth" containing RDMA requests. */
	struct spdk_nvmf_rdma_request		*reqs;
@@ -356,6 +361,9 @@ struct spdk_nvmf_rdma_qpair {
	/* The maximum number of SGEs per WR on the recv queue */
	uint32_t				max_recv_sge;

	/* The list of pending send requests for a transfer */
	struct spdk_nvmf_send_wr_list		sends_to_post;

	struct spdk_nvmf_rdma_resources		*resources;

	STAILQ_HEAD(, spdk_nvmf_rdma_request)	pending_rdma_read_queue;
@@ -619,7 +627,8 @@ nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
	while (next_data_wr) {
		current_data_wr = next_data_wr;
		send_wr = current_data_wr->wr.next;
		if (send_wr != NULL && send_wr != &rdma_req->rsp.wr) {
		if (send_wr != NULL && send_wr != &rdma_req->rsp.wr &&
		    send_wr->wr_id == current_data_wr->wr.wr_id) {
			next_data_wr = SPDK_CONTAINEROF(send_wr, struct spdk_nvmf_rdma_request_data, wr);
		} else {
			next_data_wr = NULL;
@@ -979,6 +988,9 @@ spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
	spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0);
	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair);

	rqpair->sends_to_post.first = NULL;
	rqpair->sends_to_post.last = NULL;

	if (rqpair->poller->srq == NULL) {
		rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
		transport = &rtransport->transport;
@@ -1013,14 +1025,34 @@ error:
	return -1;
}


/* Append the given send wr structure to the qpair's outstanding sends list. */
/* This function accepts either a single wr or the first wr in a linked list. */
static void
nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first)
{
	struct ibv_send_wr *last;

	last = first;
	while (last->next != NULL) {
		last = last->next;
	}

	if (rqpair->sends_to_post.first == NULL) {
		rqpair->sends_to_post.first = first;
		rqpair->sends_to_post.last = last;
	} else {
		rqpair->sends_to_post.last->next = first;
		rqpair->sends_to_post.last = last;
	}
}

static int
request_transfer_in(struct spdk_nvmf_request *req)
{
	int				rc;
	struct spdk_nvmf_rdma_request	*rdma_req;
	struct spdk_nvmf_qpair		*qpair;
	struct spdk_nvmf_rdma_qpair	*rqpair;
	struct ibv_send_wr		*bad_wr = NULL;

	qpair = req->qpair;
	rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
@@ -1029,13 +1061,7 @@ request_transfer_in(struct spdk_nvmf_request *req)
	assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
	assert(rdma_req != NULL);

	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair);

	rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr);
	if (rc) {
		SPDK_ERRLOG("Unable to transfer data from host to target\n");
		return -1;
	}
	nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr);
	rqpair->current_read_depth += rdma_req->num_outstanding_data_wr;
	rqpair->current_send_depth += rdma_req->num_outstanding_data_wr;
	return 0;
@@ -1051,7 +1077,7 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
	struct spdk_nvmf_rdma_qpair	*rqpair;
	struct spdk_nvme_cpl		*rsp;
	struct ibv_recv_wr		*bad_recv_wr = NULL;
	struct ibv_send_wr		*send_wr, *bad_send_wr = NULL;
	struct ibv_send_wr		*first = NULL;

	*data_posted = 0;
	qpair = req->qpair;
@@ -1089,24 +1115,15 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
	 * RDMA WRITEs to transfer data, plus an RDMA SEND
	 * containing the response.
	 */
	send_wr = &rdma_req->rsp.wr;
	first = &rdma_req->rsp.wr;

	if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
	    req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
		SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair);
		send_wr = &rdma_req->data.wr;
		first = &rdma_req->data.wr;
		*data_posted = 1;
		num_outstanding_data_wr = rdma_req->num_outstanding_data_wr;
	}

	SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair);

	/* Send the completion */
	rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr);
	if (rc) {
		SPDK_ERRLOG("Unable to send response capsule\n");
		return rc;
	}
	nvmf_rdma_qpair_queue_send_wrs(rqpair, first);
	/* +1 for the rsp wr */
	rqpair->current_send_depth += num_outstanding_data_wr + 1;

@@ -1797,6 +1814,7 @@ nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
	rdma_req->req.length = 0;
	rdma_req->req.iovcnt = 0;
	rdma_req->req.data = NULL;
	rdma_req->rsp.wr.next = NULL;
	rdma_req->data.wr.next = NULL;
	rqpair->qd--;

@@ -3190,6 +3208,88 @@ spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req)
}
#endif

static void
_qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport,
		       struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc)
{
	struct spdk_nvmf_rdma_wr	*bad_rdma_wr;
	struct spdk_nvmf_rdma_request	*prev_rdma_req = NULL, *cur_rdma_req = NULL;

	SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc);
	for (; bad_wr != NULL; bad_wr = bad_wr->next) {
		bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id;
		assert(rqpair->current_send_depth > 0);
		rqpair->current_send_depth--;
		switch (bad_rdma_wr->type) {
		case RDMA_WR_TYPE_DATA:
			cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr);
			if (bad_wr->opcode == IBV_WR_RDMA_READ) {
				assert(rqpair->current_read_depth > 0);
				rqpair->current_read_depth--;
			}
			break;
		case RDMA_WR_TYPE_SEND:
			cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr);
			break;
		default:
			SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair);
			prev_rdma_req = cur_rdma_req;
			continue;
		}

		if (prev_rdma_req == cur_rdma_req) {
			/* this request was handled by an earlier wr. i.e. we were performing an nvme read. */
			/* We only have to check against prev_wr since each requests wrs are contiguous in this list. */
			continue;
		}

		switch (cur_rdma_req->state) {
		case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
			cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
			cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
			break;
		case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
		case RDMA_REQUEST_STATE_COMPLETING:
			cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
			break;
		default:
			SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n",
				    cur_rdma_req->state, rqpair);
			continue;
		}

		spdk_nvmf_rdma_request_process(rtransport, cur_rdma_req);
		prev_rdma_req = cur_rdma_req;
	}

	if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) {
		/* Disconnect the connection. */
		spdk_nvmf_rdma_start_disconnect(rqpair);
	}

}

static void
_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
		     struct spdk_nvmf_rdma_poller *rpoller)
{
	struct spdk_nvmf_rdma_qpair	*rqpair;
	struct ibv_send_wr		*bad_wr = NULL;
	int				rc;

	TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) {
		if (rqpair->sends_to_post.first != NULL) {
			rc = ibv_post_send(rqpair->cm_id->qp, rqpair->sends_to_post.first, &bad_wr);
			/* bad wr always points to the first wr that failed. */
			if (rc) {
				_qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc);
			}
			rqpair->sends_to_post.first = NULL;
			rqpair->sends_to_post.last = NULL;
		}
	}
}

static int
spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
			   struct spdk_nvmf_rdma_poller *rpoller)
@@ -3319,6 +3419,9 @@ spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
		return -1;
	}

	/* submit outstanding work requests. */
	_poller_submit_sends(rtransport, rpoller);

	return count;
}