Commit 53777de8 authored by Alexey Marchuk's avatar Alexey Marchuk Committed by Ben Walker
Browse files

rdma: Unset IBV_SEND_SIGNALED flag for RDMA_WRITE operations



Unsetting this flag will decrease the number of WRs retrieved during CQ polling and will decrease
the oeverall processing time. Since RDMA_WRITE operations are always paired with RDMA_SEND (response),
it is possible to track the number of outstanding WRs relying on the completed response WR.
Completed WRs of type RDMA_WR_TYPE_DATA are now always RDMA_READ operations.

The patch shows %2 better peformance for read operations on x86 machine. The performance was measured using perf with the following parameters:
-q 16 -o 4096 -w read -t 300 -c 2
with nvme null device, each measurement was done 4 times

avg IOPS (with patch): 865861.71
avg IOPS (master): 847958.77

avg latency (with patch): 18.46 [us]
avg latency (master): 18.85 [us]

Change-Id: Ifd3329fbd0e45dd5f27213b36b9444308660fc8b
Signed-off-by: default avatarAlexey Marchuk <alexeymar@mellanox.com>
Signed-off-by: default avatarSasha Kotchubievsky <sashakot@mellanox.com>
Signed-off-by: default avatarEvgenii Kochetov <evgeniik@mellanox.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/456469


Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarSeth Howell <seth.howell5141@gmail.com>
Reviewed-by: default avatarDarek Stojaczyk <dariusz.stojaczyk@intel.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent 1900c8c2
Loading
Loading
Loading
Loading
+16 −13
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@
 *   BSD LICENSE
 *
 *   Copyright (c) Intel Corporation. All rights reserved.
 *   Copyright (c) 2018 Mellanox Technologies LTD. All rights reserved.
 *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
@@ -1487,12 +1487,13 @@ nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
	for (i = 0; i < num_sgl_descriptors; i++) {
		if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
			current_data_wr->wr.opcode = IBV_WR_RDMA_WRITE;
			current_data_wr->wr.send_flags = 0;
		} else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
			current_data_wr->wr.opcode = IBV_WR_RDMA_READ;
			current_data_wr->wr.send_flags = IBV_SEND_SIGNALED;
		} else {
			assert(false);
		}
		work_requests[i]->wr.send_flags = IBV_SEND_SIGNALED;
		work_requests[i]->wr.sg_list = work_requests[i]->sgl;
		work_requests[i]->wr.wr_id = rdma_req->data.wr.wr_id;
		current_data_wr->wr.next = &work_requests[i]->wr;
@@ -1502,9 +1503,11 @@ nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
	if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
		current_data_wr->wr.opcode = IBV_WR_RDMA_WRITE;
		current_data_wr->wr.next = &rdma_req->rsp.wr;
		current_data_wr->wr.send_flags = 0;
	} else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
		current_data_wr->wr.opcode = IBV_WR_RDMA_READ;
		current_data_wr->wr.next = NULL;
		current_data_wr->wr.send_flags = IBV_SEND_SIGNALED;
	}
	return 0;
}
@@ -1755,9 +1758,11 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
		if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
			rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE;
			rdma_req->data.wr.next = &rdma_req->rsp.wr;
			rdma_req->data.wr.send_flags &= ~IBV_SEND_SIGNALED;
		} else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
			rdma_req->data.wr.opcode = IBV_WR_RDMA_READ;
			rdma_req->data.wr.next = NULL;
			rdma_req->data.wr.send_flags |= IBV_SEND_SIGNALED;
		}

		/* set the number of outstanding data WRs for this request. */
@@ -3427,10 +3432,11 @@ spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
			}

			rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
			rqpair->current_send_depth--;
			/* +1 for the response wr */
			rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1;
			rdma_req->num_outstanding_data_wr = 0;

			spdk_nvmf_rdma_request_process(rtransport, rdma_req);
			assert(rdma_req->num_outstanding_data_wr == 0);
			break;
		case RDMA_WR_TYPE_RECV:
			/* rdma_recv->qpair will be invalid if using an SRQ.  In that case we have to get the qpair from the wc. */
@@ -3462,16 +3468,13 @@ spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
			rqpair->current_send_depth--;
			rdma_req->num_outstanding_data_wr--;
			if (!wc[i].status) {
				if (wc[i].opcode == IBV_WC_RDMA_READ) {
				assert(wc[i].opcode == IBV_WC_RDMA_READ);
				rqpair->current_read_depth--;
				/* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */
				if (rdma_req->num_outstanding_data_wr == 0) {
					rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
					spdk_nvmf_rdma_request_process(rtransport, rdma_req);
				}
				} else {
					assert(wc[i].opcode == IBV_WC_RDMA_WRITE);
				}
			} else {
				/* If the data transfer fails still force the queue into the error state,
				 * if we were performing an RDMA_READ, we need to force the request into a