Commit ca59dd5d authored by Alexey Marchuk's avatar Alexey Marchuk Committed by Tomasz Zawadzki
Browse files

nvmf/rdma: Allow to split Write IO with multi SGL



Send as many RDMA_READ operations as allowed by
current queue depth.
Completed data WRs are put back to pool
spdk_nvmf_rdma_request is extended with 2 new members,
its size and layout is not the following:
/* size: 1368, cachelines: 22, members: 17 */
/* sum members: 1366, holes: 1, sum holes: 2 */
/* last cacheline: 24 bytes */

Type of `iovpos` and `num_remaining_data_wr` was
reduced from 4 to 2 bytes, that is enough for
almost any use cases

Fixes issue #3069

Signed-off-by: default avatarAlexey Marchuk <alexeymar@nvidia.com>
Change-Id: Ibdababc5224521da6eccc9868d06693a6d534581
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/19125


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarMichael Haeuptle <michaelhaeuptle@gmail.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarBen Walker <ben@nvidia.com>
parent 0fb65adc
Loading
Loading
Loading
Loading
+75 −11
Original line number Diff line number Diff line
@@ -238,12 +238,14 @@ struct spdk_nvmf_rdma_request {
		struct	ibv_sge			sgl[NVMF_DEFAULT_RSP_SGE];
	} rsp;

	uint32_t				iovpos;

	uint32_t				num_outstanding_data_wr;
	uint16_t				iovpos;
	uint16_t				num_outstanding_data_wr;
	/* Used to split Write IO with multi SGL payload */
	uint16_t				num_remaining_data_wr;
	uint64_t				receive_tsc;
	struct spdk_nvmf_rdma_request		*fused_pair;
	STAILQ_ENTRY(spdk_nvmf_rdma_request)	state_link;
	struct ibv_send_wr			*remaining_tranfer_in_wrs;
	struct ibv_send_wr			*transfer_wr;
	struct spdk_nvmf_rdma_request_data	data;
};
@@ -635,7 +637,7 @@ _nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
	struct spdk_nvmf_rdma_request_data	*work_requests[SPDK_NVMF_MAX_SGL_ENTRIES];
	struct spdk_nvmf_rdma_request_data	*nvmf_data;
	struct ibv_send_wr			*next_send_wr;
	uint64_t				req_wrid = data_wr->wr_id;
	uint64_t				req_wrid = (uint64_t)&rdma_req->data_wr;
	uint32_t				num_wrs = 0;

	while (data_wr && data_wr->wr_id == req_wrid) {
@@ -1146,6 +1148,48 @@ request_transfer_in(struct spdk_nvmf_request *req)
	return 0;
}

static inline int
nvmf_rdma_request_reset_transfer_in(struct spdk_nvmf_rdma_request *rdma_req,
				    struct spdk_nvmf_rdma_transport *rtransport)
{
	/* Put completed WRs back to pool and move transfer_wr pointer */
	_nvmf_rdma_request_free_data(rdma_req, rdma_req->transfer_wr, rtransport->data_wr_pool);
	rdma_req->transfer_wr = rdma_req->remaining_tranfer_in_wrs;
	rdma_req->remaining_tranfer_in_wrs = NULL;
	rdma_req->num_outstanding_data_wr = rdma_req->num_remaining_data_wr;
	rdma_req->num_remaining_data_wr = 0;

	return 0;
}

static inline int
request_prepare_transfer_in_part(struct spdk_nvmf_request *req, uint32_t num_reads_available)
{
	struct spdk_nvmf_rdma_request	*rdma_req;
	struct ibv_send_wr		*wr;
	uint32_t i;

	rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);

	assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
	assert(rdma_req != NULL);
	assert(num_reads_available > 0);
	assert(rdma_req->num_outstanding_data_wr > num_reads_available);
	wr = rdma_req->transfer_wr;

	for (i = 0; i < num_reads_available - 1; i++) {
		wr = wr->next;
	}

	rdma_req->remaining_tranfer_in_wrs = wr->next;
	rdma_req->num_remaining_data_wr = rdma_req->num_outstanding_data_wr - num_reads_available;
	rdma_req->num_outstanding_data_wr = num_reads_available;
	/* Break chain of WRs to send only part. Once this portion completes, we continue sending RDMA_READs */
	wr->next = NULL;

	return 0;
}

static int
request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
{
@@ -2050,7 +2094,7 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
	enum spdk_nvmf_rdma_request_state prev_state;
	bool				progress = false;
	int				data_posted;
	uint32_t			num_blocks;
	uint32_t			num_blocks, num_rdma_reads_available, qdepth;

	rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
	device = rqpair->device;
@@ -2175,12 +2219,19 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
				/* This request needs to wait in line to perform RDMA */
				break;
			}
			if (rqpair->current_send_depth + rdma_req->num_outstanding_data_wr > rqpair->max_send_depth
			    || rqpair->current_read_depth + rdma_req->num_outstanding_data_wr > rqpair->max_read_depth) {
			qdepth = rqpair->max_send_depth - rqpair->current_send_depth;
			num_rdma_reads_available = rqpair->max_read_depth - rqpair->current_read_depth;
			if (rdma_req->num_outstanding_data_wr > qdepth ||
			    rdma_req->num_outstanding_data_wr > num_rdma_reads_available) {
				if (num_rdma_reads_available && qdepth) {
					/* Send as much as we can */
					request_prepare_transfer_in_part(&rdma_req->req, spdk_min(num_rdma_reads_available, qdepth));
				} else {
					/* We can only have so many WRs outstanding. we have to wait until some finish. */
					rqpair->poller->stat.pending_rdma_read++;
					break;
				}
			}

			/* We have already verified that this request is the head of the queue. */
			STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link);
@@ -4534,6 +4585,7 @@ nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
	struct spdk_nvmf_rdma_qpair	*rqpair, *tmp_rqpair;
	int reaped, i;
	int count = 0;
	int rc;
	bool error = false;
	uint64_t poll_tsc = spdk_get_ticks();

@@ -4594,7 +4646,6 @@ nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
				 */
				if (spdk_unlikely(NULL == rdma_recv->qpair)) {
					struct ibv_recv_wr *bad_wr;
					int rc;

					rdma_recv->wr.next = NULL;
					spdk_rdma_srq_queue_recv_wrs(rpoller->srq, &rdma_recv->wr);
@@ -4635,6 +4686,19 @@ nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
				rqpair->current_read_depth--;
				/* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */
				if (rdma_req->num_outstanding_data_wr == 0) {
					if (spdk_unlikely(rdma_req->num_remaining_data_wr)) {
						/* Only part of RDMA_READ operations was submitted, process the rest */
						rc = nvmf_rdma_request_reset_transfer_in(rdma_req, rtransport);
						if (spdk_likely(!rc)) {
							STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link);
							rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING;
						} else {
							rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
							rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
						}
						nvmf_rdma_request_process(rtransport, rdma_req);
						break;
					}
					rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
					nvmf_rdma_request_process(rtransport, rdma_req);
				}
+4 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash
#  SPDX-License-Identifier: BSD-3-Clause
#  Copyright (C) 2016 Intel Corporation
#  Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES.
#  All rights reserved.
#
testdir=$(readlink -f $(dirname $0))
@@ -55,6 +56,9 @@ fi
$SPDK_EXAMPLE_DIR/perf -q 1 -o 4096 -w randrw -M 50 -t 1 -r "trtype:$TEST_TRANSPORT adrfam:IPv4 traddr:$NVMF_FIRST_TARGET_IP trsvcid:$NVMF_PORT"
$SPDK_EXAMPLE_DIR/perf -q 32 -o 4096 -w randrw -M 50 -t 1 -HI -r "trtype:$TEST_TRANSPORT adrfam:IPv4 traddr:$NVMF_FIRST_TARGET_IP trsvcid:$NVMF_PORT"
$SPDK_EXAMPLE_DIR/perf -q 128 -o 262144 -O 16384 -w randrw -M 50 -t 2 -r "trtype:$TEST_TRANSPORT adrfam:IPv4 traddr:$NVMF_FIRST_TARGET_IP trsvcid:$NVMF_PORT"
# this perf run aims to test handling of multi SGL payload in RDMA transport. Here we send 9 sge elements while
# standard read_depth is 16. That triggers split of Write IO into several parts
$SPDK_EXAMPLE_DIR/perf -q 128 -o 36964 -O 4096 -w randrw -M 50 -t 5 -r "trtype:$TEST_TRANSPORT adrfam:IPv4 traddr:$NVMF_FIRST_TARGET_IP trsvcid:$NVMF_PORT" -c 0xf -P 4
$SPDK_EXAMPLE_DIR/perf -q 128 -o 262144 -w randrw -M 50 -t 2 -r "trtype:$TEST_TRANSPORT adrfam:IPv4 traddr:$NVMF_FIRST_TARGET_IP trsvcid:$NVMF_PORT" --transport-stat
sync
$rpc_py nvmf_delete_subsystem nqn.2016-06.io.spdk:cnode1
+2 −2
Original line number Diff line number Diff line
@@ -1193,11 +1193,11 @@ test_nvmf_rdma_request_free_data(void)
	next_request_data = spdk_mempool_get(rtransport.data_wr_pool);
	SPDK_CU_ASSERT_FATAL(((struct test_mempool *)rtransport.data_wr_pool)->count ==
			     SPDK_NVMF_MAX_SGL_ENTRIES - 1);
	next_request_data->wr.wr_id = 1;
	next_request_data->wr.wr_id = (uint64_t)&rdma_req.data_wr;
	next_request_data->wr.num_sge = 2;
	next_request_data->wr.next = NULL;
	rdma_req.data.wr.next = &next_request_data->wr;
	rdma_req.data.wr.wr_id = 1;
	rdma_req.data.wr.wr_id = (uint64_t)&rdma_req.data_wr;
	rdma_req.data.wr.num_sge = 2;
	rdma_req.transfer_wr = &rdma_req.data.wr;