Commit bcd987ea authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Jim Harris
Browse files

nvme_rdma: Support SRQ for I/O qpairs



Support SRQ in RDMA transport of NVMe-oF initiator.

Add a new spdk_nvme_transport_opts structure and add rdma_srq_size
to the spdk_nvme_transport_opts structure.

For the user of the NVMe driver, provide two public APIs,
spdk_nvme_transport_get_opts() and spdk_nvme_transport_set_opts().

In the NVMe driver, the instance of spdk_nvme_transport_opts,
g_spdk_nvme_transport_opts, is accessible throughtout.

From an issue that async event handling caused conflicts between
initiator and target, the NVMe-oF RDMA initiator does not handle
the LAST_WQE_REACHED event. Hence, it may geta WC for a already
destroyed QP. To clarify this, add a comment in the source code.

The following is a result of a small performance evaluation using
SPDK NVMe perf tool. Even for queue_depth=1, overhead was less than 1%.
Eventually, we may be able to enable SRQ by default for NVMe-oF
initiator.

1.1 randwrite, qd=1, srq=enabled
./build/examples/perf -q 1 -s 1024 -w randwrite -t 30 -c 0XF -o 4096 -r
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  162411.97     634.42       6.14       5.42     284.07
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  163095.87     637.09       6.12       5.41     423.95
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  164725.30     643.46       6.06       5.32     165.60
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  162548.57     634.96       6.14       5.39     227.24
========================================================
Total                                                                     :  652781.70    2549.93       6.12

1.2 randwrite, qd=1, srq=disabled
./build/examples/perf -q 1 -s 1024 -w randwrite -t 30 -c 0XF -o 4096 -r
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  163398.03     638.27       6.11       5.33     240.76
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  164632.47     643.10       6.06       5.29     125.22
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  164694.40     643.34       6.06       5.31     408.43
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  164007.13     640.65       6.08       5.33     170.10
========================================================
Total                                                                     :  656732.03    2565.36       6.08       5.29     408.43

2.1 randread, qd=1, srq=enabled
./build/examples/perf -q 1 -s 1024 -w randread -t 30 -c 0xF -o 4096 -r '
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  153514.40     599.67       6.50       5.97     277.22
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  153567.57     599.87       6.50       5.95     408.06
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  153590.33     599.96       6.50       5.88     134.74
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  153357.40     599.05       6.51       5.97     229.03
========================================================
Total                                                                     :  614029.70    2398.55       6.50       5.88     408.06

2.2 randread, qd=1, srq=disabled
./build/examples/perf -q 1 -s 1024 -w randread -t 30 -c 0XF -o 4096 -r '
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  154452.40     603.33       6.46       5.94     233.15
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  154711.67     604.34       6.45       5.91      25.55
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  154717.70     604.37       6.45       5.88     130.92
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  154713.77     604.35       6.45       5.91     128.19
========================================================
Total                                                                     :  618595.53    2416.39       6.45       5.88     233.15

3.1 randwrite, qd=32, srq=enabled
./build/examples/perf -q 32 -s 1024 -w randwrite -t 30 -c 0XF -o 4096 -r 'trtype:RDMA adrfam:IPv4 traddr:1.1.18.1 trsvcid:4420'
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  672608.17    2627.38      47.56      11.33     326.96
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  672386.20    2626.51      47.58      11.03     221.88
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  673343.70    2630.25      47.51       9.11     387.54
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  672799.10    2628.12      47.55      10.48     552.80
========================================================
Total                                                                     : 2691137.17   10512.25      47.55       9.11     552.80

3.2 randwrite, qd=32, srq=disabled
./build/examples/perf -q 32 -s 1024 -w randwrite -t 30 -c 0XF -o 4096 -r 'trtype:RDMA adrfam:IPv4 traddr:1.1.18.1 trsvcid:4420'
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  672647.53    2627.53      47.56      11.13     389.95
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  672756.50    2627.96      47.55       9.53     394.83
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  672464.63    2626.81      47.57       9.48     528.07
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  673250.73    2629.89      47.52       9.43     389.83
========================================================
Total                                                                     : 2691119.40   10512.19      47.55       9.43     528.07

4.1 randread, qd=32, srq=enabled
./build/examples/perf -q 32 -s 1024 -w randread -t 30 -c 0xF -o 4096 -r
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  677286.30    2645.65      47.23      12.29     335.90
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  677554.97    2646.70      47.22      20.39     196.21
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  677086.07    2644.87      47.25      19.17     386.26
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  677654.93    2647.09      47.21      18.92     181.05
========================================================
Total                                                                     : 2709582.27   10584.31      47.23      12.29     386.26

4.2 randread, qd=32, srq=disabled
./build/examples/perf -q 32 -s 1024 -w randread -t 30 -c 0XF -o 4096 -r
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  677432.60    2646.22      47.22      13.05     435.91
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  677450.43    2646.29      47.22      16.26     178.60
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  677647.10    2647.06      47.21      17.82     177.83
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  677047.33    2644.72      47.25      15.62     308.21
========================================================
Total                                                                     : 2709577.47   10584.29      47.23      13.05     435.91

Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Signed-off-by: default avatarDenis Nagorny <denisn@nvidia.com>
Signed-off-by: default avatarEvgeniy Kochetov <evgeniik@nvidia.com>
Change-Id: I843a5eda14e872bf6e2010e9f63b8e46d5bba691
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14174


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
parent 4999a985
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -76,6 +76,15 @@ and disable CPU core locks in runtime.
Added --rpcs-allowed command line option. Users can specify a comma-separated list of RPC
names with this option to restrict allowed RPCs to only that list.

### nvme

NVMe transport options were newly introduced. The NVMe transport options are defined via
the `spdk_nvme_transport_opts` structure and configured via `spdk_nvme_transport_get_opts`
and `spdk_nvme_transport_get_opts` functions.

Shared receive queue was supported by the RDMA transport. It can be configured by
a new NVMe transport option `rdma_srq_size`.

### rpc

Added spdk_rpc_set_allowlist to restrict allowed RPCs to the specified list.
+38 −0
Original line number Diff line number Diff line
@@ -4098,6 +4098,44 @@ static void __attribute__((constructor)) _spdk_nvme_transport_register_##name(vo
	spdk_nvme_transport_register(transport_ops); \
}

/**
 * NVMe transport options.
 */
struct spdk_nvme_transport_opts {
	/**
	 * It is used for RDMA transport.
	 *
	 * The queue depth of a shared rdma receive queue.
	 */
	uint32_t rdma_srq_size;

	/**
	 * The size of spdk_nvme_transport_opts according to the caller of this library is used for ABI
	 * compatibility.  The library uses this field to know how many fields in this
	 * structure are valid. And the library will populate any remaining fields with default values.
	 */
	size_t opts_size;
} __attribute__((packed));
SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_transport_opts) == 12, "Incorrect size");

/**
 * Get the current NVMe transport options.
 *
 * \param[out] opts Will be filled with the current options for spdk_nvme_transport_set_opts().
 * \param opts_size Must be set to sizeof(struct spdk_nvme_transport_opts).
 */
void spdk_nvme_transport_get_opts(struct spdk_nvme_transport_opts *opts, size_t opts_size);

/**
 * Set the NVMe transport options.
 *
 * \param opts Pointer to the allocated spdk_nvme_transport_opts structure with new values.
 * \param opts_size Must be set to sizeof(struct spdk_nvme_transport_opts).
 *
 * \return 0 on success, or negated errno on failure.
 */
int spdk_nvme_transport_set_opts(const struct spdk_nvme_transport_opts *opts, size_t opts_size);

#ifdef __cplusplus
}
#endif
+2 −0
Original line number Diff line number Diff line
@@ -34,6 +34,8 @@

extern pid_t g_spdk_nvme_pid;

extern struct spdk_nvme_transport_opts g_spdk_nvme_transport_opts;

/*
 * Some Intel devices support vendor-unique read latency log page even
 * though the log page directory says otherwise.
+186 −33
Original line number Diff line number Diff line
@@ -133,10 +133,15 @@ struct nvme_rdma_poller_stats {
};

struct nvme_rdma_poll_group;
struct nvme_rdma_rsps;

struct nvme_rdma_poller {
	struct ibv_context		*device;
	struct ibv_cq			*cq;
	struct spdk_rdma_srq		*srq;
	struct nvme_rdma_rsps		*rsps;
	struct ibv_pd			*pd;
	struct spdk_rdma_mem_map	*mr_map;
	uint32_t			refcnt;
	int				required_num_wc;
	int				current_num_wc;
@@ -170,6 +175,7 @@ typedef int (*nvme_rdma_cm_event_cb)(struct nvme_rdma_qpair *rqpair, int ret);
struct nvme_rdma_rsp_opts {
	uint16_t				num_entries;
	struct nvme_rdma_qpair			*rqpair;
	struct spdk_rdma_srq			*srq;
	struct spdk_rdma_mem_map		*mr_map;
};

@@ -193,6 +199,7 @@ struct nvme_rdma_qpair {
	struct spdk_rdma_qp			*rdma_qp;
	struct rdma_cm_id			*cm_id;
	struct ibv_cq				*cq;
	struct spdk_rdma_srq			*srq;

	struct	spdk_nvme_rdma_req		*rdma_reqs;

@@ -707,12 +714,16 @@ nvme_rdma_qpair_set_poller(struct spdk_nvme_qpair *qpair)
		return -EINVAL;
	}

	if (!poller->srq) {
		if (nvme_rdma_resize_cq(rqpair, poller)) {
			nvme_rdma_poll_group_put_poller(group, poller);
			return -EPROTO;
		}
	}

	rqpair->cq = poller->cq;
	rqpair->srq = poller->srq;
	rqpair->rsps = poller->rsps;
	rqpair->poller = poller;
	return 0;
}
@@ -758,7 +769,11 @@ nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair)
	attr.send_cq		= rqpair->cq;
	attr.recv_cq		= rqpair->cq;
	attr.cap.max_send_wr	= rqpair->num_entries; /* SEND operations */
	if (rqpair->srq) {
		attr.srq	= rqpair->srq->srq;
	} else {
		attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */
	}
	attr.cap.max_send_sge	= spdk_min(NVME_RDMA_DEFAULT_TX_SGE, dev_attr.max_sge);
	attr.cap.max_recv_sge	= spdk_min(NVME_RDMA_DEFAULT_RX_SGE, dev_attr.max_sge);

@@ -839,6 +854,20 @@ nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair)
	return rc;
}

static inline int
nvme_rdma_poller_submit_recvs(struct nvme_rdma_poller *poller)
{
	struct ibv_recv_wr *bad_recv_wr;
	int rc;

	rc = spdk_rdma_srq_flush_recv_wrs(poller->srq, &bad_recv_wr);
	if (spdk_unlikely(rc)) {
		nvme_rdma_reset_failed_recvs(poller->rsps, bad_recv_wr, rc);
	}

	return rc;
}

#define nvme_rdma_trace_ibv_sge(sg_list) \
	if (sg_list) { \
		SPDK_DEBUGLOG(nvme, "local addr %p length 0x%x lkey 0x%x\n", \
@@ -916,7 +945,11 @@ nvme_rdma_create_rsps(struct nvme_rdma_rsp_opts *opts)

		nvme_rdma_trace_ibv_sge(recv_wr->sg_list);

		if (opts->rqpair) {
			spdk_rdma_qp_queue_recv_wrs(opts->rqpair->rdma_qp, recv_wr);
		} else {
			spdk_rdma_srq_queue_recv_wrs(opts->srq, recv_wr);
		}
	}

	rsps->num_entries = opts->num_entries;
@@ -1127,8 +1160,10 @@ nvme_rdma_connect_established(struct nvme_rdma_qpair *rqpair, int ret)
	}
	SPDK_DEBUGLOG(nvme, "RDMA requests created\n");

	if (!rqpair->srq) {
		opts.num_entries = rqpair->num_entries;
		opts.rqpair = rqpair;
		opts.srq = NULL;
		opts.mr_map = rqpair->mr_map;

		rqpair->rsps = nvme_rdma_create_rsps(&opts);
@@ -1145,6 +1180,7 @@ nvme_rdma_connect_established(struct nvme_rdma_qpair *rqpair, int ret)
			return -1;
		}
		SPDK_DEBUGLOG(nvme, "RDMA responses submitted\n");
	}

	rqpair->state = NVME_RDMA_QPAIR_STATE_FABRIC_CONNECT_SEND;

@@ -1863,6 +1899,8 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair)

		rqpair->poller = NULL;
		rqpair->cq = NULL;
		rqpair->srq = NULL;
		rqpair->rsps = NULL;
	} else if (rqpair->cq) {
		ibv_destroy_cq(rqpair->cq);
		rqpair->cq = NULL;
@@ -1892,7 +1930,8 @@ nvme_rdma_qpair_disconnected(struct nvme_rdma_qpair *rqpair, int ret)
		goto quiet;
	}

	if (rqpair->current_num_sends != 0 || rqpair->rsps->current_num_recvs != 0) {
	if (rqpair->current_num_sends != 0 ||
	    (!rqpair->srq && rqpair->rsps->current_num_recvs != 0)) {
		rqpair->state = NVME_RDMA_QPAIR_STATE_LINGERING;
		rqpair->evt_timeout_ticks = (NVME_RDMA_DISCONNECTED_QPAIR_TIMEOUT_US * spdk_get_ticks_hz()) /
					    SPDK_SEC_TO_USEC + spdk_get_ticks();
@@ -1913,7 +1952,8 @@ static int
nvme_rdma_qpair_wait_until_quiet(struct nvme_rdma_qpair *rqpair)
{
	if (spdk_get_ticks() < rqpair->evt_timeout_ticks &&
	    (rqpair->current_num_sends != 0 || rqpair->rsps->current_num_recvs != 0)) {
	    (rqpair->current_num_sends != 0 ||
	     (!rqpair->srq && rqpair->rsps->current_num_recvs != 0))) {
		return -EAGAIN;
	}

@@ -2372,7 +2412,11 @@ nvme_rdma_request_ready(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_re
	recv_wr->next = NULL;
	nvme_rdma_trace_ibv_sge(recv_wr->sg_list);

	if (!rqpair->srq) {
		spdk_rdma_qp_queue_recv_wrs(rqpair->rdma_qp, recv_wr);
	} else {
		spdk_rdma_srq_queue_recv_wrs(rqpair->srq, recv_wr);
	}
}

#define MAX_COMPLETIONS_PER_POLL 128
@@ -2431,29 +2475,45 @@ nvme_rdma_log_wc_status(struct nvme_rdma_qpair *rqpair, struct ibv_wc *wc)
}

static inline int
nvme_rdma_process_recv_completion(struct ibv_wc *wc, struct nvme_rdma_wr *rdma_wr)
nvme_rdma_process_recv_completion(struct nvme_rdma_poller *poller, struct ibv_wc *wc,
				  struct nvme_rdma_wr *rdma_wr)
{
	struct nvme_rdma_qpair		*rqpair;
	struct spdk_nvme_rdma_req	*rdma_req;
	struct spdk_nvme_rdma_rsp	*rdma_rsp;

	rdma_rsp = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_rsp, rdma_wr);

	if (poller && poller->srq) {
		rqpair = get_rdma_qpair_from_wc(poller->group, wc);
		if (spdk_unlikely(!rqpair)) {
			/* Since we do not handle the LAST_WQE_REACHED event, we do not know when
			 * a Receive Queue in a QP, that is associated with an SRQ, is flushed.
			 * We may get a WC for a already destroyed QP.
			 *
			 * However, for the SRQ, this is not any error. Hence, just re-post the
			 * receive request to the SRQ to reuse for other QPs, and return 0.
			 */
			spdk_rdma_srq_queue_recv_wrs(poller->srq, rdma_rsp->recv_wr);
			return 0;
		}
	} else {
		rqpair = rdma_rsp->rqpair;
	}

	assert(rqpair->rsps->current_num_recvs > 0);
	rqpair->rsps->current_num_recvs--;

	if (wc->status) {
		nvme_rdma_log_wc_status(rqpair, wc);
		nvme_rdma_fail_qpair(&rqpair->qpair, 0);
		return -ENXIO;
		goto err_wc;
	}

	SPDK_DEBUGLOG(nvme, "CQ recv completion\n");

	if (wc->byte_len < sizeof(struct spdk_nvme_cpl)) {
		SPDK_ERRLOG("recv length %u less than expected response size\n", wc->byte_len);
		nvme_rdma_fail_qpair(&rqpair->qpair, 0);
		return -ENXIO;
		goto err_wc;
	}
	rdma_req = &rqpair->rdma_reqs[rdma_rsp->cpl.cid];
	rdma_req->completion_flags |= NVME_RDMA_RECV_COMPLETED;
@@ -2475,6 +2535,13 @@ nvme_rdma_process_recv_completion(struct ibv_wc *wc, struct nvme_rdma_wr *rdma_w

	rqpair->num_completions++;
	return 1;

err_wc:
	nvme_rdma_fail_qpair(&rqpair->qpair, 0);
	if (poller && poller->srq) {
		spdk_rdma_srq_queue_recv_wrs(poller->srq, rdma_rsp->recv_wr);
	}
	return -ENXIO;
}

static inline int
@@ -2505,6 +2572,9 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller,
		rqpair->current_num_sends--;
		nvme_rdma_log_wc_status(rqpair, wc);
		nvme_rdma_fail_qpair(&rqpair->qpair, 0);
		if (rdma_req->rdma_rsp && poller && poller->srq) {
			spdk_rdma_srq_queue_recv_wrs(poller->srq, rdma_req->rdma_rsp->recv_wr);
		}
		return -ENXIO;
	}

@@ -2561,7 +2631,7 @@ nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size,
		rdma_wr = (struct nvme_rdma_wr *)wc[i].wr_id;
		switch (rdma_wr->type) {
		case RDMA_WR_TYPE_RECV:
			_rc = nvme_rdma_process_recv_completion(&wc[i], rdma_wr);
			_rc = nvme_rdma_process_recv_completion(poller, &wc[i], rdma_wr);
			break;

		case RDMA_WR_TYPE_SEND:
@@ -2767,6 +2837,18 @@ nvme_rdma_poller_destroy(struct nvme_rdma_poller *poller)
	if (poller->cq) {
		ibv_destroy_cq(poller->cq);
	}
	if (poller->rsps) {
		nvme_rdma_free_rsps(poller->rsps);
	}
	if (poller->srq) {
		spdk_rdma_srq_destroy(poller->srq);
	}
	if (poller->mr_map) {
		spdk_rdma_free_mem_map(&poller->mr_map);
	}
	if (poller->pd) {
		spdk_rdma_put_pd(poller->pd);
	}
	free(poller);
}

@@ -2774,6 +2856,11 @@ static struct nvme_rdma_poller *
nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx)
{
	struct nvme_rdma_poller *poller;
	struct ibv_device_attr dev_attr;
	struct spdk_rdma_srq_init_attr srq_init_attr = {};
	struct nvme_rdma_rsp_opts opts;
	int num_cqe;
	int rc;

	poller = calloc(1, sizeof(*poller));
	if (poller == NULL) {
@@ -2783,7 +2870,68 @@ nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *

	poller->group = group;
	poller->device = ctx;
	poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0);

	if (g_spdk_nvme_transport_opts.rdma_srq_size != 0) {
		rc = ibv_query_device(ctx, &dev_attr);
		if (rc) {
			SPDK_ERRLOG("Unable to query RDMA device.\n");
			goto fail;
		}

		poller->pd = spdk_rdma_get_pd(ctx);
		if (poller->pd == NULL) {
			SPDK_ERRLOG("Unable to get PD.\n");
			goto fail;
		}

		poller->mr_map = spdk_rdma_create_mem_map(poller->pd, &g_nvme_hooks,
				 SPDK_RDMA_MEMORY_MAP_ROLE_INITIATOR);
		if (poller->mr_map == NULL) {
			SPDK_ERRLOG("Unable to create memory map.\n");
			goto fail;
		}

		srq_init_attr.stats = &poller->stats.rdma_stats.recv;
		srq_init_attr.pd = poller->pd;
		srq_init_attr.srq_init_attr.attr.max_wr = spdk_min((uint32_t)dev_attr.max_srq_wr,
				g_spdk_nvme_transport_opts.rdma_srq_size);
		srq_init_attr.srq_init_attr.attr.max_sge = spdk_min(dev_attr.max_sge,
				NVME_RDMA_DEFAULT_RX_SGE);

		poller->srq = spdk_rdma_srq_create(&srq_init_attr);
		if (poller->srq == NULL) {
			SPDK_ERRLOG("Unable to create SRQ.\n");
			goto fail;
		}

		opts.num_entries = g_spdk_nvme_transport_opts.rdma_srq_size;
		opts.rqpair = NULL;
		opts.srq = poller->srq;
		opts.mr_map = poller->mr_map;

		poller->rsps = nvme_rdma_create_rsps(&opts);
		if (poller->rsps == NULL) {
			SPDK_ERRLOG("Unable to create poller RDMA responses.\n");
			goto fail;
		}

		rc = nvme_rdma_poller_submit_recvs(poller);
		if (rc) {
			SPDK_ERRLOG("Unable to submit poller RDMA responses.\n");
			goto fail;
		}

		/*
		 * When using an srq, fix the size of the completion queue at startup.
		 * The initiator sends only send and recv WRs. Hence, the multiplier is 2.
		 * (The target sends also data WRs. Hence, the multiplier is 3.)
		 */
		num_cqe = g_spdk_nvme_transport_opts.rdma_srq_size * 2;
	} else {
		num_cqe = DEFAULT_NVME_RDMA_CQ_SIZE;
	}

	poller->cq = ibv_create_cq(poller->device, num_cqe, group, NULL, 0);

	if (poller->cq == NULL) {
		SPDK_ERRLOG("Unable to create CQ, errno %d.\n", errno);
@@ -2792,7 +2940,7 @@ nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *

	STAILQ_INSERT_HEAD(&group->pollers, poller, link);
	group->num_pollers++;
	poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE;
	poller->current_num_wc = num_cqe;
	poller->required_num_wc = 0;
	return poller;

@@ -2983,6 +3131,9 @@ nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *
		} while (poller_completions < completions_per_poller);
		total_completions += poller_completions;
		poller->stats.completions += rdma_completions;
		if (poller->srq) {
			nvme_rdma_poller_submit_recvs(poller);
		}
	}

	STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
@@ -2997,7 +3148,9 @@ nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *
		}

		nvme_rdma_qpair_submit_sends(rqpair);
		if (!rqpair->srq) {
			nvme_rdma_qpair_submit_recvs(rqpair);
		}
		if (rqpair->num_completions > 0) {
			nvme_qpair_resubmit_requests(qpair, rqpair->num_completions);
		}
+60 −0
Original line number Diff line number Diff line
@@ -25,6 +25,10 @@ TAILQ_HEAD(nvme_transport_list, spdk_nvme_transport) g_spdk_nvme_transports =
struct spdk_nvme_transport g_spdk_transports[SPDK_MAX_NUM_OF_TRANSPORTS] = {};
int g_current_transport_index = 0;

struct spdk_nvme_transport_opts g_spdk_nvme_transport_opts = {
	.rdma_srq_size = 0,
};

const struct spdk_nvme_transport *
nvme_get_first_transport(void)
{
@@ -792,3 +796,59 @@ nvme_transport_get_trtype(const struct spdk_nvme_transport *transport)
{
	return transport->ops.type;
}

void
spdk_nvme_transport_get_opts(struct spdk_nvme_transport_opts *opts, size_t opts_size)
{
	if (opts == NULL) {
		SPDK_ERRLOG("opts should not be NULL.\n");
		return;
	}

	if (opts_size == 0) {
		SPDK_ERRLOG("opts_size should not be zero.\n");
		return;
	}

	opts->opts_size = opts_size;

#define SET_FIELD(field) \
	if (offsetof(struct spdk_nvme_transport_opts, field) + sizeof(opts->field) <= opts_size) { \
		opts->field = g_spdk_nvme_transport_opts.field; \
	} \

	SET_FIELD(rdma_srq_size);

	/* Do not remove this statement, you should always update this statement when you adding a new field,
	 * and do not forget to add the SET_FIELD statement for your added field. */
	SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_transport_opts) == 12, "Incorrect size");

#undef SET_FIELD
}

int
spdk_nvme_transport_set_opts(const struct spdk_nvme_transport_opts *opts, size_t opts_size)
{
	if (opts == NULL) {
		SPDK_ERRLOG("opts should not be NULL.\n");
		return -EINVAL;
	}

	if (opts_size == 0) {
		SPDK_ERRLOG("opts_size should not be zero.\n");
		return -EINVAL;
	}

#define SET_FIELD(field) \
	if (offsetof(struct spdk_nvme_transport_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
		g_spdk_nvme_transport_opts.field = opts->field; \
	} \

	SET_FIELD(rdma_srq_size);

	g_spdk_nvme_transport_opts.opts_size = opts->opts_size;

#undef SET_FIELD

	return 0;
}
Loading