Commit 9bff828f authored by Richael Zhuang's avatar Richael Zhuang Committed by Tomasz Zawadzki
Browse files

sock: introduce dynamic zerocopy according to data size

MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.

Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.

Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf



For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.

For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.

The following is part of the detailed data.

posix:
qdepth=128
rw_percent      0             |           30
cpu  origin  thisPatch  opt   | origin  thisPatch opt
1	286.5	298.5	4.19%		 307	304.15	-0.93%
4	1042.5	1107	6.19%		1135.5	1136	0.04%
8	1952.5	2058	5.40%		2170.5	2170.5	0.00%
12	2658.5	2879	8.29%		3042	3046	0.13%
16	3247.5	3460.5	6.56%		3793.5	3775	-0.49%
24	4232.5	4459.5	5.36%		4614.5	4756.5	3.08%
32	4810	5095	5.93%		4488	4845	7.95%
40	5306.5	5435	2.42%		4427.5	4902	10.72%

qdepth=512
rw_percent      0             |           30
cpu  origin  thisPatch  opt   | origin  thisPatch opt
1    275	 287	4.36%		294.4	295.45	0.36%
4	 979	1041	6.33%		1073	1083.5	0.98%
8	1822.5	1914.5	5.05%		2030.5	2018.5	-0.59%
12	2441	2598.5	6.45%		2808.5	2779.5	-1.03%
16	2920.5	3109.5	6.47%		3455	3411.5	-1.26%
24	3709	3972.5	7.10%		4483.5	4502.5	0.42%
32	4225.5	4532.5	7.27%		4463.5	4733	6.04%
40	4790.5	4884.5	1.96%		4427	4904.5	10.79%

uring:
qdepth=128
rw_percent      0             |           30
cpu  origin  thisPatch  opt   | origin  thisPatch opt
1	270.5	287.5	6.28%		295.75	304.75	3.04%
4	1018.5	1089.5	6.97%		1119.5	1156.5	3.31%
8	1907	2055	7.76%		2127	2211.5	3.97%
12	2614	2801	7.15%		2982.5	3061.5	2.65%
16	3169.5	3420	7.90%		3654.5	3781.5	3.48%
24	4109.5	4414	7.41%		4691.5	4750.5	1.26%
32	4752.5	4908	3.27%		4494	4825.5	7.38%
40	5233.5	5327	1.79%		4374.5	4891	11.81%

qdepth=512
rw_percent      0             |           30
cpu  origin  thisPatch  opt   | origin  thisPatch opt
1	259.95	 276	6.17%		286.65	294.8	2.84%
4	955 	1021	6.91%		1070.5	1100	2.76%
8	1772	1903.5	7.42%		1992.5	2077.5	4.27%
12	2380.5	2543.5	6.85%		2752.5	2860	3.91%
16	2920.5	3099	6.11%		3391.5	3540	4.38%
24	3697	3912	5.82%		4401	4637	5.36%
32	4256.5	4454.5	4.65%		4516	4777	5.78%
40	4707	4968.5	5.56%		4400.5	4933	12.10%

Signed-off-by: default avatarRichael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210


Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
parent eef6af95
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -108,6 +108,17 @@ bdevs is one after another. The concat bdev is extendable. When the free space o
concat bdev is not enough, the user can deconstruct the concat bdev, then reconstruct it
with an additional underlying bdev.

### sock

Allow MSG_ZEROCOPY flag to be set or not according to data size, which can be enabled and
set by setting "zerocopy_threshold". zerocopy_threshold = 0 means disable this function;
zerocopy_threshold > 0 means enable it and use this value as the threshold.

### rpc

Introduced `zerocopy_threshold` to enable zerocopy on send for server sockets according to
data size to be flushed.

## v22.01

### accel
+9 −0
Original line number Diff line number Diff line
@@ -75,6 +75,9 @@ struct spdk_sock_request {
		void				*curr_list;
#endif
		uint32_t			offset;

		/* Indicate if the whole req or part of it is sent with zerocopy */
		bool				is_zcopy;
	} internal;

	int				iovcnt;
@@ -139,6 +142,12 @@ struct spdk_sock_impl_opts {
	 * Enable or disable use of zero copy flow on send for client sockets. Used by posix socket module.
	 */
	bool enable_zerocopy_send_client;

	/**
	 * Set zerocopy threshold in bytes. A consecutive sequence of requests' iovecs that fall below this
	 * threshold may be sent without zerocopy flag set.
	 */
	uint32_t zerocopy_threshold;
};

/**
+14 −2
Original line number Diff line number Diff line
@@ -66,6 +66,7 @@ struct spdk_sock {
	int				cb_cnt;
	spdk_sock_cb			cb_fn;
	void				*cb_arg;
	uint32_t			zerocopy_threshold;
	struct {
		uint8_t		closed		: 1;
		uint8_t		reserved	: 7;
@@ -174,6 +175,7 @@ spdk_sock_request_put(struct spdk_sock *sock, struct spdk_sock_request *req, int
#endif

	req->internal.offset = 0;
	req->internal.is_zcopy = 0;

	closed = sock->flags.closed;
	sock->cb_cnt++;
@@ -245,11 +247,12 @@ spdk_sock_abort_requests(struct spdk_sock *sock)

static inline int
spdk_sock_prep_reqs(struct spdk_sock *_sock, struct iovec *iovs, int index,
		    struct spdk_sock_request **last_req)
		    struct spdk_sock_request **last_req, int *flags)
{
	int iovcnt, i;
	struct spdk_sock_request *req;
	unsigned int offset;
	uint64_t total = 0;

	/* Gather an iov */
	iovcnt = index;
@@ -275,8 +278,9 @@ spdk_sock_prep_reqs(struct spdk_sock *_sock, struct iovec *iovs, int index,

			iovs[iovcnt].iov_base = SPDK_SOCK_REQUEST_IOV(req, i)->iov_base + offset;
			iovs[iovcnt].iov_len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
			iovcnt++;

			total += iovs[iovcnt].iov_len;
			iovcnt++;
			offset = 0;

			if (iovcnt >= IOV_BATCH_SIZE) {
@@ -294,6 +298,14 @@ spdk_sock_prep_reqs(struct spdk_sock *_sock, struct iovec *iovs, int index,
	}

end:

#if defined(MSG_ZEROCOPY)
	/* if data size < zerocopy_threshold, remove MSG_ZEROCOPY flag */
	if (total < _sock->zerocopy_threshold && flags != NULL) {
		*flags = *flags & (~MSG_ZEROCOPY);
	}
#endif

	return iovcnt;
}

+13 −0
Original line number Diff line number Diff line
@@ -320,6 +320,8 @@ spdk_sock_connect_ext(const char *ip, int port, char *_impl_name, struct spdk_so
	struct spdk_sock *sock;
	struct spdk_sock_opts opts_local;
	const char *impl_name = NULL;
	struct spdk_sock_impl_opts impl_opts = {};
	size_t len;

	if (opts == NULL) {
		SPDK_ERRLOG("the opts should not be NULL pointer\n");
@@ -346,6 +348,10 @@ spdk_sock_connect_ext(const char *ip, int port, char *_impl_name, struct spdk_so
			sock->net_impl = impl;
			TAILQ_INIT(&sock->queued_reqs);
			TAILQ_INIT(&sock->pending_reqs);

			len = sizeof(struct spdk_sock_impl_opts);
			spdk_sock_impl_get_opts(impl->name, &impl_opts, &len);
			sock->zerocopy_threshold = impl_opts.zerocopy_threshold;
			return sock;
		}
	}
@@ -407,6 +413,8 @@ struct spdk_sock *
spdk_sock_accept(struct spdk_sock *sock)
{
	struct spdk_sock *new_sock;
	struct spdk_sock_impl_opts impl_opts = {};
	size_t len;

	new_sock = sock->net_impl->accept(sock);
	if (new_sock != NULL) {
@@ -416,6 +424,10 @@ spdk_sock_accept(struct spdk_sock *sock)
		new_sock->net_impl = sock->net_impl;
		TAILQ_INIT(&new_sock->queued_reqs);
		TAILQ_INIT(&new_sock->pending_reqs);

		len = sizeof(struct spdk_sock_impl_opts);
		spdk_sock_impl_get_opts(sock->net_impl->name, &impl_opts, &len);
		new_sock->zerocopy_threshold = impl_opts.zerocopy_threshold;
	}

	return new_sock;
@@ -850,6 +862,7 @@ spdk_sock_write_config_json(struct spdk_json_write_ctx *w)
			spdk_json_write_named_uint32(w, "enable_placement_id", opts.enable_placement_id);
			spdk_json_write_named_bool(w, "enable_zerocopy_send_server", opts.enable_zerocopy_send_server);
			spdk_json_write_named_bool(w, "enable_zerocopy_send_client", opts.enable_zerocopy_send_client);
			spdk_json_write_named_uint32(w, "zerocopy_threshold", opts.zerocopy_threshold);
			spdk_json_write_object_end(w);
			spdk_json_write_object_end(w);
		} else {
+5 −0
Original line number Diff line number Diff line
@@ -79,6 +79,7 @@ rpc_sock_impl_get_options(struct spdk_jsonrpc_request *request,
	spdk_json_write_named_uint32(w, "enable_placement_id", sock_opts.enable_placement_id);
	spdk_json_write_named_bool(w, "enable_zerocopy_send_server", sock_opts.enable_zerocopy_send_server);
	spdk_json_write_named_bool(w, "enable_zerocopy_send_client", sock_opts.enable_zerocopy_send_client);
	spdk_json_write_named_uint32(w, "zerocopy_threshold", sock_opts.zerocopy_threshold);
	spdk_json_write_object_end(w);
	spdk_jsonrpc_end_result(request, w);
	free(impl_name);
@@ -123,6 +124,10 @@ static const struct spdk_json_object_decoder rpc_sock_impl_set_opts_decoders[] =
	{
		"enable_zerocopy_send_client", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_zerocopy_send_client),
		spdk_json_decode_bool, true
	},
	{
		"zerocopy_threshold", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.zerocopy_threshold),
		spdk_json_decode_uint32, true
	}
};

Loading