Commit a1f20e78 authored by Maciej Szulik's avatar Maciej Szulik Committed by Tomasz Zawadzki
Browse files

sock: fix premature completion of zcopy req



Fixes #3610.

Consider a case when zero-copy with some threshold is enabled.
For some large requests we can run into calling sendmsg multiple times.
If the last chunk was sent without MSG_ZEROCOPY (the threshold was not
met), but some previous chunks were sent with it, then we don't wait for
the zero-copy completion from the kernel and complete the socket request
prematurely.

This can lead, for example, to data integrity issues when the upper
layer reuses buffers soon enough.

To fix it, we need to make zcopy flag active until the notification is
received. It may happen several sendmsg calls later, so cache the index
of the last zcopy chunk for a given req in new zcopy_idx field (note
this new field breaks ABI).

However, such solution creates a new bug, consider a case:
1. Upper layer calls spdk_sock_flush, we have some chunk of a request
   sent with zcopy.
2. We call flush again, in the meantime zcopy notification arrived, but
   we don't check it, because the request is not on the pending list.
3. We call sendmsg with the last chunk of the request, without zcopy.
   The req could be completed now, but we ignored the notification,
   so we can't, because pending_zcopy flag is still active.

To fix it, clear the flag in check_zcopy for the first queued request
(the only possible partial one) by checking with the
last index (partial request should have last index).
This way if next flush will send last chunk, it should be completed.

Add two new unit tests for these cases.

Signed-off-by: default avatarMaciej Szulik <maciej.szulik@intel.com>
Change-Id: I6d8b96e8b9d49a860a48ddaebce4ee62437a20ba
Reviewed-on: https://review.spdk.io/c/spdk/spdk/+/25645


Reviewed-by: default avatarJacek Kalwas <jacek.kalwas@nutanix.com>
Tested-by: default avatarSPDK Automated Test System <spdkbot@gmail.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarKonrad Sztyber <ksztyber@nvidia.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
parent 21fa6a73
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -67,6 +67,11 @@ in milliseconds.

Add `spdk_reduce_vol_get_info()` to get the information for the compressed volume.

### sock

A new internal field was added to `spdk_sock_request`, increasing its size and moving `iovcnt` and
user provided `iov[]` offsets by 8 bytes, which breaks ABI.

### thread

Added `spdk_interrupt_register_ext()` API which can receive `spdk_event_handler_opts` structure.
+7 −6
Original line number Diff line number Diff line
@@ -44,9 +44,7 @@ struct spdk_sock_request {
	void	(*cb_fn)(void *cb_arg, int err);
	void				*cb_arg;

	/**
	 * These fields are used by the socket layer and should not be modified
	 */
	/* These fields are used by the socket layer and should not be modified. */
	struct __sock_request_internal {
		TAILQ_ENTRY(spdk_sock_request)	link;

@@ -59,15 +57,18 @@ struct spdk_sock_request {

		uint32_t			offset;

		/* Indicate if the whole req or part of it is sent with zerocopy */
		bool				is_zcopy;
		/* Last zero-copy sendmsg index. */
		uint32_t			zcopy_idx;

		/* Indicate if the whole req or part of it is pending zerocopy completion. */
		bool pending_zcopy;
	} internal;

	int				iovcnt;
	/* struct iovec			iov[]; */
};

SPDK_STATIC_ASSERT(sizeof(struct spdk_sock_request) == 56, "Incorrect size.");
SPDK_STATIC_ASSERT(sizeof(struct spdk_sock_request) == 64, "Incorrect size.");

#define SPDK_SOCK_REQUEST_IOV(req, i) ((struct iovec *)(((uint8_t *)req + sizeof(struct spdk_sock_request)) + (sizeof(struct iovec) * i)))

+2 −1
Original line number Diff line number Diff line
@@ -180,7 +180,8 @@ spdk_sock_request_complete(struct spdk_sock *sock, struct spdk_sock_request *req

	spdk_trace_record(TRACE_SOCK_REQ_COMPLETE, 0, 0, (uintptr_t)req, (uintptr_t)req->cb_arg);
	req->internal.offset = 0;
	req->internal.is_zcopy = 0;
	req->internal.zcopy_idx = 0;
	req->internal.pending_zcopy = false;

	closed = sock->flags.closed;
	sock->cb_cnt++;
+24 −9
Original line number Diff line number Diff line
@@ -1172,13 +1172,14 @@ _sock_check_zcopy(struct spdk_sock *sock)
		while (true) {
			found = false;
			TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) {
				if (!req->internal.is_zcopy) {
					/* This wasn't a zcopy request. It was just waiting in line to complete */
				if (!req->internal.pending_zcopy) {
					/* This wasn't a zcopy request. It was just waiting in line
					 * to complete. */
					rc = spdk_sock_request_put(sock, req, 0);
					if (rc < 0) {
						return rc;
					}
				} else if (req->internal.offset == idx) {
				} else if (req->internal.zcopy_idx == idx) {
					found = true;
					rc = spdk_sock_request_put(sock, req, 0);
					if (rc < 0) {
@@ -1195,6 +1196,17 @@ _sock_check_zcopy(struct spdk_sock *sock)

			idx++;
		}

		/* If the req is sent partially (still queued) and we just received its zcopy
		 * notification, next chunk may be sent without zcopy and should result in the req
		 * completion if it is the last chunk. Clear the pending flag to allow it.
		 * Checking the first queued req and the last index is enough, because only one req
		 * can be partially sent and it is the last one we can get notification for. */
		req = TAILQ_FIRST(&sock->queued_reqs);
		if (req && req->internal.pending_zcopy &&
		    req->internal.zcopy_idx == serr->ee_data) {
			req->internal.pending_zcopy = false;
		}
	}

	return 0;
@@ -1268,8 +1280,12 @@ _sock_flush(struct spdk_sock *sock)
	while (req) {
		offset = req->internal.offset;

		/* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */
		req->internal.is_zcopy = is_zcopy;
		if (is_zcopy) {
			/* Cache sendmsg_idx because full request might not be handled and next
			 * chunk may be sent without zero copy. */
			req->internal.pending_zcopy = true;
			req->internal.zcopy_idx = psock->sendmsg_idx;
		}

		for (i = 0; i < req->iovcnt; i++) {
			/* Advance by the offset first */
@@ -1295,16 +1311,15 @@ _sock_flush(struct spdk_sock *sock)
		/* Handled a full request. */
		spdk_sock_request_pend(sock, req);

		if (!req->internal.is_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) {
		/* We can't put the req if zero-copy is not completed or it is not first
		 * in the line. */
		if (!req->internal.pending_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) {
			/* The sendmsg syscall above isn't currently asynchronous,
			* so it's already done. */
			retval = spdk_sock_request_put(sock, req, 0);
			if (retval) {
				break;
			}
		} else {
			/* Re-use the offset field to hold the sendmsg call index. */
			req->internal.offset = psock->sendmsg_idx;
		}

		if (rc == 0) {
+23 −9
Original line number Diff line number Diff line
@@ -969,8 +969,12 @@ sock_complete_write_reqs(struct spdk_sock *_sock, ssize_t rc, bool is_zcopy)
	/* Consume the requests that were actually written */
	req = TAILQ_FIRST(&_sock->queued_reqs);
	while (req) {
		/* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */
		req->internal.is_zcopy = is_zcopy;
		if (is_zcopy) {
			/* Cache sendmsg_idx because full request might not be handled and next
			 * chunk may be sent without zero copy. */
			req->internal.pending_zcopy = true;
			req->internal.zcopy_idx = sock->sendmsg_idx;
		}

		rc = sock_request_advance_offset(req, rc);
		if (rc < 0) {
@@ -981,14 +985,12 @@ sock_complete_write_reqs(struct spdk_sock *_sock, ssize_t rc, bool is_zcopy)
		/* Handled a full request. */
		spdk_sock_request_pend(_sock, req);

		if (!req->internal.is_zcopy && req == TAILQ_FIRST(&_sock->pending_reqs)) {
		if (!req->internal.pending_zcopy &&
		    req == TAILQ_FIRST(&_sock->pending_reqs)) {
			retval = spdk_sock_request_put(_sock, req, 0);
			if (retval) {
				return retval;
			}
		} else {
			/* Re-use the offset field to hold the sendmsg call index. */
			req->internal.offset = sock->sendmsg_idx;
		}

		if (rc == 0) {
@@ -1048,13 +1050,14 @@ _sock_check_zcopy(struct spdk_sock *_sock, int status)
	while (true) {
		found = false;
		TAILQ_FOREACH_SAFE(req, &_sock->pending_reqs, internal.link, treq) {
			if (!req->internal.is_zcopy) {
				/* This wasn't a zcopy request. It was just waiting in line to complete */
			if (!req->internal.pending_zcopy) {
				/* This wasn't a zcopy request. It was just waiting in line
				 * to complete. */
				rc = spdk_sock_request_put(_sock, req, 0);
				if (rc < 0) {
					return rc;
				}
			} else if (req->internal.offset == idx) {
			} else if (req->internal.zcopy_idx == idx) {
				found = true;
				rc = spdk_sock_request_put(_sock, req, 0);
				if (rc < 0) {
@@ -1072,6 +1075,17 @@ _sock_check_zcopy(struct spdk_sock *_sock, int status)
		idx++;
	}

	/* If the req is sent partially (still queued) and we just received its zcopy
	 * notification, next chunk may be sent without zcopy and should result in the req
	 * completion if it is the last chunk. Clear the pending flag to allow it.
	 * Checking the first queued req and the last index is enough, because only one req
	 * can be partially sent and it is the last one we can get notification for. */
	req = TAILQ_FIRST(&_sock->queued_reqs);
	if (req && req->internal.pending_zcopy &&
	    req->internal.zcopy_idx == serr->ee_data) {
		req->internal.pending_zcopy = false;
	}

	return 0;
}

Loading