Commit 8e8a5f7c authored by Or Gerlitz's avatar Or Gerlitz Committed by Tomasz Zawadzki
Browse files

nvme/tcp: Use writev_async for sending data on sockets



Amortize the writev syscall cost by using the writev_async socket API.

This allows the socket layer to batch writes into one system call
and also apply further optimizations such as posix's MSG_ZEROCOPY
when they are available. As part of doing so we remove the error
return in the socket layer writev_async implementation for sockets
that don't have a poll group.

Doing so eliminates the send queue processing.

Signed-off-by: default avatarOr Gerlitz <ogerlitz@mellanox.com>
Change-Id: I5432ae322afaff7b96c22269fc06b75f9ae60b81
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/475420


Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent e61b0904
Loading
Loading
Loading
Loading
+21 −69
Original line number Diff line number Diff line
@@ -290,81 +290,23 @@ nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
	return 0;
}

static int
nvme_tcp_qpair_process_send_queue(struct nvme_tcp_qpair *tqpair)
static void
_pdu_write_done(void *cb_arg, int err)
{
	const int array_size = 32;
	struct iovec iovs[array_size];
	int iovcnt = 0;
	int bytes = 0;
	uint32_t mapped_length;
	struct nvme_tcp_pdu *pdu;
	int pdu_length;
	TAILQ_HEAD(, nvme_tcp_pdu) completed_pdus_list;

	pdu = TAILQ_FIRST(&tqpair->send_queue);

	if (pdu == NULL) {
		return 0;
	}
	struct nvme_tcp_pdu *pdu = cb_arg;
	struct nvme_tcp_qpair *tqpair = pdu->qpair;

	/*
	 * Build up a list of iovecs for the first few PDUs in the
	 *  tqpair 's send_queue.
	 */
	while (pdu != NULL && ((array_size - iovcnt) >= (2 + (int)pdu->data_iovcnt))) {
		iovcnt += nvme_tcp_build_iovs(&iovs[iovcnt], array_size - iovcnt,
					      pdu, tqpair->host_hdgst_enable,
					      tqpair->host_ddgst_enable, &mapped_length);
		pdu = TAILQ_NEXT(pdu, tailq);
	}

	bytes = spdk_sock_writev(tqpair->sock, iovs, iovcnt);
	SPDK_DEBUGLOG(SPDK_LOG_NVME, "bytes=%d are out\n", bytes);
	if (bytes == -1) {
		if (errno == EWOULDBLOCK || errno == EAGAIN) {
			return 1;
		} else {
			SPDK_ERRLOG("spdk_sock_writev() failed, errno %d: %s\n",
				    errno, spdk_strerror(errno));
			return -errno;
		}
	}

	pdu = TAILQ_FIRST(&tqpair->send_queue);

	/*
	 * Free any PDUs that were fully written.  If a PDU was only
	 *  partially written, update its writev_offset so that next
	 *  time only the unwritten portion will be sent to writev().
	 */
	TAILQ_INIT(&completed_pdus_list);
	while (bytes > 0) {
		pdu_length = pdu->hdr->common.plen - pdu->writev_offset;
		assert(pdu_length > 0);
		if (bytes >= pdu_length) {
			bytes -= pdu_length;
	TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
			TAILQ_INSERT_TAIL(&completed_pdus_list, pdu, tailq);
			pdu = TAILQ_FIRST(&tqpair->send_queue);

		} else {
			pdu->writev_offset += bytes;
			bytes = 0;
		}
	if (err != 0) {
		nvme_tcp_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair);
		return;
	}

	while (!TAILQ_EMPTY(&completed_pdus_list)) {
		pdu = TAILQ_FIRST(&completed_pdus_list);
		TAILQ_REMOVE(&completed_pdus_list, pdu, tailq);
	assert(pdu->cb_fn != NULL);
	pdu->cb_fn(pdu->cb_arg);
}

	return TAILQ_EMPTY(&tqpair->send_queue) ? 0 : 1;

}

static int
nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
			 struct nvme_tcp_pdu *pdu,
@@ -374,6 +316,7 @@ nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
	int enable_digest;
	int hlen;
	uint32_t crc32c;
	uint32_t mapped_length = 0;

	hlen = pdu->hdr->common.hlen;
	enable_digest = 1;
@@ -397,7 +340,16 @@ nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,

	pdu->cb_fn = cb_fn;
	pdu->cb_arg = cb_arg;

	pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu,
			       tqpair->host_hdgst_enable, tqpair->host_ddgst_enable,
			       &mapped_length);
	pdu->qpair = tqpair;
	pdu->sock_req.cb_fn = _pdu_write_done;
	pdu->sock_req.cb_arg = pdu;
	TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
	spdk_sock_writev_async(tqpair->sock, &pdu->sock_req);

	return 0;
}

@@ -1437,7 +1389,7 @@ nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_c
	uint32_t reaped;
	int rc;

	rc = nvme_tcp_qpair_process_send_queue(tqpair);
	rc = spdk_sock_flush(tqpair->sock);
	if (rc < 0) {
		return rc;
	}
+0 −5
Original line number Diff line number Diff line
@@ -596,11 +596,6 @@ spdk_posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *r

	spdk_sock_request_queue(sock, req);

	if (sock->group_impl == NULL) {
		spdk_sock_request_put(sock, req, -ENOTSUP);
		return;
	}

	/* If there are a sufficient number queued, just flush them out immediately. */
	if (sock->queued_iovcnt >= IOV_BATCH_SIZE) {
		rc = _sock_flush(sock);