Commit d5073677 authored by Ziye Yang's avatar Ziye Yang Committed by Jim Harris
Browse files

nvmf/tcp: Use a big buffer for PDU receving.



Purpose: Reduce the recv/readv system call.
Method: Use a big recv buffer to conduct the read.
Though it will introduce addtional buffer copy,
we hope that the overhead introduced by buffer copy will
be smaller compared with frequent recv/readv system call overhead.
And the design is to make a trade off between them.

Signed-off-by: default avatarZiye Yang <ziye.yang@intel.com>
Change-Id: I9286fd9cec0b512cea8e3f2c335c5bf862b98573
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/464842


Reviewed-by: default avatarChangpeng Liu <changpeng.liu@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: default avatarBroadcom SPDK FC-NVMe CI <spdk-ci.pdl@broadcom.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent ea5ad0b2
Loading
Loading
Loading
Loading
+117 −31
Original line number Diff line number Diff line
@@ -55,6 +55,7 @@
#define NVMF_TCP_PDU_MAX_C2H_DATA_SIZE	131072
#define NVMF_TCP_QPAIR_MAX_C2H_PDU_NUM  64  /* Maximal c2h_data pdu number for ecah tqpair */
#define SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY 6
#define SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR 4

/* spdk nvmf related structure */
enum spdk_nvmf_tcp_req_state {
@@ -198,6 +199,13 @@ struct spdk_nvmf_tcp_req {
	TAILQ_ENTRY(spdk_nvmf_tcp_req)		state_link;
};

struct nvme_tcp_pdu_recv_buf {
	char					*buf;
	uint32_t				off;
	uint32_t				size;
	uint32_t				remain_size;
};

struct spdk_nvmf_tcp_qpair {
	struct spdk_nvmf_qpair			qpair;
	struct spdk_nvmf_tcp_poll_group		*group;
@@ -209,6 +217,7 @@ struct spdk_nvmf_tcp_qpair {
	enum nvme_tcp_qpair_state		state;

	struct nvme_tcp_pdu			pdu_in_progress;
	struct nvme_tcp_pdu_recv_buf		pdu_recv_buf;

	TAILQ_HEAD(, nvme_tcp_pdu)		send_queue;
	TAILQ_HEAD(, nvme_tcp_pdu)		free_queue;
@@ -500,6 +509,7 @@ spdk_nvmf_tcp_qpair_destroy(struct spdk_nvmf_tcp_qpair *tqpair)
	free(tqpair->reqs);
	spdk_free(tqpair->buf);
	spdk_free(tqpair->bufs);
	free(tqpair->pdu_recv_buf.buf);
	free(tqpair);
	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Leave\n");
}
@@ -982,6 +992,14 @@ spdk_nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair, uint16
			TAILQ_INSERT_TAIL(&tqpair->free_queue, &tqpair->pdu[i], tailq);
		}

		tqpair->pdu_recv_buf.size = (in_capsule_data_size + sizeof(struct spdk_nvme_tcp_cmd) + 2 *
					     SPDK_NVME_TCP_DIGEST_LEN) * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
		tqpair->pdu_recv_buf.buf = calloc(1, tqpair->pdu_recv_buf.size);
		if (!tqpair->pdu_recv_buf.buf) {
			SPDK_ERRLOG("Unable to allocate the pdu recv buf on tqpair=%p with size=%d\n", tqpair,
				    tqpair->pdu_recv_buf.size);
			return -1;
		}
	} else {
		tqpair->reqs = calloc(size, sizeof(*tqpair->reqs));
		if (!tqpair->reqs) {
@@ -1707,7 +1725,13 @@ spdk_nvmf_tcp_icreq_handle(struct spdk_nvmf_tcp_transport *ttransport,
	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "maxr2t =%u\n", (ic_req->maxr2t + 1u));

	tqpair->host_hdgst_enable = ic_req->dgst.bits.hdgst_enable ? true : false;
	if (!tqpair->host_hdgst_enable) {
		tqpair->pdu_recv_buf.size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
	}
	tqpair->host_ddgst_enable = ic_req->dgst.bits.ddgst_enable ? true : false;
	if (!tqpair->host_ddgst_enable) {
		tqpair->pdu_recv_buf.size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
	}

	tqpair->cpda = spdk_min(ic_req->hpda, SPDK_NVME_TCP_CPDA_MAX);
	SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "cpda of tqpair=(%p) is : %u\n", tqpair, tqpair->cpda);
@@ -1902,7 +1926,65 @@ nvmf_tcp_pdu_payload_insert_dif(struct nvme_tcp_pdu *pdu, uint32_t read_offset,
	return rc;
}

#define MAX_NVME_TCP_PDU_LOOP_COUNT 32
static int
nvme_tcp_recv_buf_read(struct spdk_sock *sock, struct nvme_tcp_pdu_recv_buf *pdu_recv_buf)
{
	int rc;

	assert(pdu_recv_buf->off == 0);
	assert(pdu_recv_buf->remain_size == 0);
	rc = nvme_tcp_read_data(sock, pdu_recv_buf->size,
				pdu_recv_buf->buf);
	if (rc < 0) {
		SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "will disconnect sock=%p\n", sock);
	} else if (rc > 0) {
		pdu_recv_buf->remain_size = rc;
		spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, 0, rc, 0, 0);
	}

	return rc;
}

static uint32_t
nvme_tcp_read_data_from_pdu_recv_buf(struct nvme_tcp_pdu_recv_buf *pdu_recv_buf,
				     uint32_t expected_size,
				     char *dst)
{
	uint32_t size;

	assert(pdu_recv_buf->remain_size > 0);
	size = spdk_min(expected_size, pdu_recv_buf->remain_size);
	memcpy(dst, (void *)pdu_recv_buf->buf + pdu_recv_buf->off, size);
	pdu_recv_buf->off += size;
	pdu_recv_buf->remain_size -= size;
	if (spdk_unlikely(!pdu_recv_buf->remain_size)) {
		pdu_recv_buf->off = 0;
	}

	return size;
}

static int
nvme_tcp_read_payload_data_from_pdu_recv_buf(struct nvme_tcp_pdu_recv_buf *pdu_recv_buf,
		struct nvme_tcp_pdu *pdu)
{
	struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS + 1];
	int iovcnt, i;
	uint32_t size = 0;

	assert(pdu_recv_buf->remain_size > 0);
	iovcnt = nvme_tcp_build_payload_iovs(iov, NVME_TCP_MAX_SGL_DESCRIPTORS + 1, pdu,
					     pdu->ddgst_enable, NULL);
	assert(iovcnt >= 0);
	for (i = 0; i < iovcnt; i++) {
		if (!pdu_recv_buf->remain_size) {
			break;
		}
		size += nvme_tcp_read_data_from_pdu_recv_buf(pdu_recv_buf, iov[i].iov_len, iov[i].iov_base);
	}

	return size;
}

static int
spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
@@ -1910,7 +1992,7 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
	int rc = 0;
	struct nvme_tcp_pdu *pdu;
	enum nvme_tcp_pdu_recv_state prev_state;
	uint32_t data_len, current_pdu_num = 0;
	uint32_t data_len;

	/* The loop here is to allow for several back-to-back state changes. */
	do {
@@ -1922,19 +2004,19 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
		/* Wait for the common header  */
		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
			rc = nvme_tcp_read_data(tqpair->sock,
			if (!tqpair->pdu_recv_buf.remain_size) {
				rc = nvme_tcp_recv_buf_read(tqpair->sock, &tqpair->pdu_recv_buf);
				if (rc <= 0) {
					return rc;
				}
			}
			rc = nvme_tcp_read_data_from_pdu_recv_buf(&tqpair->pdu_recv_buf,
					sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
					(void *)&pdu->hdr->common + pdu->ch_valid_bytes);
			if (rc < 0) {
				SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "will disconnect tqpair=%p\n", tqpair);
				return NVME_TCP_PDU_FATAL;
			} else if (rc > 0) {
			pdu->ch_valid_bytes += rc;
				spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, 0, rc, 0, 0);
			if (spdk_likely(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY)) {
				spdk_nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
			}
			}

			if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
				return NVME_TCP_PDU_IN_PROGRESS;
@@ -1945,25 +2027,23 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
			break;
		/* Wait for the pdu specific header  */
		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
			rc = nvme_tcp_read_data(tqpair->sock,
			if (!tqpair->pdu_recv_buf.remain_size) {
				rc = nvme_tcp_recv_buf_read(tqpair->sock, &tqpair->pdu_recv_buf);
				if (rc <= 0) {
					return rc;
				}
			}

			rc = nvme_tcp_read_data_from_pdu_recv_buf(&tqpair->pdu_recv_buf,
					pdu->psh_len - pdu->psh_valid_bytes,
					(void *)&pdu->hdr->raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
			if (rc < 0) {
				return NVME_TCP_PDU_FATAL;
			} else if (rc > 0) {
				spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE,
						  0, rc, 0, 0);
			pdu->psh_valid_bytes += rc;
			}
			if (pdu->psh_valid_bytes < pdu->psh_len) {
				return NVME_TCP_PDU_IN_PROGRESS;
			}

			/* All header(ch, psh, head digist) of this PDU has now been read from the socket. */
			spdk_nvmf_tcp_pdu_psh_handle(tqpair);
			if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) {
				current_pdu_num++;
			}
			break;
		case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
			/* check whether the data is valid, if not we just return */
@@ -1979,11 +2059,18 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
				pdu->ddgst_enable = true;
			}

			if (tqpair->pdu_recv_buf.remain_size) {
				rc = nvme_tcp_read_payload_data_from_pdu_recv_buf(&tqpair->pdu_recv_buf, pdu);
				pdu->readv_offset += rc;
			}

			if (pdu->readv_offset < data_len) {
				rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
				if (rc < 0) {
					return NVME_TCP_PDU_IN_PROGRESS;
				}
				pdu->readv_offset += rc;
			}

			if (spdk_unlikely(pdu->dif_ctx != NULL)) {
				rc = nvmf_tcp_pdu_payload_insert_dif(pdu, pdu->readv_offset - rc, rc);
@@ -1998,7 +2085,6 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)

			/* All of this PDU has now been read from the socket. */
			spdk_nvmf_tcp_pdu_payload_handle(tqpair);
			current_pdu_num++;
			break;
		case NVME_TCP_PDU_RECV_STATE_ERROR:
			/* Check whether the connection is closed. Each time, we only read 1 byte every time */
@@ -2012,7 +2098,7 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
			SPDK_ERRLOG("code should not come to here");
			break;
		}
	} while ((tqpair->recv_state != prev_state) && (current_pdu_num < MAX_NVME_TCP_PDU_LOOP_COUNT));
	} while (tqpair->recv_state != prev_state);

	return rc;
}