Commit 8126509c authored by Seth Howell's avatar Seth Howell Committed by Ben Walker
Browse files

rdma: replace improperly aligned buffers in requests.



It is a very rare thing for a buffer to be split over two memory
regions. In fact, it is only possible in dpdk versions where
--match-allocations is not passed as a startup parameter to dpdk but
dynamic memory allocation is enabled.

By adding a small helper function, we avoid failing an I/O because it
was assigned one of these improperly aligned buffers. Also, we try to
remove the buffer from circulation so that it doesn't get picked up
again by another request.

Also, add a unit test to catch this case.

Change-Id: Ia09865c2f77160a960571665b29c4533b11758ae
Signed-off-by: default avatarSeth Howell <seth.howell@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/467446


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: default avatarPaul Luse <paul.e.luse@intel.com>
parent 98233769
Loading
Loading
Loading
Loading
+54 −6
Original line number Diff line number Diff line
@@ -452,9 +452,17 @@ struct spdk_nvmf_rdma_poll_group_stat {
struct spdk_nvmf_rdma_poll_group {
	struct spdk_nvmf_transport_poll_group		group;

	/* Requests that are waiting to obtain a data buffer */

	TAILQ_HEAD(, spdk_nvmf_rdma_poller)		pollers;

	struct spdk_nvmf_rdma_poll_group_stat		stat;

	/*
	 * buffers which are split across multiple RDMA
	 * memory regions cannot be used by this transport.
	 */
	STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf)	retired_bufs;
};

/* Assuming rdma_cm uses just one protection domain per ibv_context. */
@@ -1456,6 +1464,34 @@ nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
	return 0;
}

/* This function is used in the rare case that we have a buffer split over multiple memory regions. */
static int
nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf)
{
	struct spdk_nvmf_transport_poll_group	*group = &rgroup->group;
	struct spdk_nvmf_transport		*transport = group->transport;
	struct spdk_nvmf_transport_pg_cache_buf	*old_buf;
	void					*new_buf;

	if (!(STAILQ_EMPTY(&group->buf_cache))) {
		group->buf_cache_count--;
		new_buf = STAILQ_FIRST(&group->buf_cache);
		STAILQ_REMOVE_HEAD(&group->buf_cache, link);
		assert(*buf != NULL);
	} else {
		new_buf = spdk_mempool_get(transport->data_buf_pool);
	}

	if (*buf == NULL) {
		return -ENOMEM;
	}

	old_buf = *buf;
	STAILQ_INSERT_HEAD(&rgroup->retired_bufs, old_buf, link);
	*buf = new_buf;
	return 0;
}

static int
nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport,
		       struct spdk_nvmf_rdma_poll_group *rgroup,
@@ -1483,9 +1519,13 @@ nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport,
							(uint64_t)req->iov[req->iovcnt].iov_base, &translation_len);
		}

		if (translation_len < req->iov[req->iovcnt].iov_len) {
			SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n");
			return -EINVAL;
		/* This is a very rare case that can occur when using DPDK version < 19.05 */
		if (spdk_unlikely(translation_len < req->iov[req->iovcnt].iov_len)) {
			SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n");
			if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[req->iovcnt]) == -ENOMEM) {
				return -ENOMEM;
			}
			continue;
		}

		length -= req->iov[req->iovcnt].iov_len;
@@ -2930,6 +2970,7 @@ spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport)
	}

	TAILQ_INIT(&rgroup->pollers);
	STAILQ_INIT(&rgroup->retired_bufs);

	pthread_mutex_lock(&rtransport->lock);
	TAILQ_FOREACH(device, &rtransport->devices, link) {
@@ -3012,6 +3053,7 @@ spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
	struct spdk_nvmf_rdma_poll_group	*rgroup;
	struct spdk_nvmf_rdma_poller		*poller, *tmp;
	struct spdk_nvmf_rdma_qpair		*qpair, *tmp_qpair;
	struct spdk_nvmf_transport_pg_cache_buf	*buf, *tmp_buf;

	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);

@@ -3019,6 +3061,12 @@ spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
		return;
	}

	/* free all retired buffers back to the transport so we don't short the mempool. */
	STAILQ_FOREACH_SAFE(buf, &rgroup->retired_bufs, link, tmp_buf) {
		STAILQ_REMOVE(&rgroup->retired_bufs, buf, spdk_nvmf_transport_pg_cache_buf, link);
		spdk_mempool_put(group->transport->data_buf_pool, buf);
	}

	TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
		TAILQ_REMOVE(&rgroup->pollers, poller, link);

+39 −3
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@
#include "nvmf/rdma.c"

uint64_t g_mr_size;
uint64_t g_mr_next_size;
struct ibv_mr g_rdma_mr;

#define RDMA_UT_UNITS_IN_MAX_IO 16
@@ -135,6 +136,9 @@ spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t
{
	if (g_mr_size != 0) {
		*(uint32_t *)size = g_mr_size;
		if (g_mr_next_size != 0) {
			g_mr_size = g_mr_next_size;
		}
	}

	return (uint64_t)&g_rdma_mr;
@@ -177,12 +181,16 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
	struct spdk_nvmf_transport_pg_cache_buf bufs[4];
	struct spdk_nvme_sgl_descriptor sgl_desc[SPDK_NVMF_MAX_SGL_ENTRIES] = {{0}};
	struct spdk_nvmf_rdma_request_data data;
	struct spdk_nvmf_transport_pg_cache_buf	buffer;
	struct spdk_nvmf_transport_pg_cache_buf	*buffer_ptr;
	int rc, i;

	data.wr.sg_list = data.sgl;
	STAILQ_INIT(&group.group.buf_cache);
	group.group.buf_cache_size = 0;
	group.group.buf_cache_count = 0;
	group.group.transport = &rtransport.transport;
	STAILQ_INIT(&group.retired_bufs);
	poller.group = &group;
	rqpair.poller = &poller;
	rqpair.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES;
@@ -268,7 +276,6 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
	CU_ASSERT(rdma_req.data.wr.sg_list[0].length == 0);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == 0);


	rdma_req.recv->buf = (void *)0xDDDD;
	/* Test 2: sgl type: keyed data block subtype: offset (in capsule data) */
	sgl->generic.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
@@ -408,7 +415,6 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
	}

	/* part 1: use the four buffers from the pg cache */

	group.group.buf_cache_size = 4;
	group.group.buf_cache_count = 4;
	MOCK_SET(spdk_mempool_get, (void *)0x2000);
@@ -432,8 +438,8 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
				~NVMF_DATA_BUFFER_MASK));
		CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size);
	}
	/* part 2: now that we have used the buffers from the cache, try again. We should get mempool buffers. */

	/* part 2: now that we have used the buffers from the cache, try again. We should get mempool buffers. */
	reset_nvmf_rdma_request(&rdma_req);
	rc = spdk_nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req);

@@ -482,6 +488,36 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
		CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size);
	}

	reset_nvmf_rdma_request(&rdma_req);
	/* Test 5 dealing with a buffer split over two Memory Regions */
	MOCK_SET(spdk_mempool_get, (void *)&buffer);
	sgl->generic.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
	sgl->keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
	sgl->keyed.length = rtransport.transport.opts.io_unit_size / 2;
	g_mr_size = rtransport.transport.opts.io_unit_size / 4;
	g_mr_next_size = rtransport.transport.opts.io_unit_size / 2;

	rc = spdk_nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req);
	SPDK_CU_ASSERT_FATAL(rc == 0);
	CU_ASSERT(rdma_req.req.data_from_pool == true);
	CU_ASSERT(rdma_req.req.length == rtransport.transport.opts.io_unit_size / 2);
	CU_ASSERT((uint64_t)rdma_req.req.data == (((uint64_t)&buffer + NVMF_DATA_BUFFER_MASK) &
			~NVMF_DATA_BUFFER_MASK));
	CU_ASSERT(rdma_req.data.wr.num_sge == 1);
	CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE);
	CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF);
	CU_ASSERT(rdma_req.req.buffers[0] == &buffer);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == (((uint64_t)&buffer + NVMF_DATA_BUFFER_MASK) &
			~NVMF_DATA_BUFFER_MASK));
	CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rtransport.transport.opts.io_unit_size / 2);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == g_rdma_mr.lkey);
	buffer_ptr = STAILQ_FIRST(&group.retired_bufs);
	CU_ASSERT(buffer_ptr == &buffer);
	STAILQ_REMOVE(&group.retired_bufs, buffer_ptr, spdk_nvmf_transport_pg_cache_buf, link);
	CU_ASSERT(STAILQ_EMPTY(&group.retired_bufs));

	reset_nvmf_rdma_request(&rdma_req);
}

static struct spdk_nvmf_rdma_recv *