Commit 01f2d0bd authored by Alexey Marchuk's avatar Alexey Marchuk Committed by Tomasz Zawadzki
Browse files

nvmf/rdma: Use RDMA provider memory translation



Signed-off-by: default avatarAlexey Marchuk <alexeymar@mellanox.com>
Change-Id: I78420118d3ec1bf78c0301a51ab320503b6f2fca
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/5123


Community-CI: Broadcom CI
Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
parent b6efb964
Loading
Loading
Loading
Loading
+9 −86
Original line number Diff line number Diff line
@@ -462,7 +462,7 @@ struct spdk_nvmf_rdma_device {
	struct ibv_device_attr			attr;
	struct ibv_context			*context;

	struct spdk_mem_map			*map;
	struct spdk_rdma_mem_map		*map;
	struct ibv_pd				*pd;

	int					num_srq;
@@ -1331,56 +1331,6 @@ nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *e
	return 0;
}

static int
nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
		     enum spdk_mem_map_notify_action action,
		     void *vaddr, size_t size)
{
	struct ibv_pd *pd = cb_ctx;
	struct ibv_mr *mr;
	int rc;

	switch (action) {
	case SPDK_MEM_MAP_NOTIFY_REGISTER:
		if (!g_nvmf_hooks.get_rkey) {
			mr = ibv_reg_mr(pd, vaddr, size,
					IBV_ACCESS_LOCAL_WRITE |
					IBV_ACCESS_REMOTE_READ |
					IBV_ACCESS_REMOTE_WRITE);
			if (mr == NULL) {
				SPDK_ERRLOG("ibv_reg_mr() failed\n");
				return -1;
			} else {
				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
			}
		} else {
			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
							  g_nvmf_hooks.get_rkey(pd, vaddr, size));
		}
		break;
	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
		if (!g_nvmf_hooks.get_rkey) {
			mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
			if (mr) {
				ibv_dereg_mr(mr);
			}
		}
		rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
		break;
	default:
		SPDK_UNREACHABLE();
	}

	return rc;
}

static int
nvmf_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
{
	/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
	return addr_1 == addr_2;
}

static inline void
nvmf_rdma_setup_wr(struct ibv_send_wr *wr, struct ibv_send_wr *next,
		   enum spdk_nvme_data_transfer xfer)
@@ -1490,31 +1440,6 @@ nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf)
	return 0;
}

static bool
nvmf_rdma_get_lkey(struct spdk_nvmf_rdma_device *device, struct iovec *iov,
		   uint32_t *_lkey)
{
	uint64_t	translation_len;
	uint32_t	lkey;

	translation_len = iov->iov_len;

	if (!g_nvmf_hooks.get_rkey) {
		lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map,
				(uint64_t)iov->iov_base, &translation_len))->lkey;
	} else {
		lkey = spdk_mem_map_translate(device->map,
					      (uint64_t)iov->iov_base, &translation_len);
	}

	if (spdk_unlikely(translation_len < iov->iov_len)) {
		return false;
	}

	*_lkey = lkey;
	return true;
}

static bool
nvmf_rdma_fill_wr_sge(struct spdk_nvmf_rdma_device *device,
		      struct iovec *iov, struct ibv_send_wr **_wr,
@@ -1524,13 +1449,18 @@ nvmf_rdma_fill_wr_sge(struct spdk_nvmf_rdma_device *device,
{
	struct ibv_send_wr *wr = *_wr;
	struct ibv_sge	*sg_ele = &wr->sg_list[wr->num_sge];
	struct spdk_rdma_memory_translation mem_translation;
	int		rc;
	uint32_t	lkey = 0;
	uint32_t	remaining, data_block_size, md_size, sge_len;

	if (spdk_unlikely(!nvmf_rdma_get_lkey(device, iov, &lkey))) {
	rc = spdk_rdma_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation);
	if (spdk_unlikely(rc)) {
		return false;
	}

	lkey = spdk_rdma_memory_translation_get_lkey(&mem_translation);

	if (spdk_likely(!dif_ctx)) {
		sg_ele->lkey = lkey;
		sg_ele->addr = (uintptr_t)(iov->iov_base);
@@ -2285,11 +2215,6 @@ nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
	opts->transport_specific =      NULL;
}

const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {
	.notify_cb = nvmf_rdma_mem_notify,
	.are_contiguous = nvmf_rdma_check_contiguous_entries
};

static int nvmf_rdma_destroy(struct spdk_nvmf_transport *transport,
			     spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg);

@@ -2513,7 +2438,7 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)

		assert(device->map == NULL);

		device->map = spdk_mem_map_alloc(0, &g_nvmf_rdma_map_ops, device->pd);
		device->map = spdk_rdma_create_mem_map(device->pd, &g_nvmf_hooks);
		if (!device->map) {
			SPDK_ERRLOG("Unable to allocate memory map for listen address\n");
			rc = -ENOMEM;
@@ -2604,9 +2529,7 @@ nvmf_rdma_destroy(struct spdk_nvmf_transport *transport,

	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
		TAILQ_REMOVE(&rtransport->devices, device, link);
		if (device->map) {
			spdk_mem_map_free(&device->map);
		}
		spdk_rdma_free_mem_map(&device->map);
		if (device->pd) {
			if (!g_nvmf_hooks.get_ibv_pd) {
				ibv_dealloc_pd(device->pd);
+4 −0
Original line number Diff line number Diff line
@@ -56,6 +56,7 @@ DEFINE_STUB_V(spdk_rdma_free_mem_map, (struct spdk_rdma_mem_map **map));

/* used to mock out having to split an SGL over a memory region */
size_t g_mr_size;
uint64_t g_mr_next_size;
struct ibv_mr g_rdma_mr = {
	.addr = (void *)0xC0FFEE,
	.lkey = RDMA_UT_LKEY,
@@ -72,6 +73,9 @@ spdk_rdma_get_translation(struct spdk_rdma_mem_map *map, void *address,
	HANDLE_RETURN_MOCK(spdk_rdma_get_translation);

	if (g_mr_size && length > g_mr_size) {
		if (g_mr_next_size) {
			g_mr_size = g_mr_next_size;
		}
		return -ERANGE;
	}

+17 −38
Original line number Diff line number Diff line
@@ -38,10 +38,6 @@
#include "nvmf/rdma.c"
#include "nvmf/transport.c"

uint64_t g_mr_size;
uint64_t g_mr_next_size;
struct ibv_mr g_rdma_mr;

#define RDMA_UT_UNITS_IN_MAX_IO 16

struct spdk_nvmf_transport_opts g_rdma_ut_transport_opts = {
@@ -126,19 +122,6 @@ spdk_nvme_transport_id_populate_trstring(struct spdk_nvme_transport_id *trid, co
	return 0;
}

uint64_t
spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
{
	if (g_mr_size != 0) {
		*(uint32_t *)size = g_mr_size;
		if (g_mr_next_size != 0) {
			g_mr_size = g_mr_next_size;
		}
	}

	return (uint64_t)&g_rdma_mr;
}

static void reset_nvmf_rdma_request(struct spdk_nvmf_rdma_request *rdma_req)
{
	int i;
@@ -205,7 +188,6 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
	rtransport.transport.data_buf_pool = NULL;

	device.attr.device_cap_flags = 0;
	g_rdma_mr.lkey = 0xABCD;
	sgl->keyed.key = 0xEEEE;
	sgl->address = 0xFFFF;
	rdma_req.recv->buf = (void *)0xDDDD;
@@ -231,7 +213,7 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
	CU_ASSERT((uint64_t)rdma_req.req.buffers[0] == 0x2000);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == 0x2000);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rtransport.transport.opts.io_unit_size / 2);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == RDMA_UT_LKEY);

	/* Part 2: simple I/O, one SGL larger than the transport io unit size (equal to the max io size) */
	reset_nvmf_rdma_request(&rdma_req);
@@ -248,7 +230,7 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
		CU_ASSERT((uint64_t)rdma_req.req.buffers[i] == 0x2000);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == g_rdma_mr.lkey);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
	}

	/* Part 3: simple I/O one SGL larger than the transport max io size */
@@ -404,7 +386,6 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
	sgl->keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
	sgl->address = 0xFFFF;
	rdma_req.recv->buf = (void *)0xDDDD;
	g_rdma_mr.lkey = 0xABCD;
	sgl->keyed.key = 0xEEEE;

	for (i = 0; i < 4; i++) {
@@ -508,7 +489,7 @@ test_spdk_nvmf_rdma_request_parse_sgl(void)
	CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == (((uint64_t)&buffer + NVMF_DATA_BUFFER_MASK) &
			~NVMF_DATA_BUFFER_MASK));
	CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rtransport.transport.opts.io_unit_size / 2);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == RDMA_UT_LKEY);
	buffer_ptr = STAILQ_FIRST(&group.retired_bufs);
	CU_ASSERT(buffer_ptr == &buffer);
	STAILQ_REMOVE(&group.retired_bufs, buffer_ptr, spdk_nvmf_transport_pg_cache_buf, link);
@@ -637,7 +618,6 @@ test_spdk_nvmf_rdma_request_process(void)

	device.attr.device_cap_flags = 0;
	device.map = (void *)0x0;
	g_rdma_mr.lkey = 0xABCD;

	/* Test 1: single SGL READ request */
	rdma_recv = create_recv(&rqpair, SPDK_NVME_OPC_READ);
@@ -923,7 +903,6 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)

	device.attr.device_cap_flags = 0;
	device.map = NULL;
	g_rdma_mr.lkey = 0xABCD;
	sgl->keyed.key = 0xEEEE;
	sgl->address = 0xFFFF;
	rdma_req.recv->buf = (void *)0xDDDD;
@@ -958,7 +937,7 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
	for (i = 0; i < 4; ++i) {
		CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size));
		CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == g_rdma_mr.lkey);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
	}

	/* Part 2: simple I/O, one SGL equal to io unit size, io_unit_size is not aligned with md_size,
@@ -988,16 +967,16 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
	for (i = 0; i < 3; ++i) {
		CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size));
		CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == g_rdma_mr.lkey);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
	}
	CU_ASSERT(rdma_req.data.wr.sg_list[3].addr == 0x2000 + 3 * (data_bs + md_size));
	CU_ASSERT(rdma_req.data.wr.sg_list[3].length == 488);
	CU_ASSERT(rdma_req.data.wr.sg_list[3].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[3].lkey == RDMA_UT_LKEY);

	/* 2nd buffer consumed */
	CU_ASSERT(rdma_req.data.wr.sg_list[4].addr == 0x2000);
	CU_ASSERT(rdma_req.data.wr.sg_list[4].length == 24);
	CU_ASSERT(rdma_req.data.wr.sg_list[4].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[4].lkey == RDMA_UT_LKEY);

	/* Part 3: simple I/O, one SGL equal io unit size, io_unit_size is equal to block size 512 bytes */
	MOCK_SET(spdk_mempool_get, (void *)0x2000);
@@ -1024,7 +1003,7 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)

	CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == 0x2000);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].length == data_bs);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == RDMA_UT_LKEY);

	CU_ASSERT(rdma_req.req.iovcnt == 2);
	CU_ASSERT(rdma_req.req.iov[0].iov_base == (void *)((unsigned long)0x2000));
@@ -1060,7 +1039,7 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
	for (i = 0; i < 4; ++i) {
		CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size));
		CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == g_rdma_mr.lkey);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
	}

	/* Part 5: simple I/O, one SGL equal to 2x io unit size, io_unit_size is aligned with md_size,
@@ -1123,24 +1102,24 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
	for (i = 0; i < 3; ++i) {
		CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size));
		CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == g_rdma_mr.lkey);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
	}
	CU_ASSERT(rdma_req.data.wr.sg_list[3].addr == 0x2000 + 3 * (data_bs + md_size));
	CU_ASSERT(rdma_req.data.wr.sg_list[3].length == 488);
	CU_ASSERT(rdma_req.data.wr.sg_list[3].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[3].lkey == RDMA_UT_LKEY);

	/* 2nd IO buffer consumed */
	CU_ASSERT(rdma_req.data.wr.sg_list[4].addr == 0x2000);
	CU_ASSERT(rdma_req.data.wr.sg_list[4].length == 24);
	CU_ASSERT(rdma_req.data.wr.sg_list[4].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[4].lkey == RDMA_UT_LKEY);

	CU_ASSERT(rdma_req.data.wr.sg_list[5].addr == 0x2000 + 24 + md_size);
	CU_ASSERT(rdma_req.data.wr.sg_list[5].length == 512);
	CU_ASSERT(rdma_req.data.wr.sg_list[5].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[5].lkey == RDMA_UT_LKEY);

	CU_ASSERT(rdma_req.data.wr.sg_list[6].addr == 0x2000 + 24 + 512 + md_size * 2);
	CU_ASSERT(rdma_req.data.wr.sg_list[6].length == 512);
	CU_ASSERT(rdma_req.data.wr.sg_list[6].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[6].lkey == RDMA_UT_LKEY);

	/* Part 7: simple I/O, number of SGL entries exceeds the number of entries
	   one WR can hold. Additional WR is chained */
@@ -1197,13 +1176,13 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)

	CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == 0x2000);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].length == 512);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == RDMA_UT_LKEY);

	/* 2nd IO buffer consumed, offset 4 bytes due to part of the metadata
	  is located at the beginning of that buffer */
	CU_ASSERT(rdma_req.data.wr.sg_list[1].addr == 0x2000 + 4);
	CU_ASSERT(rdma_req.data.wr.sg_list[1].length == 512);
	CU_ASSERT(rdma_req.data.wr.sg_list[1].lkey == g_rdma_mr.lkey);
	CU_ASSERT(rdma_req.data.wr.sg_list[1].lkey == RDMA_UT_LKEY);

	/* Test 9 dealing with a buffer split over two Memory Regions */
	MOCK_SET(spdk_mempool_get, (void *)&buffer);
@@ -1231,7 +1210,7 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
		CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == (uint64_t)rdma_req.req.data + i *
			  (data_bs + md_size));
		CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == g_rdma_mr.lkey);
		CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
	}
	buffer_ptr = STAILQ_FIRST(&group.retired_bufs);
	CU_ASSERT(buffer_ptr == &buffer);