Commit b5f360c4 authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Tomasz Zawadzki
Browse files

rdma: Maintain per device PD which is persistent across reconnect



SPDK NVMe RDMA initiator used the default PD per RDMA device. Default PD
may be changed when all QPs for the RDMA device are destroyed and created
again.

For multipath, the RDMA zero copy feature require the PD per RDMA device
to be persistent when all QPs for the RDMA device are destroyed and
created again.

Maintain such persistent PDs in this patch.

Add two APIs, spdk_rdma_get_pd() and spdk_rdma_put_pd().

In each call of two APIs, synchronize RDMA device list with
rdma_get_devices().

Context may be deleted anytime by rdma-core. To avoid such deletion,
hold the returned array by rdma_get_devices().

RDMA device has PD, context, ref. count, and removed flag. If context
is missing in rdma_get_devices(), set the removed flag to true. Then,
if the ref count becomes zero, free the PD and the RDMA device.

The ref. count of a RDMA device is incremented when spdk_rdma_get_pd()
is called and decremented when spdk_rdma_put_pd() is called.

To simplify synchronization, sort the returned array by
rdma_get_devices().

To avoid resource leakage, add destructor function and free all PDs
and related data at termination.

Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: I093cb4ec2c7d8432642edfbffa270797ccf3e715
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13769


Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
parent 3403be99
Loading
Loading
Loading
Loading
+16 −0
Original line number Diff line number Diff line
@@ -273,4 +273,20 @@ spdk_rdma_memory_translation_get_rkey(struct spdk_rdma_memory_translation
	       translation->mr_or_key.mr->rkey : (uint32_t)translation->mr_or_key.key;
}

/**
 * Get a Protection Domain for an RDMA device context.
 *
 * \param context RDMA device context
 * \return Pointer to the allocated Protection Domain
 */
struct ibv_pd *
spdk_rdma_get_pd(struct ibv_context *context);

/**
 * Return a Protection Domain.
 *
 * \param pd Pointer to the Protection Domain
 */
void spdk_rdma_put_pd(struct ibv_pd *pd);

#endif /* SPDK_RDMA_H */
+1 −1
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk

SO_VER := 4
SO_MINOR := 0
SO_MINOR := 1

SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_rdma.map)

+218 −0
Original line number Diff line number Diff line
@@ -13,6 +13,14 @@
#include "spdk_internal/rdma.h"
#include "spdk_internal/assert.h"

struct spdk_rdma_device {
	struct ibv_pd				*pd;
	struct ibv_context			*context;
	int					ref;
	bool					removed;
	TAILQ_ENTRY(spdk_rdma_device)		tailq;
};

struct spdk_rdma_mem_map {
	struct spdk_mem_map		*map;
	struct ibv_pd			*pd;
@@ -22,6 +30,10 @@ struct spdk_rdma_mem_map {
	LIST_ENTRY(spdk_rdma_mem_map) link;
};

static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER;
static struct ibv_context **g_ctx_list = NULL;
static TAILQ_HEAD(, spdk_rdma_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list);

static LIST_HEAD(, spdk_rdma_mem_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps);
static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;

@@ -350,3 +362,209 @@ spdk_rdma_qp_flush_recv_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_recv_w

	return rc;
}

static struct spdk_rdma_device *
rdma_add_dev(struct ibv_context *context)
{
	struct spdk_rdma_device *dev;

	dev = calloc(1, sizeof(*dev));
	if (dev == NULL) {
		SPDK_ERRLOG("Failed to allocate RDMA device object.\n");
		return NULL;
	}

	dev->pd = ibv_alloc_pd(context);
	if (dev->pd == NULL) {
		SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno);
		free(dev);
		return NULL;
	}

	dev->context = context;
	TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq);

	return dev;
}

static void
rdma_remove_dev(struct spdk_rdma_device *dev)
{
	if (!dev->removed || dev->ref > 0) {
		return;
	}

	/* Deallocate protection domain only if the device is already removed and
	 * there is no reference.
	 */
	TAILQ_REMOVE(&g_dev_list, dev, tailq);
	ibv_dealloc_pd(dev->pd);
	free(dev);
}

static int
ctx_cmp(const void *_c1, const void *_c2)
{
	struct ibv_context *c1 = *(struct ibv_context **)_c1;
	struct ibv_context *c2 = *(struct ibv_context **)_c2;

	return c1 < c2 ? -1 : c1 > c2;
}

static int
rdma_sync_dev_list(void)
{
	struct ibv_context **new_ctx_list;
	int i, j;
	int num_devs = 0;

	/*
	 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices,
	 * and sets num_devs to the number of the returned devices.
	 */
	new_ctx_list = rdma_get_devices(&num_devs);
	if (new_ctx_list == NULL) {
		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
		return -ENODEV;
	}

	if (num_devs == 0) {
		rdma_free_devices(new_ctx_list);
		SPDK_ERRLOG("Returned RDMA device array was empty\n");
		return -ENODEV;
	}

	/*
	 * Sort new_ctx_list by addresses to update devices easily.
	 */
	qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp);

	if (g_ctx_list == NULL) {
		/* If no old array, this is the first call. Add all devices. */
		for (i = 0; new_ctx_list[i] != NULL; i++) {
			rdma_add_dev(new_ctx_list[i]);
		}

		goto exit;
	}

	for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) {
		struct ibv_context *new_ctx = new_ctx_list[i];
		struct ibv_context *old_ctx = g_ctx_list[j];
		bool add = false, remove = false;

		/*
		 * If a context exists only in the new array, create a device for it,
		 * or if a context exists only in the old array, try removing the
		 * corresponding device.
		 */

		if (old_ctx == NULL) {
			add = true;
		} else if (new_ctx == NULL) {
			remove = true;
		} else if (new_ctx < old_ctx) {
			add = true;
		} else if (old_ctx < new_ctx) {
			remove = true;
		}

		if (add) {
			rdma_add_dev(new_ctx_list[i]);
			i++;
		} else if (remove) {
			struct spdk_rdma_device *dev, *tmp;

			TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
				if (dev->context == g_ctx_list[j]) {
					dev->removed = true;
					rdma_remove_dev(dev);
				}
			}
			j++;
		} else {
			i++;
			j++;
		}
	}

	/* Free the old array. */
	rdma_free_devices(g_ctx_list);

exit:
	/*
	 * Keep the newly returned array so that allocated protection domains
	 * are not freed unexpectedly.
	 */
	g_ctx_list = new_ctx_list;
	return 0;
}

struct ibv_pd *
spdk_rdma_get_pd(struct ibv_context *context)
{
	struct spdk_rdma_device *dev;
	int rc;

	pthread_mutex_lock(&g_dev_mutex);

	rc = rdma_sync_dev_list();
	if (rc != 0) {
		pthread_mutex_unlock(&g_dev_mutex);

		SPDK_ERRLOG("Failed to sync RDMA device list\n");
		return NULL;
	}

	TAILQ_FOREACH(dev, &g_dev_list, tailq) {
		if (dev->context == context && !dev->removed) {
			dev->ref++;
			pthread_mutex_unlock(&g_dev_mutex);

			return dev->pd;
		}
	}

	pthread_mutex_unlock(&g_dev_mutex);

	SPDK_ERRLOG("Failed to get PD\n");
	return NULL;
}

void
spdk_rdma_put_pd(struct ibv_pd *pd)
{
	struct spdk_rdma_device *dev, *tmp;

	pthread_mutex_lock(&g_dev_mutex);

	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
		if (dev->pd == pd) {
			assert(dev->ref > 0);
			dev->ref--;

			rdma_remove_dev(dev);
		}
	}

	rdma_sync_dev_list();

	pthread_mutex_unlock(&g_dev_mutex);
}

__attribute__((destructor)) static void
_rdma_fini(void)
{
	struct spdk_rdma_device *dev, *tmp;

	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
		dev->removed = true;
		dev->ref = 0;
		rdma_remove_dev(dev);
	}

	if (g_ctx_list != NULL) {
		rdma_free_devices(g_ctx_list);
		g_ctx_list = NULL;
	}
}
+2 −0
Original line number Diff line number Diff line
@@ -18,6 +18,8 @@
	spdk_rdma_get_translation;
	spdk_rdma_qp_queue_recv_wrs;
	spdk_rdma_qp_flush_recv_wrs;
	spdk_rdma_get_pd;
	spdk_rdma_put_pd;

	local: *;
};
+1 −0
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ DIRS-$(CONFIG_REDUCE) += reduce
ifeq ($(OS),Linux)
DIRS-$(CONFIG_VHOST) += vhost
DIRS-y += ftl
DIRS-$(CONFIG_RDMA) += rdma
endif

.PHONY: all clean $(DIRS-y)
Loading