Commit ba9853b9 authored by Jim Harris's avatar Jim Harris
Browse files

Revert "env: Register external memory with DPDK"



This reverts commit aaac4888.

This patch was showing issues with SPDK vhost mappings
when handling larger numbers of VMs.

Fixes issue #1901.

Signed-off-by: default avatarJim Harris <james.r.harris@intel.com>
Change-Id: I81bd311d26037dcb9340d85abcb4ea45b20a5170
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/7424


Community-CI: Broadcom CI
Reviewed-by: default avatarAleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent 476b6661
Loading
Loading
Loading
Loading
+0 −12
Original line number Diff line number Diff line
@@ -98,19 +98,7 @@ spdk_realloc(void *buf, size_t size, size_t align)
void
spdk_free(void *buf)
{
	struct spdk_mem_region *region, *tmp;

	rte_free(buf);
	pthread_mutex_lock(&g_spdk_mem_region_mutex);
	g_spdk_mem_do_not_notify = true;
	/* Perform memory unregister previously postponed in memory_hotplug_cb() */
	TAILQ_FOREACH_SAFE(region, &g_spdk_mem_regions, tailq, tmp) {
		spdk_mem_unregister(region->addr, region->len);
		TAILQ_REMOVE(&g_spdk_mem_regions, region, tailq);
		free(region);
	}
	g_spdk_mem_do_not_notify = false;
	pthread_mutex_unlock(&g_spdk_mem_region_mutex);
}

void *
+0 −12
Original line number Diff line number Diff line
@@ -50,18 +50,6 @@
#error RTE_VERSION is too old! Minimum 19.11 is required.
#endif

extern __thread bool g_spdk_mem_do_not_notify;
extern struct spdk_mem_region_head g_spdk_mem_regions;
extern pthread_mutex_t g_spdk_mem_region_mutex;

struct spdk_mem_region {
	void *addr;
	size_t len;
	TAILQ_ENTRY(spdk_mem_region) tailq;
};

TAILQ_HEAD(spdk_mem_region_head, spdk_mem_region);

/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47],
 * which is enough to cover 256 TB.
 */
+160 −103
Original line number Diff line number Diff line
@@ -146,10 +146,7 @@ struct spdk_mem_map {
static struct spdk_mem_map *g_mem_reg_map;
static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
	TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
struct spdk_mem_region_head g_spdk_mem_regions = TAILQ_HEAD_INITIALIZER(g_spdk_mem_regions);
static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t g_spdk_mem_region_mutex = PTHREAD_MUTEX_INITIALIZER;
__thread bool g_spdk_mem_do_not_notify;

static bool g_legacy_mem;

@@ -735,19 +732,7 @@ memory_hotplug_cb(enum rte_mem_event event_type,
			len -= seg->hugepage_sz;
		}
	} else if (event_type == RTE_MEM_EVENT_FREE) {
		/* We need to postpone spdk_mem_unregister() call here to avoid
		 * double lock of SPDK mutex (g_spdk_mem_map_mutex) and DPDK mutex
		 * (memory_hotplug_lock) which may happen when one thread is calling
		 * spdk_free() and another one is calling vhost_session_mem_unregister() */
		struct spdk_mem_region *region;

		region = calloc(1, sizeof(*region));
		assert(region != NULL);
		region->addr = (void *)addr;
		region->len = len;
		pthread_mutex_lock(&g_spdk_mem_region_mutex);
		TAILQ_INSERT_TAIL(&g_spdk_mem_regions, region, tailq);
		pthread_mutex_unlock(&g_spdk_mem_region_mutex);
		spdk_mem_unregister((void *)addr, len);
	}
}

@@ -800,6 +785,126 @@ static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
static struct spdk_mem_map *g_vtophys_map;
static struct spdk_mem_map *g_phys_ref_map;

#if VFIO_ENABLED
static int
vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
{
	struct spdk_vfio_dma_map *dma_map;
	uint64_t refcount;
	int ret;

	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
	assert(refcount < UINT64_MAX);
	if (refcount > 0) {
		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
		return 0;
	}

	dma_map = calloc(1, sizeof(*dma_map));
	if (dma_map == NULL) {
		return -ENOMEM;
	}

	dma_map->map.argsz = sizeof(dma_map->map);
	dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
	dma_map->map.vaddr = vaddr;
	dma_map->map.iova = iova;
	dma_map->map.size = size;

	pthread_mutex_lock(&g_vfio.mutex);
	if (g_vfio.device_ref == 0) {
		/* VFIO requires at least one device (IOMMU group) to be added to
		 * a VFIO container before it is possible to perform any IOMMU
		 * operations on that container. This memory will be mapped once
		 * the first device (IOMMU group) is hotplugged.
		 *
		 * Since the vfio container is managed internally by DPDK, it is
		 * also possible that some device is already in that container, but
		 * it's not managed by SPDK -  e.g. an NIC attached internally
		 * inside DPDK. We could map the memory straight away in such
		 * scenario, but there's no need to do it. DPDK devices clearly
		 * don't need our mappings and hence we defer the mapping
		 * unconditionally until the first SPDK-managed device is
		 * hotplugged.
		 */
		goto out_insert;
	}

	ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
	if (ret) {
		DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
		pthread_mutex_unlock(&g_vfio.mutex);
		free(dma_map);
		return ret;
	}

out_insert:
	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
	pthread_mutex_unlock(&g_vfio.mutex);
	spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
	return 0;
}

static int
vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
{
	struct spdk_vfio_dma_map *dma_map;
	uint64_t refcount;
	int ret;
	struct vfio_iommu_type1_dma_unmap unmap = {};

	pthread_mutex_lock(&g_vfio.mutex);
	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
		if (dma_map->map.iova == iova) {
			break;
		}
	}

	if (dma_map == NULL) {
		DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
		pthread_mutex_unlock(&g_vfio.mutex);
		return -ENXIO;
	}

	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
	assert(refcount < UINT64_MAX);
	if (refcount > 0) {
		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
	}

	/* We still have outstanding references, don't clear it. */
	if (refcount > 1) {
		pthread_mutex_unlock(&g_vfio.mutex);
		return 0;
	}

	/** don't support partial or multiple-page unmap for now */
	assert(dma_map->map.size == size);

	if (g_vfio.device_ref == 0) {
		/* Memory is not mapped anymore, just remove it's references */
		goto out_remove;
	}

	unmap.argsz = sizeof(unmap);
	unmap.flags = 0;
	unmap.iova = dma_map->map.iova;
	unmap.size = dma_map->map.size;
	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
	if (ret) {
		DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
		pthread_mutex_unlock(&g_vfio.mutex);
		return ret;
	}

out_remove:
	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
	pthread_mutex_unlock(&g_vfio.mutex);
	free(dma_map);
	return 0;
}
#endif

static uint64_t
vtophys_get_paddr_memseg(uint64_t vaddr)
{
@@ -883,14 +988,6 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
{
	int rc = 0, pci_phys = 0;
	uint64_t paddr;
#if VFIO_ENABLED
	uint32_t num_pages, i;
	rte_iova_t *iovas = NULL;
	rte_iova_t iova;
	struct rte_dev_iterator dev_iter;
	struct rte_device *dev;
	const char *devstr = "bus=pci";
#endif

	if ((uintptr_t)vaddr & ~MASK_256TB) {
		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
@@ -909,9 +1006,7 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
	switch (action) {
	case SPDK_MEM_MAP_NOTIFY_REGISTER:
		if (paddr == SPDK_VTOPHYS_ERROR) {
			/* This is not an address that DPDK is managing currently.
			 * Let's register it. */

			/* This is not an address that DPDK is managing. */
#if VFIO_ENABLED
			enum rte_iova_mode iova_mode;

@@ -920,38 +1015,19 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
			if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
				/* We'll use the virtual address as the iova to match DPDK. */
				paddr = (uint64_t)vaddr;

				num_pages = len / VALUE_2MB;
				iovas = calloc(num_pages, sizeof(rte_iova_t));
				if (iovas == NULL) {
					return -ENOMEM;
				}

				for (i = 0; i < num_pages; i++) {
					iovas[i] = paddr;
					paddr += VALUE_2MB;
				}

				rc = rte_extmem_register(vaddr, len, iovas, num_pages, VALUE_2MB);
				if (rc != 0) {
					goto error;
				}

				/* For each device, map the memory */
				RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
					rte_dev_dma_map(dev, vaddr, (uint64_t)vaddr, len);
				rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
				if (rc) {
					return -EFAULT;
				}

				for (i = 0; i < num_pages; i++) {
					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, iovas[i]);
				while (len > 0) {
					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
					if (rc != 0) {
						goto error;
						return rc;
					}

					vaddr += VALUE_2MB;
					paddr += VALUE_2MB;
					len -= VALUE_2MB;
				}

				free(iovas);
			} else
#endif
			{
@@ -962,8 +1038,7 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
					paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
					if (paddr == SPDK_VTOPHYS_ERROR) {
						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
						rc = -EFAULT;
						goto error;
						return -EFAULT;
					}
					/* The beginning of this address range points to a PCI resource,
					 * so the rest must point to a PCI resource as well.
@@ -982,39 +1057,29 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,

					if (paddr == SPDK_VTOPHYS_ERROR) {
						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
						rc = -EFAULT;
						goto error;

						return -EFAULT;
					}

					/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
					if (!pci_phys && (paddr & MASK_2MB)) {
						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
						rc = -EINVAL;
						goto error;

						return -EINVAL;
					}
#if VFIO_ENABLED
					/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
					 * with the IOMMU using the physical address to match. */
					if (spdk_iommu_is_enabled()) {
						iova = paddr;

						rc = rte_extmem_register(vaddr, VALUE_2MB, &iova, 1, VALUE_2MB);
						if (rc != 0) {
							goto error;
						}

						/* For each device, map the memory */
						RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
							rte_dev_dma_map(dev, vaddr, paddr, len);
						rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
						if (rc) {
							DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
							return -EFAULT;
						}
					}
#endif

					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
					if (rc != 0) {
						goto error;
						return rc;
					}

					vaddr += VALUE_2MB;
@@ -1043,8 +1108,11 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
		break;
	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
#if VFIO_ENABLED
		if (g_spdk_mem_do_not_notify == false) {
			/* If vfio is enabled, we need to unmap the range from the IOMMU */
		if (paddr == SPDK_VTOPHYS_ERROR) {
			/*
			 * This is not an address that DPDK is managing. If vfio is enabled,
			 * we need to unmap the range from the IOMMU
			 */
			if (spdk_iommu_is_enabled()) {
				uint64_t buffer_len = len;
				uint8_t *va = vaddr;
@@ -1063,18 +1131,14 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
							    va, len, paddr, buffer_len);
						return -EINVAL;
					}

					/* For each device, map the memory */
					RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
						rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len);
					}

					rc = rte_extmem_unregister(vaddr, len);
					if (rc != 0) {
						return rc;
					rc = vtophys_iommu_unmap_dma(paddr, len);
					if (rc) {
						DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
						return -EFAULT;
					}
				} else if (iova_mode == RTE_IOVA_PA) {
					/* Get paddr for each 2MB chunk in this address range */
					while (buffer_len > 0) {
						paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);

						if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
@@ -1082,14 +1146,14 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
							return -EFAULT;
						}

					/* For each device, map the memory */
					RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
						rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len);
						rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
						if (rc) {
							DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
							return -EFAULT;
						}

					rc = rte_extmem_unregister(vaddr, len);
					if (rc != 0) {
						return rc;
						va += VALUE_2MB;
						buffer_len -= VALUE_2MB;
					}
				}
			}
@@ -1110,13 +1174,6 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
		SPDK_UNREACHABLE();
	}

	return rc;

error:
#if VFIO_ENABLED
	free(iovas);
#endif

	return rc;
}

+0 −7
Original line number Diff line number Diff line
@@ -65,13 +65,6 @@ DEFINE_STUB(rte_dev_event_callback_unregister, int, (const char *device_name,
		rte_dev_event_cb_fn cb_fn, void *cb_arg), 0);
DEFINE_STUB(rte_dev_event_monitor_start, int, (void), 0);
DEFINE_STUB(rte_dev_event_monitor_stop, int, (void), 0);
DEFINE_STUB(rte_extmem_register, int, (void *va_addr, size_t len, rte_iova_t iova_addrs[],
				       unsigned int n_pages, size_t page_sz), 0);
DEFINE_STUB(rte_extmem_unregister, int, (void *va_addr, size_t len), 0);
DEFINE_STUB(rte_dev_dma_map, int, (struct rte_device *dev, void *addr, uint64_t iova,
				   size_t len), 0);
DEFINE_STUB(rte_dev_dma_unmap, int, (struct rte_device *dev, void *addr, uint64_t iova,
				     size_t len), 0);

static int
test_mem_map_notify(void *cb_ctx, struct spdk_mem_map *map,