Commit a36bc251 authored by Ben Walker's avatar Ben Walker Committed by Tomasz Zawadzki
Browse files

env_dpdk: Automatically map PCI BARs into VFIO



By doing the registration immediately upon mapping the BAR instead of
when the memory is inserted into the spdk_mem_map, we're able to
register BARs that are not 2MB multiples in size and alignment. The SPDK
API for registering a BAR already returns the physical/io address in the
map call, and it can be used directly without a call to
spdk_mem_register().

If the user does elect to later register the BAR using
spdk_mem_register(), we attempt to insert the 2MB aligned segments we
can into the spdk_mem_map. Users may still need to register memory for a
few reasons, such as making spdk_vtophys() work, or for setting up the
BAR as a target for RDMA. These cases still require 2MB aligned and
sized segments.

Change-Id: I395ae8803ec4bf22703f6f76db54200949e82532
Signed-off-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14017


Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
parent 3d5971ec
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -8,6 +8,17 @@

#include "spdk/stdinc.h"

#ifndef __linux__
#define VFIO_ENABLED 0
#else
#include <linux/version.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
#define VFIO_ENABLED 1
#else
#define VFIO_ENABLED 0
#endif
#endif

#ifdef __cplusplus
extern "C" {
#endif
+3 −0
Original line number Diff line number Diff line
@@ -33,6 +33,9 @@ void pci_env_fini(void);
int mem_map_init(bool legacy_mem);
int vtophys_init(void);

int vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size);
int vtophys_iommu_unmap_dma_bar(uint64_t vaddr);

struct rte_pci_device;

/**
+145 −54
Original line number Diff line number Diff line
@@ -22,12 +22,9 @@
#include "spdk/env_dpdk.h"
#include "spdk/log.h"

#ifndef __linux__
#define VFIO_ENABLED 0
#else
#ifdef __linux__
#include <linux/version.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
#define VFIO_ENABLED 1
#include <linux/vfio.h>
#include <rte_vfio.h>

@@ -53,9 +50,6 @@ static struct vfio_cfg g_vfio = {
	.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
	.mutex = PTHREAD_MUTEX_INITIALIZER
};

#else
#define VFIO_ENABLED 0
#endif
#endif

@@ -765,19 +759,11 @@ static struct spdk_mem_map *g_phys_ref_map;

#if VFIO_ENABLED
static int
vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
_vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
{
	struct spdk_vfio_dma_map *dma_map;
	uint64_t refcount;
	int ret;

	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
	assert(refcount < UINT64_MAX);
	if (refcount > 0) {
		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
		return 0;
	}

	dma_map = calloc(1, sizeof(*dma_map));
	if (dma_map == NULL) {
		return -ENOMEM;
@@ -789,7 +775,6 @@ vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
	dma_map->map.iova = iova;
	dma_map->map.size = size;

	pthread_mutex_lock(&g_vfio.mutex);
	if (g_vfio.device_ref == 0) {
		/* VFIO requires at least one device (IOMMU group) to be added to
		 * a VFIO container before it is possible to perform any IOMMU
@@ -816,18 +801,78 @@ vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)

out_insert:
	TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
	return 0;
}


static int
vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
{
	uint64_t refcount;
	int ret;

	refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
	assert(refcount < UINT64_MAX);
	if (refcount > 0) {
		spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
		return 0;
	}

	pthread_mutex_lock(&g_vfio.mutex);
	ret = _vfio_iommu_map_dma(vaddr, iova, size);
	pthread_mutex_unlock(&g_vfio.mutex);
	if (ret) {
		return ret;
	}

	spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
	return 0;
}

int
vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
{
	int ret;

	pthread_mutex_lock(&g_vfio.mutex);
	ret = _vfio_iommu_map_dma(vaddr, iova, size);
	pthread_mutex_unlock(&g_vfio.mutex);

	return ret;
}

static int
_vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
{
	struct vfio_iommu_type1_dma_unmap unmap = {};
	int ret;

	if (g_vfio.device_ref == 0) {
		/* Memory is not mapped anymore, just remove it's references */
		goto out_remove;
	}

	unmap.argsz = sizeof(unmap);
	unmap.flags = 0;
	unmap.iova = dma_map->map.iova;
	unmap.size = dma_map->map.size;
	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
	if (ret) {
		SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
	}

out_remove:
	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
	free(dma_map);
	return 0;
}

static int
vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
{
	struct spdk_vfio_dma_map *dma_map;
	uint64_t refcount;
	int ret;
	struct vfio_iommu_type1_dma_unmap unmap = {};

	pthread_mutex_lock(&g_vfio.mutex);
	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
@@ -857,25 +902,34 @@ vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
	/** don't support partial or multiple-page unmap for now */
	assert(dma_map->map.size == size);

	if (g_vfio.device_ref == 0) {
		/* Memory is not mapped anymore, just remove it's references */
		goto out_remove;
	ret = _vfio_iommu_unmap_dma(dma_map);
	pthread_mutex_unlock(&g_vfio.mutex);

	return ret;
}

	unmap.argsz = sizeof(unmap);
	unmap.flags = 0;
	unmap.iova = dma_map->map.iova;
	unmap.size = dma_map->map.size;
	ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
	if (ret) {
		SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
int
vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
{
	struct spdk_vfio_dma_map *dma_map;
	int ret;

	pthread_mutex_lock(&g_vfio.mutex);
	TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
		if (dma_map->map.vaddr == vaddr) {
			break;
		}
	}

out_remove:
	TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
	if (dma_map == NULL) {
		DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
		pthread_mutex_unlock(&g_vfio.mutex);
	free(dma_map);
	return 0;
		return -ENXIO;
	}

	ret = _vfio_iommu_unmap_dma(dma_map);
	pthread_mutex_unlock(&g_vfio.mutex);
	return ret;
}
#endif

@@ -926,7 +980,7 @@ vtophys_get_paddr_pagemap(uint64_t vaddr)

/* Try to get the paddr from pci devices */
static uint64_t
vtophys_get_paddr_pci(uint64_t vaddr)
vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
{
	struct spdk_vtophys_pci_device *vtophys_dev;
	uintptr_t paddr;
@@ -935,7 +989,7 @@ vtophys_get_paddr_pci(uint64_t vaddr)
	pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
	TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
		dev = vtophys_dev->pci_device;
		paddr = dpdk_pci_device_vtophys(dev, vaddr);
		paddr = dpdk_pci_device_vtophys(dev, vaddr, len);
		if (paddr != SPDK_VTOPHYS_ERROR) {
			pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
			return paddr;
@@ -951,7 +1005,7 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
	       enum spdk_mem_map_notify_action action,
	       void *vaddr, size_t len)
{
	int rc = 0, pci_phys = 0;
	int rc = 0;
	uint64_t paddr;

	if ((uintptr_t)vaddr & ~MASK_256TB) {
@@ -972,6 +1026,30 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
	case SPDK_MEM_MAP_NOTIFY_REGISTER:
		if (paddr == SPDK_VTOPHYS_ERROR) {
			/* This is not an address that DPDK is managing. */

			/* Check if this is a PCI BAR. They need special handling */
			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
			if (paddr != SPDK_VTOPHYS_ERROR) {
				/* Get paddr for each 2MB chunk in this address range */
				while (len > 0) {
					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
					if (paddr == SPDK_VTOPHYS_ERROR) {
						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
						return -EFAULT;
					}

					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
					if (rc != 0) {
						return rc;
					}

					vaddr += VALUE_2MB;
					len -= VALUE_2MB;
				}

				return 0;
			}

#if VFIO_ENABLED
			enum rte_iova_mode iova_mode;

@@ -998,35 +1076,22 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
			{
				/* Get the physical address from /proc/self/pagemap. */
				paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
				if (paddr == SPDK_VTOPHYS_ERROR) {
					/* Get the physical address from PCI devices */
					paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
				if (paddr == SPDK_VTOPHYS_ERROR) {
					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
					return -EFAULT;
				}
					/* The beginning of this address range points to a PCI resource,
					 * so the rest must point to a PCI resource as well.
					 */
					pci_phys = 1;
				}

				/* Get paddr for each 2MB chunk in this address range */
				while (len > 0) {
					/* Get the physical address from /proc/self/pagemap. */
					if (pci_phys) {
						paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
					} else {
					paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
					}

					if (paddr == SPDK_VTOPHYS_ERROR) {
						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
						return -EFAULT;
					}

					/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
					if (!pci_phys && (paddr & MASK_2MB)) {
					if (paddr & MASK_2MB) {
						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
						return -EINVAL;
					}
@@ -1075,7 +1140,33 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
#if VFIO_ENABLED
		if (paddr == SPDK_VTOPHYS_ERROR) {
			/*
			 * This is not an address that DPDK is managing. If vfio is enabled,
			 * This is not an address that DPDK is managing.
			 */

			/* Check if this is a PCI BAR. They need special handling */
			paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
			if (paddr != SPDK_VTOPHYS_ERROR) {
				/* Get paddr for each 2MB chunk in this address range */
				while (len > 0) {
					paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
					if (paddr == SPDK_VTOPHYS_ERROR) {
						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
						return -EFAULT;
					}

					rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
					if (rc != 0) {
						return rc;
					}

					vaddr += VALUE_2MB;
					len -= VALUE_2MB;
				}

				return 0;
			}

			/* If vfio is enabled,
			 * we need to unmap the range from the IOMMU
			 */
			if (spdk_iommu_is_enabled()) {
+44 −1
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@
#include "spdk/env.h"
#include "spdk/log.h"
#include "spdk/string.h"
#include "spdk/memory.h"

#define SYSFS_PCI_DRIVERS	"/sys/bus/pci/drivers"

@@ -712,12 +713,54 @@ int
spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
			void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
{
	return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
	int rc;

	rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
	if (rc) {
		return rc;
	}

#if VFIO_ENABLED
	/* Automatically map the BAR to the IOMMU */
	if (!spdk_iommu_is_enabled()) {
		return 0;
	}

	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
		/* We'll use the virtual address as the iova to match DPDK. */
		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size);
		if (rc) {
			dev->unmap_bar(dev, bar, *mapped_addr);
			return -EFAULT;
		}

		*phys_addr = (uint64_t)(*mapped_addr);
	} else {
		/* We'll use the physical address as the iova to match DPDK. */
		rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size);
		if (rc) {
			dev->unmap_bar(dev, bar, *mapped_addr);
			return -EFAULT;
		}
	}
#endif
	return rc;
}

int
spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
{
#if VFIO_ENABLED
	int rc;

	if (spdk_iommu_is_enabled()) {
		rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr);
		if (rc) {
			return -EFAULT;
		}
	}
#endif

	return dev->unmap_bar(dev, bar, addr);
}

+2 −2
Original line number Diff line number Diff line
@@ -44,9 +44,9 @@ dpdk_pci_init(void)
}

uint64_t
dpdk_pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr)
dpdk_pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
{
	return g_dpdk_fn_table->pci_device_vtophys(dev, vaddr);
	return g_dpdk_fn_table->pci_device_vtophys(dev, vaddr, len);
}

const char *
Loading