Commit be04cfc3 authored by Darek Stojaczyk's avatar Darek Stojaczyk Committed by Jim Harris
Browse files

env_dpdk/memory: aggregate adjacent vfio mappings



In the past, memory in spdk could have been unregistered in
different chunks than it was registered, so to account
for that the vtophys code used to register each hugepage
(2MB chunk of memory) separately to the VFIO driver. This
really made the code generally simple.

Now that memory in spdk can only be unregistered in the same
chunks it was registered in, we no longer have to register
each hugepage to VFIO separately. We could register the
entire memory region with just a single VFIO ioctl instead,
so that's we'll do now.

This serves as an optimization as we obviously send less
ioctls now, but most importantly it prevents SPDK from
reaching a VFIO registrations limit that was introduced
in Linux 5.1. [1]

The default limit is 65535, which results in SPDK being able to
make only the first 128GB of memory DMA-able. This is most
problematic for vhost where we need to register the memory
of all the VMs.

Fixes #915

[1] 492855939bdb59c6f947b0b5b44af9ad82b7e38c
("vfio/type1: Limit DMA mappings per container")

Change-Id: Ida40306b2684e20daa2fd8d12e0df2eef5a4bff1
Signed-off-by: default avatarDarek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/432442


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarChangpeng Liu <changpeng.liu@intel.com>
parent 43f4e393
Loading
Loading
Loading
Loading
+111 −58
Original line number Diff line number Diff line
@@ -1005,7 +1005,6 @@ spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
		return -EINVAL;
	}

	while (len > 0) {
	/* Get the physical address from the DPDK memsegs */
	paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);

@@ -1022,10 +1021,19 @@ spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
				 * addresses will never overlap.
				 */
				paddr = (uint64_t)vaddr;
					rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
				rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
				if (rc) {
					return -EFAULT;
				}
				while (len > 0) {
					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
					if (rc != 0) {
						return rc;
					}
					vaddr += VALUE_2MB;
					paddr += VALUE_2MB;
					len -= VALUE_2MB;
				}
			} else
#endif
			{
@@ -1038,10 +1046,26 @@ spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
						return -EFAULT;
					}
					/* The beginning of this address range points to a PCI resource,
					 * so the rest must point to a PCI resource as well.
					 */
					pci_phys = 1;
				}

				/* Get paddr for each 2MB chunk in this address range */
				while (len > 0) {
					/* Get the physical address from /proc/self/pagemap. */
					if (pci_phys) {
						paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
					} else {
						paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
					}

					if (paddr == SPDK_VTOPHYS_ERROR) {
						DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
						return -EFAULT;
					}

					/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
					if (!pci_phys && (paddr & MASK_2MB)) {
						DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
@@ -1049,6 +1073,33 @@ spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
					}

					rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
					if (rc != 0) {
						return rc;
					}

					vaddr += VALUE_2MB;
					len -= VALUE_2MB;
				}
			}
		} else {
			/* This is an address managed by DPDK. Just setup the translations. */
			while (len > 0) {
				paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
				if (paddr == SPDK_VTOPHYS_ERROR) {
					DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
					return -EFAULT;
				}

				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
				if (rc != 0) {
					return rc;
				}

				vaddr += VALUE_2MB;
				len -= VALUE_2MB;
			}
		}

		break;
	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
#if SPDK_VFIO_ENABLED
@@ -1058,31 +1109,33 @@ spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
			 * we need to unmap the range from the IOMMU
			 */
			if (spdk_iommu_is_enabled()) {
					uint64_t buffer_len = VALUE_2MB;
				uint64_t buffer_len = len;
				paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len);
					if (buffer_len != VALUE_2MB) {
				if (buffer_len != len) {
					return -EINVAL;
				}
					rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
				rc = vtophys_iommu_unmap_dma(paddr, len);
				if (rc) {
					return -EFAULT;
				}
			}
		}
#endif
		while (len > 0) {
			rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
			break;
		default:
			SPDK_UNREACHABLE();
		}

			if (rc != 0) {
				return rc;
			}

			vaddr += VALUE_2MB;
			len -= VALUE_2MB;
		}

		break;
	default:
		SPDK_UNREACHABLE();
	}

	return rc;
}