Commit 2b80955c authored by Michal Berger's avatar Michal Berger Committed by Tomasz Zawadzki
Browse files

scripts/setup: Refactor hugepages allocation for Linux



Main changes:

- By default, allocate all hugepages on node0. On NUMA-aware systems,
  processes will most often use default policy with local node as the
  preffered one for allocations. This usually is node0, thus splitting
  hugepages evenly across the nodes, in default setup, would force
  allocations from a remote node, impacting overall performance (in
  case hugepages on node0 run out). See 68740678 as a reference.

- Introduce HUGE_EVEN_ALLOC - force setup.sh to evenly distribute
  hugepages across all the nodes.

- Introduce HUGEPGSZ - overrides default page size

- Introduce CLEAR_HUGE - remove all hugepages on demand before
  allocation is performed.

- HUGENODE - this now can be a list of nodes to use. NRHUGE will be
  set across all the nodes from the list.

Change-Id: I084829edde3c416e7fc6b7b6abe369cc0631fcd7
Signed-off-by: default avatarMichal Berger <michalx.berger@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/5042


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarTomasz Zawadzki <tomasz.zawadzki@intel.com>
parent 07e251ef
Loading
Loading
Loading
Loading
+69 −20
Original line number Diff line number Diff line
@@ -44,12 +44,18 @@ function usage() {
	echo
	echo "The following environment variables can be specified."
	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
	echo "                  For NUMA systems, the hugepages will be evenly distributed"
	echo "                  between CPU nodes"
	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
	echo "                  default."
	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
	echo "                  Uses kernel's default for hugepages size."
	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
	echo "HUGENODE          Specific NUMA node to allocate hugepages on. To allocate"
	echo "                  hugepages on multiple nodes run this script multiple times -"
	echo "                  once for each node."
	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
	echo "                  separated with comas - NRHUGE will be applied on each node."
	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
	echo "                  setting is used."
	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
	echo "                  be made prior to allocation".
	echo "PCI_WHITELIST"
	echo "PCI_BLACKLIST     Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
	echo "                  Each device must be specified as a full PCI address."
@@ -391,6 +397,57 @@ function cleanup_linux() {
	unset dirs_to_clean files_to_clean opened_files
}

check_hugepages_alloc() {
	local hp_int=$1
	local allocated_hugepages

	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"

	allocated_hugepages=$(< "$hp_int")
	if ((allocated_hugepages < NRHUGE)); then
		cat <<- ERROR

			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
		ERROR
		return 1
	fi
}

clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }

configure_linux_hugepages() {
	local node system_nodes nodes_to_use

	if [[ $CLEAR_HUGE == yes ]]; then
		clear_hugepages
	fi

	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
		clear_hugepages
		check_hugepages_alloc /proc/sys/vm/nr_hugepages
		return 0
	fi

	for node in /sys/devices/system/node/node*; do
		[[ -e $node ]] || continue
		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
	done

	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
	if ((${#nodes_to_use[@]} == 0)); then
		nodes_to_use=(0)
	fi

	for node in "${nodes_to_use[@]}"; do
		if [[ -z ${nodes[node]} ]]; then
			echo "Node $node doesn't exist, ignoring" >&2
			continue
		fi
		check_hugepages_alloc "${nodes[node]}" "$node"
	done
}

function configure_linux() {
	configure_linux_pci
	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
@@ -402,20 +459,7 @@ function configure_linux() {
		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
	fi

	if [ -z "$HUGENODE" ]; then
		hugepages_target="/proc/sys/vm/nr_hugepages"
	else
		hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages"
	fi

	echo "$NRHUGE" > "$hugepages_target"
	allocated_hugepages=$(cat $hugepages_target)
	if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then
		echo ""
		echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated."
		echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine."
		exit 1
	fi
	configure_linux_hugepages

	if [ "$driver_name" = "vfio-pci" ]; then
		if [ -n "$TARGET_USER" ]; then
@@ -704,7 +748,12 @@ if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
fi

if [[ $os == Linux ]]; then
	HUGEPGSZ=$(($(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')))
	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
		echo "${HUGEPGSZ}kB is not supported by the running kernel, ingoring" >&2
		unset -v HUGEPGSZ
	fi

	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}