Commit c1e9ed6d authored by Michal Berger's avatar Michal Berger Committed by Jim Harris
Browse files

perf/vhost: Auto generate VM cpu and disk map configs

The configuration is generated based on existing host's NUMA
topology (as seen via sysfs) instead of a total number of cpus as
it was done before. New logic attempts to load balance VMs and
their cpus based on nvme drives' NUMA location. If there is no
enough cpus left under the target node, all remaining nodes are
checked. For the sake of the performance, cpus are not mixed
between different numa nodes.

Disk map is created by mapping VMs to existing nvme drives
based on their NUMA location. Extra VMs are assigned in bus
order of the nvme drives.

SPDK cpus are split by matching the VM-to-nvme NUMA ratio.
Static list can be defined as well to override this behavior.

https://trello.com/c/HSoRtQkO/401-deprecate-vhost-performance-python-helper-script



Signed-off-by: default avatarMichal Berger <michal.berger@intel.com>
Change-Id: Ia63c6f9a472a685d252efd110eaba7b114a87d2c
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12401


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarTomasz Zawadzki <tomasz.zawadzki@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarKarol Latecki <karol.latecki@intel.com>
parent 5433004e
Loading
Loading
Loading
Loading
+371 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash
#  SPDX-License-Identifier: BSD-3-Clause
#  Copyright (C) 2022 Intel Corporation.
#  All rights reserved.

curdir=$(readlink -f "$(dirname "$0")")
rootdir=$(readlink -f "$curdir/../../../")

source "$rootdir/scripts/common.sh"
source "$rootdir/test/scheduler/common.sh"

get_auto_cfg() {
	local vm_cpus vm_node vm vms vms_per_nvme
	local cpu node nodes_idxs node_idx
	local nvmes nvme nvme_idx nvme_diff nvmes_per_node
	local vm_diff aligned_number_of_vms=0
	local diff iter

	local -g auto_cpu_map=() auto_disk_map=() spdk=()

	map_cpus
	get_nvme_numa_map

	nodes_idxs=("${!nodes[@]}")

	# Construct initial NUMA-aware setup by pinning VM to given nvme's node. First run is meant
	# to pin enough number of VMs (as per vm_count) to match the number of available nvme ctrls.
	vm=0
	for node in "${nodes_idxs[@]}"; do
		nvmes=(${!nvme_numa_map[node]})
		for ((nvme_idx = 0; nvme_idx < ${#nvmes[@]} && vm < vm_count; vm++, nvme_idx++)); do
			eval "vm${vm}_node=$node"
		done
		nvmes_per_node[node]=${#nvmes[@]}
	done

	vm_diff=$((vm_count - vm))

	# Align extra number of VMs in case nvme ctrls are not distributed evenly across the existing
	# NUMA nodes.
	# FIXME: This is targeted for systems with only 2 NUMA nodes. Technically, kernel supports
	# more than that - it's possible to achieve setups with > 2 NUMA nodes under virtual env
	# for instance. Should this be of any concern?
	if ((nvmes_per_node[0] < nvmes_per_node[1])); then
		nvme_diff=$((nvmes_per_node[1] - nvmes_per_node[0]))
	elif ((nvmes_per_node[0] > nvmes_per_node[1])); then
		nvme_diff=$((nvmes_per_node[0] - nvmes_per_node[1]))
	else
		nvme_diff=0
	fi

	diff=$((vm_diff + nvme_diff))

	if ((diff % 2 == 0)); then
		aligned_number_of_vms=$((diff / ${#nodes_idxs[@]}))
	fi

	# Second run distributes extra VMs across existing NUMA nodes. In case we can distribute even
	# number of extra VMs (as per vm_count) then simply assign them in bulk. In case there's an
	# odd number, do some simple rr balancing where we assign them one by one - first to node0,
	# second to node1, third to node0, etc.
	if ((aligned_number_of_vms)); then
		for node in "${nodes_idxs[@]}"; do
			for ((iter = 0; iter < aligned_number_of_vms && vm < vm_count; iter++, vm++)); do
				eval "vm${vm}_node=$node"
			done
		done
	else
		while ((vm < vm_count)); do
			for node in "${nodes_idxs[@]}"; do
				eval "vm${vm}_node=$node"
				((++vm))
			done
		done
	fi

	local -g vm_numa_map=()
	for ((vm = 0; vm < vm_count; vm++)); do
		# Load balance the cpus across available numa nodes based on the pinning
		# done prior. If there are no cpus left under selected node, iterate over
		# all available nodes. If no cpus are left, fail. We don't allow to mix
		# cpus from different nodes for the sake of the performance.
		node_idx=0 node_idx_perc=0
		eval "vm_node=\$vm${vm}_node"

		local -n node_cpus=node_${vm_node}_cpu
		local -n vm_nodes=node_${vm_node}_vm

		vm_numa_map[vm_node]="node_${vm_node}_vm[@]"

		while ((${#node_cpus[@]} < vm_cpu_num && node_idx < ${#nodes_idxs[@]})); do
			vm_node=${nodes_idxs[node_idx]}
			local -n node_cpus=node_${nodes_idxs[node_idx++]}_cpu
		done

		if ((${#node_cpus[@]} < vm_cpu_num)); then
			printf 'Not enough CPUs available for VM %u (CPUs: %u, Nodes: %u, CPUs per VM: %u)\n' \
				"$vm" "${#cpus[@]}" "${#nodes_idxs[@]}" "$vm_cpu_num" >&2
			return 1
		fi

		# Normalize indexes
		node_cpus=("${node_cpus[@]}")

		vm_cpus=("${node_cpus[@]::vm_cpu_num}")
		node_cpus=("${node_cpus[@]:vm_cpu_num}")

		auto_cpu_map+=("$(
			cat <<- CPU_VM
				VM_${vm}_qemu_mask=$(
				IFS=","
				echo "${vm_cpus[*]}"
				)
				VM_${vm}_qemu_numa_node=$vm_node
			CPU_VM
		)")

		# Save map of each VM->NUMA node to be able to construct a disk map in later steps.
		vm_nodes+=("$vm")
	done

	# auto_cpu_map is ready, all requested VMs should be balanced across all NUMA nodes
	# making sure each nvme drive will be bound to at least 1 VM placed on the
	# corresponding NUMA node. Now, construct disk_cfg and assign VMs, with proper
	# split value, to each nvme - extra VMs will be added to nvme drives in their
	# bus order.
	local -A nvme_vm_map=()
	local iter nvmes_no=0 vms_no=0
	for node in "${nodes_idxs[@]}"; do
		if [[ ! -v nvme_numa_map[node] ]]; then
			# There are no drives available on that node, skip it
			continue
		fi
		nvmes=(${!nvme_numa_map[node]}) nvmes_no=${#nvmes[@]}
		vms=(${!vm_numa_map[node]}) vms_no=${#vms[@]}
		for ((iter = 0; iter <= (vms_no - nvmes_no <= 0 ? 1 : vms_no - nvmes_no); iter++)); do
			for nvme in "${nvmes[@]}"; do
				if ((${#vms[@]} == 0)); then
					# No VMs on given node or they have been exhausted - skip all remaining drives.
					continue 3
				fi
				nvme_vm_map["$nvme"]="_${nvme//[:.]/_}_[@]"
				local -n nvme_vms=_${nvme//[:.]/_}_
				nvme_vms+=("${vms[0]}") vms=("${vms[@]:1}")
			done
		done
	done

	local sorted_nvmes=()
	sorted_nvmes=($(printf '%s\n' "${!nvme_vm_map[@]}" | sort))
	for nvme in "${!sorted_nvmes[@]}"; do
		vms=(${!nvme_vm_map["${sorted_nvmes[nvme]}"]})
		auto_disk_map+=("${sorted_nvmes[nvme]},Nvme$((nvme++)),${#vms[*]},${vms[*]}")
	done

	get_spdk_cpus || return 1

	auto_cpu_map+=("vhost_0_reactor_mask=[$(
		IFS=","
		echo "${spdk[*]}"
	)]")
	auto_cpu_map+=("vhost_0_master_core=${spdk[0]}")
}

get_nvme_numa_map() {
	local nvmes nvme node
	local -g nvme_numa_map=()

	cache_pci_bus

	for nvme in ${pci_bus_cache[0x010802]}; do
		node=$(< "/sys/bus/pci/devices/$nvme/numa_node")
		nvme_numa_map[node]="node_${node}_nvme[@]"
		local -n node_nvmes=node_${node}_nvme
		node_nvmes+=("$nvme")
	done
}

get_spdk_cpus() {
	local -g spdk=()
	local node vms perc
	local cpus_per_node cpus_exhausted=() cpus_remained=()

	if [[ -z $spdk_cpu_num ]]; then
		spdk=(0)
		return 0
	fi

	if [[ -n $spdk_cpu_list ]]; then
		spdk=($(parse_cpu_list <(echo "$spdk_cpu_list")))
		return 0
	fi

	# Start allocating from NUMA node with greater number of pinned VMs.
	node_sort=($(for node in "${!vm_numa_map[@]}"; do
		vms=(${!vm_numa_map[node]})
		echo "${#vms[@]}:$node"
	done | sort -rn))

	for _node in "${node_sort[@]}"; do
		node=${_node#*:} vms=${_node%:*}
		local -n node_all_cpus=node_${node}_cpu
		perc=$((vms * 100 / vm_count))
		cpus_per_node=$((spdk_cpu_num * perc / 100))
		cpus_per_node=$((cpus_per_node == 0 ? 1 : cpus_per_node))

		if ((${#node_all_cpus[@]} == 0)); then
			printf 'No CPUs left to allocate for SPDK on node%u. Need %u CPUs\n' \
				"$node" "$cpus_per_node" >&2

			cpus_exhausted[node]=1
			continue
		fi
		if ((${#node_all_cpus[@]} < cpus_per_node)); then
			printf 'Not enough CPUs to allocate for SPDK on node%u. Need %u CPUs, getting %u\n' \
				"$node" "$cpus_per_node" "${#node_all_cpus[@]}" >&2
			cpus_per_node=${#node_all_cpus[@]}
			cpus_exhauseted[node]=1
		fi

		spdk+=("${node_all_cpus[@]::cpus_per_node}")
		node_all_cpus=("${node_all_cpus[@]:cpus_per_node}")
		cpus_remained+=("${node_all_cpus[@]}")
	done

	# If we didn't allocate the entire number of requested cpus in the initial run,
	# adjust it by adding the remaining portion from the node having greater number
	# of pinned VMs.
	if ((${#spdk[@]} < spdk_cpu_num)); then
		if [[ -n $ALIGN_FROM_ALL_NODES ]] && ((${#cpus_remained[@]} > 0)); then
			printf 'Trying to get extra CPUs from all nodes\n'
			local -n node_all_cpus=cpus_remained
		else
			node=${node_sort[0]#*:}
			printf 'Trying to get extra CPUs from the dominant node%u to align: %u < %u\n' \
				"$node" "${#spdk[@]}" "$spdk_cpu_num"
			if ((cpus_exhausted[node])); then
				printf 'No CPUs available on node%u\n' "$node"
			else
				local -n node_all_cpus=node_${node}_cpu
			fi
		fi
		spdk+=("${node_all_cpus[@]::spdk_cpu_num-${#spdk[@]}}")
	fi >&2
	if ((${#spdk[@]} != spdk_cpu_num)); then
		printf 'Different number of SPDK CPUs allocated to meet the requirements: requested %u, got %u\n' \
			"$spdk_cpu_num" "${#spdk[@]}"
	else
		printf 'Requested number of SPDK CPUs allocated: %u\n' "$spdk_cpu_num"
	fi >&2
}

_p_disk_map() {
	((${#auto_disk_map[@]} > 0)) || return 0
	printf '%s\n' "${auto_disk_map[@]}"
}

_p_cpu_map() {
	((${#auto_cpu_map[@]} > 0)) || return 0
	printf '%s\n' "${auto_cpu_map[@]}"
}

p_disk_map() {
	cat <<- DISK_MAP
		# Generated automatically by ${0##*/}
		# NVMe Drives: ${#auto_disk_map[@]} VM count: $vm_count
		$(_p_disk_map)
	DISK_MAP
}

p_vms_in_node() {
	((${#vm_numa_map[@]} > 0)) || return 0

	local node vms
	for node in "${!vm_numa_map[@]}"; do
		vms=(${!vm_numa_map[node]})
		echo "Node$node: ${#vms[@]} VMs"
	done
}

p_cpu_map() {
	local node_stats

	mapfile -t node_stats < <(p_vms_in_node)
	cat <<- CPU_MAP
		# Generated automatically by ${0##*/}
		# VM NUMA Nodes: ${#vm_numa_map[@]} VM count: $vm_count CPU Per VM: $vm_cpu_num SPDK CPU count: ${#spdk[@]}
		$(printf '#  - %s\n' "${node_stats[@]}")
		$(_p_cpu_map)
	CPU_MAP
}

p_all() {
	p_disk_map
	printf '\n'
	p_cpu_map
}

fetch_env() {
	spdk_cpu_num=${spdk_cpu_num:-1}
	vm_count=${vm_count:-1}
	vm_cpu_num=${vm_cpu_num:-1}

	# Normalize
	spdk_cpu_num=$((spdk_cpu_num <= 0 ? 1 : spdk_cpu_num))
	vm_count=$((vm_count <= 0 ? 1 : vm_count))
	vm_cpu_num=$((vm_cpu_num <= 0 ? 1 : vm_cpu_num))

	cpu_out=${cpu_out:-"$PWD/auto-cpu.conf"}
	disk_out=${disk_out:-"$PWD/auto-disk.conf"}
}

help() {
	cat <<- HELP
		${0##*/}: [-p all|cpu|disk -s]

		Configuration is generated based on system's cpu and nvme topology. Parameters
		taken directly from the environment:

		spdk_cpu_list - list of CPUs to assign to a SPDK app
		spdk_cpu_num  - number of CPUs to use across all NUMA nodes
		                (spdk_cpu_list takes priority, default: 1)
		vm_count      - number of VMs to prepare the configuration for
		                (default: 1)
		vm_cpu_num    - number of CPUs to assign per VM (default: 1)

		Override parameters:
		vmN_node      - overrides selected NUMA node for VM N - by default,
		                this is allocated up to number of nvme drives
		cpu_out       - with -s, points at location where to save cpu conf
		disk_out      - with -s, points at location where to save disk conf

		Note: VMs are pinned to nvme drives based on their NUMA location.

		Example:
		# Allocate 6 cpus from node1 for SPDK. Configure 24 VMs, 2 CPUs per VM
		$ export spdk_cpu_num=6 vm_count=24 vm_cpu_num=2
		$ ${0##*/} -p all
	HELP
}

print=""
save=no

fetch_env

while getopts :hsp: arg; do
	case "$arg" in
		h)
			help
			exit 0
			;;
		p) print=$OPTARG ;;
		s) save=yes ;;
		*) ;;
	esac
done

get_auto_cfg || exit 1

case "$print" in
	all) p_all ;;
	cpu) p_cpu_map ;;
	disk) p_disk_map ;;
	*) ;;
esac

if [[ $save == yes ]]; then
	p_cpu_map > "$cpu_out"
	p_disk_map > "$disk_out"
fi
+11 −0
Original line number Diff line number Diff line
@@ -65,6 +65,17 @@ perf_args+=(${split:+--use-split})
perf_args+=(${disk_map:+--disk-map="$disk_map"})
perf_args+=(${cpu_cfg:+--custom-cpu-cfg="$cpu_cfg"})

if [[ $auto_cfg == yes || $auto_cfg_print == yes ]]; then
	if [[ $auto_cfg_print == yes ]]; then
		"$curdir/conf-generator" -p all || exit 1
		exit 0
	fi
	cpu_out=$curdir/auto-cpu.conf disk_out=$curdir/auto-disk.conf \
		"$curdir/conf-generator" -s || exit 1
	perf_args+=("--disk-map=$disk_out")
	perf_args+=("--custom-cpu-cfg=$cpu_out")
fi

if [[ -n $extra_params ]]; then
	perf_args+=($extra_params)
fi
+1 −1
Original line number Diff line number Diff line
@@ -65,7 +65,7 @@ map_cpus_node() {
			local -n _cpu_core_map=node_${node_idx}_core_${core_idx}
			_cpu_core_map+=("$cpu_idx") cpu_core_map[cpu_idx]=$core_idx
		fi
		_cpu_node_map+=("$cpu_idx") cpu_node_map[cpu_idx]=$node_idx
		_cpu_node_map[cpu_idx]=$cpu_idx cpu_node_map[cpu_idx]=$node_idx
		cpus+=("$cpu_idx")
	done