Commit ce25e81f authored by Michal Berger's avatar Michal Berger Committed by Konrad Sztyber
Browse files

nvme/perf: Use sqthread_poll_cpu for io_uring workloads



Under newer kernels the way how cpu affinity is applied to poll
threads changed, such that they are not bound to all online cpus
but rather the cpus of the job threads. When using cpus_allowed,
this means all the poll threads are bound to that particular mask,
unless dedicated cpu is selected via sqthread_poll_cpu.

This behavior shows significant performance drop since the cpu
time is now shared across all these threads. To mitigate, and to
make sure results, more or less, much the ones from older kernels,
define the dedicated cpu for each poll thread created per job.

Change-Id: I7e085729883ade099c5832e766f27cff22b18757
Signed-off-by: default avatarMichal Berger <michal.berger@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/23090


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarJim Harris <jim.harris@samsung.com>
Reviewed-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
parent 632342a0
Loading
Loading
Loading
Loading
+53 −2
Original line number Diff line number Diff line
@@ -4,6 +4,11 @@
#  All rights reserved.
#
source "$rootdir/test/dd/common.sh"
source "$rootdir/test/scheduler/common.sh"

xtrace_disable
map_cpus
xtrace_restore

function discover_bdevs() {
	local rootdir=$1
@@ -196,6 +201,47 @@ function get_disks_on_numa() {
	echo $disks_on_numa
}

function set_potential_poll_threads() {
	[[ $PLUGIN =~ uring ]] || return 0

	local _cpus=("$@") all_fio_cpus=() cpu _cpu poll_thread
	local -g sqpoll_cpu_threads
	local node

	# Collect all siblings in case smt is enabled.
	for cpu in "${_cpus[@]}"; do
		for _cpu in "${!cpu_siblings[cpu]}"; do
			all_fio_cpus[_cpu]=$cpu
		done
	done

	# Move fio's polling thread to a different cpu. In case smt is enabled, we  try to move it to a completely
	# different core under the same numa node. Note that we can set sgthread_poll_cpu only to a single cpu -
	# in case NUMJOBS is > 1 we will have multiple poll threads using that particular cpu. FIXME: what would be
	# the potential impact here of spamming poll threads like that? In case of higher numjobs, does it make
	# sense to still use sqthread_poll?
	#
	# Here we build list of all potential candidates to hold sqpoll thread. Lists are per node and should hold
	# all cpus outside of the fio cpus that were requested. So each get_poll_thread() should return a cpu
	# thread which is guaranteed to be outside of the physical core of each of fio cpus (but still bound to
	# the same numa node as the fio cpu selected for given job).
	for cpu in "${cpus[@]}"; do
		[[ -n ${all_fio_cpus[cpu]} ]] && continue
		node=${cpu_node_map[cpu]}
		local -n sqpoll_node=sqpoll_threads_node_$node
		sqpoll_node+=("$cpu")
		sqpoll_cpu_threads[node]=sqpoll_threads_node_$node
	done
}

function get_poll_thread() {
	local node=$1 idx=$2
	local -n node=${sqpoll_cpu_threads[node]}

	# Default to the highest cpu
	echo "${node[idx]:-${cpus[-1]}}"
}

function create_fio_config() {
	local disk_no=$1
	local plugin=$2
@@ -249,6 +295,9 @@ function create_fio_config() {
		echo "iodepth_batch_complete=$IO_BATCH_COMPLETE" >> $testdir/config.fio
	fi

	# shellcheck disable=SC2068
	set_potential_poll_threads ${cores[@]//,/ }

	for i in "${!cores[@]}"; do
		local m=0 #Counter of disks per NUMA node
		local n=0 #Counter of all disks in test
@@ -273,7 +322,8 @@ function create_fio_config() {
			fio_job_section+=("")
			fio_job_section+=("[filename${i}]")
			fio_job_section+=("iodepth=$QD")
			fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")
			fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]} ($FIO_FNAME_STRATEGY)")
			fio_job_section+=("sqthread_poll_cpu=$(get_poll_thread "${cores_numa[i]}" "$i")")
		fi

		while [[ "$m" -lt "$total_disks_per_core" ]]; do
@@ -285,6 +335,7 @@ function create_fio_config() {
					fio_job_section+=("[filename${m}-${cores[$i]}]")
					fio_job_section+=("iodepth=$QD")
					fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")
					fio_job_section+=("sqthread_poll_cpu=$(get_poll_thread "${cores_numa[i]}" "$i")")
				fi

				if [[ "$plugin" == "spdk-plugin-nvme" ]]; then
@@ -292,7 +343,7 @@ function create_fio_config() {
				elif [[ "$plugin" == "spdk-plugin-bdev" || "$plugin" == "spdk-plugin-bdev-xnvme" ]]; then
					fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
				elif [[ "$plugin" =~ "kernel" ]]; then
					fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
					fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]} ($FIO_FNAME_STRATEGY)")
				fi
				m=$((m + 1))