Commit 45c42ac2 authored by Michal Berger's avatar Michal Berger Committed by Jim Harris
Browse files

scripts: Use core dump collector



This is done in order to track core dumps in a more efficient
manner. Till now, some cores could be missed if the binary was
executed outside of the cwd of the autotest (i.e. outside of
the spdk repo) but which was part of the critical path of the
actual test (e.g. fio in vhost-initiator tests). Also, since
core_pattern was set to plain "core", impact on the underlying
storage wasn't controlled either - if core was 20G in size,
this is what we would get. This could easly exhaust storage in
case error-prone patchsets were submitted on the CI side.

The collector will try to mitigate all the above by doing the
following:

  - collecting all the cores, regardless of their cwd
  - limiting size of the core to 2G
  - compressing the cores (lz4)

Also, limit of 2 collectors executing at once is set - if more
processes crashes at approx. the same time, they will be logged
in the kernel log instead.

Signed-off-by: default avatarMichal Berger <michalx.berger@intel.com>
Change-Id: I5956a9030c463ae85a21bfe95f28af5568c5c285
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/5369


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarKarol Latecki <karol.latecki@intel.com>
Reviewed-by: default avatarTomasz Zawadzki <tomasz.zawadzki@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
parent d8211848
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -30,8 +30,10 @@ fi

if [ $(uname -s) = Linux ]; then
	old_core_pattern=$(< /proc/sys/kernel/core_pattern)
	mkdir -p "$output_dir/coredumps"
	# set core_pattern to a known value to avoid ABRT, systemd-coredump, etc.
	echo "core" > /proc/sys/kernel/core_pattern
	echo "|$rootdir/scripts/core-collector.sh %P %s %t $output_dir/coredumps" > /proc/sys/kernel/core_pattern
	echo 2 > /proc/sys/kernel/core_pipe_limit

	# Make sure that the hugepage state for our VM is fresh so we don't fail
	# hugepage allocation. Allow time for this action to complete.
+89 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash
# We don't want to tell kernel to include %e or %E since these
# can include whitespaces or other funny characters, and working
# with those on the cmdline would be a nightmare. Use procfs for
# the remaining pieces we want to gather:
# |$rootdir/scripts/core-collector.sh %P %s %t $output_dir

get_rlimit() {
	local limit

	while read -ra limit; do
		[[ ${limit[1]} == core ]] && echo "${limit[4]}" # soft
	done < "/proc/$core_pid/limits"
}

core_meta() {
	jq . <<- CORE
		{
		  "$exe_comm": {
		    "ts": "$core_time",
		    "size": "$core_size bytes",
		    "PID": $core_pid,
		    "signal": "$core_sig ($core_sig_name)",
		    "path": "$exe_path",
		    "statm": "$statm"
		  }
		}
	CORE
}

bt() { hash gdb && gdb -batch -ex "thread apply all bt full" "$1" "$2" 2>&1; }

stderr() {
	exec 2> "$core.stderr.txt"
	set -x
}

args+=(core_pid)
args+=(core_sig)
args+=(core_ts)
args+=(output_dir)

read -r "${args[@]}" <<< "$*"

exe_path=$(readlink -f "/proc/$core_pid/exe")
exe_comm=$(< "/proc/$core_pid/comm")
statm=$(< "/proc/$core_pid/statm")
core_time=$(date -d@"$core_ts")
core_sig_name=$(kill -l "$core_sig")

core=$output_dir/${exe_path##*/}_$core_pid.core
stderr

# RLIMIT_CORE is not enforced when core is piped to us. To make
# sure we won't attempt to overload underlying storage, copy
# only the reasonable amount of bytes (systemd defaults to 2G
# so let's follow that). But first, check limits of terminating
# process to see if we need to make any adjustments.
max_core=$((1024 * 1024 * 1024 * 2))

rlimit=$(get_rlimit)
if [[ $rlimit == unlimited ]] || ((rlimit > max_core)); then
	rlimit=$max_core
fi

# Nothing to do
((rlimit == 0)) && exit 0

# Clear path for lz
rm -f "$core"{,.{bin,bt,gz,json}}

# Slurp the core
head -c "$rlimit" <&0 > "$core"
core_size=$(wc -c < "$core")

# Compress it
gzip -c "$core" > "$core.gz"

# Save the binary
cp "$exe_path" "$core.bin"

# Save the backtrace
bt "$exe_path" "$core" > "$core.bt.txt"

# Save the metadata of the core
core_meta > "$core.json"

# Nuke the original core
rm "$core"
+30 −18
Original line number Diff line number Diff line
@@ -600,24 +600,36 @@ function gdb_attach() {
}

function process_core() {
	ret=0
	while IFS= read -r -d '' core; do
		exe=$(eu-readelf -n "$core" | grep psargs | sed "s/.*psargs: \([^ \'\" ]*\).*/\1/")
		if [[ ! -f "$exe" ]]; then
			exe=$(eu-readelf -n "$core" | grep -oP -m1 "$exe.+")
		fi
		echo "exe for $core is $exe"
		if [[ -n "$exe" ]]; then
			if hash gdb &> /dev/null; then
				gdb -batch -ex "thread apply all bt full" $exe $core
			fi
			cp $exe $output_dir
		fi
		mv $core $output_dir
		chmod a+r $output_dir/$core
		ret=1
	done < <(find . -type f \( -name 'core.[0-9]*' -o -name 'core' -o -name '*.core' \) -print0)
	return $ret
	# Note that this always was racy as we can't really sync with the kernel
	# to see if there's any core queued up for writing. We could check if
	# collector is running and wait for it explicitly, but it doesn't seem
	# to be worth the effort. So assume that if we are being called via
	# trap, as in, when some error has occurred, wait up to 5s for any
	# potential cores. If we are called just for cleanup at the very end,
	# don't wait since all the tests ended successfully, hence having any
	# critical cores lying around is unlikely.
	local es=$?
	((es != 0)) && sleep 5s

	local coredumps core

	shopt -s nullglob
	coredumps=("$output_dir/coredumps/"*.bt.txt)
	shopt -u nullglob

	((${#coredumps[@]} > 0)) || return 0
	chmod -R a+r "$output_dir/coredumps"

	for core in "${coredumps[@]}"; do
		cat <<- BT
			##### CORE BT ${core##*/} #####

			$(<"$core")

			--
		BT
	done
	return 1
}

function process_shm() {