Commit 8ebaed2e authored by Karol Latecki's avatar Karol Latecki Committed by Daniel Verkamp
Browse files

test/vhost: live migration test case 3



To be run only manually as test pool does not
have DUTs connected in pairs.

Change-Id: If3ab3e671156b583adb35d5b23ee90003a6de732
Signed-off-by: default avatarKarol Latecki <karol.latecki@intel.com>
Reviewed-on: https://review.gerrithub.io/400988


Reviewed-by: default avatarPawel Wodkowski <pawelx.wodkowski@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Tested-by: default avatarSPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: default avatarDaniel Verkamp <daniel.verkamp@intel.com>
parent 0d1c3aef
Loading
Loading
Loading
Loading
+7 −3
Original line number Diff line number Diff line
@@ -220,8 +220,9 @@ function spdk_vhost_kill()
			rm $vhost_pid_file
			rc=1
		else
			#check vhost return code, activate trap on error
			wait $vhost_pid
			while kill -0 $vhost_pid; do
				echo "."
			done
		fi
	elif /bin/kill -0 $vhost_pid; then
		error "vhost NOT killed - you need to kill it manually"
@@ -294,9 +295,12 @@ function vm_create_ssh_config()
		echo "  UserKnownHostsFile=/dev/null"
		echo "  StrictHostKeyChecking=no"
		echo "  User root"
		echo "  ControlPath=$VM_BASE_DIR/%r@%h:%p.ssh"
		echo "  ControlPath=/tmp/%r@%h:%p.ssh"
		echo ""
		) > $ssh_config
		# Control path created at /tmp because of live migration test case 3.
		# In case of using sshfs share for the test - control path cannot be
		# on share because remote server will fail on ssh commands.
	fi
}

+14 −0
Original line number Diff line number Diff line
vhost_0_reactor_mask=0x1
vhost_0_master_core=0

vhost_1_reactor_mask=0x1
vhost_1_master_core=0

VM_0_qemu_mask=0x1
VM_0_qemu_numa_node=0

VM_1_qemu_mask=0x1
VM_1_qemu_numa_node=0

VM_2_qemu_mask=0x1
VM_2_qemu_numa_node=0
+20 −0
Original line number Diff line number Diff line
[global]
blocksize=4k-512k
iodepth=128
ioengine=libaio
filename=
group_reporting
thread
numjobs=1
direct=1
do_verify=1
verify=md5
verify_fatal=1
verify_dump=1
verify_backlog=8

[randwrite]
rw=randwrite
runtime=15
time_based
stonewall
+206 −0
Original line number Diff line number Diff line
source $SPDK_BUILD_DIR/test/nvmf/common.sh
source $BASE_DIR/autotest.config

MGMT_TARGET_IP="10.102.17.181"
MGMT_INITIATOR_IP="10.102.17.180"
RDMA_TARGET_IP="10.0.0.1"
RDMA_INITIATOR_IP="10.0.0.2"
incoming_vm=1
target_vm=2
incoming_vm_ctrlr=naa.VhostScsi0.$incoming_vm
target_vm_ctrlr=naa.VhostScsi0.$target_vm
share_dir=$TEST_DIR/share
job_file=$BASE_DIR/migration-tc3.job

function ssh_remote()
{
	local ssh_cmd="ssh -i $SPDK_VHOST_SSH_KEY_FILE \
		-o UserKnownHostsFile=/dev/null \
		-o StrictHostKeyChecking=no -o ControlMaster=auto \
		root@$1"

	shift
	$ssh_cmd "$@"
}

function wait_for_remote()
{
	local timeout=40
	set +x
	while [[ ! -f $share_dir/DONE ]]; do
		echo -n "."
		if (( timeout-- == 0 )); then
			error "timeout while waiting for FIO!"
		fi
		sleep 1
	done
	set -x
	rm -f $share_dir/DONE
}

function check_rdma_connection()
{
	local nic_name=$(ip -4 -o addr show to $RDMA_TARGET_IP up | cut -d' ' -f2)
	if [[ -z $nic_name ]]; then
		error "There is no NIC with IP address $RDMA_TARGET_IP configured"
	fi

	if ! ls /sys/class/infiniband/*/device/net/$nic_name &> /dev/null; then
		error "$nic_name with IP $RDMA_TARGET_IP is not a RDMA capable NIC"
	fi

}

function host1_cleanup_nvmf()
{
	notice "Shutting down nvmf_tgt on local server"
	if [[ ! -z "$1" ]]; then
		pkill --signal $1 -F $nvmf_dir/nvmf_tgt.pid
	else
		pkill -F $nvmf_dir/nvmf_tgt.pid
	fi
	rm -f $nvmf_dir/nvmf_tgt.pid
}

function host1_cleanup_vhost()
{
	trap 'host1_cleanup_nvmf SIGKILL; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT
	notice "Shutting down VM $incoming_vm"
	vm_kill $incoming_vm

	notice "Removing bdev & controller from vhost on local server"
	$rpc_0 delete_bdev Nvme0n1
	$rpc_0 remove_vhost_controller $incoming_vm_ctrlr

	notice "Shutting down vhost app"
	spdk_vhost_kill 0

	host1_cleanup_nvmf
}

function host1_start_nvmf()
{
	nvmf_dir="$TEST_DIR/nvmf_tgt"
	rpc_nvmf="python $SPDK_BUILD_DIR/scripts/rpc.py -s $nvmf_dir/nvmf_rpc.sock"

	notice "Starting nvmf_tgt instance on local server"
	mkdir -p $nvmf_dir
	rm -rf $nvmf_dir/*

	cp $SPDK_BUILD_DIR/test/nvmf/nvmf.conf $nvmf_dir/nvmf.conf
	$SPDK_BUILD_DIR/scripts/gen_nvme.sh >> $nvmf_dir/nvmf.conf

	trap 'host1_cleanup_nvmf SIGKILL; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT
	$SPDK_BUILD_DIR/app/nvmf_tgt/nvmf_tgt -s 512 -c $nvmf_dir/nvmf.conf -r $nvmf_dir/nvmf_rpc.sock &
	nvmf_tgt_pid=$!
	echo $nvmf_tgt_pid > $nvmf_dir/nvmf_tgt.pid
	waitforlisten "$nvmf_tgt_pid" "$nvmf_dir/nvmf_rpc.sock"

	$rpc_nvmf construct_nvmf_subsystem nqn.2018-02.io.spdk:cnode1 \
		"trtype:RDMA traddr:$RDMA_TARGET_IP trsvcid:4420" "" -a -s SPDK01 -n Nvme0n1
}

function host1_start_vhost()
{
	rpc_0="python $SPDK_BUILD_DIR/scripts/rpc.py -s $(get_vhost_dir 0)/rpc.sock"

	notice "Starting vhost0 instance on local server"
	trap 'host1_cleanup_vhost; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT
	spdk_vhost_run --conf-path=$BASE_DIR --vhost-num=0
	$rpc_0 construct_nvme_bdev -b Nvme0 -t rdma -f ipv4 -a $RDMA_TARGET_IP -s 4420 -n "nqn.2018-02.io.spdk:cnode1"
	$rpc_0 construct_vhost_scsi_controller $incoming_vm_ctrlr
	$rpc_0 add_vhost_scsi_lun $incoming_vm_ctrlr 0 Nvme0n1

	vm_setup --os="$share_dir/migration.qcow2" --force=$incoming_vm --disk-type=spdk_vhost_scsi --disks=VhostScsi0 \
		--migrate-to=$target_vm --memory=512 --queue_num=1

	# TODO: Fix loop calculating cpu_num in common.sh
	# We need -smp 1 and -queue_num 1 for this test to work, and this loop
	# in some cases calculates wrong cpu_num.
	sed -i "s#smp 2#smp 1#g" $VM_BASE_DIR/$incoming_vm/run.sh
	vm_run $incoming_vm
	vm_wait_for_boot 300 $incoming_vm
}

function cleanup_share()
{
	set +e
	notice "Cleaning up share directory on remote and local server"
	ssh_remote $MGMT_INITIATOR_IP "umount $VM_BASE_DIR"
	ssh_remote $MGMT_INITIATOR_IP "umount $share_dir; rm -f $share_dir/*"
	rm -f $share_dir/migration.qcow2
	rm -f $share_dir/spdk.tar.gz
	set -e
}

function host_1_create_share()
{
	notice "Creating share directory on local server to re-use on remote"
	mkdir -p $share_dir
	mkdir -p $VM_BASE_DIR # This dir would've been created later but we need it now
	rm -rf $share_dir/spdk.tar.gz $share_dir/spdk || true
	cp $os_image $share_dir/migration.qcow2
	tar --exclude="*.o"--exclude="*.d" --exclude="*.git" -C $SPDK_BUILD_DIR -zcf $share_dir/spdk.tar.gz .
}

function host_2_create_share()
{
	# Copy & compile the sources for later use on remote server.
	ssh_remote $MGMT_INITIATOR_IP "uname -a"
	ssh_remote $MGMT_INITIATOR_IP "mkdir -p $share_dir"
	ssh_remote $MGMT_INITIATOR_IP "mkdir -p $VM_BASE_DIR"
	ssh_remote $MGMT_INITIATOR_IP "sshfs -o ssh_command=\"ssh -i $SPDK_VHOST_SSH_KEY_FILE\" root@$MGMT_TARGET_IP:$VM_BASE_DIR $VM_BASE_DIR"
	ssh_remote $MGMT_INITIATOR_IP "sshfs -o ssh_command=\"ssh -i $SPDK_VHOST_SSH_KEY_FILE\" root@$MGMT_TARGET_IP:$share_dir $share_dir"
	ssh_remote $MGMT_INITIATOR_IP "mkdir -p $share_dir/spdk"
	ssh_remote $MGMT_INITIATOR_IP "tar -zxf $share_dir/spdk.tar.gz -C $share_dir/spdk --strip-components=1"
	ssh_remote $MGMT_INITIATOR_IP "cd $share_dir/spdk; make clean; ./configure --with-rdma --enable-debug; make -j40"
}

function host_2_start_vhost()
{
	ssh_remote $MGMT_INITIATOR_IP "nohup $share_dir/spdk/test/vhost/migration/migration.sh --test-cases=3b --work-dir=$TEST_DIR --os=$share_dir/migration.qcow2 &>$share_dir/output.log &"
	notice "Waiting for remote to be done with vhost & VM setup..."
	wait_for_remote
}

function setup_share()
{
	trap 'cleanup_share; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT
	host_1_create_share
	host_2_create_share
}

function migration_tc3()
{
	check_rdma_connection
	setup_share
	host1_start_nvmf
	host1_start_vhost
	host_2_start_vhost

	# Do migration
	notice "Starting fio on local VM"
	vm_check_scsi_location $incoming_vm

	run_fio $fio_bin --job-file="$job_file" --local --vm="${incoming_vm}$(printf ':/dev/%s' $SCSI_DISK)"
	sleep 5

	if ! is_fio_running $incoming_vm; then
		vh_ssh $incoming_vm "cat /root/$(basename ${job_file}).out"
		error "Fio not running on local VM before starting migration!"
	fi

	vm_migrate $incoming_vm $RDMA_INITIATOR_IP
	sleep 1

	# Verify migration on remote host and clean up vhost
	ssh_remote $MGMT_INITIATOR_IP "pkill -CONT -F $TEST_DIR/tc3b.pid"
	notice "Waiting for remote to finish FIO on VM and clean up..."
	wait_for_remote

	# Clean up local stuff
	host1_cleanup_vhost
	cleanup_share
}

migration_tc3
+80 −0
Original line number Diff line number Diff line
# Set -m option is needed to be able to use "suspend" command
# as we are usin non-interactive session to connect to remote.
# Without -m it would be not possible to suspend the process.
set -m
source $BASE_DIR/autotest.config

RDMA_TARGET_IP="10.0.0.1"
incoming_vm=1
target_vm=2
target_vm_ctrl=naa.VhostScsi0.$target_vm
rpc="python $SPDK_BUILD_DIR/scripts/rpc.py -s $(get_vhost_dir 1)/rpc.sock"
share_dir=$TEST_DIR/share

function host_2_cleanup_vhost()
{
	notice "Shutting down VM $target_vm"
	vm_kill $target_vm

	notice "Removing bdev & controller from vhost 1 on remote server"
	$rpc delete_bdev Nvme0n1
	$rpc remove_vhost_controller $target_vm_ctrl

	notice "Shutting down vhost app"
	spdk_vhost_kill 1
	sleep 1
}

function host_2_start_vhost()
{
	echo "BASE DIR $TEST_DIR"
	vhost_work_dir=$TEST_DIR/vhost1
	mkdir -p $vhost_work_dir
	rm -f $vhost_work_dir/*

	notice "Starting vhost 1 instance on remote server"
	trap 'host_2_cleanup_vhost; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT
	spdk_vhost_run --conf-path=$BASE_DIR --vhost-num=1

	$rpc construct_nvme_bdev -b Nvme0 -t rdma -f ipv4 -a $RDMA_TARGET_IP -s 4420 -n "nqn.2018-02.io.spdk:cnode1"
	$rpc construct_vhost_scsi_controller $target_vm_ctrl
	$rpc add_vhost_scsi_lun $target_vm_ctrl 0 Nvme0n1

	vm_setup --os="$os_image" --force=$target_vm --disk-type=spdk_vhost_scsi --disks=VhostScsi0 \
		--memory=512 --vhost-num=1 --incoming=$incoming_vm
	vm_run $target_vm
	sleep 1

	# Use this file as a flag to notify main script
	# that setup on remote server is done
	echo "DONE" > $share_dir/DONE
}

echo $$ > $TEST_DIR/tc3b.pid
host_2_start_vhost
suspend -f

if ! vm_os_booted $target_vm; then
	fail "VM$target_vm is not running!"
fi

if ! is_fio_running $target_vm; then
	vm_ssh $target_vm "cat /root/migration-tc3.job.out"
	error "FIO is not running on remote server after migration!"
fi

notice "Waiting for FIO to finish on remote server VM"
timeout=40
while is_fio_running $target_vm; do
	sleep 1
	echo -n "."
	if (( timeout-- == 0 )); then
		error "timeout while waiting for FIO!"
	fi
done

notice "FIO result after migration:"
vm_ssh $target_vm "cat /root/migration-tc3.job.out"

host_2_cleanup_vhost
echo "DONE" > $share_dir/DONE
Loading