Commit e44d6317 authored by sijie.sun's avatar sijie.sun Committed by Tomasz Zawadzki
Browse files

nvme_rdma: handle DEVICE_REMOVAL event in RDMA initiator



When IBV_EVENT_DEVICE_FATAL & RDMA_CM_EVENT_DEVICE_REMOVAL occurs,
destroy qpair immediately and do no assume that no successful WQE will
be received after rdma_disconnect.

Signed-off-by: default avatarsijie.sun <sijie.sun@smartx.com>
Change-Id: I23e44dd32c8adea301e5251659b1be519f5dfdf7
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/16314


Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Community-CI: Mellanox Build Bot
parent 549be9ad
Loading
Loading
Loading
Loading
+27 −11
Original line number Diff line number Diff line
@@ -244,6 +244,7 @@ struct nvme_rdma_qpair {
	bool					in_connect_poll;

	uint8_t					stale_conn_retry_count;
	bool					need_destroy;
};

enum NVME_RDMA_COMPLETION_FLAGS {
@@ -509,6 +510,7 @@ nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair)
			break;
		case RDMA_CM_EVENT_DEVICE_REMOVAL:
			rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
			rqpair->need_destroy = true;
			break;
		case RDMA_CM_EVENT_MULTICAST_JOIN:
		case RDMA_CM_EVENT_MULTICAST_ERROR:
@@ -1889,9 +1891,6 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair)
			spdk_rdma_qp_destroy(rqpair->rdma_qp);
			rqpair->rdma_qp = NULL;
		}

		rdma_destroy_id(rqpair->cm_id);
		rqpair->cm_id = NULL;
	}

	if (rqpair->poller) {
@@ -1916,6 +1915,12 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair)
	nvme_rdma_free_reqs(rqpair);
	nvme_rdma_free_rsps(rqpair->rsps);
	rqpair->rsps = NULL;

	/* destroy cm_id last so cma device will not be freed before we destroy the cq. */
	if (rqpair->cm_id) {
		rdma_destroy_id(rqpair->cm_id);
		rqpair->cm_id = NULL;
	}
}

static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
@@ -1941,8 +1946,9 @@ nvme_rdma_qpair_disconnected(struct nvme_rdma_qpair *rqpair, int ret)
		goto quiet;
	}

	if (rqpair->current_num_sends != 0 ||
	    (!rqpair->srq && rqpair->rsps->current_num_recvs != 0)) {
	if (rqpair->need_destroy ||
	    (rqpair->current_num_sends != 0 ||
	     (!rqpair->srq && rqpair->rsps->current_num_recvs != 0))) {
		rqpair->state = NVME_RDMA_QPAIR_STATE_LINGERING;
		rqpair->evt_timeout_ticks = (NVME_RDMA_DISCONNECTED_QPAIR_TIMEOUT_US * spdk_get_ticks_hz()) /
					    SPDK_SEC_TO_USEC + spdk_get_ticks();
@@ -2570,13 +2576,13 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller,
	struct spdk_nvme_rdma_req	*rdma_req;

	rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr);

	/* If we are flushing I/O */
	if (wc->status) {
	rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL;
	if (!rqpair) {
		rqpair = rdma_qpair != NULL ? rdma_qpair : get_rdma_qpair_from_wc(poller->group, wc);
	}

	/* If we are flushing I/O */
	if (wc->status) {
		if (!rqpair) {
			/* When poll_group is used, several qpairs share the same CQ and it is possible to
			 * receive a completion with error (e.g. IBV_WC_WR_FLUSH_ERR) for already disconnected qpair
@@ -2598,9 +2604,19 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller,
	/* We do not support Soft Roce anymore. Other than Soft Roce's bug, we should not
	 * receive a completion without error status after qpair is disconnected/destroyed.
	 */
	assert(rdma_req->req != NULL);
	if (spdk_unlikely(rdma_req->req == NULL)) {
		/*
		 * Some infiniband drivers do not guarantee the previous assumption after we
		 * received a RDMA_CM_EVENT_DEVICE_REMOVAL event.
		 */
		SPDK_ERRLOG("Received malformed completion: request 0x%"PRIx64" type %d\n", wc->wr_id,
			    rdma_wr->type);
		if (!rqpair || !rqpair->need_destroy) {
			assert(0);
		}
		return -ENXIO;
	}

	rqpair = nvme_rdma_qpair(rdma_req->req->qpair);
	rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED;
	assert(rqpair->current_num_sends > 0);
	rqpair->current_num_sends--;
+56 −17
Original line number Diff line number Diff line
@@ -10,6 +10,11 @@ source $rootdir/test/setup/common.sh
source $rootdir/test/common/autotest_common.sh
source $rootdir/test/nvmf/common.sh

tgt_core_mask='0x3'
bdevperf_core_mask='0x4'
bdevperf_rpc_sock=/var/tmp/bdevperf.sock
bdevperf_rpc_pid=-1

nvmftestinit

function get_subsystem_nqn() {
@@ -33,15 +38,6 @@ function create_subsystem_and_connect_on_netdev() {
	$rpc_py nvmf_subsystem_add_ns $nqn $malloc_name
	$rpc_py nvmf_subsystem_add_listener $nqn -t $TEST_TRANSPORT -a $ip -s $NVMF_PORT

	if ! nvme connect -t $TEST_TRANSPORT -n $nqn -a $ip -s $NVMF_PORT; then
		exit 1
	fi

	waitforserial "$serial"
	nvme_name=$(lsblk -l -o NAME,SERIAL | grep -oP "([\w]*)(?=\s+${serial})")
	nvme_size=$(sec_size_to_bytes $nvme_name)

	echo "${nvme_name}"
	return 0
}

@@ -87,16 +83,56 @@ function get_rdma_dev_count_in_nvmf_tgt() {
	$rpc_py nvmf_get_stats | jq -r '.poll_groups[0].transports[].devices | length'
}

function generate_io_traffic_with_bdevperf() {
	local dev_names=("$@")

	mkdir -p $testdir
	$rootdir/build/examples/bdevperf -m $bdevperf_core_mask -z -r $bdevperf_rpc_sock -q 128 -o 4096 -w verify -t 90 &> $testdir/try.txt &
	bdevperf_pid=$!

	trap 'process_shm --id $NVMF_APP_SHM_ID; cat $testdir/try.txt; rm -f $testdir/try.txt; kill -9 $bdevperf_pid; nvmftestfini; exit 1' SIGINT SIGTERM EXIT
	waitforlisten $bdevperf_pid $bdevperf_rpc_sock

	# Create a controller and set multipath behavior
	# bdev_retry_count is set to -1 means infinite reconnects
	$rpc_py -s $bdevperf_rpc_sock bdev_nvme_set_options -r -1

	for dev_name in "${dev_names[@]}"; do
		nqn=$(get_subsystem_nqn $dev_name)
		tgt_ip=$(get_ip_address "$dev_name")

		# -l -1 ctrlr_loss_timeout_sec -1 means infinite reconnects
		# -o 1 reconnect_delay_sec time to delay a reconnect retry is limited to 1 sec
		$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b Nvme_$dev_name -t $TEST_TRANSPORT -a $tgt_ip -s $NVMF_PORT -f ipv4 -n $nqn -l -1 -o 1
	done

	$rootdir/examples/bdev/bdevperf/bdevperf.py -t 120 -s $bdevperf_rpc_sock perform_tests &
	bdevperf_rpc_pid=$!

	sleep 5
}

function stop_bdevperf() {
	wait $bdevperf_rpc_pid

	# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
	killprocess $bdevperf_pid || true
	bdevperf_pid=

	cat $testdir/try.txt

	trap - SIGINT SIGTERM EXIT
	rm -f $testdir/try.txt
}

function test_remove_and_rescan() {
	nvmfappstart -m 0xF
	nvmfappstart -m "$tgt_core_mask"

	create_subsystem_and_connect "$@"

	for net_dev in "${!netdev_nvme_dict[@]}"; do
		$rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 40 &
		fio_pid=$!
		sleep 3
	generate_io_traffic_with_bdevperf "${!netdev_nvme_dict[@]}"

	for net_dev in "${!netdev_nvme_dict[@]}"; do
		nvme_dev=${netdev_nvme_dict[$net_dev]}
		rdma_dev_name=$(get_rdma_device_name $net_dev)
		origin_ip=$(get_ip_address "$net_dev")
@@ -162,6 +198,8 @@ function test_remove_and_rescan() {
		done
	done

	stop_bdevperf

	# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
	killprocess $nvmfpid || true
	nvmfpid=
@@ -229,7 +267,7 @@ function test_bonding_slaves_on_nics() {
	# wait ib driver activated on bond device
	sleep 5

	nvmfappstart -m 0xF
	nvmfappstart -m "$tgt_core_mask"
	$rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192

	create_subsystem_and_connect_on_netdev $BOND_NAME
@@ -237,8 +275,7 @@ function test_bonding_slaves_on_nics() {
	ib_count=$(get_rdma_dev_count_in_nvmf_tgt)
	echo "IB Count: " $ib_count

	$rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 10 &
	fio_pid=$!
	generate_io_traffic_with_bdevperf $BOND_NAME

	sleep 2
	echo -$nic1 | sudo tee /sys/class/net/${BOND_NAME}/bonding/slaves
@@ -257,6 +294,8 @@ function test_bonding_slaves_on_nics() {
		exit 1
	fi

	stop_bdevperf

	# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
	killprocess $nvmfpid || true
	nvmfpid=