Commit 8ddc5cd4 authored by sijie.sun's avatar sijie.sun Committed by Tomasz Zawadzki
Browse files

nvmf/rdma: Destroy all related resources after IB device removed



When IBV_EVENT_DEVICE_FATAL & RDMA_CM_EVENT_DEVICE_REMOVAL occurs,
destory all userspace resources such as qp, poller and ibv_context.

Signed-off-by: default avatarsijie.sun <sijie.sun@smartx.com>
Change-Id: Ie4832e4804eb572d6ec3bdc44fb7f9339f443d7e
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15615


Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
parent 07be7ca0
Loading
Loading
Loading
Loading
+264 −22
Original line number Diff line number Diff line
@@ -283,6 +283,8 @@ struct spdk_nvmf_rdma_resources {

typedef void (*spdk_nvmf_rdma_qpair_ibv_event)(struct spdk_nvmf_rdma_qpair *rqpair);

typedef void (*spdk_poller_destroy_cb)(void *ctx);

struct spdk_nvmf_rdma_ibv_event_ctx {
	struct spdk_nvmf_rdma_qpair			*rqpair;
	spdk_nvmf_rdma_qpair_ibv_event			cb_fn;
@@ -395,6 +397,7 @@ struct spdk_nvmf_rdma_poller {

	/* The maximum number of I/O outstanding on the shared receive queue at one time */
	uint16_t				max_srq_depth;
	bool					need_destroy;

	/* Shared receive queue */
	struct spdk_rdma_srq			*srq;
@@ -402,6 +405,9 @@ struct spdk_nvmf_rdma_poller {
	struct spdk_nvmf_rdma_resources		*resources;
	struct spdk_nvmf_rdma_poller_stat	stat;

	spdk_poller_destroy_cb			destroy_cb;
	void					*destroy_cb_ctx;

	RB_HEAD(qpairs_tree, spdk_nvmf_rdma_qpair) qpairs;

	STAILQ_HEAD(, spdk_nvmf_rdma_qpair)	qpairs_pending_recv;
@@ -436,6 +442,8 @@ struct spdk_nvmf_rdma_device {
	struct ibv_pd				*pd;

	int					num_srq;
	bool					need_destroy;
	bool					ready_to_destroy;

	TAILQ_ENTRY(spdk_nvmf_rdma_device)	link;
};
@@ -476,6 +484,16 @@ struct spdk_nvmf_rdma_transport {
	TAILQ_HEAD(, spdk_nvmf_rdma_poll_group)	poll_groups;
};

struct poller_manage_ctx {
	struct spdk_nvmf_rdma_transport		*rtransport;
	struct spdk_nvmf_rdma_poll_group	*rgroup;
	struct spdk_nvmf_rdma_poller		*rpoller;
	struct spdk_nvmf_rdma_device		*device;

	struct spdk_thread			*thread;
	volatile int				*inflight_op_counter;
};

static const struct spdk_json_object_decoder rdma_transport_opts_decoder[] = {
	{
		"num_cqe", offsetof(struct rdma_transport_opts, num_cqe),
@@ -516,6 +534,8 @@ static void _poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
static void _poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
				 struct spdk_nvmf_rdma_poller *rpoller);

static void _nvmf_rdma_remove_destroyed_device(void *c);

static inline int
nvmf_rdma_check_ibv_state(enum ibv_qp_state state)
{
@@ -831,6 +851,8 @@ nvmf_rdma_qpair_clean_ibv_events(struct spdk_nvmf_rdma_qpair *rqpair)
	}
}

static void nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller);

static void
nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair)
{
@@ -909,6 +931,9 @@ nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair)
		rqpair->destruct_channel = NULL;
	}

	if (rqpair->poller && rqpair->poller->need_destroy && RB_EMPTY(&rqpair->poller->qpairs)) {
		nvmf_rdma_poller_destroy(rqpair->poller);
	}
	free(rqpair);
}

@@ -2682,6 +2707,8 @@ destroy_ib_device(struct spdk_nvmf_rdma_transport *rtransport,
			ibv_dealloc_pd(device->pd);
		}
	}
	SPDK_NOTICELOG("IB device %s[%p] is destroyed.\n", ibv_get_device_name(device->context->device),
		       device);
	free(device);
}

@@ -2878,6 +2905,96 @@ nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport,
	}
}

static void _nvmf_rdma_remove_poller_in_group(void *c);

static bool
nvmf_rdma_all_pollers_are_destroyed(void *c)
{
	struct poller_manage_ctx	*ctx = c;
	int				counter;

	counter = __atomic_sub_fetch(ctx->inflight_op_counter, 1, __ATOMIC_SEQ_CST);
	SPDK_DEBUGLOG(rdma, "nvmf_rdma_all_pollers_are_destroyed called. counter: %d, poller: %p\n",
		      counter, ctx->rpoller);

	if (counter == 0) {
		free((void *)ctx->inflight_op_counter);
	}
	free(ctx);

	return counter == 0;
}

static int
nvmf_rdma_remove_pollers_on_dev(struct spdk_nvmf_rdma_transport *rtransport,
				struct spdk_nvmf_rdma_device *device,
				bool *has_inflight)
{
	struct spdk_nvmf_rdma_poll_group	*rgroup;
	struct spdk_nvmf_rdma_poller		*rpoller;
	struct spdk_nvmf_poll_group		*poll_group;
	struct poller_manage_ctx		*ctx;
	bool					found;
	int					*inflight_counter;
	spdk_msg_fn				do_fn;

	*has_inflight = false;
	do_fn = _nvmf_rdma_remove_poller_in_group;
	inflight_counter = calloc(1, sizeof(int));
	if (!inflight_counter) {
		SPDK_ERRLOG("Failed to allocate inflight counter when removing pollers\n");
		return -ENOMEM;
	}

	TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
		(*inflight_counter)++;
	}

	TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
		found = false;
		TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
			if (rpoller->device == device) {
				found = true;
				break;
			}
		}
		if (!found) {
			__atomic_fetch_sub(inflight_counter, 1, __ATOMIC_SEQ_CST);
			continue;
		}

		ctx = calloc(1, sizeof(struct poller_manage_ctx));
		if (!ctx) {
			SPDK_ERRLOG("Failed to allocate poller_manage_ctx when removing pollers\n");
			if (!*has_inflight) {
				free(inflight_counter);
			}
			return -ENOMEM;
		}

		ctx->rtransport = rtransport;
		ctx->rgroup = rgroup;
		ctx->rpoller = rpoller;
		ctx->device = device;
		ctx->thread = spdk_get_thread();
		ctx->inflight_op_counter = inflight_counter;
		*has_inflight = true;

		poll_group = rgroup->group.group;
		if (poll_group->thread != spdk_get_thread()) {
			spdk_thread_send_msg(poll_group->thread, do_fn, ctx);
		} else {
			do_fn(ctx);
		}
	}

	if (!*has_inflight) {
		free(inflight_counter);
	}

	return 0;
}

static void
nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport,
				struct spdk_nvmf_rdma_qpair *rqpair, bool drain)
@@ -2954,6 +3071,12 @@ nvmf_rdma_destroy_drained_qpair(struct spdk_nvmf_rdma_qpair *rqpair)
		return;
	}

	/* device is already destroyed and we should force destroy this qpair. */
	if (rqpair->poller && rqpair->poller->need_destroy) {
		nvmf_rdma_qpair_destroy(rqpair);
		return;
	}

	/* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */
	if (rqpair->current_send_depth != 0) {
		return;
@@ -3069,6 +3192,42 @@ nvmf_rdma_handle_cm_event_addr_change(struct spdk_nvmf_transport *transport,
	return event_acked;
}

static void
nvmf_rdma_handle_device_removal(struct spdk_nvmf_rdma_transport *rtransport,
				struct spdk_nvmf_rdma_device *device)
{
	struct spdk_nvmf_rdma_port	*port, *port_tmp;
	int				rc;
	bool				has_inflight;

	rc = nvmf_rdma_remove_pollers_on_dev(rtransport, device, &has_inflight);
	if (rc) {
		SPDK_ERRLOG("Failed to handle device removal, rc %d\n", rc);
		return;
	}

	if (!has_inflight) {
		/* no pollers, destroy the device */
		device->ready_to_destroy = true;
		spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_remove_destroyed_device, rtransport);
	}

	TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) {
		if (port->device == device) {
			SPDK_NOTICELOG("Port %s:%s on device %s is being removed.\n",
				       port->trid->traddr,
				       port->trid->trsvcid,
				       ibv_get_device_name(port->device->context->device));

			/* keep NVMF listener and only destroy structures of the
			 * RDMA transport. when the device comes back we can retry listening
			 * and the application's workflow will not be interrupted.
			 */
			nvmf_rdma_stop_listen(&rtransport->transport, port->trid);
		}
	}
}

static void
nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport,
				       struct rdma_cm_event *event)
@@ -3079,14 +3238,11 @@ nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport,
	port = event->id->context;
	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);

	SPDK_NOTICELOG("Port %s:%s is being removed\n", port->trid->traddr, port->trid->trsvcid);

	nvmf_rdma_disconnect_qpairs_on_port(rtransport, port);

	rdma_ack_cm_event(event);

	while (spdk_nvmf_transport_stop_listen(transport, port->trid) == 0) {
		;
	if (!port->device->need_destroy) {
		port->device->need_destroy = true;
		nvmf_rdma_handle_device_removal(rtransport, port->device);
	}
}

@@ -3159,15 +3315,11 @@ nvmf_process_cm_event(struct spdk_nvmf_transport *transport)
			 * don't make attempts to call any ibv_query/modify/create functions. We can only call
			 * ibv_destroy* functions to release user space memory allocated by IB. All kernel
			 * resources are already cleaned. */
			if (event->id->qp) {
			if (!event->id->qp) {
				/* If rdma_cm event has a valid `qp` pointer then the event refers to the
				 * corresponding qpair. Otherwise the event refers to a listening device */
				rc = nvmf_rdma_disconnect(event);
				if (rc < 0) {
					SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
					break;
				}
			} else {
				 * corresponding qpair. Otherwise the event refers to a listening device.
				 * Only handle this event on device because we will disconnect all qpairs
				 * when removing device */
				nvmf_rdma_handle_cm_event_port_removal(transport, event);
				event_acked = true;
			}
@@ -3323,8 +3475,12 @@ nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device)
			break;
		}
		break;
	case IBV_EVENT_CQ_ERR:
	case IBV_EVENT_DEVICE_FATAL:
		SPDK_ERRLOG("Device Fatal event[%s] received on %s. device: %p\n",
			    ibv_event_type_str(event.event_type), ibv_get_device_name(device->context->device), device);
		device->need_destroy = true;
		break;
	case IBV_EVENT_CQ_ERR:
	case IBV_EVENT_PORT_ACTIVE:
	case IBV_EVENT_PORT_ERR:
	case IBV_EVENT_LID_CHANGE:
@@ -3369,6 +3525,7 @@ nvmf_rdma_accept(void *ctx)
	struct spdk_nvmf_rdma_transport *rtransport;
	struct spdk_nvmf_rdma_device *device, *tmp;
	uint32_t count;
	short revents;

	rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
	count = nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0);
@@ -3389,8 +3546,17 @@ nvmf_rdma_accept(void *ctx)

	/* Second and subsequent poll descriptors are IB async events */
	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
		if (rtransport->poll_fds[i++].revents & POLLIN) {
		revents = rtransport->poll_fds[i++].revents;
		if (revents & POLLIN) {
			if (spdk_likely(!device->need_destroy)) {
				nvmf_process_ib_events(device, 32);
				if (spdk_unlikely(device->need_destroy)) {
					nvmf_rdma_handle_device_removal(rtransport, device);
				}
			}
			nfds--;
		} else if (revents & POLLNVAL || revents & POLLHUP) {
			SPDK_ERRLOG("Receive unknown revent %x on device %p\n", (int)revents, device);
			nfds--;
		}
	}
@@ -3632,6 +3798,9 @@ static void
nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller)
{
	struct spdk_nvmf_rdma_qpair	*qpair, *tmp_qpair;
	int				rc;

	TAILQ_REMOVE(&poller->group->pollers, poller, link);
	RB_FOREACH_SAFE(qpair, qpairs_tree, &poller->qpairs, tmp_qpair) {
		nvmf_rdma_qpair_destroy(qpair);
	}
@@ -3645,7 +3814,15 @@ nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller)
	}

	if (poller->cq) {
		ibv_destroy_cq(poller->cq);
		rc = ibv_destroy_cq(poller->cq);
		if (rc != 0) {
			SPDK_ERRLOG("Destroy cq return %d, error: %s\n", rc, strerror(errno));
		}
	}

	if (poller->destroy_cb) {
		poller->destroy_cb(poller->destroy_cb_ctx);
		poller->destroy_cb = NULL;
	}

	free(poller);
@@ -3664,7 +3841,6 @@ nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
	}

	TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
		TAILQ_REMOVE(&rgroup->pollers, poller, link);
		nvmf_rdma_poller_destroy(poller);
	}

@@ -4055,12 +4231,23 @@ nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
	struct spdk_nvmf_rdma_wr	*rdma_wr;
	struct spdk_nvmf_rdma_request	*rdma_req;
	struct spdk_nvmf_rdma_recv	*rdma_recv;
	struct spdk_nvmf_rdma_qpair	*rqpair;
	struct spdk_nvmf_rdma_qpair	*rqpair, *tmp_rqpair;
	int reaped, i;
	int count = 0;
	bool error = false;
	uint64_t poll_tsc = spdk_get_ticks();

	if (spdk_unlikely(rpoller->need_destroy)) {
		/* If qpair is closed before poller destroy, nvmf_rdma_destroy_drained_qpair may not
		 * be called because we cannot poll anything from cq. So we call that here to force
		 * destroy the qpair after to_close turning true.
		 */
		RB_FOREACH_SAFE(rqpair, qpairs_tree, &rpoller->qpairs, tmp_rqpair) {
			nvmf_rdma_destroy_drained_qpair(rqpair);
		}
		return 0;
	}

	/* Poll for completing operations. */
	reaped = ibv_poll_cq(rpoller->cq, 32, wc);
	if (reaped < 0) {
@@ -4203,19 +4390,74 @@ nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
	return count;
}

static void
_nvmf_rdma_remove_destroyed_device(void *c)
{
	struct spdk_nvmf_rdma_transport	*rtransport = c;
	struct spdk_nvmf_rdma_device	*device, *device_tmp;
	int				rc;

	TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
		if (device->ready_to_destroy) {
			destroy_ib_device(rtransport, device);
		}
	}

	free_poll_fds(rtransport);
	rc = generate_poll_fds(rtransport);
	/* cannot handle fd allocation error here */
	if (rc != 0) {
		SPDK_ERRLOG("Failed to generate poll fds after remove ib device.\n");
	}
}

static void
_nvmf_rdma_remove_poller_in_group_cb(void *c)
{
	struct poller_manage_ctx	*ctx = c;
	struct spdk_nvmf_rdma_transport	*rtransport = ctx->rtransport;
	struct spdk_nvmf_rdma_device	*device = ctx->device;
	struct spdk_thread		*thread = ctx->thread;

	if (nvmf_rdma_all_pollers_are_destroyed(c)) {
		/* destroy device when last poller is destroyed */
		device->ready_to_destroy = true;
		spdk_thread_send_msg(thread, _nvmf_rdma_remove_destroyed_device, rtransport);
	}
}

static void
_nvmf_rdma_remove_poller_in_group(void *c)
{
	struct spdk_nvmf_rdma_qpair		*rqpair, *tmp_qpair;
	struct poller_manage_ctx		*ctx = c;

	ctx->rpoller->need_destroy = true;
	ctx->rpoller->destroy_cb_ctx = ctx;
	ctx->rpoller->destroy_cb = _nvmf_rdma_remove_poller_in_group_cb;

	if (RB_EMPTY(&ctx->rpoller->qpairs)) {
		nvmf_rdma_poller_destroy(ctx->rpoller);
	} else {
		RB_FOREACH_SAFE(rqpair, qpairs_tree, &ctx->rpoller->qpairs, tmp_qpair) {
			spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL);
		}
	}
}

static int
nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
{
	struct spdk_nvmf_rdma_transport *rtransport;
	struct spdk_nvmf_rdma_poll_group *rgroup;
	struct spdk_nvmf_rdma_poller	*rpoller;
	struct spdk_nvmf_rdma_poller	*rpoller, *tmp;
	int				count, rc;

	rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport);
	rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);

	count = 0;
	TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
	TAILQ_FOREACH_SAFE(rpoller, &rgroup->pollers, link, tmp) {
		rc = nvmf_rdma_poller_poll(rtransport, rpoller);
		if (rc < 0) {
			return rc;
+2 −0
Original line number Diff line number Diff line
@@ -67,6 +67,8 @@ if [[ $NET_TYPE == phy ]]; then
		if ((${#TCP_INTERFACE_LIST[@]} > 0)); then
			run_test "nvmf_perf_adq" $rootdir/test/nvmf/target/perf_adq.sh "${TEST_ARGS[@]}"
		fi
	else
		run_test "nvmf_device_removal" test/nvmf/target/device_removal.sh "${TEST_ARGS[@]}"
	fi
	run_test "nvmf_shutdown" $rootdir/test/nvmf/target/shutdown.sh "${TEST_ARGS[@]}"
	# TODO: disabled due to intermittent failures. Need to triage.
+251 −0
Original line number Diff line number Diff line
#!/usr/bin/env bash
#  SPDX-License-Identifier: BSD-3-Clause
#  Copyright (C) 2021 Intel Corporation
#  All rights reserved.
#

testdir=$(readlink -f $(dirname $0))
rootdir=$(readlink -f $testdir/../../..)
source $rootdir/test/setup/common.sh
source $rootdir/test/common/autotest_common.sh
source $rootdir/test/nvmf/common.sh

nvmftestinit

function get_subsystem_nqn() {
	echo nqn.2016-06.io.spdk:system_$1
}

function create_subsystem_and_connect_on_netdev() {
	local -a dev_name

	dev_name=$1
	malloc_name=$dev_name
	nqn=$(get_subsystem_nqn "$dev_name")
	ip=$(get_ip_address "$dev_name")
	serial=SPDK000$dev_name

	MALLOC_BDEV_SIZE=128
	MALLOC_BLOCK_SIZE=512

	$rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE -b $malloc_name
	$rpc_py nvmf_create_subsystem $nqn -a -s $serial
	$rpc_py nvmf_subsystem_add_ns $nqn $malloc_name
	$rpc_py nvmf_subsystem_add_listener $nqn -t $TEST_TRANSPORT -a $ip -s $NVMF_PORT

	if ! nvme connect -t $TEST_TRANSPORT -n $nqn -a $ip -s $NVMF_PORT; then
		exit 1
	fi

	waitforserial "$serial"
	nvme_name=$(lsblk -l -o NAME,SERIAL | grep -oP "([\w]*)(?=\s+${serial})")
	nvme_size=$(sec_size_to_bytes $nvme_name)

	echo "${nvme_name}"
	return 0
}

function create_subsystem_and_connect() {
	local -gA netdev_nvme_dict
	netdev_nvme_dict=()

	$rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192 "$@"
	for net_dev in $(get_rdma_if_list); do
		netdev_nvme_dict[$net_dev]="$(create_subsystem_and_connect_on_netdev $net_dev)"
	done

	return 0
}

function rescan_pci() {
	echo 1 > /sys/bus/pci/rescan
}

function get_pci_dir() {
	dev_name=$1
	readlink -f /sys/bus/pci/devices/*/net/${dev_name}/device
}

function remove_one_nic() {
	dev_name=$1
	echo 1 > $(get_pci_dir $dev_name)/remove
}

function get_rdma_device_name() {
	dev_name=$1
	ls $(get_pci_dir $dev_name)/infiniband
}

function test_remove_and_rescan() {
	nvmfappstart -m 0xF

	create_subsystem_and_connect "$@"

	for net_dev in "${!netdev_nvme_dict[@]}"; do
		$rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 40 &
		fio_pid=$!
		sleep 3

		nvme_dev=${netdev_nvme_dict[$net_dev]}
		rdma_dev_name=$(get_rdma_device_name $net_dev)
		origin_ip=$(get_ip_address "$net_dev")
		pci_dir=$(get_pci_dir $net_dev)

		if ! $rpc_py nvmf_get_stats | grep "\"name\": \"$rdma_dev_name\""; then
			echo "Device $rdma_dev_name is not registered in tgt".
			exit 1
		fi

		remove_one_nic $net_dev

		for i in $(seq 1 10); do
			if ! $rpc_py nvmf_get_stats | grep "\"name\": \"$rdma_dev_name\""; then
				break
			fi
			if [[ $i == 10 ]]; then
				# failed to remove this device
				exit 1
			fi
			sleep 1
		done

		rescan_pci

		for i in $(seq 1 10); do
			new_net_dev=$(ls ${pci_dir}/net || echo)
			if [[ -z $new_net_dev ]]; then
				sleep 1
			elif [[ $new_net_dev != "$net_dev" ]]; then
				echo "Device name changed after rescan, try rename."
				ip link set $new_net_dev down && ip link set $new_net_dev name $net_dev
				sleep 1
			else
				break
			fi
		done

		if [[ -z $new_net_dev ]]; then
			exit 1
		fi

		ip link set $net_dev up
		if [[ -z $(get_ip_address "$net_dev") ]]; then
			ip addr add $origin_ip/24 dev $net_dev
		fi
	done

	killprocess $nvmfpid
	nvmfpid=

	return 0
}

function check_env_for_test_bonding_slaves() {
	# only test with dual-port CX4/CX5.

	local -gA port_nic_map
	local -g target_nics

	# gather dev with same bus-device.
	for bdf in "${mlx[@]}"; do
		pci_net_devs=("/sys/bus/pci/devices/$bdf/net/"*)
		pci_net_devs=("${pci_net_devs[@]##*/}")

		bd=$(echo ${bdf} | cut -d '.' -f 1)

		port_nic_map[$bd]="${pci_net_devs[*]} ${port_nic_map[$bd]}"
	done

	for x in "${port_nic_map[@]}"; do
		ports=($x)
		if ((${#ports[@]} >= 2)); then
			target_nics=(${ports[@]})
			return 0
		fi
	done

	return 1
}

BOND_NAME="bond_nvmf"
BOND_IP="10.11.11.26"
BOND_MASK="24"

function clean_bond_device() {
	if ip link | grep $BOND_NAME; then
		ip link del $BOND_NAME
	fi
	for net_dev in "${target_nics[@]}"; do
		ip link set $net_dev up
	done
}

function test_bonding_slaves_on_nics() {
	nic1=$1
	nic2=$2

	clean_bond_device
	ip link add $BOND_NAME type bond mode 1
	ip link set $nic1 down && sudo ip link set $nic1 master $BOND_NAME
	ip link set $nic2 down && sudo ip link set $nic2 master $BOND_NAME
	ip link set $BOND_NAME up
	ip addr add ${BOND_IP}/${BOND_MASK} dev $BOND_NAME

	# check slaves here
	slaves=($(cat /sys/class/net/${BOND_NAME}/bonding/slaves))
	if ((${#slaves[@]} != 2)); then
		exit 1
	fi

	# wait ib driver activated on bond device
	sleep 5

	nvmfappstart -m 0xF
	$rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192

	create_subsystem_and_connect_on_netdev $BOND_NAME

	ib_count=$($rpc_py nvmf_get_stats | grep devices -A 2 | grep -c name)
	echo "IB Count: " $ib_count

	$rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 10 &
	fio_pid=$!

	sleep 2
	echo -$nic1 | sudo tee /sys/class/net/${BOND_NAME}/bonding/slaves

	ib_count2=$ib_count
	for i in $(seq 1 10); do
		ib_count2=$($rpc_py nvmf_get_stats | grep devices -A 2 | grep -c name)
		if ((ib_count2 < ib_count)); then
			break
		fi
		sleep 2
	done
	if ((ib_count2 == ib_count)); then
		exit 1
	fi

	# fio will exit when nvmf fin. do not wait here because it may be in D state.
	killprocess $nvmfpid
	nvmfpid=
	return 0
}

function test_bond_slaves() {
	check_env_for_test_bonding_slaves
	if [[ -z "$target_nics" ]]; then
		echo "No available nic ports to run this test."
		exit 0
	fi
	test_bonding_slaves_on_nics "${target_nics[@]}"
}

run_test "nvmf_device_removal_pci_remove_no_srq" test_remove_and_rescan --no-srq
run_test "nvmf_device_removal_pci_remove" test_remove_and_rescan
# bond slaves case needs lag_master & vport_manager are enabled by mlxconfig
# and not work on CI machine currently.
# run_test "nvmf_device_removal_bond_slaves" test_bond_slaves

nvmftestfini
clean_bond_device