Commit 3b26e2c5 authored by Evgeniy Kochetov's avatar Evgeniy Kochetov Committed by Konrad Sztyber
Browse files

nvme/rdma: Create poller and CQ on demand



Original implementation creates pollers and CQs for all discovered
devices at poll group creation. Device (ibv_context) that has no
references, i.e. has no QPs, may be removed from the system and
ibv_context may be closed by rdma_cm. In this case we will have a CQ
that refers to closed ibv_context and it may crash in ibv_poll_cq.

With this patch pollers are created on demand when we create the first
QP for a device. When there are no more QPs on the poller, we destroy
the poller. This also helps to avoid polling CQs that don't have any
QPs attached.

Signed-off-by: default avatarEvgeniy Kochetov <evgeniik@nvidia.com>
Change-Id: I46dd2c8b9b2902168dba24e139c904f51bd1b101
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13692


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
parent 3f922a68
Loading
Loading
Loading
Loading
+84 −39
Original line number Diff line number Diff line
@@ -137,6 +137,7 @@ struct nvme_rdma_poller_stats {
struct nvme_rdma_poller {
	struct ibv_context		*device;
	struct ibv_cq			*cq;
	uint32_t			refcnt;
	int				required_num_wc;
	int				current_num_wc;
	struct nvme_rdma_poller_stats	stats;
@@ -295,6 +296,10 @@ static const char *rdma_cm_event_str[] = {

struct nvme_rdma_qpair *nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group,
		uint32_t qp_num);
static struct nvme_rdma_poller *nvme_rdma_poll_group_get_poller(struct nvme_rdma_poll_group *group,
		struct ibv_context *device);
static void nvme_rdma_poll_group_put_poller(struct nvme_rdma_poll_group *group,
		struct nvme_rdma_poller *poller);

static TAILQ_HEAD(, nvme_rdma_memory_domain) g_memory_domains = TAILQ_HEAD_INITIALIZER(
			g_memory_domains);
@@ -711,22 +716,19 @@ nvme_rdma_poll_group_set_cq(struct spdk_nvme_qpair *qpair)

	assert(rqpair->cq == NULL);

	STAILQ_FOREACH(poller, &group->pollers, link) {
		if (poller->device == rqpair->cm_id->verbs) {
	poller = nvme_rdma_poll_group_get_poller(group, rqpair->cm_id->verbs);
	if (!poller) {
		SPDK_ERRLOG("Unable to find a cq for qpair %p on poll group %p\n", qpair, qpair->poll_group);
		return -EINVAL;
	}

	if (nvme_rdma_resize_cq(rqpair, poller)) {
		nvme_rdma_poll_group_put_poller(group, poller);
		return -EPROTO;
	}

	rqpair->cq = poller->cq;
	rqpair->poller = poller;
			break;
		}
	}

	if (rqpair->cq == NULL) {
		SPDK_ERRLOG("Unable to find a cq for qpair %p on poll group %p\n", qpair, qpair->poll_group);
		return -EINVAL;
	}

	return 0;
}

@@ -2155,6 +2157,13 @@ nvme_rdma_stale_conn_retry(struct nvme_rdma_qpair *rqpair)
		       rqpair->stale_conn_retry_count, qpair->ctrlr->cntlid, qpair->id);

	if (qpair->poll_group) {
		struct nvme_rdma_poll_group	*group;

		group = nvme_rdma_poll_group(qpair->poll_group);
		if (rqpair->poller) {
			nvme_rdma_poll_group_put_poller(group, rqpair->poller);
			rqpair->poller = NULL;
		}
		rqpair->cq = NULL;
	}

@@ -2816,7 +2825,7 @@ nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
	}
}

static int
static struct nvme_rdma_poller *
nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx)
{
	struct nvme_rdma_poller *poller;
@@ -2824,22 +2833,23 @@ nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *
	poller = calloc(1, sizeof(*poller));
	if (poller == NULL) {
		SPDK_ERRLOG("Unable to allocate poller.\n");
		return -ENOMEM;
		return NULL;
	}

	poller->device = ctx;
	poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0);

	if (poller->cq == NULL) {
		SPDK_ERRLOG("Unable to create CQ, errno %d.\n", errno);
		free(poller);
		return -EINVAL;
		return NULL;
	}

	STAILQ_INSERT_HEAD(&group->pollers, poller, link);
	group->num_pollers++;
	poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE;
	poller->required_num_wc = 0;
	return 0;
	return poller;
}

static void
@@ -2848,11 +2858,54 @@ nvme_rdma_poll_group_free_pollers(struct nvme_rdma_poll_group *group)
	struct nvme_rdma_poller	*poller, *tmp_poller;

	STAILQ_FOREACH_SAFE(poller, &group->pollers, link, tmp_poller) {
		assert(poller->refcnt == 0);
		if (poller->refcnt) {
			SPDK_WARNLOG("Destroying poller with non-zero ref count: poller %p, refcnt %d\n",
				     poller, poller->refcnt);
		}

		if (poller->cq) {
			ibv_destroy_cq(poller->cq);
		}
		STAILQ_REMOVE(&group->pollers, poller, nvme_rdma_poller, link);
		free(poller);
	}
}

static struct nvme_rdma_poller *
nvme_rdma_poll_group_get_poller(struct nvme_rdma_poll_group *group, struct ibv_context *device)
{
	struct nvme_rdma_poller *poller = NULL;

	STAILQ_FOREACH(poller, &group->pollers, link) {
		if (poller->device == device) {
			break;
		}
	}

	if (!poller) {
		poller = nvme_rdma_poller_create(group, device);
		if (!poller) {
			SPDK_ERRLOG("Failed to create a poller for device %p\n", device);
			return NULL;
		}
	}

	poller->refcnt++;
	return poller;
}

static void
nvme_rdma_poll_group_put_poller(struct nvme_rdma_poll_group *group, struct nvme_rdma_poller *poller)
{
	assert(poller->refcnt > 0);
	if (--poller->refcnt == 0) {
		if (poller->cq) {
			ibv_destroy_cq(poller->cq);
		}
		STAILQ_REMOVE(&group->pollers, poller, nvme_rdma_poller, link);
		free(poller);
		group->num_pollers--;
	}
}

@@ -2860,8 +2913,6 @@ static struct spdk_nvme_transport_poll_group *
nvme_rdma_poll_group_create(void)
{
	struct nvme_rdma_poll_group	*group;
	struct ibv_context		**contexts;
	int i = 0;

	group = calloc(1, sizeof(*group));
	if (group == NULL) {
@@ -2870,26 +2921,6 @@ nvme_rdma_poll_group_create(void)
	}

	STAILQ_INIT(&group->pollers);

	contexts = rdma_get_devices(NULL);
	if (contexts == NULL) {
		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
		free(group);
		return NULL;
	}

	while (contexts[i] != NULL) {
		if (nvme_rdma_poller_create(group, contexts[i])) {
			nvme_rdma_poll_group_free_pollers(group);
			free(group);
			rdma_free_devices(contexts);
			return NULL;
		}
		i++;
	}

	rdma_free_devices(contexts);

	return &group->group;
}

@@ -2927,7 +2958,13 @@ static int
nvme_rdma_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
{
	struct nvme_rdma_qpair		*rqpair = nvme_rdma_qpair(qpair);
	struct nvme_rdma_poll_group	*group;

	group = nvme_rdma_poll_group(qpair->poll_group);
	if (rqpair->poller) {
		nvme_rdma_poll_group_put_poller(group, rqpair->poller);
		rqpair->poller = NULL;
	}
	rqpair->cq = NULL;

	return 0;
@@ -3003,7 +3040,9 @@ nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *
	}

	completions_allowed = completions_per_qpair * num_qpairs;
	if (group->num_pollers) {
		completions_per_poller = spdk_max(completions_allowed / group->num_pollers, 1);
	}

	STAILQ_FOREACH(poller, &group->pollers, link) {
		poller_completions = 0;
@@ -3086,6 +3125,12 @@ nvme_rdma_poll_group_get_stats(struct spdk_nvme_transport_poll_group *tgroup,
	}
	stats->trtype = SPDK_NVME_TRANSPORT_RDMA;
	stats->rdma.num_devices = group->num_pollers;

	if (stats->rdma.num_devices == 0) {
		*_stats = stats;
		return 0;
	}

	stats->rdma.device_stats = calloc(stats->rdma.num_devices, sizeof(*stats->rdma.device_stats));
	if (!stats->rdma.device_stats) {
		SPDK_ERRLOG("Can't allocate memory for RDMA device stats\n");
+93 −63
Original line number Diff line number Diff line
@@ -601,20 +601,50 @@ static void
test_nvme_rdma_poller_create(void)
{
	struct nvme_rdma_poll_group	group = {};
	struct ibv_context *contexts = (struct ibv_context *)0xDEADBEEF;
	struct ibv_context context = {
		.device = (struct ibv_device *)0xDEADBEEF
	};
	struct ibv_context context_2 = {
		.device = (struct ibv_device *)0xBAADBEEF
	};
	struct nvme_rdma_poller *poller_1, *poller_2, *poller_3;

	/* Case: calloc and ibv not need to fail test */
	STAILQ_INIT(&group.pollers);
	group.num_pollers = 1;
	int rc = nvme_rdma_poller_create(&group, contexts);

	CU_ASSERT(rc == 0);
	CU_ASSERT(group.num_pollers = 2);
	CU_ASSERT(&group.pollers != NULL);
	CU_ASSERT(group.pollers.stqh_first->device == contexts);
	CU_ASSERT(group.pollers.stqh_first->cq == (struct ibv_cq *)0xFEEDBEEF);
	CU_ASSERT(group.pollers.stqh_first->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE);
	CU_ASSERT(group.pollers.stqh_first->required_num_wc == 0);
	poller_1 = nvme_rdma_poll_group_get_poller(&group, &context);
	SPDK_CU_ASSERT_FATAL(poller_1 != NULL);
	CU_ASSERT(group.num_pollers == 1);
	CU_ASSERT(STAILQ_FIRST(&group.pollers) == poller_1);
	CU_ASSERT(poller_1->refcnt == 1);
	CU_ASSERT(poller_1->device == &context);
	CU_ASSERT(poller_1->cq == (struct ibv_cq *)0xFEEDBEEF);
	CU_ASSERT(poller_1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE);
	CU_ASSERT(poller_1->required_num_wc == 0);

	poller_2 = nvme_rdma_poll_group_get_poller(&group, &context_2);
	SPDK_CU_ASSERT_FATAL(poller_2 != NULL);
	CU_ASSERT(group.num_pollers == 2);
	CU_ASSERT(STAILQ_FIRST(&group.pollers) == poller_2);
	CU_ASSERT(poller_2->refcnt == 1);
	CU_ASSERT(poller_2->device == &context_2);

	poller_3 = nvme_rdma_poll_group_get_poller(&group, &context);
	SPDK_CU_ASSERT_FATAL(poller_3 != NULL);
	CU_ASSERT(poller_3 == poller_1);
	CU_ASSERT(group.num_pollers == 2);
	CU_ASSERT(poller_3->refcnt == 2);

	nvme_rdma_poll_group_put_poller(&group, poller_2);
	CU_ASSERT(group.num_pollers == 1);

	nvme_rdma_poll_group_put_poller(&group, poller_1);
	CU_ASSERT(group.num_pollers == 1);
	CU_ASSERT(poller_3->refcnt == 1);

	nvme_rdma_poll_group_put_poller(&group, poller_3);
	CU_ASSERT(STAILQ_EMPTY(&group.pollers));
	CU_ASSERT(group.num_pollers == 0);

	nvme_rdma_poll_group_free_pollers(&group);
}
@@ -1321,20 +1351,15 @@ test_nvme_rdma_poll_group_get_stats(void)

	/* Initialization */
	STAILQ_INIT(&tgroup.pollers);
	rc = nvme_rdma_poller_create(&tgroup, &contexts1);
	CU_ASSERT(rc == 0);
	tpoller2 = nvme_rdma_poller_create(&tgroup, &contexts1);
	SPDK_CU_ASSERT_FATAL(tpoller2 != NULL);
	CU_ASSERT(tgroup.num_pollers == 1);

	rc = nvme_rdma_poller_create(&tgroup, &contexts2);
	CU_ASSERT(rc == 0);
	tpoller1 = nvme_rdma_poller_create(&tgroup, &contexts2);
	SPDK_CU_ASSERT_FATAL(tpoller1 != NULL);
	CU_ASSERT(tgroup.num_pollers == 2);
	CU_ASSERT(&tgroup.pollers != NULL);

	tpoller1 = STAILQ_FIRST(&tgroup.pollers);
	SPDK_CU_ASSERT_FATAL(tpoller1 != NULL);
	tpoller2 = STAILQ_NEXT(tpoller1, link);
	SPDK_CU_ASSERT_FATAL(tpoller2 != NULL);

	CU_ASSERT(tpoller1->device == &contexts2);
	CU_ASSERT(tpoller2->device == &contexts1);
	CU_ASSERT(strcmp(tpoller1->device->device->name, "/dev/test2") == 0);
@@ -1407,86 +1432,91 @@ test_nvme_rdma_poll_group_set_cq(void)
	int rc = -1;
	struct nvme_rdma_poll_group *group;
	struct spdk_nvme_transport_poll_group *tgroup;
	struct nvme_rdma_poller *poller1;
	struct nvme_rdma_poller *poller2;
	struct nvme_rdma_poller *poller;
	struct nvme_rdma_qpair rqpair = {};
	struct rdma_cm_id cm_id = {};

	/* Case1: Test function nvme_rdma_poll_group_create
	   Test1: Function rdma_get_devices failed */
	MOCK_SET(rdma_get_devices, NULL);

	tgroup = nvme_rdma_poll_group_create();
	CU_ASSERT(tgroup == NULL);

	/* Test2: Function nvme_rdma_poller_create failed */
	MOCK_CLEAR(rdma_get_devices);
	MOCK_SET(ibv_create_cq, NULL);

	tgroup = nvme_rdma_poll_group_create();
	CU_ASSERT(tgroup == NULL);

	/* Test3: Function nvme_rdma_poll_group_create success */
	MOCK_SET(ibv_create_cq, (struct ibv_cq *)0xFEEDBEEF);

	/* Case1: Test function nvme_rdma_poll_group_create */
	/* Test1: Function nvme_rdma_poll_group_create success */
	tgroup = nvme_rdma_poll_group_create();
	CU_ASSERT(tgroup != NULL);
	SPDK_CU_ASSERT_FATAL(tgroup != NULL);

	group = nvme_rdma_poll_group(tgroup);
	CU_ASSERT(group != NULL);

	poller1 = STAILQ_FIRST(&group->pollers);
	SPDK_CU_ASSERT_FATAL(poller1 != NULL);
	poller2 = STAILQ_NEXT(poller1, link);
	SPDK_CU_ASSERT_FATAL(poller2 != NULL);

	CU_ASSERT(poller1->device == (struct ibv_context *)0xFEEDBEEF);
	CU_ASSERT(poller1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE);
	CU_ASSERT(poller1->required_num_wc == 0);
	CU_ASSERT(poller2->device == (struct ibv_context *)0xDEADBEEF);
	CU_ASSERT(poller2->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE);
	CU_ASSERT(poller2->required_num_wc == 0);
	CU_ASSERT(STAILQ_EMPTY(&group->pollers));

	/* Case2: Test function nvme_rdma_poll_group_set_cq */
	rqpair.qpair.poll_group = tgroup;
	rqpair.qpair.trtype = SPDK_NVME_TRANSPORT_RDMA;
	rqpair.cm_id = &cm_id;

	/* Test1: Unable to find a cq for qpair on poll group */
	/* Test1: Function ibv_create_cq failed */
	cm_id.verbs = (void *)0xFEEDBEEF;
	MOCK_SET(ibv_create_cq, NULL);

	rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair);
	CU_ASSERT(rc == -EINVAL);
	CU_ASSERT(rqpair.cq == NULL);
	CU_ASSERT(STAILQ_EMPTY(&group->pollers));

	MOCK_CLEAR(ibv_create_cq);

	/* Test2: Unable to find a cq for qpair on poll group */
	cm_id.verbs = NULL;

	rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair);
	CU_ASSERT(rc == -EINVAL);
	CU_ASSERT(rqpair.cq == NULL);
	CU_ASSERT(STAILQ_EMPTY(&group->pollers));

	/* Test3: Match cq success, current_num_wc is enough */
	MOCK_SET(ibv_create_cq, (struct ibv_cq *)0xFEEDBEEF);

	/* Test2: Match cq success, current_num_wc is enough */
	cm_id.verbs = (void *)0xFEEDBEEF;
	rqpair.num_entries = 0;

	rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair);
	CU_ASSERT(rc == 0);
	CU_ASSERT(poller1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE);
	CU_ASSERT(poller1->required_num_wc == 0);
	CU_ASSERT(rqpair.cq == (void *)0xFEEDBEEF);

	poller = STAILQ_FIRST(&group->pollers);
	SPDK_CU_ASSERT_FATAL(poller != NULL);
	CU_ASSERT(STAILQ_NEXT(poller, link) == NULL);
	CU_ASSERT(poller->device == (struct ibv_context *)0xFEEDBEEF);
	CU_ASSERT(poller->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE);
	CU_ASSERT(poller->required_num_wc == 0);
	CU_ASSERT(rqpair.poller == poller);

	/* Test3: Match cq success, function ibv_resize_cq failed */
	rc = nvme_rdma_poll_group_disconnect_qpair(&rqpair.qpair);
	CU_ASSERT(rc == 0);
	CU_ASSERT(rqpair.cq == NULL);
	CU_ASSERT(rqpair.poller == NULL);
	CU_ASSERT(STAILQ_EMPTY(&group->pollers));

	/* Test4: Match cq success, function ibv_resize_cq failed */
	rqpair.cq = NULL;
	rqpair.num_entries = DEFAULT_NVME_RDMA_CQ_SIZE - 1;
	MOCK_SET(ibv_resize_cq, -1);

	rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair);
	CU_ASSERT(rc == -EPROTO);
	CU_ASSERT(poller1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE);
	CU_ASSERT(poller1->required_num_wc == 0);
	CU_ASSERT(STAILQ_EMPTY(&group->pollers));

	/* Test4: Current_num_wc is not enough, resize success */
	/* Test5: Current_num_wc is not enough, resize success */
	MOCK_SET(ibv_resize_cq, 0);

	rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair);
	CU_ASSERT(rc == 0);
	CU_ASSERT(poller1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE * 2);
	CU_ASSERT(poller1->required_num_wc == (DEFAULT_NVME_RDMA_CQ_SIZE - 1) * 2);
	CU_ASSERT(rqpair.cq == poller1->cq);
	CU_ASSERT(rqpair.poller == poller1);

	poller = STAILQ_FIRST(&group->pollers);
	SPDK_CU_ASSERT_FATAL(poller != NULL);
	CU_ASSERT(poller->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE * 2);
	CU_ASSERT(poller->required_num_wc == (DEFAULT_NVME_RDMA_CQ_SIZE - 1) * 2);
	CU_ASSERT(rqpair.cq == poller->cq);
	CU_ASSERT(rqpair.poller == poller);

	rc = nvme_rdma_poll_group_disconnect_qpair(&rqpair.qpair);
	CU_ASSERT(rc == 0);

	rc = nvme_rdma_poll_group_destroy(tgroup);
	CU_ASSERT(rc == 0);