Commit e9e3f615 authored by Seth Howell's avatar Seth Howell Committed by Tomasz Zawadzki
Browse files

nvme/rdma: add connect retry in edge case



If the initiator dies without a disconnecting a qpair, the target can
possibly retain the state of the connection. In this case, it will
inform us that the connection is stale, and we need to try again.

Change-Id: I4d349c634aee59ce9ea4af795b07dd8649db56b3
Signed-off-by: default avatarSeth Howell <seth.howell@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/473063


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
parent 1f3b2a74
Loading
Loading
Loading
Loading
+31 −5
Original line number Diff line number Diff line
@@ -72,6 +72,13 @@
/* CM event processing timeout */
#define NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US	100000

/*
 * In the special case of a stale connection we don't expose a mechanism
 * for the user to retry the connection so we need to handle it internally.
 */
#define NVME_RDMA_STALE_CONN_RETRY_MAX		5
#define NVME_RDMA_STALE_CONN_RETRY_DELAY_US	10000

struct spdk_nvmf_cmd {
	struct spdk_nvme_cmd cmd;
	struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS];
@@ -791,13 +798,16 @@ nvme_rdma_connect(struct nvme_rdma_qpair *rqpair)
	}

	ret = nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_ESTABLISHED);
	if (ret) {
	if (ret == -ESTALE) {
		SPDK_NOTICELOG("Received a stale connection notice during connection.\n");
		return -EAGAIN;
	} else if (ret) {
		SPDK_ERRLOG("RDMA connect error\n");
		return -1;
	}

	} else {
		return 0;
	}
}

static int
nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service)
@@ -1023,7 +1033,7 @@ nvme_rdma_qpair_connect(struct nvme_rdma_qpair *rqpair)
	rc = nvme_rdma_connect(rqpair);
	if (rc != 0) {
		SPDK_ERRLOG("Unable to connect the rqpair\n");
		return -1;
		return rc;
	}

	rc = nvme_rdma_register_reqs(rqpair);
@@ -1441,7 +1451,7 @@ nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr,
{
	struct nvme_rdma_qpair *rqpair;
	struct spdk_nvme_qpair *qpair;
	int rc;
	int rc, retry_count = 0;

	rqpair = calloc(1, sizeof(struct nvme_rdma_qpair));
	if (!rqpair) {
@@ -1475,6 +1485,22 @@ nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr,
	SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses allocated\n");

	rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair);

	/*
	 * -EAGAIN represents the special case where the target side still thought it was connected.
	 * Most NICs will fail the first connection attempt, and the NICs will clean up whatever
	 * state they need to. After that, subsequent connection attempts will succeed.
	 */
	if (rc == -EAGAIN) {
		SPDK_NOTICELOG("Detected stale connection on Target side for qpid: %d\n", rqpair->qpair.id);
		do {
			nvme_delay(NVME_RDMA_STALE_CONN_RETRY_DELAY_US);
			nvme_transport_ctrlr_disconnect_qpair(ctrlr, &rqpair->qpair);
			rc = nvme_transport_ctrlr_connect_qpair(ctrlr, &rqpair->qpair);
			retry_count++;
		} while (rc == -EAGAIN && retry_count < NVME_RDMA_STALE_CONN_RETRY_MAX);
	}

	if (rc < 0) {
		nvme_rdma_qpair_destroy(qpair);
		return NULL;