Commit 80e81273 authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Tomasz Zawadzki
Browse files

bdev/nvme: Do not use ctrlr for I/O submission if reconnect failed repeatedly



If ctrlr_loss_timeout_sec is set to -1, reconnect is tried repeatedly
indefinitely, and I/Os continue to be queued.

This patch adds another option fast_io_fail_timeout_sec, a flag
fast_io_fail_timedout to nvme_ctrlr.

If the time fast_io_fail_timeout_sec passed after starting reset,
set fast_io_fail_timedout to true not to use the path for I/O submission.

fast_io_fail_timeout_sec is initialized to zero as same as
ctrlr_loss_timeout_sec and reconnect_delay_sec.

The name of the parameter follows the famous DM-multipath, its fast_io_fail_tmo.

Change-Id: Ib870cf8e2fd29300c47f1df69617776f4e67bd8c
Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/10301


Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
parent ae4e54fd
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -2956,9 +2956,10 @@ multipathing. This is done by specifying the `name` parameter as an existing con
path, the hostnqn, hostsvcid, hostaddr, prchk_reftag, and prchk_guard_arguments must not be specified and are assumed
to have the same value as the existing path.

The parameters, `ctrlr_loss_timeout_sec` and `reconnect_delay_sec`, are mutually dependent.
The parameters, `ctrlr_loss_timeout_sec`, `reconnect_delay_sec`, and `fast_io_fail_timeout_sec`, are mutually dependent.
If `reconnect_delay_sec` is non-zero, `ctrlr_loss_timeout_sec` has to be -1 or not less than `reconnect_delay_sec`.
If `reconnect_delay_sec` is zero, `ctrlr_loss_timeout_sec` has to be zero.
If `fast_io_fail_timeout_sec` is not zero, it has to be not less than `reconnect_delay_sec` and less than `ctrlr_loss_timeout_sec` if `ctrlr_loss_timeout_sec` is not -1.

#### Result

@@ -2986,6 +2987,7 @@ multipath | Optional | string | Multipathing behavior: dis
num_io_queues              | Optional | uint32_t    | The number of IO queues to request during initialization. Range: (0, UINT16_MAX + 1], Default is 1024.
ctrlr_loss_timeout_sec     | Optional | number      | Time to wait until ctrlr is reconnected before deleting ctrlr.  -1 means infinite reconnects. 0 means no reconnect.
reconnect_delay_sec        | Optional | number      | Time to delay a reconnect trial. 0 means no reconnect.
fast_io_fail_timeout_sec   | Optional | number      | Time to wait until ctrlr is reconnected before failing I/O to ctrlr. 0 means no such timeout.

#### Example

+48 −6
Original line number Diff line number Diff line
@@ -737,6 +737,10 @@ nvme_io_path_is_failed(struct nvme_io_path *io_path)
		return true;
	}

	if (nvme_ctrlr->fast_io_fail_timedout) {
		return true;
	}

	if (nvme_ctrlr->resetting) {
		if (nvme_ctrlr->reconnect_delay_sec != 0) {
			return false;
@@ -1312,6 +1316,23 @@ bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr)
	}
}

static bool
bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
{
	uint32_t elapsed;

	if (nvme_ctrlr->fast_io_fail_timeout_sec == 0) {
		return false;
	}

	elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
	if (elapsed >= nvme_ctrlr->fast_io_fail_timeout_sec) {
		return true;
	} else {
		return false;
	}
}

enum bdev_nvme_op_after_reset {
	OP_NONE,
	OP_COMPLETE_PENDING_DESTRUCT,
@@ -1333,6 +1354,9 @@ bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
	} else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
		return OP_DESTRUCT;
	} else {
		if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) {
			nvme_ctrlr->fast_io_fail_timedout = true;
		}
		bdev_nvme_failover_trid(nvme_ctrlr, false);
		return OP_DELAYED_RECONNECT;
	}
@@ -3370,6 +3394,7 @@ nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
		nvme_ctrlr->prchk_flags = ctx->prchk_flags;
		nvme_ctrlr->ctrlr_loss_timeout_sec = ctx->ctrlr_loss_timeout_sec;
		nvme_ctrlr->reconnect_delay_sec = ctx->reconnect_delay_sec;
		nvme_ctrlr->fast_io_fail_timeout_sec = ctx->fast_io_fail_timeout_sec;
	}

	nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
@@ -3837,7 +3862,8 @@ bdev_nvme_async_poll(void *arg)

static bool
bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec,
				 uint32_t reconnect_delay_sec)
				 uint32_t reconnect_delay_sec,
				 uint32_t fast_io_fail_timeout_sec)
{
	if (ctrlr_loss_timeout_sec < -1) {
		SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n");
@@ -3846,6 +3872,10 @@ bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec,
		if (reconnect_delay_sec == 0) {
			SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
			return false;
		} else if (fast_io_fail_timeout_sec != 0 &&
			   fast_io_fail_timeout_sec < reconnect_delay_sec) {
			SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n");
			return false;
		}
	} else if (ctrlr_loss_timeout_sec != 0) {
		if (reconnect_delay_sec == 0) {
@@ -3854,9 +3884,17 @@ bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec,
		} else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) {
			SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n");
			return false;
		} else if (fast_io_fail_timeout_sec != 0) {
			if (fast_io_fail_timeout_sec < reconnect_delay_sec) {
				SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n");
				return false;
			} else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) {
				SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n");
				return false;
			}
		}
	} else if (reconnect_delay_sec != 0) {
		SPDK_ERRLOG("reconnect_delay_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n");
	} else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) {
		SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n");
		return false;
	}

@@ -3874,7 +3912,8 @@ bdev_nvme_create(struct spdk_nvme_transport_id *trid,
		 struct spdk_nvme_ctrlr_opts *opts,
		 bool multipath,
		 int32_t ctrlr_loss_timeout_sec,
		 uint32_t reconnect_delay_sec)
		 uint32_t reconnect_delay_sec,
		 uint32_t fast_io_fail_timeout_sec)
{
	struct nvme_probe_skip_entry	*entry, *tmp;
	struct nvme_async_probe_ctx	*ctx;
@@ -3888,7 +3927,8 @@ bdev_nvme_create(struct spdk_nvme_transport_id *trid,
		return -EEXIST;
	}

	if (!bdev_nvme_check_multipath_params(ctrlr_loss_timeout_sec, reconnect_delay_sec)) {
	if (!bdev_nvme_check_multipath_params(ctrlr_loss_timeout_sec, reconnect_delay_sec,
					      fast_io_fail_timeout_sec)) {
		return -EINVAL;
	}

@@ -3905,6 +3945,7 @@ bdev_nvme_create(struct spdk_nvme_transport_id *trid,
	ctx->trid = *trid;
	ctx->ctrlr_loss_timeout_sec = ctrlr_loss_timeout_sec;
	ctx->reconnect_delay_sec = reconnect_delay_sec;
	ctx->fast_io_fail_timeout_sec = fast_io_fail_timeout_sec;

	if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
		TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
@@ -4230,7 +4271,7 @@ discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl,
			snprintf(new_ctx->opts.hostnqn, sizeof(new_ctx->opts.hostnqn), "%s", ctx->hostnqn);
			rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 0,
					      discovery_attach_controller_done, new_ctx,
					      &new_ctx->opts, true, 0, 0);
					      &new_ctx->opts, true, 0, 0, 0);
			if (rc == 0) {
				TAILQ_INSERT_TAIL(&ctx->ctrlr_ctxs, new_ctx, tailq);
				ctx->attach_in_progress++;
@@ -5561,6 +5602,7 @@ nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
				   (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
	spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->ctrlr_loss_timeout_sec);
	spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->reconnect_delay_sec);
	spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", nvme_ctrlr->fast_io_fail_timeout_sec);

	spdk_json_write_object_end(w);

+5 −1
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@ struct nvme_async_probe_ctx {
	uint32_t prchk_flags;
	int32_t ctrlr_loss_timeout_sec;
	uint32_t reconnect_delay_sec;
	uint32_t fast_io_fail_timeout_sec;
	struct spdk_poller *poller;
	struct spdk_nvme_transport_id trid;
	struct spdk_nvme_ctrlr_opts opts;
@@ -110,6 +111,7 @@ struct nvme_ctrlr {

	uint32_t				resetting : 1;
	uint32_t				reconnect_is_delayed : 1;
	uint32_t				fast_io_fail_timedout : 1;
	uint32_t				destruct : 1;
	uint32_t				ana_log_page_updating : 1;
	/**
@@ -148,6 +150,7 @@ struct nvme_ctrlr {

	uint32_t				reconnect_delay_sec;
	int32_t					ctrlr_loss_timeout_sec;
	uint32_t				fast_io_fail_timeout_sec;

	pthread_mutex_t				mutex;
};
@@ -267,7 +270,8 @@ int bdev_nvme_create(struct spdk_nvme_transport_id *trid,
		     struct spdk_nvme_ctrlr_opts *opts,
		     bool multipath,
		     int32_t ctrlr_loss_timeout_sec,
		     uint32_t reconnect_delay_sec);
		     uint32_t reconnect_delay_sec,
		     uint32_t fast_io_fail_timeout_sec);

int bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, const char *base_name,
			      struct spdk_nvme_ctrlr_opts *opts,
+3 −1
Original line number Diff line number Diff line
@@ -186,6 +186,7 @@ struct rpc_bdev_nvme_attach_controller {
	char *multipath;
	int32_t ctrlr_loss_timeout_sec;
	uint32_t reconnect_delay_sec;
	uint32_t fast_io_fail_timeout_sec;
	struct spdk_nvme_ctrlr_opts opts;
};

@@ -227,6 +228,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_attach_controller_dec
	{"num_io_queues", offsetof(struct rpc_bdev_nvme_attach_controller, opts.num_io_queues), spdk_json_decode_uint32, true},
	{"ctrlr_loss_timeout_sec", offsetof(struct rpc_bdev_nvme_attach_controller, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true},
	{"reconnect_delay_sec", offsetof(struct rpc_bdev_nvme_attach_controller, reconnect_delay_sec), spdk_json_decode_uint32, true},
	{"fast_io_fail_timeout_sec", offsetof(struct rpc_bdev_nvme_attach_controller, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true},
};

#define NVME_MAX_BDEVS_PER_RPC 128
@@ -497,7 +499,7 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request,
	rc = bdev_nvme_create(&trid, ctx->req.name, ctx->names, ctx->count, prchk_flags,
			      rpc_bdev_nvme_attach_controller_done, ctx, &ctx->req.opts,
			      multipath, ctx->req.ctrlr_loss_timeout_sec,
			      ctx->req.reconnect_delay_sec);
			      ctx->req.reconnect_delay_sec, ctx->req.fast_io_fail_timeout_sec);
	if (rc) {
		spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
		goto cleanup;
+8 −1
Original line number Diff line number Diff line
@@ -540,7 +540,8 @@ if __name__ == "__main__":
                                                         multipath=args.multipath,
                                                         num_io_queues=args.num_io_queues,
                                                         ctrlr_loss_timeout_sec=args.ctrlr_loss_timeout_sec,
                                                         reconnect_delay_sec=args.reconnect_delay_sec))
                                                         reconnect_delay_sec=args.reconnect_delay_sec,
                                                         fast_io_fail_timeout_sec=args.fast_io_fail_timeout_sec))

    p = subparsers.add_parser('bdev_nvme_attach_controller', aliases=['construct_nvme_bdev'],
                              help='Add bdevs with nvme backend')
@@ -586,6 +587,12 @@ if __name__ == "__main__":
                   If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_delay_sec has to be non-zero and
                   less than ctrlr_loss_timeout_sec.""",
                   type=int)
    p.add_argument('-u', '--fast-io-fail-timeout-sec',
                   help="""Time to wait until ctrlr is reconnected before failing I/O to ctrlr.
                   0 means no such timeout.
                   If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and
                   less than ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1.""",
                   type=int)
    p.set_defaults(func=bdev_nvme_attach_controller)

    def bdev_nvme_get_controllers(args):
Loading