Commit acd0b457 authored by Ben Walker's avatar Ben Walker
Browse files

bdev/nvme: Allow the user to control the I/O timeout behavior



The user can now not only specify an optional timeout for
commands, but also the action to take when a timeout is
detected.

Change-Id: I7d7cdd846d580e0b3a5f733d398ee9b19d6fe034
Signed-off-by: default avatarBen Walker <benjamin.walker@intel.com>
parent 193f4f83
Loading
Loading
Loading
Loading
+7 −4
Original line number Diff line number Diff line
@@ -100,10 +100,13 @@
  # The number of attempts per I/O when an I/O fails. Do not include
  # this key to get the default behavior.
  NvmeRetryCount 4
  # Registers the application to receive timeout callback and to reset the controller.
  ResetControllerOnTimeout Yes
  # Timeout value.
  NvmeTimeoutValue 30
  # Timeout for each command, in seconds. If 0, don't track timeouts.
  NvmeTimeoutValue 0
  # Action to take on command time out. Only valid when Timeout is greater
  # than 0. This may be 'Reset' to reset the controller, 'Abort' to abort
  # the command, or 'None' to just print a message but do nothing.
  # Admin command timeouts will always result in a reset.
  ActionOnTimeout None
  # Set how often the admin queue is polled for asynchronous events.
  # Units in microseconds.
  AdminPollRate 100000
+7 −4
Original line number Diff line number Diff line
@@ -91,10 +91,13 @@
  # The number of attempts per I/O when an I/O fails. Do not include
  # this key to get the default behavior.
  NvmeRetryCount 4
  # Registers the application to receive timeout callback and to reset the controller.
  ResetControllerOnTimeout Yes
  # Timeout value.
  NvmeTimeoutValue 30
  # Timeout for each command, in seconds. If 0, don't track timeouts.
  NvmeTimeoutValue 0
  # Action to take on command time out. Only valid when Timeout is greater
  # than 0. This may be 'Reset' to reset the controller, 'Abort' to abort
  # the command, or 'None' to just print a message but do nothing.
  # Admin command timeouts will always result in a reset.
  ActionOnTimeout None
  # Set how often the admin queue is polled for asynchronous events.
  # Units in microseconds.
  AdminPollRate 100000
+7 −7
Original line number Diff line number Diff line
@@ -79,13 +79,13 @@
  # The number of attempts per I/O when an I/O fails. Do not include
  # this key to get the default behavior.
  NvmeRetryCount 4
  # The maximum number of NVMe controllers to claim. Do not include this key to
  # claim all of them.
  NumControllers 2
  # Registers the application to receive timeout callback and to reset the controller.
  ResetControllerOnTimeout Yes
  # Timeout value.
  NvmeTimeoutValue 30
  # Timeout for each command, in seconds. If 0, don't track timeouts.
  NvmeTimeoutValue 0
  # Action to take on command time out. Only valid when Timeout is greater
  # than 0. This may be 'Reset' to reset the controller, 'Abort' to abort
  # the command, or 'None' to just print a message but do nothing.
  # Admin command timeouts will always result in a reset.
  ActionOnTimeout None
  # Set how often the admin queue is polled for asynchronous events.
  # Units in microseconds.
  AdminPollRate 100000
+66 −8
Original line number Diff line number Diff line
@@ -111,8 +111,14 @@ struct nvme_probe_ctx {
	const char *names[NVME_MAX_CONTROLLERS];
};

enum timeout_action {
	TIMEOUT_ACTION_NONE = 0,
	TIMEOUT_ACTION_RESET,
	TIMEOUT_ACTION_ABORT,
};

static int g_hot_insert_nvme_controller_index = 0;
static bool g_reset_controller_on_timeout = false;
static enum timeout_action g_action_on_timeout = TIMEOUT_ACTION_NONE;
static int g_timeout = 0;
static int g_nvme_adminq_poll_timeout_us = 0;
static bool g_nvme_hotplug_enabled;
@@ -559,6 +565,21 @@ probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
	return true;
}

static void
spdk_nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
{
	struct spdk_nvme_ctrlr *ctrlr = ctx;
	int rc;

	if (spdk_nvme_cpl_is_error(cpl)) {
		SPDK_WARNLOG("Abort failed. Resetting controller.\n");
		rc = spdk_nvme_ctrlr_reset(ctrlr);
		if (rc) {
			SPDK_ERRLOG("Resetting controller failed.\n");
		}
	}
}

static void
timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
	   struct spdk_nvme_qpair *qpair, uint16_t cid)
@@ -567,9 +588,27 @@ timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,

	SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);

	switch (g_action_on_timeout) {
	case TIMEOUT_ACTION_ABORT:
		if (qpair) {
			rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
						       spdk_nvme_abort_cpl, ctrlr);
			if (rc == 0) {
				return;
			}

			SPDK_ERRLOG("Unable to send abort. Resetting.\n");
		}

	/* Fallthrough */
	case TIMEOUT_ACTION_RESET:
		rc = spdk_nvme_ctrlr_reset(ctrlr);
		if (rc) {
		SPDK_ERRLOG("resetting controller failed\n");
			SPDK_ERRLOG("Resetting controller failed.\n");
		}
		break;
	case TIMEOUT_ACTION_NONE:
		break;
	}
}

@@ -621,7 +660,7 @@ attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
				sizeof(struct nvme_io_channel));
	TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq);

	if (g_reset_controller_on_timeout) {
	if (g_action_on_timeout != TIMEOUT_ACTION_NONE) {
		spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_timeout,
				timeout_cb, NULL);
	}
@@ -763,13 +802,32 @@ bdev_nvme_library_init(void)
		probe_ctx.count++;
	}

	g_reset_controller_on_timeout =
		spdk_conf_section_get_boolval(sp, "ResetControllerOnTimeout", false);

	if ((g_timeout = spdk_conf_section_get_intval(sp, "NvmeTimeoutValue")) < 0) {
		g_timeout = 0;
	}

	if (g_timeout > 0) {
		val = spdk_conf_section_get_val(sp, "ActionOnTimeout");
		if (val != NULL) {
			if (!strcasecmp(val, "Reset")) {
				g_action_on_timeout = TIMEOUT_ACTION_RESET;
			} else if (!strcasecmp(val, "Abort")) {
				g_action_on_timeout = TIMEOUT_ACTION_ABORT;
			}
		} else {
			/* Handle old name for backward compatibility */
			val = spdk_conf_section_get_val(sp, "ResetControllerOnTimeout");
			if (val) {
				SPDK_WARNLOG("ResetControllerOnTimeout was renamed to ActionOnTimeout\n");
				SPDK_WARNLOG("Please update your configuration file\n");

				if (spdk_conf_section_get_boolval(sp, "ResetControllerOnTimeout", false)) {
					g_action_on_timeout = TIMEOUT_ACTION_RESET;
				}
			}
		}
	}

	g_nvme_adminq_poll_timeout_us = spdk_conf_section_get_intval(sp, "AdminPollRate");
	if (g_nvme_adminq_poll_timeout_us <= 0) {
		g_nvme_adminq_poll_timeout_us = 1000000;