Commit 6aa4edc2 authored by Richael Zhuang's avatar Richael Zhuang Committed by Jim Harris
Browse files

bdev/nvme: select io path according to outstanding io numbder



Support selecting io path according to number of outstanding io of
each path in a channel. It's optional, and can be set by calling
RPC "bdev_nvme_set_multipath_policy -s queue_depth".

Change-Id: I82cdfbd69b3e105c973844c4f34dc98f0dca2faf
Signed-off-by: default avatarRichael Zhuang <richael.zhuang@arm.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14734


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
parent a8d21b9b
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -91,6 +91,9 @@ Added spdk_rpc_set_allowlist to restrict allowed RPCs to the specified list.

Changed `bdev_raid_get_bdevs` RPC output format to include raid_bdev details.

Added `selector` parameter to bdev_nvme_set_multipath_policy RPC to set path selector for multipath.
Option `round_robin` and `queue_depth` are available.

### bdevperf

Promoted the application to example to match similar programs: fio_plugin and perf.
@@ -150,6 +153,10 @@ a specified qpair.

Updated `bdev_nvme_set_options` RPC (and rpc.py) to support the new `transport_tos` parameter.

For the active-active policy of the multipath mode, in addition to the default round-robin path
selector, the minimum queue depth path selector was added. The minimum queue depth path selector
selects an I/O path according to the number of outstanding requests of each nvme qpair.

## v22.09

### accel
+3 −1
Original line number Diff line number Diff line
@@ -4135,7 +4135,8 @@ Example response:

### bdev_nvme_set_multipath_policy {#rpc_bdev_nvme_set_multipath_policy}

Set multipath policy of the NVMe bdev in multipath mode.
Set multipath policy of the NVMe bdev in multipath mode or set multipath
selector for active-active multipath policy.

#### Parameters

@@ -4143,6 +4144,7 @@ Name | Optional | Type | Description
----------------------- | -------- | ----------- | -----------
name                    | Required | string      | Name of the NVMe bdev
policy                  | Required | string      | Multipath policy: active_active or active_passive
selector                | Optional | string      | Multipath selector: round_robin or queue_depth, used in active-active mode. Default is round_robin

#### Example

+56 −2
Original line number Diff line number Diff line
@@ -655,6 +655,7 @@ bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
	pthread_mutex_lock(&nbdev->mutex);

	nbdev_ch->mp_policy = nbdev->mp_policy;
	nbdev_ch->mp_selector = nbdev->mp_selector;

	TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
		rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
@@ -873,6 +874,51 @@ _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
	return non_optimized;
}

static struct nvme_io_path *
_bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
{
	struct nvme_io_path *io_path;
	struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
	uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
	uint32_t num_outstanding_reqs;

	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
		if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
			/* The device is currently resetting. */
			continue;
		}

		if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
			continue;
		}

		num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
		switch (io_path->nvme_ns->ana_state) {
		case SPDK_NVME_ANA_OPTIMIZED_STATE:
			if (num_outstanding_reqs < opt_min_qd) {
				opt_min_qd = num_outstanding_reqs;
				optimized = io_path;
			}
			break;
		case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
			if (num_outstanding_reqs < non_opt_min_qd) {
				non_opt_min_qd = num_outstanding_reqs;
				non_optimized = io_path;
			}
			break;
		default:
			break;
		}
	}

	/* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
	if (optimized != NULL) {
		return optimized;
	}

	return non_optimized;
}

static inline struct nvme_io_path *
bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
@@ -881,7 +927,12 @@ bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
		return nbdev_ch->current_io_path;
	}

	if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
	    nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
		return _bdev_nvme_find_io_path(nbdev_ch);
	} else {
		return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
	}
}

/* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
@@ -3301,6 +3352,7 @@ nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)

	bdev->ref = 1;
	bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
	bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN;
	TAILQ_INIT(&bdev->nvme_ns_list);
	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
	bdev->opal = nvme_ctrlr->opal_dev != NULL;
@@ -4110,6 +4162,7 @@ _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i)
	struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch);

	nbdev_ch->mp_policy = nbdev->mp_policy;
	nbdev_ch->mp_selector = nbdev->mp_selector;
	nbdev_ch->current_io_path = NULL;

	spdk_for_each_channel_continue(i, 0);
@@ -4117,7 +4170,7 @@ _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i)

void
bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy,
			       bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
			       enum bdev_nvme_multipath_selector selector, bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
{
	struct bdev_nvme_set_multipath_policy_ctx *ctx;
	struct spdk_bdev *bdev;
@@ -4153,6 +4206,7 @@ bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy

	pthread_mutex_lock(&nbdev->mutex);
	nbdev->mp_policy = policy;
	nbdev->mp_selector = selector;
	pthread_mutex_unlock(&nbdev->mutex);

	spdk_for_each_channel(nbdev,
+9 −0
Original line number Diff line number Diff line
@@ -28,6 +28,11 @@ enum bdev_nvme_multipath_policy {
	BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
};

enum bdev_nvme_multipath_selector {
	BDEV_NVME_MP_SELECTOR_ROUND_ROBIN = 1,
	BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH,
};

typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc);
typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status);
typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx);
@@ -158,6 +163,7 @@ struct nvme_bdev {
	pthread_mutex_t			mutex;
	int				ref;
	enum bdev_nvme_multipath_policy	mp_policy;
	enum bdev_nvme_multipath_selector mp_selector;
	TAILQ_HEAD(, nvme_ns)		nvme_ns_list;
	bool				opal;
	TAILQ_ENTRY(nvme_bdev)		tailq;
@@ -196,6 +202,7 @@ struct nvme_io_path {
struct nvme_bdev_channel {
	struct nvme_io_path			*current_io_path;
	enum bdev_nvme_multipath_policy		mp_policy;
	enum bdev_nvme_multipath_selector	mp_selector;
	STAILQ_HEAD(, nvme_io_path)		io_path_list;
	TAILQ_HEAD(retry_io_head, spdk_bdev_io)	retry_io_list;
	struct spdk_poller			*retry_io_poller;
@@ -345,10 +352,12 @@ typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc);
 *
 * \param name NVMe bdev name
 * \param policy Multipath policy (active-passive or active-active)
 * \param selector Multipath selector (round_robin, queue_depth)
 * \param cb_fn Function to be called back after completion.
 */
void bdev_nvme_set_multipath_policy(const char *name,
				    enum bdev_nvme_multipath_policy policy,
				    enum bdev_nvme_multipath_selector selector,
				    bdev_nvme_set_multipath_policy_cb cb_fn,
				    void *cb_arg);

+27 −1
Original line number Diff line number Diff line
@@ -2209,6 +2209,7 @@ SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_pa
struct rpc_set_multipath_policy {
	char *name;
	enum bdev_nvme_multipath_policy policy;
	enum bdev_nvme_multipath_selector selector;
};

static void
@@ -2234,9 +2235,27 @@ rpc_decode_mp_policy(const struct spdk_json_val *val, void *out)
	return 0;
}

static int
rpc_decode_mp_selector(const struct spdk_json_val *val, void *out)
{
	enum bdev_nvme_multipath_selector *selector = out;

	if (spdk_json_strequal(val, "round_robin") == true) {
		*selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN;
	} else if (spdk_json_strequal(val, "queue_depth") == true) {
		*selector = BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH;
	} else {
		SPDK_NOTICELOG("Invalid parameter value: selector\n");
		return -EINVAL;
	}

	return 0;
}

static const struct spdk_json_object_decoder rpc_set_multipath_policy_decoders[] = {
	{"name", offsetof(struct rpc_set_multipath_policy, name), spdk_json_decode_string},
	{"policy", offsetof(struct rpc_set_multipath_policy, policy), rpc_decode_mp_policy},
	{"selector", offsetof(struct rpc_set_multipath_policy, selector), rpc_decode_mp_selector, true},
};

struct rpc_set_multipath_policy_ctx {
@@ -2282,7 +2301,14 @@ rpc_bdev_nvme_set_multipath_policy(struct spdk_jsonrpc_request *request,

	ctx->request = request;

	bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy,
	if (ctx->req.policy != BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && ctx->req.selector > 0) {
		SPDK_ERRLOG("selector only works in active_active mode\n");
		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
						 "spdk_json_decode_object failed");
		goto cleanup;
	}

	bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy, ctx->req.selector,
				       rpc_bdev_nvme_set_multipath_policy_done, ctx);
	return;

Loading