Commit 8f9b9775 authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Tomasz Zawadzki
Browse files

bdev/nvme: Add active/active policy for multipath mode



The NVMe bdev module supported active-passive policy for multipath mode
first. By this patch, the NVMe bdev module supports active-active policy
for multipath node next. Following the Linux kernel native NVMe multipath,
the NVMe bdev module supports round robin algorithm for active-active
policy.

The multipath policy, active-passive or active-active, is managed per
nvme_bdev. The multipath policy is copied to all corresponding
nvme_bdev_channels.

Different from active-passive, active-active caches even non_optimized
path to provide load balance across multiple paths.

Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: Ie18b24db60d3da1ce2f83725b6cd3079f628f95b
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12001


Community-CI: Mellanox Build Bot
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
parent 0869265d
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -26,6 +26,9 @@ A new RPC `bdev_nvme_get_io_paths` was added to get all active I/O paths.
A new RPC `bdev_nvme_set_preferred_path` was added to set preferred I/O path for an NVMe bdev
when in multipath mode. This RPC does not support NVMe bdevs in failover mode.

A new RPC `bdev_nvme_set_multipath_policy` was added to set multipath policy of a NVMe bdev
in multipath mode.

### idxd

A new parameter `flags` was added to all low level submission and preparation
+37 −0
Original line number Diff line number Diff line
@@ -3433,6 +3433,43 @@ Example response:
}
~~~

### bdev_nvme_set_multipath_policy {#rpc_bdev_nvme_set_multipath_policy}

Set multipath policy of the NVMe bdev in multipath mode.

#### Parameters

Name                    | Optional | Type        | Description
----------------------- | -------- | ----------- | -----------
name                    | Required | string      | Name of the NVMe bdev
policy                  | Required | string      | Multipath policy: active_active or active_passive

#### Example

Example request:

~~~json
{
  "jsonrpc": "2.0",
  "method": "bdev_nvme_set_multipath_policy",
  "id": 1,
  "params": {
    "name": "Nvme0n1",
    "policy": "active_passive"
  }
}
~~~

Example response:

~~~json
{
  "jsonrpc": "2.0",
  "id": 1,
  "result": true
}
~~~

### bdev_nvme_cuse_register {#rpc_bdev_nvme_cuse_register}

Register CUSE device on NVMe controller.
+152 −1
Original line number Diff line number Diff line
@@ -825,6 +825,55 @@ nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
	return true;
}

/* Simulate circular linked list. */
static inline struct nvme_io_path *
nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path)
{
	struct nvme_io_path *next_path;

	next_path = STAILQ_NEXT(prev_path, stailq);
	if (next_path != NULL) {
		return next_path;
	} else {
		return STAILQ_FIRST(&nbdev_ch->io_path_list);
	}
}

static struct nvme_io_path *
bdev_nvme_find_next_io_path(struct nvme_bdev_channel *nbdev_ch,
			    struct nvme_io_path *prev)
{
	struct nvme_io_path *io_path, *start, *non_optimized = NULL;

	start = nvme_io_path_get_next(nbdev_ch, prev);

	io_path = start;
	do {
		if (spdk_likely(nvme_io_path_is_connected(io_path) &&
				!io_path->nvme_ns->ana_state_updating)) {
			switch (io_path->nvme_ns->ana_state) {
			case SPDK_NVME_ANA_OPTIMIZED_STATE:
				nbdev_ch->current_io_path = io_path;
				return io_path;
			case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
				if (non_optimized == NULL) {
					non_optimized = io_path;
				}
				break;
			default:
				break;
			}
		}
		io_path = nvme_io_path_get_next(nbdev_ch, io_path);
	} while (io_path != start);

	/* We come here only if there is no optimized path. Cache even non_optimized
	 * path for load balance across multiple non_optimized paths.
	 */
	nbdev_ch->current_io_path = non_optimized;
	return non_optimized;
}

static struct nvme_io_path *
_bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
@@ -864,7 +913,11 @@ bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
		return _bdev_nvme_find_io_path(nbdev_ch);
	}

	if (spdk_likely(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) {
		return nbdev_ch->current_io_path;
	} else {
		return bdev_nvme_find_next_io_path(nbdev_ch, nbdev_ch->current_io_path);
	}
}

/* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
@@ -2600,6 +2653,20 @@ nvme_namespace_info_json(struct spdk_json_write_ctx *w,
	spdk_json_write_object_end(w);
}

static const char *
nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev)
{
	switch (nbdev->mp_policy) {
	case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
		return "active_passive";
	case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
		return "active_active";
	default:
		assert(false);
		return "invalid";
	}
}

static int
bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
{
@@ -2612,6 +2679,7 @@ bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
		nvme_namespace_info_json(w, nvme_ns);
	}
	spdk_json_write_array_end(w);
	spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev));
	pthread_mutex_unlock(&nvme_bdev->mutex);

	return 0;
@@ -2884,6 +2952,7 @@ nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
	}

	bdev->ref = 1;
	bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
	TAILQ_INIT(&bdev->nvme_ns_list);
	TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
	bdev->opal = nvme_ctrlr->opal_dev != NULL;
@@ -3635,6 +3704,88 @@ err_alloc:
	cb_fn(cb_arg, rc);
}

struct bdev_nvme_set_multipath_policy_ctx {
	struct spdk_bdev_desc *desc;
	bdev_nvme_set_multipath_policy_cb cb_fn;
	void *cb_arg;
};

static void
bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status)
{
	struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i);

	assert(ctx != NULL);
	assert(ctx->desc != NULL);
	assert(ctx->cb_fn != NULL);

	spdk_bdev_close(ctx->desc);

	ctx->cb_fn(ctx->cb_arg, status);

	free(ctx);
}

static void
_bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i)
{
	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
	struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch);

	nbdev_ch->mp_policy = nbdev->mp_policy;
	nbdev_ch->current_io_path = NULL;

	spdk_for_each_channel_continue(i, 0);
}

void
bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy,
			       bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
{
	struct bdev_nvme_set_multipath_policy_ctx *ctx;
	struct spdk_bdev *bdev;
	struct nvme_bdev *nbdev;
	int rc;

	assert(cb_fn != NULL);

	ctx = calloc(1, sizeof(*ctx));
	if (ctx == NULL) {
		SPDK_ERRLOG("Failed to alloc context.\n");
		rc = -ENOMEM;
		goto err_alloc;
	}

	ctx->cb_fn = cb_fn;
	ctx->cb_arg = cb_arg;

	rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
	if (rc != 0) {
		SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
		rc = -ENODEV;
		goto err_open;
	}

	bdev = spdk_bdev_desc_get_bdev(ctx->desc);
	nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);

	pthread_mutex_lock(&nbdev->mutex);
	nbdev->mp_policy = policy;
	pthread_mutex_unlock(&nbdev->mutex);

	spdk_for_each_channel(nbdev,
			      _bdev_nvme_set_multipath_policy,
			      ctx,
			      bdev_nvme_set_multipath_policy_done);
	return;

err_open:
	free(ctx);
err_alloc:
	cb_fn(cb_arg, rc);
}

static void
aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
{
+29 −8
Original line number Diff line number Diff line
@@ -48,6 +48,11 @@ extern bool g_bdev_nvme_module_finish;

#define NVME_MAX_CONTROLLERS 1024

enum bdev_nvme_multipath_policy {
	BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE,
	BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
};

typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc);
typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx);
typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx);
@@ -171,6 +176,7 @@ struct nvme_bdev {
	struct nvme_bdev_ctrlr		*nbdev_ctrlr;
	pthread_mutex_t			mutex;
	int				ref;
	enum bdev_nvme_multipath_policy	mp_policy;
	TAILQ_HEAD(, nvme_ns)		nvme_ns_list;
	bool				opal;
	TAILQ_ENTRY(nvme_bdev)		tailq;
@@ -207,6 +213,7 @@ struct nvme_io_path {

struct nvme_bdev_channel {
	struct nvme_io_path			*current_io_path;
	enum bdev_nvme_multipath_policy		mp_policy;
	STAILQ_HEAD(, nvme_io_path)		io_path_list;
	TAILQ_HEAD(retry_io_head, spdk_bdev_io)	retry_io_list;
	struct spdk_poller			*retry_io_poller;
@@ -333,4 +340,18 @@ typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc);
void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
				  bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg);

typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc);

/**
 * Set multipath policy of the NVMe bdev.
 *
 * \param name NVMe bdev name
 * \param policy Multipath policy (active-passive or active-active)
 * \param cb_fn Function to be called back after completion.
 */
void bdev_nvme_set_multipath_policy(const char *name,
				    enum bdev_nvme_multipath_policy policy,
				    bdev_nvme_set_multipath_policy_cb cb_fn,
				    void *cb_arg);

#endif /* SPDK_BDEV_NVME_H */
+87 −0
Original line number Diff line number Diff line
@@ -2196,3 +2196,90 @@ cleanup:
}
SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path,
		  SPDK_RPC_RUNTIME)

struct rpc_set_multipath_policy {
	char *name;
	enum bdev_nvme_multipath_policy policy;
};

static void
free_rpc_set_multipath_policy(struct rpc_set_multipath_policy *req)
{
	free(req->name);
}

static int
rpc_decode_mp_policy(const struct spdk_json_val *val, void *out)
{
	enum bdev_nvme_multipath_policy *policy = out;

	if (spdk_json_strequal(val, "active_passive") == true) {
		*policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
	} else if (spdk_json_strequal(val, "active_active") == true) {
		*policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE;
	} else {
		SPDK_NOTICELOG("Invalid parameter value: policy\n");
		return -EINVAL;
	}

	return 0;
}

static const struct spdk_json_object_decoder rpc_set_multipath_policy_decoders[] = {
	{"name", offsetof(struct rpc_set_multipath_policy, name), spdk_json_decode_string},
	{"policy", offsetof(struct rpc_set_multipath_policy, policy), rpc_decode_mp_policy},
};

struct rpc_set_multipath_policy_ctx {
	struct rpc_set_multipath_policy req;
	struct spdk_jsonrpc_request *request;
};

static void
rpc_bdev_nvme_set_multipath_policy_done(void *cb_arg, int rc)
{
	struct rpc_set_multipath_policy_ctx *ctx = cb_arg;

	if (rc == 0) {
		spdk_jsonrpc_send_bool_response(ctx->request, true);
	} else {
		spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc));
	}

	free_rpc_set_multipath_policy(&ctx->req);
	free(ctx);
}

static void
rpc_bdev_nvme_set_multipath_policy(struct spdk_jsonrpc_request *request,
				   const struct spdk_json_val *params)
{
	struct rpc_set_multipath_policy_ctx *ctx;

	ctx = calloc(1, sizeof(*ctx));
	if (ctx == NULL) {
		spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
		return;
	}

	if (spdk_json_decode_object(params, rpc_set_multipath_policy_decoders,
				    SPDK_COUNTOF(rpc_set_multipath_policy_decoders),
				    &ctx->req)) {
		SPDK_ERRLOG("spdk_json_decode_object failed\n");
		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
						 "spdk_json_decode_object failed");
		goto cleanup;
	}

	ctx->request = request;

	bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy,
				       rpc_bdev_nvme_set_multipath_policy_done, ctx);
	return;

cleanup:
	free_rpc_set_multipath_policy(&ctx->req);
	free(ctx);
}
SPDK_RPC_REGISTER("bdev_nvme_set_multipath_policy", rpc_bdev_nvme_set_multipath_policy,
		  SPDK_RPC_RUNTIME)
Loading