Commit 22b77a3c authored by Shuhei Matsumoto's avatar Shuhei Matsumoto Committed by Tomasz Zawadzki
Browse files

bdev/nvme: Set preferred I/O path in multipath mode



If we specify a preferred path manually for each NVMe bdev, we will
be able to realize a simple static load balancing and make the failover
more controllable in the multipath mode.

The idea is to move I/O path to the NVMe-oF controller to the head of
the list and then clear the I/O path cache for each NVMe bdev channel.
We can set the I/O path to the I/O path cache directly but it must be
conditional and make the code very complex. Hence, let find_io_path() do
that.

However, a NVMe bdev channel may be acquired after setting the preferred
path. To cover such case, sort the nvme_ns list of the NVMe bdev too.

This feature supports only multipath mode. The NVMe bdev module supports
failover mode too. However, to support the latter, the new RPC needs to
have trid as parameters and the code and the usage will be come very
complex. Add a note for such limitation.

To verify one by one exactly, add unit test.

Signed-off-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: Ia51c74f530d6d7dc1f73d5b65f854967363e76b0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12262


Community-CI: Mellanox Build Bot
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Reviewed-by: default avatar <tanl12@chinatelecom.cn>
Reviewed-by: default avatarGangCao <gang.cao@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent f0935084
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -23,6 +23,9 @@ safe for race conditions.

A new RPC `bdev_nvme_get_io_paths` was added to get all active I/O paths.

A new RPC `bdev_nvme_set_preferred_path` was added to set preferred I/O path for an NVMe bdev
when in multipath mode. This RPC does not support NVMe bdevs in failover mode.

### idxd

A new parameter `flags` was added to all low level submission and preparation
+39 −0
Original line number Diff line number Diff line
@@ -3394,6 +3394,45 @@ Example response:
}
~~~

### bdev_nvme_set_preferred_path {#rpc_bdev_nvme_set_preferred_path}

Set the preferred I/O path for an NVMe bdev in multipath mode.

NOTE: This RPC does not support NVMe bdevs in failover mode.

#### Parameters

Name                    | Optional | Type        | Description
----------------------- | -------- | ----------- | -----------
name                    | Required | string      | Name of the NVMe bdev
cntlid                  | Required | number      | NVMe-oF controller ID

#### Example

Example request:

~~~json
{
  "jsonrpc": "2.0",
  "method": "bdev_nvme_set_preferred_path",
  "id": 1,
  "params": {
    "name": "Nvme0n1",
    "cntlid": 0
  }
}
~~~

Example response:

~~~json
{
  "jsonrpc": "2.0",
  "id": 1,
  "result": true
}
~~~

### bdev_nvme_cuse_register {#rpc_bdev_nvme_cuse_register}

Register CUSE device on NVMe controller.
+154 −0
Original line number Diff line number Diff line
@@ -3475,6 +3475,160 @@ nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
	return rc;
}

static void
dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
{
}

struct bdev_nvme_set_preferred_path_ctx {
	struct spdk_bdev_desc *desc;
	struct nvme_ns *nvme_ns;
	bdev_nvme_set_preferred_path_cb cb_fn;
	void *cb_arg;
};

static void
bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status)
{
	struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i);

	assert(ctx != NULL);
	assert(ctx->desc != NULL);
	assert(ctx->cb_fn != NULL);

	spdk_bdev_close(ctx->desc);

	ctx->cb_fn(ctx->cb_arg, status);

	free(ctx);
}

static void
_bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i)
{
	struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
	struct nvme_io_path *io_path, *prev;

	prev = NULL;
	STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
		if (io_path->nvme_ns == ctx->nvme_ns) {
			break;
		}
		prev = io_path;
	}

	if (io_path != NULL && prev != NULL) {
		STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq);
		STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq);

		/* We can set io_path to nbdev_ch->current_io_path directly here.
		 * However, it needs to be conditional. To simplify the code,
		 * just clear nbdev_ch->current_io_path and let find_io_path()
		 * fill it.
		 */
		nbdev_ch->current_io_path = NULL;
	}

	spdk_for_each_channel_continue(i, 0);
}

static struct nvme_ns *
bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid)
{
	struct nvme_ns *nvme_ns, *prev;
	const struct spdk_nvme_ctrlr_data *cdata;

	prev = NULL;
	TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
		cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);

		if (cdata->cntlid == cntlid) {
			break;
		}
		prev = nvme_ns;
	}

	if (nvme_ns != NULL && prev != NULL) {
		TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq);
		TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq);
	}

	return nvme_ns;
}

/* This function supports only multipath mode. There is only a single I/O path
 * for each NVMe-oF controller. Hence, just move the matched I/O path to the
 * head of the I/O path list for each NVMe bdev channel.
 *
 * NVMe bdev channel may be acquired after completing this function. move the
 * matched namespace to the head of the namespace list for the NVMe bdev too.
 */
void
bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
			     bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg)
{
	struct bdev_nvme_set_preferred_path_ctx *ctx;
	struct spdk_bdev *bdev;
	struct nvme_bdev *nbdev;
	int rc = 0;

	assert(cb_fn != NULL);

	ctx = calloc(1, sizeof(*ctx));
	if (ctx == NULL) {
		SPDK_ERRLOG("Failed to alloc context.\n");
		rc = -ENOMEM;
		goto err_alloc;
	}

	ctx->cb_fn = cb_fn;
	ctx->cb_arg = cb_arg;

	rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
	if (rc != 0) {
		SPDK_ERRLOG("Failed to open bdev %s.\n", name);
		goto err_open;
	}

	bdev = spdk_bdev_desc_get_bdev(ctx->desc);

	if (bdev->module != &nvme_if) {
		SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
		rc = -ENODEV;
		goto err_bdev;
	}

	nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);

	pthread_mutex_lock(&nbdev->mutex);

	ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid);
	if (ctx->nvme_ns == NULL) {
		pthread_mutex_unlock(&nbdev->mutex);

		SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid);
		rc = -ENODEV;
		goto err_bdev;
	}

	pthread_mutex_unlock(&nbdev->mutex);

	spdk_for_each_channel(nbdev,
			      _bdev_nvme_set_preferred_path,
			      ctx,
			      bdev_nvme_set_preferred_path_done);
	return;

err_bdev:
	spdk_bdev_close(ctx->desc);
err_open:
	free(ctx);
err_alloc:
	cb_fn(cb_arg, rc);
}

static void
aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
{
+15 −0
Original line number Diff line number Diff line
@@ -318,4 +318,19 @@ int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id);
 */
int bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg);

typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc);

/**
 * Set the preferred I/O path for an NVMe bdev in multipath mode.
 *
 * NOTE: This function does not support NVMe bdevs in failover mode.
 *
 * \param name NVMe bdev name
 * \param cntlid NVMe-oF controller ID
 * \param cb_fn Function to be called back after completion.
 * \param cb_arg Argument for callback function.
 */
void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
				  bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg);

#endif /* SPDK_BDEV_NVME_H */
+70 −0
Original line number Diff line number Diff line
@@ -2126,3 +2126,73 @@ rpc_bdev_nvme_get_io_paths(struct spdk_jsonrpc_request *request,
			      rpc_bdev_nvme_get_io_paths_done);
}
SPDK_RPC_REGISTER("bdev_nvme_get_io_paths", rpc_bdev_nvme_get_io_paths, SPDK_RPC_RUNTIME)

struct rpc_bdev_nvme_set_preferred_path {
	char *name;
	uint16_t cntlid;
};

static void
free_rpc_bdev_nvme_set_preferred_path(struct rpc_bdev_nvme_set_preferred_path *req)
{
	free(req->name);
}

static const struct spdk_json_object_decoder rpc_bdev_nvme_set_preferred_path_decoders[] = {
	{"name", offsetof(struct rpc_bdev_nvme_set_preferred_path, name), spdk_json_decode_string},
	{"cntlid", offsetof(struct rpc_bdev_nvme_set_preferred_path, cntlid), spdk_json_decode_uint16},
};

struct rpc_bdev_nvme_set_preferred_path_ctx {
	struct rpc_bdev_nvme_set_preferred_path req;
	struct spdk_jsonrpc_request *request;
};

static void
rpc_bdev_nvme_set_preferred_path_done(void *cb_arg, int rc)
{
	struct rpc_bdev_nvme_set_preferred_path_ctx *ctx = cb_arg;

	if (rc == 0) {
		spdk_jsonrpc_send_bool_response(ctx->request, true);
	} else {
		spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc));
	}

	free_rpc_bdev_nvme_set_preferred_path(&ctx->req);
	free(ctx);
}

static void
rpc_bdev_nvme_set_preferred_path(struct spdk_jsonrpc_request *request,
				 const struct spdk_json_val *params)
{
	struct rpc_bdev_nvme_set_preferred_path_ctx *ctx;

	ctx = calloc(1, sizeof(*ctx));
	if (ctx == NULL) {
		spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
		return;
	}

	if (spdk_json_decode_object(params, rpc_bdev_nvme_set_preferred_path_decoders,
				    SPDK_COUNTOF(rpc_bdev_nvme_set_preferred_path_decoders),
				    &ctx->req)) {
		SPDK_ERRLOG("spdk_json_decode_object failed\n");
		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
						 "spdk_json_decode_object failed");
		goto cleanup;
	}

	ctx->request = request;

	bdev_nvme_set_preferred_path(ctx->req.name, ctx->req.cntlid,
				     rpc_bdev_nvme_set_preferred_path_done, ctx);
	return;

cleanup:
	free_rpc_bdev_nvme_set_preferred_path(&ctx->req);
	free(ctx);
}
SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path,
		  SPDK_RPC_RUNTIME)
Loading