Commit a5f10654 authored by Mariusz Tkaczyk's avatar Mariusz Tkaczyk Committed by Jim Harris
Browse files

nvme_bdev: flush support



If flush is requested for nvme device, spdk should guarantee that data
is not in volatile write cache (VWC). Therefore, flush is send only if
device reported VWC support. Similar approach is used in Linux kernel.

The cache check is added in nvme_bdev because this way it is not
preventing from sending flush is passthrough mode.

Due do backward compatibility, bdev_nvme flush support in disabled by
default and must be turned on by setting "enable_flush" rpc option.

Change-Id: I2912963e62596e2fa1410e4edc14921c7e25de9c
Signed-off-by: default avatarMariusz Tkaczyk <mariusz.tkaczyk@dell.com>
Reviewed-on: https://review.spdk.io/c/spdk/spdk/+/26141


Reviewed-by: default avatarJim Harris <jim.harris@nvidia.com>
Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK Automated Test System <spdkbot@gmail.com>
Reviewed-by: default avatarKonrad Sztyber <ksztyber@nvidia.com>
parent a083dd99
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -4130,6 +4130,7 @@ They can be overridden if they are given by the RPC bdev_nvme_attach_controller.
 dhchap_dhgroups            | Optional   | list    | List of allowed DH-HMAC-CHAP DH groups.
 rdma_umr_per_io            | Optional   | boolean | Enable/disable scatter-gather UMR per IO in RDMA transport if supported by system
 tcp_connect_timeout_ms     | Optional   | number  | Time to wait until TCP connection is done. Default: 0 (no timeout).
 enable_flush               | Optional   | boolean | If true, pass flush to nvme devices when volatile write cache is present. Default : `false`.

#### Example

+2 −1
Original line number Diff line number Diff line
@@ -108,8 +108,9 @@ struct spdk_bdev_nvme_opts {
	/* Hole at bytes 121-123. */
	uint8_t reserved121[3];
	uint32_t tcp_connect_timeout_ms;
	bool enable_flush;
};
SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 128, "Incorrect size");
SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 136, "Incorrect size");

/**
 * Connect to the NVMe controller and populate namespaces as bdevs.
+24 −3
Original line number Diff line number Diff line
@@ -188,6 +188,7 @@ static struct spdk_bdev_nvme_opts g_opts = {
	.dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS,
	.dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS,
	.rdma_umr_per_io = false,
	.enable_flush = false,
};

#define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
@@ -3244,6 +3245,8 @@ static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
				  uint64_t num_blocks);

static int bdev_nvme_flush(struct nvme_bdev_io *bio);

static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks,
			  uint64_t src_offset_blocks,
			  uint64_t num_blocks);
@@ -3360,8 +3363,14 @@ _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_i
		return;

	case SPDK_BDEV_IO_TYPE_FLUSH:
		/* No need to send flush if Volatile Write Cache is disabled */
		if (!bdev->write_cache || !g_opts.enable_flush) {
			bdev_nvme_io_complete(nbdev_io, 0);
			return;
		}

		rc = bdev_nvme_flush(nbdev_io);
		break;

	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
		rc = bdev_nvme_zone_appendv(nbdev_io,
@@ -6220,12 +6229,13 @@ spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts, size_t opts_size)
	SET_FIELD(dhchap_dhgroups, 0);
	SET_FIELD(rdma_umr_per_io, false);
	SET_FIELD(tcp_connect_timeout_ms, 0);
	SET_FIELD(enable_flush, false);

#undef SET_FIELD

	/* Do not remove this statement, you should always update this statement when you adding a new field,
	 * and do not forget to add the SET_FIELD statement for your added field. */
	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 128, "Incorrect size");
	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 136, "Incorrect size");
}

static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
@@ -6338,6 +6348,7 @@ spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
	SET_FIELD(dhchap_digests, 0);
	SET_FIELD(dhchap_dhgroups, 0);
	SET_FIELD(tcp_connect_timeout_ms, 0);
	SET_FIELD(enable_flush, false);

	g_opts.opts_size = opts->opts_size;

@@ -8630,6 +8641,14 @@ bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_
					     0);
}

static int
bdev_nvme_flush(struct nvme_bdev_io *bio)
{
	return spdk_nvme_ns_cmd_flush(bio->io_path->nvme_ns->ns,
				      bio->io_path->qpair->qpair,
				      bdev_nvme_queued_done, bio);
}

static int
bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
			struct spdk_bdev_zone_info *info)
@@ -8937,6 +8956,8 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
	spdk_json_write_array_end(w);
	spdk_json_write_named_bool(w, "rdma_umr_per_io", g_opts.rdma_umr_per_io);
	spdk_json_write_named_uint32(w, "tcp_connect_timeout_ms", g_opts.tcp_connect_timeout_ms);
	spdk_json_write_named_bool(w, "enable_flush", g_opts.enable_flush);

	spdk_json_write_object_end(w);

	spdk_json_write_object_end(w);
+1 −0
Original line number Diff line number Diff line
@@ -140,6 +140,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] =
	{"dhchap_dhgroups", offsetof(struct spdk_bdev_nvme_opts, dhchap_dhgroups), rpc_decode_dhgroup_array, true},
	{"rdma_umr_per_io", offsetof(struct spdk_bdev_nvme_opts, rdma_umr_per_io), spdk_json_decode_bool, true},
	{"tcp_connect_timeout_ms", offsetof(struct spdk_bdev_nvme_opts, tcp_connect_timeout_ms), spdk_json_decode_uint32, true},
	{"enable_flush", offsetof(struct spdk_bdev_nvme_opts, enable_flush), spdk_json_decode_bool, true},
};

static void
+2 −0
Original line number Diff line number Diff line
@@ -509,6 +509,8 @@ def add_parser(subparsers):
                   action='store_false', dest='rdma_umr_per_io')
    p.add_argument('--tcp-connect-timeout-ms',
                   help='Time to wait until TCP connection is done. Default: 0 (no timeout).', type=int)
    p.add_argument('--enable-flush', help='Pass flush to NVMe when volatile write cache is present',
                   action='store_true')

    p.set_defaults(func=bdev_nvme_set_options)

Loading