Commit f61b0041 authored by Richael Zhuang's avatar Richael Zhuang Committed by Jim Harris
Browse files

bdev_nvme: update nvme_io_path stat when IO completes



Currently we have stat per bdev I/O channel, but for NVMe bdev
multipath, we don't have stat per I/O path. Especially for
active-active mode, we may want to observe each path's statistics.

This patch support IO stat for nvme_io_path. Record each nvme_io_path
stat using structure spdk_bdev_io_stat.

The following is the comparison of bdevperf test.

Test on Arm server with the following basic configuration.
1 Null bdev: block size: 4K, num_blocks:16k
run bdevperf with io size=4k, qdepth=1/32/128, rw type=randwrite/mixed with 70% read/randread

Each time run 30 seconds, each item run for 16 times and get the average.

The result is as follows.

qdepth type   IOPS(default) IOPS(this patch)  diff
1   randwrite   7795157.27  7859909.78       0.83%
1   mix(70% r)  7418607.08  7404026.54      -0.20%
1   randread    8053560.83  8046315.44      -0.09%

32  randwrite   15409191.3  15327642.11	    -0.53%
32  mix(70% r)  13760145.97 13714666.28	    -0.33%
32  randread    16136922.98 16038855.39	    -0.61%

128 randwrite   14815647.56 14944902.74	     0.87%
128 mix(70% r)  13414858.59 13412317.46	    -0.02%
128 randread    15508642.43 15521752.41	     0.08%

Change-Id: I4eb5673f49d65d3ff9b930361d2f31ab0ccfa021
Signed-off-by: default avatarRichael Zhuang <richael.zhuang@arm.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14743


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
parent 8ddc102a
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -66,6 +66,9 @@ collecting NVMe error counts.
New APIs `spdk_bdev_reset_io_stat`, `spdk_bdev_add_io_stat` and `spdk_bdev_dump_io_stat_json`
were added to process I/O statistics outside the generic bdev layer, especially in bdev modules.

Added I/O statistics per I/O path to the NVMe bdev module for NVMe bdev multipath. It can be
enabled by a new option io_path_stat of RPC bdev_nvme_set_options.

### event

Added core lock file mechanism to prevent the same CPU cores from being used by multiple
@@ -101,6 +104,8 @@ Added `rr_min_io` option to RPC bdev_nvme_set_multipath_policy. It switches I/O
another path after rr_min_io I/Os are routed to current io path for the round-robin
path selector.

Added option `--io-path-stat` for RPC bdev_nvme_set_option to enable collecting io path stat.

### bdevperf

Promoted the application to example to match similar programs: fio_plugin and perf.
+1 −0
Original line number Diff line number Diff line
@@ -3618,6 +3618,7 @@ generate_uuids | Optional | boolean | Enable generation of UUIDs
transport_tos              | Optional | number      | IPv4 Type of Service value. Only applicable for RDMA transport. Default: 0 (no TOS is applied).
nvme_error_stat            | Optional | boolean     | Enable collecting NVMe error counts.
rdma_srq_size              | Optional | number      | Set the size of a shared rdma receive queue. Default: 0 (disabled).
io_path_stat               | Optional | boolean     | Enable collecting I/O stat of each nvme bdev io path. Default: `false`.

#### Example

+122 −0
Original line number Diff line number Diff line
@@ -93,6 +93,9 @@ struct nvme_bdev_io {

	/* How many times the current I/O was retried. */
	int32_t retry_count;

	/* Current tsc at submit time. */
	uint64_t submit_tsc;
};

struct nvme_probe_skip_entry {
@@ -126,6 +129,7 @@ static struct spdk_bdev_nvme_opts g_opts = {
	.generate_uuids = false,
	.transport_tos = 0,
	.nvme_error_stat = false,
	.io_path_stat = false,
};

#define NVME_HOTPLUG_POLL_PERIOD_MAX			10000000ULL
@@ -588,10 +592,21 @@ _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_
		return -ENOMEM;
	}

	if (g_opts.io_path_stat) {
		io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
		if (io_path->stat == NULL) {
			free(io_path);
			SPDK_ERRLOG("Failed to alloc io_path stat.\n");
			return -ENOMEM;
		}
		spdk_bdev_reset_io_stat(io_path->stat, BDEV_RESET_STAT_MAXMIN);
	}

	io_path->nvme_ns = nvme_ns;

	ch = spdk_get_io_channel(nvme_ns->ctrlr);
	if (ch == NULL) {
		free(io_path->stat);
		free(io_path);
		SPDK_ERRLOG("Failed to alloc io_channel.\n");
		return -ENOMEM;
@@ -635,6 +650,7 @@ _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_pat
	ch = spdk_io_channel_from_ctx(ctrlr_ch);
	spdk_put_io_channel(ch);

	free(io_path->stat);
	free(io_path);
}

@@ -1122,6 +1138,99 @@ bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk
	pthread_mutex_unlock(&nbdev->mutex);
}

static inline void
bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio)
{
	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
	uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
	uint32_t blocklen = bdev_io->bdev->blocklen;
	struct spdk_bdev_io_stat *stat;
	uint64_t tsc_diff;

	if (bio->io_path->stat == NULL) {
		return;
	}

	tsc_diff = spdk_get_ticks() - bio->submit_tsc;
	stat = bio->io_path->stat;

	switch (bdev_io->type) {
	case SPDK_BDEV_IO_TYPE_READ:
		stat->bytes_read += num_blocks * blocklen;
		stat->num_read_ops++;
		stat->read_latency_ticks += tsc_diff;
		if (stat->max_read_latency_ticks < tsc_diff) {
			stat->max_read_latency_ticks = tsc_diff;
		}
		if (stat->min_read_latency_ticks > tsc_diff) {
			stat->min_read_latency_ticks = tsc_diff;
		}
		break;
	case SPDK_BDEV_IO_TYPE_WRITE:
		stat->bytes_written += num_blocks * blocklen;
		stat->num_write_ops++;
		stat->write_latency_ticks += tsc_diff;
		if (stat->max_write_latency_ticks < tsc_diff) {
			stat->max_write_latency_ticks = tsc_diff;
		}
		if (stat->min_write_latency_ticks > tsc_diff) {
			stat->min_write_latency_ticks = tsc_diff;
		}
		break;
	case SPDK_BDEV_IO_TYPE_UNMAP:
		stat->bytes_unmapped += num_blocks * blocklen;
		stat->num_unmap_ops++;
		stat->unmap_latency_ticks += tsc_diff;
		if (stat->max_unmap_latency_ticks < tsc_diff) {
			stat->max_unmap_latency_ticks = tsc_diff;
		}
		if (stat->min_unmap_latency_ticks > tsc_diff) {
			stat->min_unmap_latency_ticks = tsc_diff;
		}
		break;
	case SPDK_BDEV_IO_TYPE_ZCOPY:
		/* Track the data in the start phase only */
		if (!bdev_io->u.bdev.zcopy.start) {
			break;
		}
		if (bdev_io->u.bdev.zcopy.populate) {
			stat->bytes_read += num_blocks * blocklen;
			stat->num_read_ops++;
			stat->read_latency_ticks += tsc_diff;
			if (stat->max_read_latency_ticks < tsc_diff) {
				stat->max_read_latency_ticks = tsc_diff;
			}
			if (stat->min_read_latency_ticks > tsc_diff) {
				stat->min_read_latency_ticks = tsc_diff;
			}
		} else {
			stat->bytes_written += num_blocks * blocklen;
			stat->num_write_ops++;
			stat->write_latency_ticks += tsc_diff;
			if (stat->max_write_latency_ticks < tsc_diff) {
				stat->max_write_latency_ticks = tsc_diff;
			}
			if (stat->min_write_latency_ticks > tsc_diff) {
				stat->min_write_latency_ticks = tsc_diff;
			}
		}
		break;
	case SPDK_BDEV_IO_TYPE_COPY:
		stat->bytes_copied += num_blocks * blocklen;
		stat->num_copy_ops++;
		stat->copy_latency_ticks += tsc_diff;
		if (stat->max_copy_latency_ticks < tsc_diff) {
			stat->max_copy_latency_ticks = tsc_diff;
		}
		if (stat->min_copy_latency_ticks > tsc_diff) {
			stat->min_copy_latency_ticks = tsc_diff;
		}
		break;
	default:
		break;
	}
}

static inline void
bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
				  const struct spdk_nvme_cpl *cpl)
@@ -1136,6 +1245,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
	assert(!bdev_nvme_io_type_is_admin(bdev_io->type));

	if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
		bdev_nvme_update_io_path_stat(bio);
		goto complete;
	}

@@ -1188,6 +1298,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,

complete:
	bio->retry_count = 0;
	bio->submit_tsc = 0;
	__bdev_nvme_io_complete(bdev_io, 0, cpl);
}

@@ -1223,6 +1334,7 @@ bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
	}

	bio->retry_count = 0;
	bio->submit_tsc = 0;
	__bdev_nvme_io_complete(bdev_io, io_status, NULL);
}

@@ -2328,6 +2440,15 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i
	struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
	struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;

	if (spdk_likely(nbdev_io->submit_tsc == 0)) {
		nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
	} else {
		/* There are cases where submit_tsc != 0, i.e. retry I/O.
		 * We need to update submit_tsc here.
		 */
		nbdev_io->submit_tsc = spdk_get_ticks();
	}

	spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
	nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
	if (spdk_unlikely(!nbdev_io->io_path)) {
@@ -6942,6 +7063,7 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
	spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
	spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids);
	spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos);
	spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat);
	spdk_json_write_object_end(w);

	spdk_json_write_object_end(w);
+4 −0
Original line number Diff line number Diff line
@@ -198,6 +198,9 @@ struct nvme_io_path {
	/* The following are used to update io_path cache of the nvme_bdev_channel. */
	struct nvme_bdev_channel	*nbdev_ch;
	TAILQ_ENTRY(nvme_io_path)	tailq;

	/* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */
	struct spdk_bdev_io_stat	*stat;
};

struct nvme_bdev_channel {
@@ -274,6 +277,7 @@ struct spdk_bdev_nvme_opts {
	uint8_t transport_tos;
	bool nvme_error_stat;
	uint32_t rdma_srq_size;
	bool io_path_stat;
};

struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);
+1 −0
Original line number Diff line number Diff line
@@ -73,6 +73,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] =
	{"transport_tos", offsetof(struct spdk_bdev_nvme_opts, transport_tos), spdk_json_decode_uint8, true},
	{"nvme_error_stat", offsetof(struct spdk_bdev_nvme_opts, nvme_error_stat), spdk_json_decode_bool, true},
	{"rdma_srq_size", offsetof(struct spdk_bdev_nvme_opts, rdma_srq_size), spdk_json_decode_uint32, true},
	{"io_path_stat", offsetof(struct spdk_bdev_nvme_opts, io_path_stat), spdk_json_decode_bool, true},
};

static void
Loading