Commit 8de75f81 authored by Jim Harris's avatar Jim Harris
Browse files

nvme/perf: add software-based latency tracking



The latency tracking is done with ranges of bucket arrays.
The bucket for any given I/O is determined solely by TSC
deltas - any translation to microseconds is only done after
the test is finished and statistics are printed.

Each range has a number of buckets determined by a
NUM_BUCKETS_PER_RANGE value which is currently set to 128.
The buckets in ranges 0 and 1 each map to one specific TSC
delta.  The buckets in subsequent ranges each map to twice
as many TSC deltas as buckets in the previous range:

Range 0:  1 TSC each - 128 buckets cover deltas    0 to  127
Range 1:  1 TSC each - 128 buckets cover deltas  128 to  255
Range 2:  2 TSC each - 128 buckets cover deltas  256 to  511
Range 3:  4 TSC each - 128 buckets cover deltas  512 to 1023
Range 4:  8 TSC each - 128 buckets cover deltas 1024 to 2047
Range 5: 16 TSC each - 128 buckets cover deltas 2048 to 4095
etc.

While here, change some variable names and usage
messages to differentiate between the existing latency
tracking via vendor-specific NVMe log pages on Intel
NVMe SSDs, and the newly added latency tracking done
in software.

Signed-off-by: default avatarJim Harris <james.r.harris@intel.com>
Change-Id: I299f1c1f6dbfa7ea0e73085f7a685e71fc687a2b
parent a591161c
Loading
Loading
Loading
Loading
+143 −8
Original line number Diff line number Diff line
@@ -81,6 +81,40 @@ struct ns_entry {
	char			name[1024];
};

/*
 * Latency tracking is done with ranges of bucket arrays.  The bucket
 * for any given I/O is determined solely by the TSC delta - any
 * translation to microseconds is only done after the test is finished
 * and statistics are printed.
 *
 * Each range has a number of buckets determined by NUM_BUCKETS_PER_RANGE
 * which is 128.  The buckets in ranges 0 and 1 each map to one specific
 * TSC delta.  The buckets in subsequent ranges each map to twice as many
 * TSC deltas as buckets in the range before it:
 *
 * Range 0:  1 TSC each - 128 buckets cover 0 to 127 (2^7-1)
 * Range 1:  1 TSC each - 128 buckets cover 128 to 255 (2^8-1)
 * Range 2:  2 TSC each - 128 buckets cover 256 to 511 (2^9-1)
 * Range 3:  4 TSC each - 128 buckets cover 512 to 1023 (2^10-1)
 * Range 4:  8 TSC each - 128 buckets cover 1024 to 2047 (2^11-1)
 * Range 5: 16 TSC each - 128 buckets cover 2048 to 4095 (2^12-1)
 * ...
 * Range 55: 2^54 TSC each - 128 buckets cover 2^61 to 2^62-1
 * Range 56: 2^55 TSC each - 128 buckets cover 2^62 to 2^63-1
 * Range 57: 2^56 TSC each - 128 buckets cover 2^63 to 2^64-1
 *
 * On a 2.3GHz processor, this strategy results in 50ns buckets in the
 * 7-14us range (sweet spot for Intel Optane SSD latency testing).
 *
 * Buckets can be made more granular by increasing BUCKET_SHIFT.  This
 * comes at the cost of additional storage per namespace context to
 * store the bucket data.
 */
#define BUCKET_SHIFT 7
#define NUM_BUCKETS_PER_RANGE (1ULL << BUCKET_SHIFT)
#define BUCKET_MASK (NUM_BUCKETS_PER_RANGE - 1)
#define NUM_BUCKET_RANGES (64 - BUCKET_SHIFT + 1)

struct ns_worker_ctx {
	struct ns_entry		*entry;
	uint64_t		io_completed;
@@ -105,6 +139,8 @@ struct ns_worker_ctx {
	} u;

	struct ns_worker_ctx	*next;

	uint64_t		bucket[NUM_BUCKET_RANGES][NUM_BUCKETS_PER_RANGE];
};

struct perf_task {
@@ -124,7 +160,8 @@ struct worker_thread {

static int g_outstanding_commands;

static bool g_latency_tracking_enable = false;
static bool g_latency_ssd_tracking_enable = false;
static int g_latency_sw_tracking_level = 0;

static struct rte_mempool *task_pool;

@@ -160,6 +197,63 @@ static int g_aio_optind; /* Index of first AIO filename in argv */
static void
task_complete(struct perf_task *task);

static uint32_t
get_bucket_range(uint64_t tsc)
{
	uint32_t clz, range;

	assert(tsc != 0);

	clz = __builtin_clzll(tsc);

	if (clz <= NUM_BUCKET_RANGES) {
		range = NUM_BUCKET_RANGES - clz;
	} else {
		range = 0;
	}

	return range;
}

static uint32_t
get_bucket_index(uint64_t tsc, uint32_t range)
{
	uint32_t shift;

	if (range == 0) {
		shift = 0;
	} else {
		shift = range - 1;
	}

	return (tsc >> shift) & BUCKET_MASK;
}

static double
get_us_from_bucket(uint32_t range, uint32_t index)
{
	uint64_t tsc;

	index += 1;
	if (range > 0) {
		tsc = 1ULL << (range + BUCKET_SHIFT - 1);
		tsc += (uint64_t)index << (range - 1);
	} else {
		tsc = index;
	}

	return (double)tsc * 1000 * 1000 / g_tsc_rate;
}

static void
track_latency(struct ns_worker_ctx *ns_ctx, uint64_t tsc)
{
	uint32_t range = get_bucket_range(tsc);
	uint32_t index = get_bucket_index(tsc, range);

	ns_ctx->bucket[range][index]++;
}

static void
register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
{
@@ -277,7 +371,7 @@ register_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
	entry->next = g_controllers;
	g_controllers = entry;

	if (g_latency_tracking_enable &&
	if (g_latency_ssd_tracking_enable &&
	    spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
		set_latency_tracking_feature(ctrlr, true);

@@ -494,7 +588,9 @@ task_complete(struct perf_task *task)
	if (ns_ctx->max_tsc < tsc_diff) {
		ns_ctx->max_tsc = tsc_diff;
	}

	if (g_latency_sw_tracking_level > 0) {
		track_latency(ns_ctx, tsc_diff);
	}
	rte_mempool_put(task_pool, task);

	/*
@@ -655,7 +751,8 @@ static void usage(char *program_name)
	printf("\t[-w io pattern type, must be one of\n");
	printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
	printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
	printf("\t[-l enable latency tracking, default: disabled]\n");
	printf("\t[-L enable latency tracking via sw, default: disabled]\n");
	printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n");
	printf("\t[-t time in seconds]\n");
	printf("\t[-c core mask for I/O submission/completion.]\n");
	printf("\t\t(default: 1)]\n");
@@ -731,6 +828,41 @@ print_performance(void)
	       sum_ave_latency / ns_count, sum_min_latency / ns_count,
	       sum_max_latency / ns_count);
	printf("\n");

	if (g_latency_sw_tracking_level == 0) {
		return;
	}
	worker = g_workers;
	while (worker) {
		ns_ctx = worker->ns_ctx;
		while (ns_ctx) {
			uint64_t i, j, so_far = 0;
			float so_far_pct = 0;
			double last_bucket, bucket = 0;

			printf("Latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
			printf("=========================================================================\n");
			printf("       Range in us     Cumulative    IO count\n");

			for (i = 0; i < NUM_BUCKET_RANGES; i++) {
				for (j = 0; j < NUM_BUCKETS_PER_RANGE; j++) {
					so_far += ns_ctx->bucket[i][j];
					so_far_pct = (float)so_far * 100 / total_io_completed;
					last_bucket = bucket;
					bucket = get_us_from_bucket(i, j);
					if (ns_ctx->bucket[i][j] == 0) {
						continue;
					}
					printf("%9.3f - %9.3f: %9.4f%%  (%9ju)\n",
					       last_bucket, bucket, so_far_pct, ns_ctx->bucket[i][j]);
				}
			}
			printf("\n");
			ns_ctx = ns_ctx->next;
		}
		worker = worker->next;
	}

}

static void
@@ -804,7 +936,7 @@ static void
print_stats(void)
{
	print_performance();
	if (g_latency_tracking_enable) {
	if (g_latency_ssd_tracking_enable) {
		if (g_rw_percentage != 0) {
			print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
		}
@@ -866,7 +998,7 @@ parse_args(int argc, char **argv)
	g_core_mask = NULL;
	g_max_completions = 0;

	while ((op = getopt(argc, argv, "c:d:i:lm:q:r:s:t:w:M:")) != -1) {
	while ((op = getopt(argc, argv, "c:d:i:lm:q:r:s:t:w:LM:")) != -1) {
		switch (op) {
		case 'c':
			g_core_mask = optarg;
@@ -878,7 +1010,7 @@ parse_args(int argc, char **argv)
			g_shm_id = atoi(optarg);
			break;
		case 'l':
			g_latency_tracking_enable = true;
			g_latency_ssd_tracking_enable = true;
			break;
		case 'm':
			g_max_completions = atoi(optarg);
@@ -901,6 +1033,9 @@ parse_args(int argc, char **argv)
		case 'w':
			workload_type = optarg;
			break;
		case 'L':
			g_latency_sw_tracking_level++;
			break;
		case 'M':
			g_rw_percentage = atoi(optarg);
			mix_specified = true;
@@ -1126,7 +1261,7 @@ unregister_controllers(void)
	while (entry) {
		struct ctrlr_entry *next = entry->next;
		spdk_free(entry->latency_page);
		if (g_latency_tracking_enable &&
		if (g_latency_ssd_tracking_enable &&
		    spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
			set_latency_tracking_feature(entry->ctrlr, false);
		spdk_nvme_detach(entry->ctrlr);
+1 −1
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ done
timing_exit identify

timing_enter perf
$rootdir/examples/nvme/perf/perf -q 128 -w read -s 12288 -t 1
$rootdir/examples/nvme/perf/perf -q 128 -w read -s 12288 -t 1 -L
timing_exit perf

timing_enter reserve