Commit d1c17064 authored by Jim Harris's avatar Jim Harris Committed by Tomasz Zawadzki
Browse files

nvme_perf: rate limit I/O error messages



The nvme perf tool can be useful to generate I/O for
stress conditions.  But if we use it for situations
where we expect a high rate of failures, the excessive
spew can significantly clutter the log.

So add a new -Q option (meaning "quiet") which will
rate limit these error messages.

Signed-off-by: default avatarJim Harris <james.r.harris@intel.com>
Change-Id: Ia8ab2e1ea1cfab9f43d87bcabe8f3f7589b77cda
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6077


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@mellanox.com>
parent 9765e956
Loading
Loading
Loading
Loading
+45 −6
Original line number Diff line number Diff line
@@ -261,6 +261,29 @@ static bool g_mix_specified;
static bool g_exit;
/* Default to 10 seconds for the keep alive value. This value is arbitrary. */
static uint32_t g_keep_alive_timeout_in_ms = 10000;
static uint32_t g_quiet_count = 1;

/* When user specifies -Q, some error messages are rate limited.  When rate
 * limited, we only print the error message every g_quiet_count times the
 * error occurs.
 *
 * Note: the __count is not thread safe, meaning the rate limiting will not
 * be exact when running perf with multiple thread with lots of errors.
 * Thread-local __count would mean rate-limiting per thread which doesn't
 * seem as useful.
 */
#define RATELIMIT_LOG(...) \
	{								\
		static uint64_t __count = 0;				\
		if ((__count % g_quiet_count) == 0) {			\
			if (__count > 0 && g_quiet_count > 1) {		\
				fprintf(stderr, "Message suppressed %" PRIu32 " times: ",	\
					g_quiet_count - 1);		\
			}						\
			fprintf(stderr, __VA_ARGS__);			\
		}							\
		__count++;						\
	}

static const char *g_core_mask;

@@ -1260,7 +1283,7 @@ submit_single_io(struct perf_task *task)
	rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios);

	if (spdk_unlikely(rc != 0)) {
		fprintf(stderr, "starting I/O failed\n");
		RATELIMIT_LOG("starting I/O failed\n");
	} else {
		ns_ctx->current_queue_depth++;
	}
@@ -1317,10 +1340,10 @@ io_complete(void *ctx, const struct spdk_nvme_cpl *cpl)

	if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
		if (task->is_read) {
			fprintf(stderr, "Read completed with error (sct=%d, sc=%d)\n",
			RATELIMIT_LOG("Read completed with error (sct=%d, sc=%d)\n",
				      cpl->status.sct, cpl->status.sc);
		} else {
			fprintf(stderr, "Write completed with error (sct=%d, sc=%d)\n",
			RATELIMIT_LOG("Write completed with error (sct=%d, sc=%d)\n",
				      cpl->status.sct, cpl->status.sc);
		}
	}
@@ -1551,6 +1574,7 @@ static void usage(char *program_name)
	printf("\t[-C max completions per poll]\n");
	printf("\t\t(default: 0 - unlimited)\n");
	printf("\t[-i shared memory group ID]\n");
	printf("\t[-Q log I/O errors every N times (default: 1)\n");
	printf("\t");
	spdk_log_usage(stdout, "-T");
	printf("\t[-V enable VMD enumeration]\n");
@@ -1990,7 +2014,8 @@ parse_args(int argc, char **argv)
	long int val;
	int rc;

	while ((op = getopt(argc, argv, "a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:RS:T:U:VZ:")) != -1) {
	while ((op = getopt(argc, argv,
			    "a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:Q:RS:T:U:VZ:")) != -1) {
		switch (op) {
		case 'a':
		case 'A':
@@ -2004,6 +2029,7 @@ parse_args(int argc, char **argv)
		case 's':
		case 't':
		case 'M':
		case 'Q':
		case 'U':
			val = spdk_strtol(optarg, 10);
			if (val < 0) {
@@ -2045,6 +2071,9 @@ parse_args(int argc, char **argv)
				g_rw_percentage = val;
				g_mix_specified = true;
				break;
			case 'Q':
				g_quiet_count = val;
				break;
			case 'U':
				g_nr_unused_io_queues = val;
				break;
@@ -2187,6 +2216,11 @@ parse_args(int argc, char **argv)
		usage(argv[0]);
		return 1;
	}
	if (!g_quiet_count) {
		fprintf(stderr, "-Q value must be greater than 0\n");
		usage(argv[0]);
		return 1;
	}

	if (strncmp(g_workload_type, "rand", 4) == 0) {
		g_is_random = 1;
@@ -2571,6 +2605,11 @@ int main(int argc, char **argv)
		goto cleanup;
	}

	if (g_num_workers > 1 && g_quiet_count > 1) {
		fprintf(stderr, "Error message rate-limiting enabled across multiple threads.\n");
		fprintf(stderr, "Error suppression count may not be exact.\n");
	}

	rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL);
	if (rc != 0) {
		fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n");