Commit da214ab2 authored by Jim Harris's avatar Jim Harris Committed by Daniel Verkamp
Browse files

nvme: add new test application to measure SW overhead



Signed-off-by: default avatarJim Harris <james.r.harris@intel.com>
Change-Id: I330ca1577e5725b0e135422156328d5b165e79a3
parent 921bf342
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk

DIRS-y = unit aer reset sgl e2edp
DIRS-y = unit aer reset sgl e2edp overhead

.PHONY: all clean $(DIRS-y)

+62 −0
Original line number Diff line number Diff line
#
#  BSD LICENSE
#
#  Copyright (c) Intel Corporation.
#  All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions
#  are met:
#
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in
#      the documentation and/or other materials provided with the
#      distribution.
#    * Neither the name of Intel Corporation nor the names of its
#      contributors may be used to endorse or promote products derived
#      from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk

APP = overhead

C_SRCS := overhead.c

CFLAGS += -I. $(DPDK_INC)

SPDK_LIBS += $(SPDK_ROOT_DIR)/lib/nvme/libspdk_nvme.a \
	     $(SPDK_ROOT_DIR)/lib/util/libspdk_util.a \
	     $(SPDK_ROOT_DIR)/lib/memory/libspdk_memory.a

LIBS += $(SPDK_LIBS) $(PCIACCESS_LIB) -lpthread $(DPDK_LIB) -lrt

ifeq ($(OS),Linux)
LIBS += -laio
CFLAGS += -DHAVE_LIBAIO
endif

all : $(APP)

$(APP) : $(OBJS) $(SPDK_LIBS)
	$(LINK_C)

clean :
	$(CLEAN_C) $(APP)

include $(SPDK_ROOT_DIR)/mk/spdk.deps.mk
+24 −0
Original line number Diff line number Diff line
This application measures the software overhead of I/O submission
and completion for both the SPDK NVMe driver and an AIO file handle.
It runs a random read, queue depth = 1 workload to a single device,
and captures TSC as follows:

* Submission: capture TSC before and after the I/O submission
  call (SPDK or AIO).
* Completion: capture TSC before and after the I/O completion
  check.  Only record the TSC delta if the I/O completion check
  resulted in a completed I/O.  Also use heuristics in the AIO
  case to account for time spent in interrupt handling outside
  of the actual I/O completion check.

Usage:

To test software overhead for a 4KB I/O over a 10 second period:

SPDK:  overhead -s 4096 -t 10
AIO:   overhead -s 4096 -t 10 /dev/nvme0n1

Note that for the SPDK case, it will only use the first namespace
on the first controller found by SPDK.  If a different namespace is
desired, attach controllers individually to the kernel NVMe driver
to ensure they will not be enumerated by SPDK.
+654 −0
Original line number Diff line number Diff line
/*-
 *   BSD LICENSE
 *
 *   Copyright (c) Intel Corporation.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
 *   are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *     * Neither the name of Intel Corporation nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>

#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_mempool.h>
#include <rte_malloc.h>
#include <rte_lcore.h>

#include "spdk/file.h"
#include "spdk/nvme.h"
#include "spdk/pci.h"
#include "spdk/string.h"
#include "spdk/nvme_intel.h"

#if HAVE_LIBAIO
#include <libaio.h>
#include <sys/stat.h>
#include <fcntl.h>
#endif

struct ctrlr_entry {
	struct spdk_nvme_ctrlr			*ctrlr;
	struct ctrlr_entry			*next;
	char					name[1024];
};

enum entry_type {
	ENTRY_TYPE_NVME_NS,
	ENTRY_TYPE_AIO_FILE,
};

struct ns_entry {
	enum entry_type		type;

	union {
		struct {
			struct spdk_nvme_ctrlr	*ctrlr;
			struct spdk_nvme_ns	*ns;
			struct spdk_nvme_qpair	*qpair;
		} nvme;
#if HAVE_LIBAIO
		struct {
			int			fd;
			struct io_event		*events;
			io_context_t		ctx;
		} aio;
#endif
	} u;

	uint32_t		io_size_blocks;
	uint64_t		size_in_ios;
	bool			is_draining;
	uint32_t		current_queue_depth;
	char			name[1024];
};

struct perf_task {
	void			*buf;
	uint64_t		submit_tsc;
#if HAVE_LIBAIO
	struct iocb		iocb;
#endif
};

struct rte_mempool *request_mempool;

static struct ctrlr_entry *g_ctrlr = NULL;
static struct ns_entry *g_ns = NULL;

static uint64_t g_tsc_rate;

static uint32_t g_io_size_bytes;
static int g_time_in_sec;

static int g_aio_optind; /* Index of first AIO filename in argv */

struct perf_task *g_task;
uint64_t g_tsc_submit = 0;
uint64_t g_tsc_complete = 0;
uint64_t g_io_completed = 0;

static void
register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
{
	struct ns_entry *entry;
	const struct spdk_nvme_ctrlr_data *cdata;

	cdata = spdk_nvme_ctrlr_get_data(ctrlr);

	if (!spdk_nvme_ns_is_active(ns)) {
		printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
		       cdata->mn, cdata->sn,
		       spdk_nvme_ns_get_id(ns));
		return;
	}

	if (spdk_nvme_ns_get_size(ns) < g_io_size_bytes ||
	    spdk_nvme_ns_get_sector_size(ns) > g_io_size_bytes) {
		printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
		       "ns size %" PRIu64 " / block size %u for I/O size %u\n",
		       cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
		       spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes);
		return;
	}

	entry = calloc(1, sizeof(struct ns_entry));
	if (entry == NULL) {
		perror("ns_entry malloc");
		exit(1);
	}

	entry->type = ENTRY_TYPE_NVME_NS;
	entry->u.nvme.ctrlr = ctrlr;
	entry->u.nvme.ns = ns;

	entry->size_in_ios = spdk_nvme_ns_get_size(ns) /
			     g_io_size_bytes;
	entry->io_size_blocks = g_io_size_bytes / spdk_nvme_ns_get_sector_size(ns);

	snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);

	g_ns = entry;
}

static void
register_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
{
	int num_ns;
	struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry));
	const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr);

	if (entry == NULL) {
		perror("ctrlr_entry malloc");
		exit(1);
	}

	snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);

	entry->ctrlr = ctrlr;
	g_ctrlr = entry;

	num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
	/* Only register the first namespace. */
	if (num_ns < 1) {
		fprintf(stderr, "controller found with no namespaces\n");
		exit(1);
	}

	register_ns(ctrlr, spdk_nvme_ctrlr_get_ns(ctrlr, 1));
}

#if HAVE_LIBAIO
static int
register_aio_file(const char *path)
{
	struct ns_entry *entry;

	int fd;
	uint64_t size;
	uint32_t blklen;

	fd = open(path, O_RDWR | O_DIRECT);
	if (fd < 0) {
		fprintf(stderr, "Could not open AIO device %s: %s\n", path, strerror(errno));
		return -1;
	}

	size = spdk_file_get_size(fd);
	if (size == 0) {
		fprintf(stderr, "Could not determine size of AIO device %s\n", path);
		close(fd);
		return -1;
	}

	blklen = spdk_dev_get_blocklen(fd);
	if (blklen == 0) {
		fprintf(stderr, "Could not determine block size of AIO device %s\n", path);
		close(fd);
		return -1;
	}

	entry = calloc(1, sizeof(struct ns_entry));
	if (entry == NULL) {
		close(fd);
		perror("aio ns_entry malloc");
		return -1;
	}

	entry->type = ENTRY_TYPE_AIO_FILE;
	entry->u.aio.fd = fd;
	entry->size_in_ios = size / g_io_size_bytes;
	entry->io_size_blocks = g_io_size_bytes / blklen;

	snprintf(entry->name, sizeof(entry->name), "%s", path);

	g_ns = entry;

	return 0;
}

static int
aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, void *buf,
	   unsigned long nbytes, uint64_t offset, void *cb_ctx)
{
	iocb->aio_fildes = fd;
	iocb->aio_reqprio = 0;
	iocb->aio_lio_opcode = cmd;
	iocb->u.c.buf = buf;
	iocb->u.c.nbytes = nbytes;
	iocb->u.c.offset = offset;
	iocb->data = cb_ctx;

	if (io_submit(aio_ctx, 1, &iocb) < 0) {
		printf("io_submit");
		return -1;
	}

	return 0;
}

static void
aio_check_io(void)
{
	int count, i;
	struct timespec timeout;

	timeout.tv_sec = 0;
	timeout.tv_nsec = 0;

	count = io_getevents(g_ns->u.aio.ctx, 1, 1, g_ns->u.aio.events, &timeout);
	if (count < 0) {
		fprintf(stderr, "io_getevents error\n");
		exit(1);
	}

	for (i = 0; i < count; i++) {
		g_ns->current_queue_depth--;
	}
}
#endif /* HAVE_LIBAIO */

static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion);

static __thread unsigned int seed = 0;

static void
submit_single_io(void)
{
	uint64_t		offset_in_ios;
	uint64_t		start;
	int			rc;
	struct ns_entry		*entry = g_ns;

	offset_in_ios = rand_r(&seed) % entry->size_in_ios;

	start = rte_get_tsc_cycles();
	rte_mb();
#if HAVE_LIBAIO
	if (entry->type == ENTRY_TYPE_AIO_FILE) {
		rc = aio_submit(g_ns->u.aio.ctx, &g_task->iocb, entry->u.aio.fd, IO_CMD_PREAD, g_task->buf,
				g_io_size_bytes, offset_in_ios * g_io_size_bytes, g_task);
	} else
#endif
	{
		rc = spdk_nvme_ns_cmd_read(entry->u.nvme.ns, g_ns->u.nvme.qpair, g_task->buf,
					   offset_in_ios * entry->io_size_blocks,
					   entry->io_size_blocks, io_complete, g_task, 0);
	}

	rte_mb();
	g_tsc_submit += rte_get_tsc_cycles() - start;

	if (rc != 0) {
		fprintf(stderr, "starting I/O failed\n");
	}

	g_ns->current_queue_depth++;
}

static void
io_complete(void *ctx, const struct spdk_nvme_cpl *completion)
{
	g_ns->current_queue_depth--;
}

uint64_t g_complete_tsc_start;

static void
check_io(void)
{
	uint64_t end;
	rte_mb();
#if HAVE_LIBAIO
	if (g_ns->type == ENTRY_TYPE_AIO_FILE) {
		aio_check_io();
	} else
#endif
	{
		spdk_nvme_qpair_process_completions(g_ns->u.nvme.qpair, 0);
	}
	rte_mb();
	end = rte_get_tsc_cycles();
	if (g_ns->current_queue_depth == 1) {
		/*
		 * Account for race condition in AIO case where interrupt occurs
		 *  after checking for queue depth.  If the timestamp capture
		 *  is too big compared to the last capture, assume that an
		 *  interrupt fired, and do not bump the start tsc forward.  This
		 *  will ensure this extra time is accounted for next time through
		 *  when we see current_queue_depth drop to 0.
		 */
		if ((end - g_complete_tsc_start) < 500) {
			g_complete_tsc_start = end;
		}
	} else {
		g_tsc_complete += end - g_complete_tsc_start;
		g_io_completed++;
		if (!g_ns->is_draining) {
			submit_single_io();
		}
		g_complete_tsc_start = rte_get_tsc_cycles();
	}
}

static void
drain_io(void)
{
	g_ns->is_draining = true;
	while (g_ns->current_queue_depth > 0) {
		check_io();
	}
}

static int
init_ns_worker_ctx(void)
{
	if (g_ns->type == ENTRY_TYPE_AIO_FILE) {
#ifdef HAVE_LIBAIO
		g_ns->u.aio.events = calloc(1, sizeof(struct io_event));
		if (!g_ns->u.aio.events) {
			return -1;
		}
		g_ns->u.aio.ctx = 0;
		if (io_setup(1, &g_ns->u.aio.ctx) < 0) {
			free(g_ns->u.aio.events);
			perror("io_setup");
			return -1;
		}
#endif
	} else {
		/*
		 * TODO: If a controller has multiple namespaces, they could all use the same queue.
		 *  For now, give each namespace/thread combination its own queue.
		 */
		g_ns->u.nvme.qpair = spdk_nvme_ctrlr_alloc_io_qpair(g_ns->u.nvme.ctrlr, 0);
		if (!g_ns->u.nvme.qpair) {
			printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
			return -1;
		}
	}

	return 0;
}

static void
cleanup_ns_worker_ctx(void)
{
	if (g_ns->type == ENTRY_TYPE_AIO_FILE) {
#ifdef HAVE_LIBAIO
		io_destroy(g_ns->u.aio.ctx);
		free(g_ns->u.aio.events);
#endif
	} else {
		spdk_nvme_ctrlr_free_io_qpair(g_ns->u.nvme.qpair);
	}
}

static int
work_fn(void)
{
	uint64_t tsc_end;

	printf("Starting work_fn on core %u\n", rte_lcore_id());

	/* Allocate a queue pair for each namespace. */
	if (init_ns_worker_ctx() != 0) {
		printf("ERROR: init_ns_worker_ctx() failed\n");
		return 1;
	}

	tsc_end = rte_get_tsc_cycles() + g_time_in_sec * g_tsc_rate;

	/* Submit initial I/O for each namespace. */
	submit_single_io();
	g_complete_tsc_start = rte_get_tsc_cycles();

	while (1) {
		/*
		 * Check for completed I/O for each controller. A new
		 * I/O will be submitted in the io_complete callback
		 * to replace each I/O that is completed.
		 */
		check_io();

		if (rte_get_tsc_cycles() > tsc_end) {
			break;
		}
	}

	drain_io();
	cleanup_ns_worker_ctx();

	return 0;
}

static void usage(char *program_name)
{
	printf("%s options", program_name);
#if HAVE_LIBAIO
	printf(" [AIO device(s)]...");
#endif
	printf("\n");
	printf("\t[-s io size in bytes]\n");
	printf("\t[-t time in seconds]\n");
	printf("\t\t(default: 1)]\n");
}

static void
print_stats(void)
{
	printf("g_tsc_submit = %ju\n", g_tsc_submit);
	printf("g_tsc_complete = %ju\n", g_tsc_complete);
	printf("g_io_completed = %ju\n", g_io_completed);

	printf("avg submit = %8.1f\n", (float)g_tsc_submit / g_io_completed);
	printf("avg complete = %8.1f\n", (float)g_tsc_complete / g_io_completed);
}

static int
parse_args(int argc, char **argv)
{
	int op;

	/* default value*/
	g_io_size_bytes = 0;
	g_time_in_sec = 0;

	while ((op = getopt(argc, argv, "s:t:")) != -1) {
		switch (op) {
		case 's':
			g_io_size_bytes = atoi(optarg);
			break;
		case 't':
			g_time_in_sec = atoi(optarg);
			break;
		default:
			usage(argv[0]);
			return 1;
		}
	}

	if (!g_io_size_bytes) {
		usage(argv[0]);
		return 1;
	}
	if (!g_time_in_sec) {
		usage(argv[0]);
		return 1;
	}

	g_aio_optind = optind;
	optind = 1;
	return 0;
}

static bool
probe_cb(void *cb_ctx, struct spdk_pci_device *dev, struct spdk_nvme_ctrlr_opts *opts)
{
	static uint32_t ctrlr_found = 0;

	if (spdk_pci_device_has_non_uio_driver(dev)) {
		fprintf(stderr, "non-uio kernel driver attached to NVMe\n");
		fprintf(stderr, " controller at PCI address %04x:%02x:%02x.%02x\n",
			spdk_pci_device_get_domain(dev),
			spdk_pci_device_get_bus(dev),
			spdk_pci_device_get_dev(dev),
			spdk_pci_device_get_func(dev));
		fprintf(stderr, " skipping...\n");
		return false;
	}

	if (ctrlr_found == 1) {
		fprintf(stderr, "only attching to one controller, so skipping\n");
		fprintf(stderr, " controller at PCI address %04x:%02x:%02x.%02x\n",
			spdk_pci_device_get_domain(dev),
			spdk_pci_device_get_bus(dev),
			spdk_pci_device_get_dev(dev),
			spdk_pci_device_get_func(dev));
		return false;
	}
	ctrlr_found = 1;

	printf("Attaching to %04x:%02x:%02x.%02x\n",
	       spdk_pci_device_get_domain(dev),
	       spdk_pci_device_get_bus(dev),
	       spdk_pci_device_get_dev(dev),
	       spdk_pci_device_get_func(dev));

	return true;
}

static void
attach_cb(void *cb_ctx, struct spdk_pci_device *dev, struct spdk_nvme_ctrlr *ctrlr,
	  const struct spdk_nvme_ctrlr_opts *opts)
{
	printf("Attached to %04x:%02x:%02x.%02x\n",
	       spdk_pci_device_get_domain(dev),
	       spdk_pci_device_get_bus(dev),
	       spdk_pci_device_get_dev(dev),
	       spdk_pci_device_get_func(dev));

	register_ctrlr(ctrlr);
}

static int
register_controllers(void)
{
	printf("Initializing NVMe Controllers\n");

	if (spdk_nvme_probe(NULL, probe_cb, attach_cb, NULL) != 0) {
		fprintf(stderr, "spdk_nvme_probe() failed\n");
		return 1;
	}

	return 0;
}

static char *ealargs[] = {
	"perf",
	"-c 0x1",
	"-n 4",
};

int main(int argc, char **argv)
{
	int rc;

	rc = parse_args(argc, argv);
	if (rc != 0) {
		return rc;
	}

	rc = rte_eal_init(sizeof(ealargs) / sizeof(ealargs[0]), ealargs);

	if (rc < 0) {
		fprintf(stderr, "could not initialize dpdk\n");
		return 1;
	}

	request_mempool = rte_mempool_create("nvme_request", 8192,
					     spdk_nvme_request_size(), 128, 0,
					     NULL, NULL, NULL, NULL,
					     SOCKET_ID_ANY, 0);

	if (request_mempool == NULL) {
		fprintf(stderr, "could not initialize request mempool\n");
		return 1;
	}

	g_task = rte_zmalloc("task", sizeof(struct perf_task), 0);
	if (g_task == NULL) {
		fprintf(stderr, "g_task alloc failed\n");
		exit(1);
	}

	g_task->buf = rte_malloc(NULL, g_io_size_bytes, 0x1000);
	if (g_task->buf == NULL) {
		fprintf(stderr, "g_task->buf rte_malloc failed\n");
		exit(1);
	}

	g_tsc_rate = rte_get_tsc_hz();

#if HAVE_LIBAIO
	if (g_aio_optind < argc) {
		printf("Measuring overhead for AIO device %s.\n", argv[g_aio_optind]);
		if (register_aio_file(argv[g_aio_optind]) != 0) {
			rc = -1;
			goto cleanup;
		}
	} else
#endif
	{
		if (register_controllers() != 0) {
			rc = -1;
			goto cleanup;
		}
	}

	printf("Initialization complete. Launching workers.\n");

	rc = work_fn();

	print_stats();

cleanup:
	free(g_ns);
	if (g_ctrlr) {
		spdk_nvme_detach(g_ctrlr->ctrlr);
		free(g_ctrlr);
	}

	if (rc != 0) {
		fprintf(stderr, "%s: errors occured\n", argv[0]);
	}

	return rc;
}