Commit 2371a070 authored by Ben Walker's avatar Ben Walker Committed by Tomasz Zawadzki
Browse files

idxd: For kernel mode, handle IOMMU+SM mode



If the kernel is booted with the IOMMU enabled and Shared Memory mode
enabled (which are the expected boot parameters for production servers),
then the kernel idxd driver will automatically register a dedicated work
queue with the PASID for the process that opens it. This means that the
descriptors written into the portal for that work queue should be
*virtual* addresses.

If the IOMMU is enabled but Shared Memory mode is disabled, then the
kernel has registered the device with the IOMMU and assigned it I/O
virtual addresses. We have no way to get those addresses from user
space, so we cannot use the kernel driver in this mode. Add a check to
catch that.

If the IOMMU is disabled, then physical addresses are used everywherre.

Change-Id: I0bf079835ad4df1128ef9db54f5564050327e9f7
Signed-off-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14019


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarPaul Luse <paul.e.luse@intel.com>
Reviewed-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
parent 1c098401
Loading
Loading
Loading
Loading
+49 −24
Original line number Diff line number Diff line
@@ -45,10 +45,16 @@ _submit_to_hw(struct spdk_idxd_io_channel *chan, struct idxd_ops *op)
}

inline static int
_vtophys(const void *buf, uint64_t *buf_addr, uint64_t size)
_vtophys(struct spdk_idxd_io_channel *chan, const void *buf, uint64_t *buf_addr, uint64_t size)
{
	uint64_t updated_size = size;

	if (chan->pasid_enabled) {
		/* We can just use virtual addresses */
		*buf_addr = (uint64_t)buf;
		return 0;
	}

	*buf_addr = spdk_vtophys(buf, &updated_size);

	if (*buf_addr == SPDK_VTOPHYS_ERROR) {
@@ -70,16 +76,20 @@ struct idxd_vtophys_iter {
	uint64_t	len;

	uint64_t	offset;

	bool		pasid_enabled;
};

static void
idxd_vtophys_iter_init(struct idxd_vtophys_iter *iter,
idxd_vtophys_iter_init(struct spdk_idxd_io_channel *chan,
		       struct idxd_vtophys_iter *iter,
		       const void *src, void *dst, uint64_t len)
{
	iter->src = src;
	iter->dst = dst;
	iter->len = len;
	iter->offset = 0;
	iter->pasid_enabled = chan->pasid_enabled;
}

static uint64_t
@@ -97,6 +107,12 @@ idxd_vtophys_iter_next(struct idxd_vtophys_iter *iter,
		return 0;
	}

	if (iter->pasid_enabled) {
		*src_phys = (uint64_t)src;
		*dst_phys = (uint64_t)dst;
		return iter->len;
	}

	len = iter->len - iter->offset;

	src_off = len;
@@ -145,7 +161,7 @@ _dsa_alloc_batches(struct spdk_idxd_io_channel *chan, int num_descriptors)
			goto error_user;
		}

		rc = _vtophys(batch->user_desc, &batch->user_desc_addr,
		rc = _vtophys(chan, batch->user_desc, &batch->user_desc_addr,
			      DESC_PER_BATCH * sizeof(struct idxd_hw_desc));
		if (rc) {
			SPDK_ERRLOG("Failed to translate batch descriptor memory\n");
@@ -161,7 +177,7 @@ _dsa_alloc_batches(struct spdk_idxd_io_channel *chan, int num_descriptors)
		}

		for (j = 0; j < DESC_PER_BATCH; j++) {
			rc = _vtophys(&op->hw, &desc->completion_addr, sizeof(struct dsa_hw_comp_record));
			rc = _vtophys(chan, &op->hw, &desc->completion_addr, sizeof(struct dsa_hw_comp_record));
			if (rc) {
				SPDK_ERRLOG("Failed to translate batch entry completion memory\n");
				goto error_user;
@@ -208,6 +224,7 @@ spdk_idxd_get_channel(struct spdk_idxd_device *idxd)
	}

	chan->idxd = idxd;
	chan->pasid_enabled = idxd->pasid_enabled;
	STAILQ_INIT(&chan->ops_pool);
	TAILQ_INIT(&chan->batch_pool);
	STAILQ_INIT(&chan->ops_outstanding);
@@ -258,7 +275,7 @@ spdk_idxd_get_channel(struct spdk_idxd_device *idxd)
	for (i = 0; i < num_descriptors; i++) {
		STAILQ_INSERT_TAIL(&chan->ops_pool, op, link);
		op->desc = desc;
		rc = _vtophys(&op->hw, &desc->completion_addr, comp_rec_size);
		rc = _vtophys(chan, &op->hw, &desc->completion_addr, comp_rec_size);
		if (rc) {
			SPDK_ERRLOG("Failed to translate completion memory\n");
			goto error;
@@ -650,7 +667,7 @@ spdk_idxd_submit_copy(struct spdk_idxd_io_channel *chan,
	     len > 0;
	     len = spdk_ioviter_next(&iter, &src, &dst)) {

		idxd_vtophys_iter_init(&vtophys_iter, src, dst, len);
		idxd_vtophys_iter_init(chan, &vtophys_iter, src, dst, len);

		while (len > 0) {
			if (first_op == NULL) {
@@ -726,7 +743,7 @@ spdk_idxd_submit_dualcast(struct spdk_idxd_io_channel *chan, void *dst1, void *d
		return rc;
	}

	idxd_vtophys_iter_init(&iter_outer, src, dst1, nbytes);
	idxd_vtophys_iter_init(chan, &iter_outer, src, dst1, nbytes);

	first_op = NULL;
	count = 0;
@@ -738,7 +755,7 @@ spdk_idxd_submit_dualcast(struct spdk_idxd_io_channel *chan, void *dst1, void *d
			goto error;
		}

		idxd_vtophys_iter_init(&iter_inner, src, dst2, nbytes);
		idxd_vtophys_iter_init(chan, &iter_inner, src, dst2, nbytes);

		src += outer_seg_len;
		nbytes -= outer_seg_len;
@@ -824,7 +841,7 @@ spdk_idxd_submit_compare(struct spdk_idxd_io_channel *chan,
	     len > 0;
	     len = spdk_ioviter_next(&iter, &src1, &src2)) {

		idxd_vtophys_iter_init(&vtophys_iter, src1, src2, len);
		idxd_vtophys_iter_init(chan, &vtophys_iter, src1, src2, len);

		while (len > 0) {
			if (first_op == NULL) {
@@ -919,12 +936,16 @@ spdk_idxd_submit_fill(struct spdk_idxd_io_channel *chan,
			count++;

			seg_len = len;
			if (chan->pasid_enabled) {
				dst_addr = (uint64_t)dst;
			} else {
				dst_addr = spdk_vtophys(dst, &seg_len);
				if (dst_addr == SPDK_VTOPHYS_ERROR) {
					SPDK_ERRLOG("Error translating address\n");
					rc = -EFAULT;
					goto error;
				}
			}

			seg_len = spdk_min(seg_len, len);

@@ -997,12 +1018,16 @@ spdk_idxd_submit_crc32c(struct spdk_idxd_io_channel *chan,
			count++;

			seg_len = len;
			if (chan->pasid_enabled) {
				src_addr = (uint64_t)src;
			} else {
				src_addr = spdk_vtophys(src, &seg_len);
				if (src_addr == SPDK_VTOPHYS_ERROR) {
					SPDK_ERRLOG("Error translating address\n");
					rc = -EFAULT;
					goto error;
				}
			}

			seg_len = spdk_min(seg_len, len);

@@ -1069,7 +1094,7 @@ spdk_idxd_submit_copy_crc32c(struct spdk_idxd_io_channel *chan,
	     len = spdk_ioviter_next(&iter, &src, &dst)) {


		idxd_vtophys_iter_init(&vtophys_iter, src, dst, len);
		idxd_vtophys_iter_init(chan, &vtophys_iter, src, dst, len);

		while (len > 0) {
			if (first_op == NULL) {
@@ -1145,12 +1170,12 @@ _idxd_submit_compress_single(struct spdk_idxd_io_channel *chan, void *dst, const
		return rc;
	}

	rc = _vtophys(src, &src_addr, nbytes_src);
	rc = _vtophys(chan, src, &src_addr, nbytes_src);
	if (rc) {
		goto error;
	}

	rc = _vtophys(dst, &dst_addr, nbytes_dst);
	rc = _vtophys(chan, dst, &dst_addr, nbytes_dst);
	if (rc) {
		goto error;
	}
@@ -1213,12 +1238,12 @@ _idxd_submit_decompress_single(struct spdk_idxd_io_channel *chan, void *dst, con
		return rc;
	}

	rc = _vtophys(src, &src_addr, nbytes);
	rc = _vtophys(chan, src, &src_addr, nbytes);
	if (rc) {
		goto error;
	}

	rc = _vtophys(dst, &dst_addr, nbytes_dst);
	rc = _vtophys(chan, dst, &dst_addr, nbytes_dst);
	if (rc) {
		goto error;
	}
+3 −0
Original line number Diff line number Diff line
@@ -74,6 +74,8 @@ struct spdk_idxd_io_channel {
	void					*portal;
	uint32_t				portal_offset;

	bool					pasid_enabled;

	/* The currently open batch */
	struct idxd_batch			*batch;

@@ -138,6 +140,7 @@ struct spdk_idxd_device {
	uint32_t			total_wq_size;
	uint32_t			chan_per_device;
	pthread_mutex_t			num_channels_lock;
	bool				pasid_enabled;
	enum idxd_dev			type;
	struct iaa_aecs			*aecs;
	uint32_t			version;
+13 −2
Original line number Diff line number Diff line
@@ -71,6 +71,7 @@ kernel_idxd_probe(void *cb_ctx, spdk_idxd_attach_cb attach_cb, spdk_idxd_probe_c
		enum accfg_device_state dstate;
		struct spdk_kernel_idxd_device *kernel_idxd;
		struct accfg_wq *wq;
		bool pasid_enabled;

		/* Make sure that the device is enabled */
		dstate = accfg_device_get_state(device);
@@ -78,6 +79,17 @@ kernel_idxd_probe(void *cb_ctx, spdk_idxd_attach_cb attach_cb, spdk_idxd_probe_c
			continue;
		}

		pasid_enabled = accfg_device_get_pasid_enabled(device);
		if (!pasid_enabled && spdk_iommu_is_enabled()) {
			/*
			 * If the IOMMU is enabled but shared memory mode is not on,
			 * then we have no way to get the IOVA from userspace to use this
			 * device or any kernel device. Return an error.
			 */
			SPDK_ERRLOG("Found kernel IDXD device, but cannot use it when IOMMU is enabled but SM is disabled\n");
			return -ENOTSUP;
		}

		kernel_idxd = calloc(1, sizeof(struct spdk_kernel_idxd_device));
		if (kernel_idxd == NULL) {
			SPDK_ERRLOG("Failed to allocate memory for kernel_idxd device.\n");
@@ -91,6 +103,7 @@ kernel_idxd_probe(void *cb_ctx, spdk_idxd_attach_cb attach_cb, spdk_idxd_probe_c
		kernel_idxd->idxd.impl = &g_kernel_idxd_impl;
		kernel_idxd->fd = -1;
		kernel_idxd->idxd.version = accfg_device_get_version(device);
		kernel_idxd->idxd.pasid_enabled = pasid_enabled;

		accfg_wq_foreach(device, wq) {
			enum accfg_wq_state wstate;
@@ -146,8 +159,6 @@ kernel_idxd_probe(void *cb_ctx, spdk_idxd_attach_cb attach_cb, spdk_idxd_probe_c
			/* Since we only use a single WQ, the total size is the size of this WQ */
			kernel_idxd->idxd.total_wq_size = accfg_wq_get_size(wq);
			kernel_idxd->idxd.chan_per_device = (kernel_idxd->idxd.total_wq_size >= 128) ? 8 : 4;
			/* TODO: Handle BOF when we add support for shared WQ */
			/* wq_ctx->bof = accfg_wq_get_block_on_fault(wq); */

			/* We only use a single WQ, so once we've found one we can stop looking. */
			break;