Commit e3da0e84 authored by Alexey Marchuk's avatar Alexey Marchuk Committed by Konrad Sztyber
Browse files

lib/mlx5: Add RDMA_READ/WRITE and CQ polling API



Signed-off-by: default avatarAlexey Marchuk <alexeymar@nvidia.com>
Change-Id: I30d79b809539d90173228ed54eda0ed139a76d22
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/23103


Community-CI: Mellanox Build Bot
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: default avatarBen Walker <ben@nvidia.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
parent b557c9c8
Loading
Loading
Loading
Loading
+48 −0
Original line number Diff line number Diff line
@@ -125,6 +125,54 @@ int spdk_mlx5_qp_set_error_state(struct spdk_mlx5_qp *qp);
 */
void spdk_mlx5_qp_destroy(struct spdk_mlx5_qp *qp);

/**
 * Poll Completion Queue, save up to \b max_completions into \b comp array
 *
 * \param cq Completion Queue
 * \param comp Array of completions to be filled by this function
 * \param max_completions
 * \return 0 on success, negated errno on failure
 */
int spdk_mlx5_cq_poll_completions(struct spdk_mlx5_cq *cq,
				  struct spdk_mlx5_cq_completion *comp, int max_completions);

/**
 * Ring Send Queue doorbell, submits all previously posted WQEs to HW
 *
 * \param qp qpair pointer
 */
void spdk_mlx5_qp_complete_send(struct spdk_mlx5_qp *qp);

/**
 * Submit RDMA_WRITE operations on the qpair
 *
 * \param qp qpair pointer
 * \param sge Memory layout of the local data to be written
 * \param sge_count Number of \b sge entries
 * \param dstaddr Remote address to write \b sge to
 * \param rkey Remote memory key
 * \param wrid wrid which is returned in the CQE
 * \param flags SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE to have a signaled completion or 0
 * \return 0 on success, negated errno on failure
 */
int spdk_mlx5_qp_rdma_write(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count,
			    uint64_t dstaddr, uint32_t rkey, uint64_t wrid, uint32_t flags);

/**
 * Submit RDMA_WRITE operations on the qpair
 *
 * \param qp qpair pointer
 * \param sge Memory layout of the local buffers for reading remote data
 * \param sge_count Number of \b sge entries
 * \param dstaddr Remote address to read into \b sge
 * \param rkey Remote memory key
 * \param wrid wrid which is returned in the CQE
 * \param flags SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE to have a signaled completion or 0
 * \return 0 on success, negated errno on failure
 */
int spdk_mlx5_qp_rdma_read(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count,
			   uint64_t dstaddr, uint32_t rkey, uint64_t wrid, uint32_t flags);

/**
 * Return a NULL terminated array of devices which support crypto operation on Nvidia NICs
 *
+1 −1
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
SO_VER := 3
SO_MINOR := 0

C_SRCS = mlx5_crypto.c mlx5_qp.c
C_SRCS = mlx5_crypto.c mlx5_qp.c mlx5_dma.c
LIBNAME = mlx5

LOCAL_SYS_LIBS += -lmlx5 -libverbs -lrdmacm

lib/mlx5/mlx5_dma.c

0 → 100644
+451 −0
Original line number Diff line number Diff line
/*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

#include "mlx5_priv.h"
#include "mlx5_ifc.h"
#include "spdk/log.h"
#include "spdk/util.h"
#include "spdk/barrier.h"
#include "spdk/likely.h"

#include "spdk_internal/rdma_utils.h"
#include "spdk_internal/mlx5.h"

#define MLX5_DMA_Q_TX_CQE_SIZE  64

struct _mlx5_err_cqe {
	uint8_t		rsvd0[32];
	uint32_t	srqn;
	uint8_t		rsvd1[16];
	uint8_t		hw_err_synd;
	uint8_t		rsvd2[1];
	uint8_t		vendor_err_synd;
	uint8_t		syndrome;
	uint32_t	s_wqe_opcode_qpn;
	uint16_t	wqe_counter;
	uint8_t		signature;
	uint8_t		op_own;
};

struct mlx5_sigerr_cqe {
	uint8_t		rsvd0[16];
	uint32_t	expected_trans_sig;
	uint32_t	actual_trans_sig;
	uint32_t	expected_ref_tag;
	uint32_t	actual_ref_tag;
	uint16_t	syndrome;
	uint8_t		sig_type;
	uint8_t		domain;
	uint32_t	mkey;
	uint64_t	sig_err_offset;
	uint8_t		rsvd30[14];
	uint8_t		signature;
	uint8_t		op_own;
};

static const char *
mlx5_cqe_err_opcode(struct _mlx5_err_cqe *ecqe)
{
	uint8_t wqe_err_opcode = be32toh(ecqe->s_wqe_opcode_qpn) >> 24;

	switch (ecqe->op_own >> 4) {
	case MLX5_CQE_REQ_ERR:
		switch (wqe_err_opcode) {
		case MLX5_OPCODE_RDMA_WRITE_IMM:
		case MLX5_OPCODE_RDMA_WRITE:
			return "RDMA_WRITE";
		case MLX5_OPCODE_SEND_IMM:
		case MLX5_OPCODE_SEND:
		case MLX5_OPCODE_SEND_INVAL:
			return "SEND";
		case MLX5_OPCODE_RDMA_READ:
			return "RDMA_READ";
		case MLX5_OPCODE_ATOMIC_CS:
			return "COMPARE_SWAP";
		case MLX5_OPCODE_ATOMIC_FA:
			return "FETCH_ADD";
		case MLX5_OPCODE_ATOMIC_MASKED_CS:
			return "MASKED_COMPARE_SWAP";
		case MLX5_OPCODE_ATOMIC_MASKED_FA:
			return "MASKED_FETCH_ADD";
		case MLX5_OPCODE_MMO:
			return "GGA_DMA";
		default:
			return "";
		}
	case MLX5_CQE_RESP_ERR:
		return "RECV";
	default:
		return "";
	}
}

static int
mlx5_cqe_err(struct mlx5_cqe64 *cqe)
{
	struct _mlx5_err_cqe *ecqe = (struct _mlx5_err_cqe *)cqe;
	uint16_t wqe_counter;
	uint32_t qp_num = 0;
	char info[200] = {0};

	wqe_counter = be16toh(ecqe->wqe_counter);
	qp_num = be32toh(ecqe->s_wqe_opcode_qpn) & ((1 << 24) - 1);

	if (ecqe->syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
		SPDK_DEBUGLOG(mlx5, "QP 0x%x wqe[%d] is flushed\n", qp_num, wqe_counter);
		return ecqe->syndrome;
	}

	switch (ecqe->syndrome) {
	case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
		snprintf(info, sizeof(info), "Local length");
		break;
	case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR:
		snprintf(info, sizeof(info), "Local QP operation");
		break;
	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
		snprintf(info, sizeof(info), "Local protection");
		break;
	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
		snprintf(info, sizeof(info), "WR flushed because QP in error state");
		break;
	case MLX5_CQE_SYNDROME_MW_BIND_ERR:
		snprintf(info, sizeof(info), "Memory window bind");
		break;
	case MLX5_CQE_SYNDROME_BAD_RESP_ERR:
		snprintf(info, sizeof(info), "Bad response");
		break;
	case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR:
		snprintf(info, sizeof(info), "Local access");
		break;
	case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
		snprintf(info, sizeof(info), "Invalid request");
		break;
	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
		snprintf(info, sizeof(info), "Remote access");
		break;
	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
		snprintf(info, sizeof(info), "Remote QP");
		break;
	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
		snprintf(info, sizeof(info), "Transport retry count exceeded");
		break;
	case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
		snprintf(info, sizeof(info), "Receive-no-ready retry count exceeded");
		break;
	case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR:
		snprintf(info, sizeof(info), "Remote side aborted");
		break;
	default:
		snprintf(info, sizeof(info), "Generic");
		break;
	}
	SPDK_WARNLOG("Error on QP 0x%x wqe[%03d]: %s (synd 0x%x vend 0x%x hw 0x%x) opcode %s\n",
		     qp_num, wqe_counter, info, ecqe->syndrome, ecqe->vendor_err_synd, ecqe->hw_err_synd,
		     mlx5_cqe_err_opcode(ecqe));

	return ecqe->syndrome;
}

/*
 * DATA WQE LAYOUT:
 * ----------------------------------
 * | gen_ctrl |   rseg   |   dseg   |
 * ----------------------------------
 *   16bytes    16bytes    16bytes * sge_count
 */

static inline void
mlx5_dma_xfer_full(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, uint64_t raddr,
		   uint32_t rkey, int op, uint32_t flags, uint64_t wr_id, uint32_t bb_count)
{
	struct mlx5_hw_qp *hw_qp = &qp->hw;
	struct mlx5_wqe_ctrl_seg *ctrl;
	struct mlx5_wqe_raddr_seg *rseg;
	struct mlx5_wqe_data_seg *dseg;
	uint8_t fm_ce_se;
	uint32_t i, pi;

	fm_ce_se = mlx5_qp_fm_ce_se_update(qp, (uint8_t)flags);

	/* absolute PI value */
	pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1);
	SPDK_DEBUGLOG(mlx5, "opc %d, sge_count %u, bb_count %u, orig pi %u, fm_ce_se %x\n", op, sge_count,
		      bb_count, hw_qp->sq_pi, fm_ce_se);

	ctrl = (struct mlx5_wqe_ctrl_seg *) mlx5_qp_get_wqe_bb(hw_qp);
	/* WQE size in octowords (16-byte units). DS (data segment) accounts for all the segments in the WQE
	 * as summarized in WQE construction */
	mlx5_set_ctrl_seg(ctrl, hw_qp->sq_pi, op, 0, hw_qp->qp_num, fm_ce_se, 2 + sge_count, 0, 0);

	rseg = (struct mlx5_wqe_raddr_seg *)(ctrl + 1);
	rseg->raddr = htobe64(raddr);
	rseg->rkey  = htobe32(rkey);
	rseg->reserved = 0;

	dseg = (struct mlx5_wqe_data_seg *)(rseg + 1);
	for (i = 0; i < sge_count; i++) {
		mlx5dv_set_data_seg(dseg, sge[i].length, sge[i].lkey, sge[i].addr);
		dseg = dseg + 1;
	}

	mlx5_qp_wqe_submit(qp, ctrl, bb_count, pi);

	mlx5_qp_set_comp(qp, pi, wr_id, fm_ce_se, bb_count);
	assert(qp->tx_available >= bb_count);
	qp->tx_available -= bb_count;
}

static inline void
mlx5_dma_xfer_wrap_around(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count,
			  uint64_t raddr, uint32_t rkey, int op, uint32_t flags, uint64_t wr_id, uint32_t bb_count)
{
	struct mlx5_hw_qp *hw_qp = &qp->hw;
	struct mlx5_wqe_ctrl_seg *ctrl;
	struct mlx5_wqe_raddr_seg *rseg;
	struct mlx5_wqe_data_seg *dseg;
	uint8_t fm_ce_se;
	uint32_t i, to_end, pi;

	fm_ce_se = mlx5_qp_fm_ce_se_update(qp, (uint8_t)flags);

	/* absolute PI value */
	pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1);
	SPDK_DEBUGLOG(mlx5, "opc %d, sge_count %u, bb_count %u, orig pi %u, fm_ce_se %x\n", op, sge_count,
		      bb_count, pi, fm_ce_se);

	to_end = (hw_qp->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB;
	ctrl = (struct mlx5_wqe_ctrl_seg *) mlx5_qp_get_wqe_bb(hw_qp);
	/* WQE size in octowords (16-byte units). DS (data segment) accounts for all the segments in the WQE
	 * as summarized in WQE construction */
	mlx5_set_ctrl_seg(ctrl, hw_qp->sq_pi, op, 0, hw_qp->qp_num, fm_ce_se, 2 + sge_count, 0, 0);
	to_end -= sizeof(struct mlx5_wqe_ctrl_seg); /* 16 bytes */

	rseg = (struct mlx5_wqe_raddr_seg *)(ctrl + 1);
	rseg->raddr = htobe64(raddr);
	rseg->rkey  = htobe32(rkey);
	rseg->reserved = 0;
	to_end -= sizeof(struct mlx5_wqe_raddr_seg); /* 16 bytes */

	dseg = (struct mlx5_wqe_data_seg *)(rseg + 1);
	for (i = 0; i < sge_count; i++) {
		mlx5dv_set_data_seg(dseg, sge[i].length, sge[i].lkey, sge[i].addr);
		to_end -= sizeof(struct mlx5_wqe_data_seg); /* 16 bytes */
		if (to_end != 0) {
			dseg = dseg + 1;
		} else {
			/* Start from the beginning of SQ */
			dseg = (struct mlx5_wqe_data_seg *)(hw_qp->sq_addr);
			to_end = hw_qp->sq_wqe_cnt * MLX5_SEND_WQE_BB;
		}
	}

	mlx5_qp_wqe_submit(qp, ctrl, bb_count, pi);

	mlx5_qp_set_comp(qp, pi, wr_id, fm_ce_se, bb_count);
	assert(qp->tx_available >= bb_count);
	qp->tx_available -= bb_count;
}

static inline int
mlx5_qp_rdma_op(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count, uint64_t dstaddr,
		uint32_t rkey, uint64_t wrid, uint32_t flags, int op)
{
	struct mlx5_hw_qp *hw_qp = &qp->hw;
	uint32_t to_end, pi, bb_count;

	/* One bb (building block) is 64 bytes - 4 octowords
	 * It can hold control segment + raddr segment + 2 sge segments.
	 * If sge_count (data segments) is bigger than 2 then we consume additional bb */
	bb_count = (sge_count <= 2) ? 1 : 1 + SPDK_CEIL_DIV(sge_count - 2, 4);

	if (spdk_unlikely(bb_count > qp->tx_available)) {
		return -ENOMEM;
	}
	if (spdk_unlikely(sge_count > qp->max_send_sge)) {
		return -E2BIG;
	}
	pi = hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1);
	to_end = (hw_qp->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB;

	if (spdk_likely(to_end >= bb_count * MLX5_SEND_WQE_BB)) {
		mlx5_dma_xfer_full(qp, sge, sge_count, dstaddr, rkey, op, flags, wrid, bb_count);
	} else {
		mlx5_dma_xfer_wrap_around(qp, sge, sge_count, dstaddr, rkey, op, flags, wrid, bb_count);
	}

	return 0;
}

int
spdk_mlx5_qp_rdma_write(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count,
			uint64_t dstaddr, uint32_t rkey, uint64_t wrid, uint32_t flags)
{
	return mlx5_qp_rdma_op(qp, sge, sge_count, dstaddr, rkey, wrid, flags, MLX5_OPCODE_RDMA_WRITE);
}

int
spdk_mlx5_qp_rdma_read(struct spdk_mlx5_qp *qp, struct ibv_sge *sge, uint32_t sge_count,
		       uint64_t dstaddr, uint32_t rkey, uint64_t wrid, uint32_t flags)
{
	return mlx5_qp_rdma_op(qp, sge, sge_count, dstaddr, rkey, wrid, flags, MLX5_OPCODE_RDMA_READ);
}

/* polling start */

static inline void
mlx5_qp_update_comp(struct spdk_mlx5_qp *qp)
{
	qp->completions[qp->last_pi].completions = qp->nonsignaled_outstanding;
	qp->nonsignaled_outstanding = 0;
}

static inline void
mlx5_qp_tx_complete(struct spdk_mlx5_qp *qp)
{
	if (qp->sigmode == SPDK_MLX5_QP_SIG_LAST) {
		qp->ctrl->fm_ce_se &= ~SPDK_MLX5_WQE_CTRL_CE_MASK;
		qp->ctrl->fm_ce_se |= SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE;
		mlx5_qp_update_comp(qp);
	}
	mlx5_ring_tx_db(qp, qp->ctrl);
}

static inline struct mlx5_cqe64 *
mlx5_cq_get_cqe(struct mlx5_hw_cq *hw_cq, int cqe_size)
{
	struct mlx5_cqe64 *cqe;

	/* note: that the cq_size is known at the compilation time. We pass it
	 * down here so that branch and multiplication will be done at the
	 * compile time during inlining
	 */
	cqe = (struct mlx5_cqe64 *)(hw_cq->cq_addr + (hw_cq->ci & (hw_cq->cqe_cnt - 1)) *
				    cqe_size);
	return cqe_size == 64 ? cqe : cqe + 1;
}


static inline struct mlx5_cqe64 *
mlx5_cq_poll_one(struct mlx5_hw_cq *hw_cq, int cqe_size)
{
	struct mlx5_cqe64 *cqe;

	cqe = mlx5_cq_get_cqe(hw_cq, cqe_size);

	/* cqe is hw owned */
	if (mlx5dv_get_cqe_owner(cqe) == !(hw_cq->ci & hw_cq->cqe_cnt)) {
		return NULL;
	}

	/* and must have valid opcode */
	if (mlx5dv_get_cqe_opcode(cqe) == MLX5_CQE_INVALID) {
		return NULL;
	}

	hw_cq->ci++;

	SPDK_DEBUGLOG(mlx5,
		      "cq: 0x%x ci: %d CQ opcode %d size %d wqe_counter %d scatter32 %d scatter64 %d\n",
		      hw_cq->cq_num, hw_cq->ci,
		      mlx5dv_get_cqe_opcode(cqe),
		      be32toh(cqe->byte_cnt),
		      be16toh(cqe->wqe_counter),
		      cqe->op_own & MLX5_INLINE_SCATTER_32,
		      cqe->op_own & MLX5_INLINE_SCATTER_64);
	return cqe;
}

static inline uint64_t
mlx5_qp_get_comp_wr_id(struct spdk_mlx5_qp *qp, struct mlx5_cqe64 *cqe)
{
	uint16_t comp_idx;
	uint32_t sq_mask;

	sq_mask = qp->hw.sq_wqe_cnt - 1;
	comp_idx = be16toh(cqe->wqe_counter) & sq_mask;
	SPDK_DEBUGLOG(mlx5, "got cpl, wqe_counter %u, comp_idx %u; wrid %"PRIx64", cpls %u\n",
		      cqe->wqe_counter, comp_idx, qp->completions[comp_idx].wr_id, qp->completions[comp_idx].completions);
	/* If we have several unsignaled WRs, we accumulate them in the completion of the next signaled WR */
	qp->tx_available += qp->completions[comp_idx].completions;

	return qp->completions[comp_idx].wr_id;
}

int
spdk_mlx5_cq_poll_completions(struct spdk_mlx5_cq *cq, struct spdk_mlx5_cq_completion *comp,
			      int max_completions)
{
	struct spdk_mlx5_qp *qp;
	struct mlx5_cqe64 *cqe;
	uint8_t opcode;
	int n = 0;

	do {
		cqe = mlx5_cq_poll_one(&cq->hw, MLX5_DMA_Q_TX_CQE_SIZE);
		if (!cqe) {
			break;
		}

		qp = mlx5_cq_find_qp(cq, be32toh(cqe->sop_drop_qpn) & 0xffffff);
		if (spdk_unlikely(!qp)) {
			return -ENODEV;
		}

		opcode = mlx5dv_get_cqe_opcode(cqe);
		comp[n].wr_id = mlx5_qp_get_comp_wr_id(qp, cqe);
		if (spdk_likely(opcode == MLX5_CQE_REQ)) {
			comp[n].status = IBV_WC_SUCCESS;
		} else {
			comp[n].status = mlx5_cqe_err(cqe);
		}
		n++;
	} while (n < max_completions);

	return n;
}

void
spdk_mlx5_qp_complete_send(struct spdk_mlx5_qp *qp)
{
	mlx5_qp_tx_complete(qp);
}

#ifdef DEBUG
void
mlx5_qp_dump_wqe(struct spdk_mlx5_qp *qp, int n_wqe_bb)
{
	struct mlx5_hw_qp *hw = &qp->hw;
	uint32_t pi;
	uint32_t to_end;
	uint32_t *wqe;
	int i;
	extern struct spdk_log_flag SPDK_LOG_mlx5_sq;

	if (!SPDK_LOG_mlx5_sq.enabled) {
		return;
	}

	pi = hw->sq_pi & (hw->sq_wqe_cnt - 1);
	to_end = (hw->sq_wqe_cnt - pi) * MLX5_SEND_WQE_BB;
	wqe = mlx5_qp_get_wqe_bb(hw);

	SPDK_DEBUGLOG(mlx5_sq, "QP: qpn 0x%" PRIx32 ", wqe_index 0x%" PRIx32 ", addr %p\n",
		      hw->qp_num, pi, wqe);
	for (i = 0; i < n_wqe_bb; i++) {
		fprintf(stderr,
			"%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n"
			"%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n"
			"%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n"
			"%08" PRIx32 " %08" PRIx32 " %08" PRIx32 " %08" PRIx32 "\n",
			be32toh(wqe[0]),  be32toh(wqe[1]),  be32toh(wqe[2]),  be32toh(wqe[3]),
			be32toh(wqe[4]),  be32toh(wqe[5]),  be32toh(wqe[6]),  be32toh(wqe[7]),
			be32toh(wqe[8]),  be32toh(wqe[9]),  be32toh(wqe[10]), be32toh(wqe[11]),
			be32toh(wqe[12]), be32toh(wqe[13]), be32toh(wqe[14]), be32toh(wqe[15]));
		wqe = mlx5_qp_get_next_wqebb(hw, &to_end, wqe);
	}
}
#endif

SPDK_LOG_REGISTER_COMPONENT(mlx5_sq)
+172 −0
Original line number Diff line number Diff line
@@ -80,3 +80,175 @@ enum {
	/* Enable completion only for the last control WQE segment, regardless of the flags passed by the user */
	SPDK_MLX5_QP_SIG_LAST = 2,
};

/**
 * Completion and Event mode (SPDK_MLX5_WQE_CTRL_CE_*)
 * Maps internal representation of completion events configuration to PRM values
 * g_mlx5_ce_map[][X] is fm_ce_se >> 2 & 0x3 */
static uint8_t g_mlx5_ce_map[3][4] = {
	/* SPDK_MLX5_QP_SIG_NONE */
	[0] = {
		[0] = SPDK_MLX5_WQE_CTRL_CE_CQ_NO_FLUSH_ERROR,
		[1] = SPDK_MLX5_WQE_CTRL_CE_CQ_NO_FLUSH_ERROR,
		[2] = SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE,
		[3] = SPDK_MLX5_WQE_CTRL_CE_CQ_ECE
	},
	/* SPDK_MLX5_QP_SIG_ALL */
	[1] = {
		[0] = SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE,
		[1] = SPDK_MLX5_WQE_CTRL_CE_CQ_NO_FLUSH_ERROR,
		[2] = SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE,
		[3] = SPDK_MLX5_WQE_CTRL_CE_CQ_ECE
	},
	/* SPDK_MLX5_QP_SIG_LAST */
	[2] = {
		[0] = SPDK_MLX5_WQE_CTRL_CE_CQ_NO_FLUSH_ERROR,
		[1] = SPDK_MLX5_WQE_CTRL_CE_CQ_NO_FLUSH_ERROR,
		[2] = SPDK_MLX5_WQE_CTRL_CE_CQ_NO_FLUSH_ERROR,
		[3] = SPDK_MLX5_WQE_CTRL_CE_CQ_ECE
	}

};

static inline uint8_t
mlx5_qp_fm_ce_se_update(struct spdk_mlx5_qp *qp, uint8_t fm_ce_se)
{
	uint8_t ce = (fm_ce_se >> 2) & 0x3;

	assert((ce & (~0x3)) == 0);
	fm_ce_se &= ~SPDK_MLX5_WQE_CTRL_CE_MASK;
	fm_ce_se |= g_mlx5_ce_map[qp->sigmode][ce];

	return fm_ce_se;
}

static inline void *
mlx5_qp_get_wqe_bb(struct mlx5_hw_qp *hw_qp)
{
	return (void *)hw_qp->sq_addr + (hw_qp->sq_pi & (hw_qp->sq_wqe_cnt - 1)) * MLX5_SEND_WQE_BB;
}

static inline void *
mlx5_qp_get_next_wqebb(struct mlx5_hw_qp *hw_qp, uint32_t *to_end, void *cur)
{
	*to_end -= MLX5_SEND_WQE_BB;
	if (*to_end == 0) { /* wqe buffer wap around */
		*to_end = hw_qp->sq_wqe_cnt * MLX5_SEND_WQE_BB;
		return (void *)(uintptr_t)hw_qp->sq_addr;
	}

	return ((char *)cur) + MLX5_SEND_WQE_BB;
}

static inline void
mlx5_qp_set_comp(struct spdk_mlx5_qp *qp, uint16_t pi,
		 uint64_t wr_id, uint32_t fm_ce_se, uint32_t n_bb)
{
	qp->completions[pi].wr_id = wr_id;
	if ((fm_ce_se & SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE) != SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE) {
		/* non-signaled WQE, accumulate it in outstanding */
		qp->nonsignaled_outstanding += n_bb;
		qp->completions[pi].completions = 0;
		return;
	}

	/* Store number of previous nonsignaled WQEs */
	qp->completions[pi].completions = qp->nonsignaled_outstanding + n_bb;
	qp->nonsignaled_outstanding = 0;
}

#if defined(__aarch64__)
#define spdk_memory_bus_store_fence()  asm volatile("dmb oshst" ::: "memory")
#elif defined(__i386__) || defined(__x86_64__)
#define spdk_memory_bus_store_fence() spdk_wmb()
#endif

static inline void
mlx5_update_tx_db(struct spdk_mlx5_qp *qp)
{
	/*
	 * Use cpu barrier to prevent code reordering
	 */
	spdk_smp_wmb();

	((uint32_t *)qp->hw.dbr_addr)[MLX5_SND_DBR] = htobe32(qp->hw.sq_pi);
}

static inline void
mlx5_flush_tx_db(struct spdk_mlx5_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl)
{
	*(uint64_t *)(qp->hw.sq_bf_addr) = *(uint64_t *)ctrl;
}

static inline void
mlx5_ring_tx_db(struct spdk_mlx5_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl)
{
	/* 8.9.3.1  Posting a Work Request to Work Queue
	 * 1. Write WQE to the WQE buffer sequentially to previously-posted
	 *    WQE (on WQEBB granularity)
	 *
	 * 2. Update Doorbell Record associated with that queue by writing
	 *    the sq_wqebb_counter or wqe_counter for send and RQ respectively
	 */
	mlx5_update_tx_db(qp);

	/* Make sure that doorbell record is written before ringing the doorbell */
	spdk_memory_bus_store_fence();

	/* 3. For send request ring DoorBell by writing to the Doorbell
	 *    Register field in the UAR associated with that queue
	 */
	mlx5_flush_tx_db(qp, ctrl);

	/* If UAR is mapped as WC (write combined) we need another fence to
	 * force write. Otherwise it may take a long time.
	 * On BF2/1 uar is mapped as NC (non combined) and fence is not needed
	 * here.
	 */
#if !defined(__aarch64__)
	if (!qp->hw.sq_tx_db_nc) {
		spdk_memory_bus_store_fence();
	}
#endif
}

#ifdef DEBUG
void mlx5_qp_dump_wqe(struct spdk_mlx5_qp *qp, int n_wqe_bb);
#else
#define mlx5_qp_dump_wqe(...) do { } while (0)
#endif

static inline void
mlx5_qp_wqe_submit(struct spdk_mlx5_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl, uint16_t n_wqe_bb,
		   uint16_t ctrlr_pi)
{
	mlx5_qp_dump_wqe(qp, n_wqe_bb);

	/* Delay ringing the doorbell */
	qp->hw.sq_pi += n_wqe_bb;
	qp->last_pi = ctrlr_pi;
	qp->ctrl = ctrl;
}

static inline void
mlx5_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *ctrl, uint16_t pi,
		  uint8_t opcode, uint8_t opmod, uint32_t qp_num,
		  uint8_t fm_ce_se, uint8_t ds,
		  uint8_t signature, uint32_t imm)
{
	*(uint32_t *)((void *)ctrl + 8) = 0;
	mlx5dv_set_ctrl_seg(ctrl, pi, opcode, opmod, qp_num,
			    fm_ce_se, ds, signature, imm);
}

static inline struct spdk_mlx5_qp *
mlx5_cq_find_qp(struct spdk_mlx5_cq *cq, uint32_t qp_num)
{
	uint32_t qpn_upper = qp_num >> SPDK_MLX5_QP_NUM_UPPER_SHIFT;
	uint32_t qpn_mask = qp_num & SPDK_MLX5_QP_NUM_LOWER_MASK;

	if (spdk_unlikely(!cq->qps[qpn_upper].count)) {
		return NULL;
	}
	return cq->qps[qpn_upper].table[qpn_mask];
}
+5 −0
Original line number Diff line number Diff line
@@ -6,6 +6,11 @@
    spdk_mlx5_qp_create;
    spdk_mlx5_qp_destroy;
    spdk_mlx5_qp_set_error_state;
    spdk_mlx5_qp_complete_send;
    spdk_mlx5_cq_poll_completions;

    spdk_mlx5_qp_rdma_write;
    spdk_mlx5_qp_rdma_read;

    spdk_mlx5_crypto_devs_get;
    spdk_mlx5_crypto_devs_release;
Loading