Jason Wang
2020-Jul-16 09:14 UTC
[PATCH vhost next 10/10] vdpa/mlx5: Add VDPA driver for supported mlx5 devices
On 2020/7/16 ??3:23, Eli Cohen wrote:> Add a front end VDPA driver that registers in the VDPA bus and provides > networking to a guest. The VDPA driver creates the necessary resources > on the VF it is driving such that data path will be offloaded. > > Notifications are being communicated through the driver. > > Currently, only VFs are supported. In subsequent patches we will have > devlink support to control which VF is used for VDPA and which function > is used for regular networking. > > Reviewed-by: Parav Pandit <parav at mellanox.com> > Signed-off-by: Eli Cohen <eli at mellanox.com>Looks good overall. Few nits inline.> --- > drivers/vdpa/Kconfig | 10 + > drivers/vdpa/mlx5/Makefile | 5 +- > drivers/vdpa/mlx5/core/mr.c | 2 +- > drivers/vdpa/mlx5/net/main.c | 76 ++ > drivers/vdpa/mlx5/net/mlx5_vnet.c | 1966 +++++++++++++++++++++++++++++ > drivers/vdpa/mlx5/net/mlx5_vnet.h | 32 + > 6 files changed, 2089 insertions(+), 2 deletions(-) > create mode 100644 drivers/vdpa/mlx5/net/main.c > create mode 100644 drivers/vdpa/mlx5/net/mlx5_vnet.c > create mode 100644 drivers/vdpa/mlx5/net/mlx5_vnet.h > > diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig > index 48a1a776dd86..809cb4c2eecf 100644 > --- a/drivers/vdpa/Kconfig > +++ b/drivers/vdpa/Kconfig > @@ -36,4 +36,14 @@ config MLX5_VDPA > Support library for Mellanox VDPA drivers. Provides code that is > common for all types of VDPA drivers. > > +config MLX5_VDPA_NET > + tristate "vDPA driver for ConnectX devices" > + depends on MLX5_VDPA > + default n > + help > + VDPA network driver for ConnectX6 and newer. Provides offloading > + of virtio net datapath such that descriptors put on the ring will > + be executed by the hardware. It also supports a variety of stateless > + offloads depending on the actual device used and firmware version. > + > endif # VDPA > diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile > index b347c62032ea..3f8850c1a300 100644 > --- a/drivers/vdpa/mlx5/Makefile > +++ b/drivers/vdpa/mlx5/Makefile > @@ -1 +1,4 @@ > -obj-$(CONFIG_MLX5_VDPA) += core/resources.o core/mr.o > +subdir-ccflags-y += -I$(src)/core > + > +obj-$(CONFIG_MLX5_VDPA_NET) += mlx5_vdpa.o > +mlx5_vdpa-$(CONFIG_MLX5_VDPA_NET) += net/main.o net/mlx5_vnet.o core/resources.o core/mr.o > diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c > index 975aa45fd78b..34e3bbb80df8 100644 > --- a/drivers/vdpa/mlx5/core/mr.c > +++ b/drivers/vdpa/mlx5/core/mr.c > @@ -453,7 +453,7 @@ int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *io > bool *change_map) > { > struct mlx5_vdpa_mr *mr = &mvdev->mr; > - int err; > + int err = 0; > > *change_map = false; > if (map_empty(iotlb)) { > diff --git a/drivers/vdpa/mlx5/net/main.c b/drivers/vdpa/mlx5/net/main.c > new file mode 100644 > index 000000000000..838cd98386ff > --- /dev/null > +++ b/drivers/vdpa/mlx5/net/main.c > @@ -0,0 +1,76 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* Copyright (c) 2020 Mellanox Technologies Ltd. */ > + > +#include <linux/module.h> > +#include <linux/mlx5/driver.h> > +#include <linux/mlx5/device.h> > +#include "mlx5_vdpa_ifc.h" > +#include "mlx5_vnet.h" > + > +MODULE_AUTHOR("Eli Cohen <eli at mellanox.com>"); > +MODULE_DESCRIPTION("Mellanox VDPA driver"); > +MODULE_LICENSE("Dual BSD/GPL"); > + > +static bool required_caps_supported(struct mlx5_core_dev *mdev) > +{ > + u8 event_mode; > + u64 got; > + > + got = MLX5_CAP_GEN_64(mdev, general_obj_types); > + > + if (!(got & MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q)) > + return false; > + > + event_mode = MLX5_CAP_DEV_VDPA_EMULATION(mdev, event_mode); > + if (!(event_mode & MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE)) > + return false; > + > + if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, eth_frame_offload_type)) > + return false; > + > + return true; > +} > + > +static void *mlx5_vdpa_add(struct mlx5_core_dev *mdev) > +{ > + struct mlx5_vdpa_dev *vdev; > + > + if (mlx5_core_is_pf(mdev)) > + return NULL; > + > + if (!required_caps_supported(mdev)) { > + dev_info(mdev->device, "virtio net emulation not supported\n"); > + return NULL; > + } > + vdev = mlx5_vdpa_add_dev(mdev); > + if (IS_ERR(vdev)) > + return NULL; > + > + return vdev; > +} > + > +static void mlx5_vdpa_remove(struct mlx5_core_dev *mdev, void *context) > +{ > + struct mlx5_vdpa_dev *vdev = context; > + > + mlx5_vdpa_remove_dev(vdev); > +} > + > +static struct mlx5_interface mlx5_vdpa_interface = { > + .add = mlx5_vdpa_add, > + .remove = mlx5_vdpa_remove, > + .protocol = MLX5_INTERFACE_PROTOCOL_VDPA, > +}; > + > +static int __init mlx5_vdpa_init(void) > +{ > + return mlx5_register_interface(&mlx5_vdpa_interface); > +} > + > +static void __exit mlx5_vdpa_exit(void) > +{ > + mlx5_unregister_interface(&mlx5_vdpa_interface); > +} > + > +module_init(mlx5_vdpa_init); > +module_exit(mlx5_vdpa_exit); > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c > new file mode 100644 > index 000000000000..f7d3c7a0eb92 > --- /dev/null > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c > @@ -0,0 +1,1966 @@ > +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > +/* Copyright (c) 2020 Mellanox Technologies Ltd. */ > + > +#include <linux/vdpa.h> > +#include <uapi/linux/virtio_ids.h> > +#include <linux/virtio_config.h> > +#include <linux/mlx5/qp.h> > +#include <linux/mlx5/device.h> > +#include <linux/mlx5/vport.h> > +#include <linux/mlx5/fs.h> > +#include <linux/mlx5/device.h> > +#include "mlx5_vnet.h" > +#include "../core/mlx5_vdpa_ifc.h" > +#include "../core/mlx5_vdpa.h" > + > +#define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev) > + > +#define VALID_FEATURES_MASK \ > + (BIT(VIRTIO_NET_F_CSUM) | BIT(VIRTIO_NET_F_GUEST_CSUM) | \ > + BIT(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT(VIRTIO_NET_F_MTU) | BIT(VIRTIO_NET_F_MAC) | \ > + BIT(VIRTIO_NET_F_GUEST_TSO4) | BIT(VIRTIO_NET_F_GUEST_TSO6) | \ > + BIT(VIRTIO_NET_F_GUEST_ECN) | BIT(VIRTIO_NET_F_GUEST_UFO) | BIT(VIRTIO_NET_F_HOST_TSO4) | \ > + BIT(VIRTIO_NET_F_HOST_TSO6) | BIT(VIRTIO_NET_F_HOST_ECN) | BIT(VIRTIO_NET_F_HOST_UFO) | \ > + BIT(VIRTIO_NET_F_MRG_RXBUF) | BIT(VIRTIO_NET_F_STATUS) | BIT(VIRTIO_NET_F_CTRL_VQ) | \ > + BIT(VIRTIO_NET_F_CTRL_RX) | BIT(VIRTIO_NET_F_CTRL_VLAN) | \ > + BIT(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT(VIRTIO_NET_F_GUEST_ANNOUNCE) | \ > + BIT(VIRTIO_NET_F_MQ) | BIT(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT(VIRTIO_NET_F_HASH_REPORT) | \ > + BIT(VIRTIO_NET_F_RSS) | BIT(VIRTIO_NET_F_RSC_EXT) | BIT(VIRTIO_NET_F_STANDBY) | \ > + BIT(VIRTIO_NET_F_SPEED_DUPLEX) | BIT(VIRTIO_F_NOTIFY_ON_EMPTY) | \ > + BIT(VIRTIO_F_ANY_LAYOUT) | BIT(VIRTIO_F_VERSION_1) | BIT(VIRTIO_F_IOMMU_PLATFORM) | \ > + BIT(VIRTIO_F_RING_PACKED) | BIT(VIRTIO_F_ORDER_PLATFORM) | BIT(VIRTIO_F_SR_IOV)) > + > +#define VALID_STATUS_MASK \ > + (VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK | \ > + VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED) > + > +struct mlx5_vdpa_net_resources { > + u32 tisn; > + u32 tdn; > + u32 tirn; > + u32 rqtn; > + bool valid; > +}; > + > +struct mlx5_vdpa_cq_buf { > + struct mlx5_frag_buf_ctrl fbc; > + struct mlx5_frag_buf frag_buf; > + int cqe_size; > + int nent; > +}; > + > +struct mlx5_vdpa_cq { > + struct mlx5_core_cq mcq; > + struct mlx5_vdpa_cq_buf buf; > + struct mlx5_db db; > + int cqe; > +}; > + > +struct mlx5_vdpa_umem { > + struct mlx5_frag_buf_ctrl fbc; > + struct mlx5_frag_buf frag_buf; > + int size; > + u32 id; > +}; > + > +struct mlx5_vdpa_qp { > + struct mlx5_core_qp mqp; > + struct mlx5_frag_buf frag_buf; > + struct mlx5_db db; > + u16 head; > + bool fw; > +}; > + > +struct mlx5_vq_restore_info { > + u32 num_ent; > + u64 desc_addr; > + u64 device_addr; > + u64 driver_addr; > + u16 avail_index; > + bool ready; > + struct vdpa_callback cb; > + bool restore; > +}; > + > +struct mlx5_vdpa_virtqueue { > + bool ready; > + u64 desc_addr; > + u64 device_addr; > + u64 driver_addr; > + u32 num_ent; > + struct vdpa_callback event_cb; > + > + /* Resources for implementing the notification channel from the device > + * to the driver. fwqp is the firmware end of an RC connection; the > + * other end is vqqp used by the driver. cq is is where completions are > + * reported. > + */ > + struct mlx5_vdpa_cq cq; > + struct mlx5_vdpa_qp fwqp; > + struct mlx5_vdpa_qp vqqp; > + > + /* umem resources are required for the virtqueue operation. They're use > + * is internal and they must be provided by the driver. > + */ > + struct mlx5_vdpa_umem umem1; > + struct mlx5_vdpa_umem umem2; > + struct mlx5_vdpa_umem umem3; > + > + bool initialized; > + int index; > + u32 virtq_id; > + struct mlx5_vdpa_net *ndev; > + u16 avail_idx; > + int fw_state; > + > + /* keep last in the struct */ > + struct mlx5_vq_restore_info ri; > +}; > + > +/* We will remove this limitation once mlx5_vdpa_alloc_resources() > + * provides for driver space allocation > + */ > +#define MLX5_MAX_SUPPORTED_VQS 16 > + > +struct mlx5_vdpa_net { > + struct mlx5_vdpa_dev mvdev; > + struct mlx5_vdpa_net_resources res; > + struct virtio_net_config config; > + struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS]; > + > + /* Serialize vq resources creation and destruction. This is required > + * since memory map might change and we need to destroy and create > + * resources while driver in operational. > + */ > + struct mutex reslock; > + struct mlx5_flow_table *rxft; > + struct mlx5_fc *rx_counter; > + struct mlx5_flow_handle *rx_rule; > + bool setup; > +}; > + > +static void free_resources(struct mlx5_vdpa_net *ndev); > +static void init_mvqs(struct mlx5_vdpa_net *ndev); > +static int setup_driver(struct mlx5_vdpa_net *ndev); > +static void teardown_driver(struct mlx5_vdpa_net *ndev); > + > +static bool mlx5_vdpa_debug; > + > +#define MLX5_LOG_VIO_FLAG(_feature) \ > + do { \ > + if (features & BIT(_feature)) \ > + mlx5_vdpa_info(mvdev, "%s\n", #_feature); \ > + } while (0) > + > +#define MLX5_LOG_VIO_STAT(_status) \ > + do { \ > + if (status & (_status)) \ > + mlx5_vdpa_info(mvdev, "%s\n", #_status); \ > + } while (0) > + > +static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set) > +{ > + if (status & ~VALID_STATUS_MASK) > + mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n", > + status & ~VALID_STATUS_MASK); > + > + if (!mlx5_vdpa_debug) > + return; > + > + mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get"); > + if (set && !status) { > + mlx5_vdpa_info(mvdev, "driver resets the device\n"); > + return; > + } > + > + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE); > + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER); > + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK); > + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK); > + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET); > + MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED); > +} > + > +static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set) > +{ > + if (features & ~VALID_FEATURES_MASK) > + mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n", > + features & ~VALID_FEATURES_MASK); > + > + if (!mlx5_vdpa_debug) > + return; > + > + mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads"); > + if (!features) > + mlx5_vdpa_info(mvdev, "all feature bits are cleared\n"); > + > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY); > + MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX); > + MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY); > + MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT); > + MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1); > + MLX5_LOG_VIO_FLAG(VIRTIO_F_IOMMU_PLATFORM); > + MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED); > + MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM); > + MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV); > +} > + > +static int create_tis(struct mlx5_vdpa_net *ndev) > +{ > + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; > + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; > + void *tisc; > + int err; > + > + tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); > + MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn); > + err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn); > + if (err) > + mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err); > + > + return err; > +} > + > +static void destroy_tis(struct mlx5_vdpa_net *ndev) > +{ > + mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn); > +} > + > +#define MLX5_VDPA_CQE_SIZE 64 > +#define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE) > + > +static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent) > +{ > + struct mlx5_frag_buf *frag_buf = &buf->frag_buf; > + u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE; > + u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE; > + int err; > + > + err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf, > + ndev->mvdev.mdev->priv.numa_node); > + if (err) > + return err; > + > + mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); > + > + buf->cqe_size = MLX5_VDPA_CQE_SIZE; > + buf->nent = nent; > + > + return 0; > +} > + > +static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size) > +{ > + struct mlx5_frag_buf *frag_buf = &umem->frag_buf; > + > + return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf, > + ndev->mvdev.mdev->priv.numa_node); > +} > + > +static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf) > +{ > + mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf); > +} > + > +static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n) > +{ > + return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n); > +} > + > +static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf) > +{ > + struct mlx5_cqe64 *cqe64; > + void *cqe; > + int i; > + > + for (i = 0; i < buf->nent; i++) { > + cqe = get_cqe(vcq, i); > + cqe64 = cqe; > + cqe64->op_own = MLX5_CQE_INVALID << 4; > + } > +} > + > +static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n) > +{ > + struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1)); > + > + if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && > + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe))) > + return cqe64; > + > + return NULL; > +} > + > +static void rx_post(struct mlx5_vdpa_qp *vqp, int n) > +{ > + vqp->head += n; > + vqp->db.db[0] = cpu_to_be32(vqp->head); > +} > + > +static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in, > + struct mlx5_vdpa_virtqueue *mvq, u32 num_ent) > +{ > + struct mlx5_vdpa_qp *vqp; > + __be64 *pas; > + void *qpc; > + > + vqp = fw ? &mvq->fwqp : &mvq->vqqp; > + MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid); > + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); > + if (vqp->fw) { > + /* Firmware QP is allocated by the driver for the firmware's > + * use so we can skip part of the params as they will be chosen by firmware > + */ > + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); > + MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); > + MLX5_SET(qpc, qpc, no_sq, 1); > + return; > + } > + > + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); > + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); > + MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn); > + MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES); > + MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index); > + MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); > + MLX5_SET(qpc, qpc, no_sq, 1); > + MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn); > + MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent)); > + MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); > + pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas); > + mlx5_fill_page_frag_array(&vqp->frag_buf, pas); > +} > + > +static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent) > +{ > + return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, > + num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf, > + ndev->mvdev.mdev->priv.numa_node); > +} > + > +static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp) > +{ > + mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf); > +} > + > +static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, > + struct mlx5_vdpa_qp *vqp) > +{ > + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; > + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); > + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; > + void *qpc; > + void *in; > + int err; > + > + if (!vqp->fw) { > + vqp = &mvq->vqqp; > + err = rq_buf_alloc(ndev, vqp, mvq->num_ent); > + if (err) > + return err; > + > + err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db); > + if (err) > + goto err_db; > + inlen += vqp->frag_buf.npages * sizeof(__be64); > + } > + > + in = kzalloc(inlen, GFP_KERNEL); > + if (!in) { > + err = -ENOMEM; > + goto err_kzalloc; > + } > + > + qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent); > + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); > + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); > + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); > + MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn); > + MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES); > + if (!vqp->fw) > + MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma); > + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); > + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); > + kfree(in); > + if (err) > + goto err_kzalloc; > + > + vqp->mqp.uid = MLX5_GET(create_qp_in, in, uid); > + vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn); > + > + if (!vqp->fw) > + rx_post(vqp, mvq->num_ent); > + > + return 0; > + > +err_kzalloc: > + if (!vqp->fw) > + mlx5_db_free(ndev->mvdev.mdev, &vqp->db); > +err_db: > + if (!vqp->fw) > + rq_buf_free(ndev, vqp); > + > + return err; > +} > + > +static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp) > +{ > + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; > + > + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); > + MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn); > + MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid); > + if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in)) > + mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn); > + if (!vqp->fw) { > + mlx5_db_free(ndev->mvdev.mdev, &vqp->db); > + rq_buf_free(ndev, vqp); > + } > +} > + > +static void *next_cqe_sw(struct mlx5_vdpa_cq *cq) > +{ > + return get_sw_cqe(cq, cq->mcq.cons_index); > +} > + > +static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq) > +{ > + struct mlx5_cqe64 *cqe64; > + > + cqe64 = next_cqe_sw(vcq); > + if (!cqe64) > + return -EAGAIN; > + > + vcq->mcq.cons_index++; > + return 0; > +} > + > +static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num) > +{ > + mlx5_cq_set_ci(&mvq->cq.mcq); > + rx_post(&mvq->vqqp, num); > + if (mvq->event_cb.callback) > + mvq->event_cb.callback(mvq->event_cb.private); > +} > + > +static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) > +{ > + struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq); > + struct mlx5_vdpa_net *ndev = mvq->ndev; > + void __iomem *uar_page = ndev->mvdev.res.uar->map; > + int num = 0; > + > + while (!mlx5_vdpa_poll_one(&mvq->cq)) { > + num++; > + if (num > mvq->num_ent / 2) { > + /* If completions keep coming while we poll, we want to > + * let the hardware know that we consumed them by > + * updating the doorbell record. We also let vdpa core > + * know about this so it passes it on the virtio driver > + * on the guest. > + */ > + mlx5_vdpa_handle_completions(mvq, num); > + num = 0; > + } > + } > + > + if (num) > + mlx5_vdpa_handle_completions(mvq, num); > + > + mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); > +} > + > +static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) > +{ > + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; > + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; > + void __iomem *uar_page = ndev->mvdev.res.uar->map; > + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; > + struct mlx5_vdpa_cq *vcq = &mvq->cq; > + unsigned int irqn; > + __be64 *pas; > + int inlen; > + void *cqc; > + void *in; > + int err; > + int eqn; > + > + err = mlx5_db_alloc(mdev, &vcq->db); > + if (err) > + return err; > + > + vcq->mcq.set_ci_db = vcq->db.db; > + vcq->mcq.arm_db = vcq->db.db + 1; > + vcq->mcq.cqe_sz = 64; > + > + err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent); > + if (err) > + goto err_db; > + > + cq_frag_buf_init(vcq, &vcq->buf); > + > + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + > + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages; > + in = kzalloc(inlen, GFP_KERNEL); > + if (!in) { > + err = -ENOMEM; > + goto err_vzalloc; > + } > + > + MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid); > + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); > + mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas); > + > + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); > + MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); > + > + /* Use vector 0 by default. Consider adding code to choose least used > + * vector. > + */ > + err = mlx5_vector2eqn(mdev, 0, &eqn, &irqn); > + if (err) > + goto err_vec; > + > + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); > + MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent)); > + MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index); > + MLX5_SET(cqc, cqc, c_eqn, eqn); > + MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma); > + > + err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out)); > + if (err) > + goto err_vec; > + > + vcq->mcq.comp = mlx5_vdpa_cq_comp; > + vcq->cqe = num_ent; > + vcq->mcq.set_ci_db = vcq->db.db; > + vcq->mcq.arm_db = vcq->db.db + 1; > + mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); > + kfree(in); > + return 0; > + > +err_vec: > + kfree(in); > +err_vzalloc: > + cq_frag_buf_free(ndev, &vcq->buf); > +err_db: > + mlx5_db_free(ndev->mvdev.mdev, &vcq->db); > + return err; > +} > + > +static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx) > +{ > + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; > + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; > + struct mlx5_vdpa_cq *vcq = &mvq->cq; > + > + if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) { > + mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn); > + return; > + } > + cq_frag_buf_free(ndev, &vcq->buf); > + mlx5_db_free(ndev->mvdev.mdev, &vcq->db); > +} > + > +static int umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, > + struct mlx5_vdpa_umem **umemp) > +{ > + struct mlx5_core_dev *mdev = ndev->mvdev.mdev; > + int p_a; > + int p_b; > + > + switch (num) { > + case 1: > + p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a); > + p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b); > + *umemp = &mvq->umem1; > + break; > + case 2: > + p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a); > + p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b); > + *umemp = &mvq->umem2; > + break; > + case 3: > + p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a); > + p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b); > + *umemp = &mvq->umem3; > + break; > + } > + return p_a * mvq->num_ent + p_b; > +} > + > +static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem) > +{ > + mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf); > +} > + > +static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num) > +{ > + int inlen; > + u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {}; > + void *um; > + void *in; > + int err; > + __be64 *pas; > + int size; > + struct mlx5_vdpa_umem *umem; > + > + size = umem_size(ndev, mvq, num, &umem); > + if (size < 0) > + return size; > + > + umem->size = size; > + err = umem_frag_buf_alloc(ndev, umem, size); > + if (err) > + return err; > + > + inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages; > + > + in = kzalloc(inlen, GFP_KERNEL); > + if (!in) { > + err = -ENOMEM; > + goto err_in; > + } > + > + MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM); > + MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid); > + um = MLX5_ADDR_OF(create_umem_in, in, umem); > + MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); > + MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages); > + > + pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]); > + mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW); > + > + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); > + if (err) { > + mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err); > + goto err_cmd; > + } > + > + kfree(in); > + umem->id = MLX5_GET(create_umem_out, out, umem_id); > + > + return 0; > + > +err_cmd: > + kfree(in); > +err_in: > + umem_frag_buf_free(ndev, umem); > + return err; > +} > + > +static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num) > +{ > + u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {}; > + u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {}; > + struct mlx5_vdpa_umem *umem; > + > + switch (num) { > + case 1: > + umem = &mvq->umem1; > + break; > + case 2: > + umem = &mvq->umem2; > + break; > + case 3: > + umem = &mvq->umem3; > + break; > + } > + > + MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM); > + MLX5_SET(destroy_umem_in, in, umem_id, umem->id); > + if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) > + return; > + > + umem_frag_buf_free(ndev, umem); > +} > + > +static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + int num; > + int err; > + > + for (num = 1; num <= 3; num++) { > + err = create_umem(ndev, mvq, num); > + if (err) > + goto err_umem; > + } > + return 0; > + > +err_umem: > + for (num--; num > 0; num--) > + umem_destroy(ndev, mvq, num); > + > + return err; > +} > + > +static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + int num; > + > + for (num = 3; num > 0; num--) > + umem_destroy(ndev, mvq, num); > +} > + > +static int get_queue_type(struct mlx5_vdpa_net *ndev) > +{ > + u32 type_mask; > + > + type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type); > + > + /* prefer split queue */ > + if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED) > + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED; > + > + WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)); > + > + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT; > +} > + > +static bool vq_is_tx(u16 idx) > +{ > + return idx % 2; > +} > + > +static u16 get_features_12_3(u64 features) > +{ > + return (!!(features & BIT(VIRTIO_NET_F_HOST_TSO4)) << 9) | > + (!!(features & BIT(VIRTIO_NET_F_HOST_TSO6)) << 8) | > + (!!(features & BIT(VIRTIO_NET_F_CSUM)) << 7) | > + (!!(features & BIT(VIRTIO_NET_F_GUEST_CSUM)) << 6); > +} > + > +static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in); > + u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {}; > + void *obj_context; > + void *cmd_hdr; > + void *vq_ctx; > + void *in; > + int err; > + > + err = umems_create(ndev, mvq); > + if (err) > + return err; > + > + in = kzalloc(inlen, GFP_KERNEL); > + if (!in) { > + err = -ENOMEM; > + goto err_alloc; > + } > + > + cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr); > + > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); > + > + obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context); > + MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx); > + MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3, > + get_features_12_3(ndev->mvdev.actual_features)); > + vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context); > + MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev)); > + > + if (vq_is_tx(mvq->index)) > + MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn); > + > + MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE); > + MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index); > + MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn); > + MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent); > + MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, > + !!(ndev->mvdev.actual_features & VIRTIO_F_VERSION_1)); > + MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr); > + MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr); > + MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); > + MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey.key); > + MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id); > + MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size); > + MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id); > + MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem1.size); > + MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id); > + MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem1.size); > + MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn); > + if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type)) > + MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1); > + > + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); > + if (err) > + goto err_cmd; > + > + kfree(in); > + mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); > + > + return 0; > + > +err_cmd: > + kfree(in); > +err_alloc: > + umems_destroy(ndev, mvq); > + return err; > +} > + > +static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {}; > + u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {}; > + > + MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode, > + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); > + MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id); > + MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid); > + MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type, > + MLX5_OBJ_TYPE_VIRTIO_NET_Q); > + if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) { > + mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id); > + return; > + } > + umems_destroy(ndev, mvq); > +} > + > +static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw) > +{ > + return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn; > +} > + > +static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw) > +{ > + return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn; > +} > + > +static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out, > + int *outlen, u32 qpn, u32 rqpn) > +{ > + void *qpc; > + void *pp; > + > + switch (cmd) { > + case MLX5_CMD_OP_2RST_QP: > + *inlen = MLX5_ST_SZ_BYTES(qp_2rst_in); > + *outlen = MLX5_ST_SZ_BYTES(qp_2rst_out); > + *in = kzalloc(*inlen, GFP_KERNEL); > + *out = kzalloc(*outlen, GFP_KERNEL); > + if (!in || !out) > + goto outerr; > + > + MLX5_SET(qp_2rst_in, *in, opcode, cmd); > + MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid); > + MLX5_SET(qp_2rst_in, *in, qpn, qpn); > + break; > + case MLX5_CMD_OP_RST2INIT_QP: > + *inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in); > + *outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out); > + *in = kzalloc(*inlen, GFP_KERNEL); > + *out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL); > + if (!in || !out) > + goto outerr; > + > + MLX5_SET(rst2init_qp_in, *in, opcode, cmd); > + MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid); > + MLX5_SET(rst2init_qp_in, *in, qpn, qpn); > + qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc); > + MLX5_SET(qpc, qpc, remote_qpn, rqpn); > + MLX5_SET(qpc, qpc, rwe, 1); > + pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path); > + MLX5_SET(ads, pp, vhca_port_num, 1); > + break; > + case MLX5_CMD_OP_INIT2RTR_QP: > + *inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in); > + *outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out); > + *in = kzalloc(*inlen, GFP_KERNEL); > + *out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL); > + if (!in || !out) > + goto outerr; > + > + MLX5_SET(init2rtr_qp_in, *in, opcode, cmd); > + MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid); > + MLX5_SET(init2rtr_qp_in, *in, qpn, qpn); > + qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc); > + MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES); > + MLX5_SET(qpc, qpc, log_msg_max, 30); > + MLX5_SET(qpc, qpc, remote_qpn, rqpn); > + pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path); > + MLX5_SET(ads, pp, fl, 1); > + break; > + case MLX5_CMD_OP_RTR2RTS_QP: > + *inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in); > + *outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out); > + *in = kzalloc(*inlen, GFP_KERNEL); > + *out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL); > + if (!in || !out) > + goto outerr; > + > + MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd); > + MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid); > + MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn); > + qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc); > + pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path); > + MLX5_SET(ads, pp, ack_timeout, 14); > + MLX5_SET(qpc, qpc, retry_count, 7); > + MLX5_SET(qpc, qpc, rnr_retry, 7); > + break; > + default: > + goto outerr; > + } > + if (!*in || !*out) > + goto outerr; > + > + return; > + > +outerr: > + kfree(*in); > + kfree(*out); > + *in = NULL; > + *out = NULL; > +} > + > +static void free_inout(void *in, void *out) > +{ > + kfree(in); > + kfree(out); > +} > + > +/* Two QPs are used by each virtqueue. One is used by the driver and one by > + * firmware. The fw argument indicates whether the subjected QP is the one used > + * by firmware. > + */ > +static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd) > +{ > + int outlen; > + int inlen; > + void *out; > + void *in; > + int err; > + > + alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw)); > + if (!in || !out) > + return -ENOMEM; > + > + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen); > + free_inout(in, out); > + return err; > +} > + > +static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + int err; > + > + err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP); > + if (err) > + return err; > + > + err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP); > + if (err) > + return err; > + > + err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP); > + if (err) > + return err; > + > + err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP); > + if (err) > + return err; > + > + err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP); > + if (err) > + return err; > + > + err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP); > + if (err) > + return err; > + > + return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP); > +} > + > +struct mlx5_virtq_attr { > + u8 state; > + u16 available_index; > +}; > + > +static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, > + struct mlx5_virtq_attr *attr) > +{ > + int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out); > + u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {}; > + void *out; > + void *obj_context; > + void *cmd_hdr; > + int err; > + > + out = kzalloc(outlen, GFP_KERNEL); > + if (!out) > + return -ENOMEM; > + > + cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr); > + > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id); > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); > + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen); > + if (err) > + goto err_cmd; > + > + obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context); > + memset(attr, 0, sizeof(*attr)); > + attr->state = MLX5_GET(virtio_net_q_object, obj_context, state); > + attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index); > + kfree(out); > + return 0; > + > +err_cmd: > + kfree(out); > + return err; > +} > + > +static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state) > +{ > + int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); > + u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {}; > + void *obj_context; > + void *cmd_hdr; > + void *in; > + int err; > + > + in = kzalloc(inlen, GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr); > + > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id); > + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); > + > + obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context); > + MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, > + MLX5_VIRTQ_MODIFY_MASK_STATE); > + MLX5_SET(virtio_net_q_object, obj_context, state, state); > + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); > + kfree(in); > + if (!err) > + mvq->fw_state = state; > + > + return err; > +} > + > +static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + u16 idx = mvq->index; > + int err; > + > + if (!mvq->num_ent) > + return 0; > + > + if (mvq->initialized) { > + mlx5_vdpa_warn(&ndev->mvdev, "attempt re init\n"); > + return -EINVAL; > + } > + > + err = cq_create(ndev, idx, mvq->num_ent); > + if (err) > + return err; > + > + err = qp_create(ndev, mvq, &mvq->fwqp); > + if (err) > + goto err_fwqp; > + > + err = qp_create(ndev, mvq, &mvq->vqqp); > + if (err) > + goto err_vqqp; > + > + err = connect_qps(ndev, mvq); > + if (err) > + goto err_connect; > + > + err = create_virtqueue(ndev, mvq); > + if (err) > + goto err_connect; > + > + if (mvq->ready) { > + err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); > + if (err) { > + mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n", > + idx, err); > + goto err_connect; > + } > + } > + > + mvq->initialized = true; > + return 0; > + > +err_connect: > + qp_destroy(ndev, &mvq->vqqp); > +err_vqqp: > + qp_destroy(ndev, &mvq->fwqp); > +err_fwqp: > + cq_destroy(ndev, idx); > + return err; > +} > + > +static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + struct mlx5_virtq_attr attr; > + > + if (!mvq->initialized) > + return; > + > + if (query_virtqueue(ndev, mvq, &attr)) { > + mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n"); > + return; > + } > + if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) > + return; > + > + if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)) > + mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n"); > +} > + > +static void suspend_vqs(struct mlx5_vdpa_net *ndev) > +{ > + int i; > + > + for (i = 0; i < MLX5_MAX_SUPPORTED_VQS; i++) > + suspend_vq(ndev, &ndev->vqs[i]);In teardown_virtqueues() it has a check of mvq->num_ent, any reason not doing it here?> +} > + > +static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + if (!mvq->initialized || !mvq->num_ent) > + return; > + > + suspend_vq(ndev, mvq); > + destroy_virtqueue(ndev, mvq); > + qp_destroy(ndev, &mvq->vqqp); > + qp_destroy(ndev, &mvq->fwqp); > + cq_destroy(ndev, mvq->index); > + mvq->initialized = false; > +} > + > +static int create_rqt(struct mlx5_vdpa_net *ndev) > +{ > + int log_max_rqt; > + int acutal_rqt; > + __be32 *list; > + void *rqtc; > + int inlen; > + void *in; > + int i, j; > + int err; > + > + log_max_rqt = min_t(int, 1, MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size)); > + if (log_max_rqt < 1) > + return -EOPNOTSUPP; > + > + acutal_rqt = 1; > + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + (1 << log_max_rqt) * MLX5_ST_SZ_BYTES(rq_num); > + in = kzalloc(inlen, GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid); > + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); > + > + MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q); > + MLX5_SET(rqtc, rqtc, rqt_max_size, 1 << log_max_rqt); > + MLX5_SET(rqtc, rqtc, rqt_actual_size, 1); > + list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]); > + for (i = 0, j = 0; j < ndev->mvdev.max_vqs; j++) { > + if (!ndev->vqs[j].initialized) > + continue; > + > + if (!vq_is_tx(ndev->vqs[j].index)) { > + list[i] = cpu_to_be32(ndev->vqs[j].virtq_id); > + i++; > + } > + } > + > + err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn); > + kfree(in); > + if (err) > + return err; > + > + return 0; > +} > + > +static void destroy_rqt(struct mlx5_vdpa_net *ndev) > +{ > + mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn); > +} > + > +static int create_tir(struct mlx5_vdpa_net *ndev) > +{ > +#define HASH_IP_L4PORTS \ > + (MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT | \ > + MLX5_HASH_FIELD_SEL_L4_DPORT) > + static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7, > + 0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94, > + 0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1, > + 0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59, > + 0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a }; > + void *rss_key; > + void *outer; > + void *tirc; > + void *in; > + int err; > + > + in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL); > + if (!in) > + return -ENOMEM; > + > + MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid); > + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); > + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); > + > + MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); > + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ); > + rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); > + memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key)); > + > + outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); > + MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); > + MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); > + MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS); > + > + MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn); > + MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn); > + > + err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn); > + kfree(in); > + return err; > +} > + > +static void destroy_tir(struct mlx5_vdpa_net *ndev) > +{ > + mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn); > +} > + > +static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev) > +{ > + struct mlx5_flow_destination dest[2] = {}; > + struct mlx5_flow_table_attr ft_attr = {}; > + struct mlx5_flow_act flow_act = {}; > + struct mlx5_flow_namespace *ns; > + int err; > + > + /* for now, one entry, match all, forward to tir */ > + ft_attr.max_fte = 1; > + ft_attr.autogroup.max_num_groups = 1; > + > + ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS); > + if (!ns) { > + mlx5_vdpa_warn(&ndev->mvdev, "get flow namespace\n"); > + return -EOPNOTSUPP; > + } > + > + ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); > + if (IS_ERR(ndev->rxft)) > + return PTR_ERR(ndev->rxft); > + > + ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false); > + if (IS_ERR(ndev->rx_counter)) { > + err = PTR_ERR(ndev->rx_counter); > + goto err_fc; > + } > + > + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT; > + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR; > + dest[0].tir_num = ndev->res.tirn; > + dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; > + dest[1].counter_id = mlx5_fc_id(ndev->rx_counter); > + ndev->rx_rule = mlx5_add_flow_rules(ndev->rxft, NULL, &flow_act, dest, 2); > + if (IS_ERR(ndev->rx_rule)) { > + err = PTR_ERR(ndev->rx_rule); > + ndev->rx_rule = NULL; > + goto err_rule; > + } > + > + return 0; > + > +err_rule: > + mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter); > +err_fc: > + mlx5_destroy_flow_table(ndev->rxft); > + return err; > +} > + > +static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev) > +{ > + if (!ndev->rx_rule) > + return; > + > + mlx5_del_flow_rules(ndev->rx_rule); > + mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter); > + mlx5_destroy_flow_table(ndev->rxft); > + > + ndev->rx_rule = NULL; > +} > + > +static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; > + > + if (unlikely(!mvq->ready)) > + return; > + > + iowrite16(idx, ndev->mvdev.res.kick_addr); > +} > + > +static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area, > + u64 driver_area, u64 device_area) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; > + > + mvq->desc_addr = desc_area; > + mvq->device_addr = device_area; > + mvq->driver_addr = driver_area; > + return 0; > +} > + > +static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + struct mlx5_vdpa_virtqueue *mvq; > + > + mvq = &ndev->vqs[idx]; > + mvq->num_ent = num; > +} > + > +static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + struct mlx5_vdpa_virtqueue *vq = &ndev->vqs[idx]; > + > + vq->event_cb = *cb; > +} > + > +static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; > + int err; > + > + if (!mvq->ready && ready && mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) { > + err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); > + if (err) { > + mlx5_vdpa_warn(mvdev, "failed to modify virtqueue(%d)\n", err); > + return; > + } > + }I wonder what's the reason of changing vq state on the hardware here. I think we can defer it to set_status(). (Anyhow we don't sync vq address in set_vq_address()).> + > + mvq->ready = ready; > +} > + > +static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; > + > + return mvq->ready; > +} > + > +static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx, > + const struct vdpa_vq_state *state) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; > + > + if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) { > + mlx5_vdpa_warn(mvdev, "can't modify available index\n"); > + return -EINVAL; > + } > + > + mvq->avail_idx = state->avail_index; > + return 0; > +} > + > +static void mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; > + struct mlx5_virtq_attr attr; > + int err; > + > + if (!mvq->initialized) > + goto not_ready; > + > + err = query_virtqueue(ndev, mvq, &attr); > + if (err) { > + mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n"); > + goto not_ready; > + } > + state->avail_index = attr.available_index; > + > +not_ready: > + state->state = VQ_STATE_NOT_READY; > +} > + > +static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev) > +{ > + return PAGE_SIZE; > +} > + > +enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9, > + MLX5_VIRTIO_NET_F_CSUM = 1 << 10, > + MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11, > + MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12, > +}; > + > +static u64 mlx_to_vritio_features(u16 dev_features) > +{ > + u64 result = 0; > + > + if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM) > + result |= BIT(VIRTIO_NET_F_GUEST_CSUM); > + if (dev_features & MLX5_VIRTIO_NET_F_CSUM) > + result |= BIT(VIRTIO_NET_F_CSUM); > + if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6) > + result |= BIT(VIRTIO_NET_F_HOST_TSO6); > + if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4) > + result |= BIT(VIRTIO_NET_F_HOST_TSO4); > + > + return result; > +} > + > +static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + u16 dev_features; > + > + dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask); > + ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features); > + if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0)) > + ndev->mvdev.mlx_features |= BIT(VIRTIO_F_VERSION_1);This is interesting. This suggests !VIRTIO_F_VERSION_1 && VIRTIO_F_IOMMU_PLATFORM is valid. But virito spec doesn't allow such configuration. So I think you need either: 1) Fail vDPA device probe when VERSION_1 is not supported 2) clear IOMMU_PLATFORM if VERSION_1 is not negotiated For 2) I guess it can't work, according to the spec, without IOMMU_PLATFORM, device need to use PA to access the memory> + ndev->mvdev.mlx_features |= BIT(VIRTIO_F_ANY_LAYOUT); > + ndev->mvdev.mlx_features |= BIT(VIRTIO_F_IOMMU_PLATFORM); > + if (mlx5_vdpa_max_qps(ndev->mvdev.max_vqs) > 1) > + ndev->mvdev.mlx_features |= BIT(VIRTIO_NET_F_MQ); > + > + print_features(mvdev, ndev->mvdev.mlx_features, false); > + return ndev->mvdev.mlx_features; > +} > + > +static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features) > +{ > + /* FIXME: qemu currently does not set all the feaures due to a bug. > + * Add checks when this is fixed. > + */I think we should add the check now then qemu can get notified. (E.g IOMMU_PLATFORM)> + return 0; > +} > + > +static int setup_virtqueues(struct mlx5_vdpa_net *ndev) > +{ > + int err; > + int i; > + > + for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); i++) { > + err = setup_vq(ndev, &ndev->vqs[i]); > + if (err) > + goto err_vq; > + } > + > + return 0; > + > +err_vq: > + for (--i; i >= 0; i--) > + teardown_vq(ndev, &ndev->vqs[i]); > + > + return err; > +} > + > +static void teardown_virtqueues(struct mlx5_vdpa_net *ndev) > +{ > + struct mlx5_vdpa_virtqueue *mvq; > + int i; > + > + for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) { > + mvq = &ndev->vqs[i]; > + if (!mvq->num_ent) > + continue; > + > + teardown_vq(ndev, mvq); > + } > +} > + > +static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + int err; > + > + print_features(mvdev, features, true); > + > + err = verify_min_features(mvdev, features); > + if (err) > + return err; > + > + ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features; > + return err; > +} > + > +static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb) > +{ > + /* not implemented */ > + mlx5_vdpa_warn(to_mvdev(vdev), "set config callback not supported\n"); > +} > + > +#define MLX5_VDPA_MAX_VQ_ENTRIES 256Is this a hardware limitation, qemu can support up to 1K which the requirement of some NFV cases.> +static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev) > +{ > + return MLX5_VDPA_MAX_VQ_ENTRIES; > +} > + > +static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev) > +{ > + return VIRTIO_ID_NET; > +} > + > +static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev) > +{ > + return PCI_VENDOR_ID_MELLANOX; > +} > + > +static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + > + print_status(mvdev, ndev->mvdev.status, false); > + return ndev->mvdev.status; > +} > + > +static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) > +{ > + struct mlx5_vq_restore_info *ri = &mvq->ri; > + struct mlx5_virtq_attr attr; > + int err; > + > + if (!mvq->initialized) > + return 0; > + > + err = query_virtqueue(ndev, mvq, &attr); > + if (err) > + return err; > + > + ri->avail_index = attr.available_index; > + ri->ready = mvq->ready; > + ri->num_ent = mvq->num_ent; > + ri->desc_addr = mvq->desc_addr; > + ri->device_addr = mvq->device_addr; > + ri->driver_addr = mvq->driver_addr; > + ri->cb = mvq->event_cb; > + ri->restore = true; > + return 0; > +} > + > +static int save_channels_info(struct mlx5_vdpa_net *ndev) > +{ > + int i; > + > + for (i = 0; i < ndev->mvdev.max_vqs; i++) { > + memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri)); > + save_channel_info(ndev, &ndev->vqs[i]); > + } > + return 0; > +} > + > +static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev) > +{ > + int i; > + > + for (i = 0; i < ndev->mvdev.max_vqs; i++) > + memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri)); > +} > + > +static void restore_channels_info(struct mlx5_vdpa_net *ndev) > +{ > + struct mlx5_vdpa_virtqueue *mvq; > + struct mlx5_vq_restore_info *ri; > + int i; > + > + mlx5_clear_vqs(ndev); > + init_mvqs(ndev); > + for (i = 0; i < ndev->mvdev.max_vqs; i++) { > + mvq = &ndev->vqs[i]; > + ri = &mvq->ri; > + if (!ri->restore) > + continue; > + > + mvq->avail_idx = ri->avail_index; > + mvq->ready = ri->ready; > + mvq->num_ent = ri->num_ent; > + mvq->desc_addr = ri->desc_addr; > + mvq->device_addr = ri->device_addr; > + mvq->driver_addr = ri->driver_addr; > + mvq->event_cb = ri->cb; > + } > +} > + > +static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *iotlb) > +{ > + int err; > + > + suspend_vqs(ndev); > + err = save_channels_info(ndev); > + if (err) > + goto err_mr; > + > + teardown_driver(ndev); > + mlx5_vdpa_destroy_mr(&ndev->mvdev); > + err = mlx5_vdpa_create_mr(&ndev->mvdev, iotlb); > + if (err) > + goto err_mr; > + > + restore_channels_info(ndev); > + err = setup_driver(ndev); > + if (err) > + goto err_setup; > + > + return 0; > + > +err_setup: > + mlx5_vdpa_destroy_mr(&ndev->mvdev); > +err_mr: > + return err; > +} > + > +static int setup_driver(struct mlx5_vdpa_net *ndev) > +{ > + int err; > + > + mutex_lock(&ndev->reslock); > + if (ndev->setup) { > + mlx5_vdpa_warn(&ndev->mvdev, "setup driver called for already setup driver\n"); > + err = 0; > + goto out; > + } > + err = setup_virtqueues(ndev); > + if (err) { > + mlx5_vdpa_warn(&ndev->mvdev, "setup_virtqueues\n"); > + goto out; > + } > + > + err = create_rqt(ndev); > + if (err) { > + mlx5_vdpa_warn(&ndev->mvdev, "create_rqt\n"); > + goto err_rqt; > + } > + > + err = create_tir(ndev); > + if (err) { > + mlx5_vdpa_warn(&ndev->mvdev, "create_tir\n"); > + goto err_tir; > + } > + > + err = add_fwd_to_tir(ndev); > + if (err) { > + mlx5_vdpa_warn(&ndev->mvdev, "add_fwd_to_tir\n"); > + goto err_fwd; > + } > + ndev->setup = true; > + mutex_unlock(&ndev->reslock); > + > + return 0; > + > +err_fwd: > + destroy_tir(ndev); > +err_tir: > + destroy_rqt(ndev); > +err_rqt: > + teardown_virtqueues(ndev); > +out: > + mutex_unlock(&ndev->reslock); > + return err; > +} > + > +static void teardown_driver(struct mlx5_vdpa_net *ndev) > +{ > + mutex_lock(&ndev->reslock); > + if (!ndev->setup) > + goto out; > + > + remove_fwd_to_tir(ndev); > + destroy_tir(ndev); > + destroy_rqt(ndev); > + teardown_virtqueues(ndev); > + ndev->setup = false; > +out: > + mutex_unlock(&ndev->reslock); > +} > + > +static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + int err; > + > + print_status(mvdev, status, true); > + if (!status) { > + mlx5_vdpa_info(mvdev, "performing device reset\n"); > + teardown_driver(ndev); > + mlx5_vdpa_destroy_mr(&ndev->mvdev); > + ndev->mvdev.status = 0; > + ndev->mvdev.mlx_features = 0; > + ++mvdev->generation; > + return; > + } > + > + if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) { > + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { > + err = setup_driver(ndev); > + if (err) { > + mlx5_vdpa_warn(mvdev, "failed to setup driver\n"); > + goto err_setup; > + } > + } else { > + mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n"); > + return; > + } > + } > + > + ndev->mvdev.status = status; > + return; > + > +err_setup: > + mlx5_vdpa_destroy_mr(&ndev->mvdev); > + ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; > +} > + > +static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, > + unsigned int len) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + > + if (offset + len < sizeof(struct virtio_net_config)) > + memcpy(buf, &ndev->config + offset, len);Note that guest expect LE, what's the endian for ndev->config?> +} > + > +static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, > + unsigned int len) > +{ > + /* not supported */ > +} > + > +static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + > + return mvdev->generation; > +} > + > +static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb) > +{ > + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); > + bool change_map; > + int err; > + > + err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map); > + if (err) { > + mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err); > + return err; > + } > + > + if (change_map) > + return mlx5_vdpa_change_map(ndev, iotlb);Any reason for not doing this inside mlx5_handle_set_map()?> + > + return 0; > +} > + > +static void mlx5_vdpa_free(struct vdpa_device *vdev) > +{ > + /* not used */ > +} > + > +static const struct vdpa_config_ops mlx5_vdpa_ops = { > + .set_vq_address = mlx5_vdpa_set_vq_address, > + .set_vq_num = mlx5_vdpa_set_vq_num, > + .kick_vq = mlx5_vdpa_kick_vq, > + .set_vq_cb = mlx5_vdpa_set_vq_cb, > + .set_vq_ready = mlx5_vdpa_set_vq_ready, > + .get_vq_ready = mlx5_vdpa_get_vq_ready, > + .set_vq_state = mlx5_vdpa_set_vq_state, > + .get_vq_state = mlx5_vdpa_get_vq_state, > + .get_vq_align = mlx5_vdpa_get_vq_align, > + .get_features = mlx5_vdpa_get_features, > + .set_features = mlx5_vdpa_set_features, > + .set_config_cb = mlx5_vdpa_set_config_cb, > + .get_vq_num_max = mlx5_vdpa_get_vq_num_max, > + .get_device_id = mlx5_vdpa_get_device_id, > + .get_vendor_id = mlx5_vdpa_get_vendor_id, > + .get_status = mlx5_vdpa_get_status, > + .set_status = mlx5_vdpa_set_status, > + .get_config = mlx5_vdpa_get_config, > + .set_config = mlx5_vdpa_set_config, > + .get_generation = mlx5_vdpa_get_generation, > + .set_map = mlx5_vdpa_set_map, > + .free = mlx5_vdpa_free, > +}; > + > +static int alloc_resources(struct mlx5_vdpa_net *ndev) > +{ > + struct mlx5_vdpa_net_resources *res = &ndev->res; > + int err; > + > + if (res->valid) { > + mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n"); > + return -EEXIST; > + } > + > + err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn); > + if (err) > + return err; > + > + err = create_tis(ndev); > + if (err) > + goto err_tis; > + > + res->valid = true; > + > + return 0; > + > +err_tis: > + mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn); > + return err; > +} > + > +static void free_resources(struct mlx5_vdpa_net *ndev) > +{ > + struct mlx5_vdpa_net_resources *res = &ndev->res; > + > + if (!res->valid) > + return; > + > + destroy_tis(ndev); > + mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn); > + res->valid = false; > +} > + > +static void init_mvqs(struct mlx5_vdpa_net *ndev) > +{ > + struct mlx5_vdpa_virtqueue *mvq; > + int i; > + > + for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) { > + mvq = &ndev->vqs[i]; > + memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri)); > + mvq->index = i; > + mvq->ndev = ndev; > + mvq->fwqp.fw = true; > + } > + for (; i < ndev->mvdev.max_vqs; i++) { > + mvq = &ndev->vqs[i]; > + memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri)); > + mvq->index = i; > + mvq->ndev = ndev; > + } > +} > + > +void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) > +{ > + struct virtio_net_config *config; > + struct mlx5_vdpa_dev *mvdev; > + struct mlx5_vdpa_net *ndev; > + u32 max_vqs; > + int err; > + > + /* we save one virtqueue for control virtqueue should we require it */ > + max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues); > + max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); > + > + ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, > + 2 * mlx5_vdpa_max_qps(max_vqs)); > + if (IS_ERR(ndev)) > + return ndev; > + > + ndev->mvdev.max_vqs = max_vqs; > + mvdev = &ndev->mvdev; > + mvdev->mdev = mdev; > + init_mvqs(ndev); > + mutex_init(&ndev->reslock); > + config = &ndev->config; > + err = mlx5_query_nic_vport_mtu(mdev, &config->mtu); > + if (err) > + goto err_mtu; > + > + err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac); > + if (err) > + goto err_mtu; > + > + mvdev->vdev.dma_dev = mdev->device; > + err = mlx5_vdpa_alloc_resources(&ndev->mvdev); > + if (err) > + goto err_mtu; > + > + err = alloc_resources(ndev); > + if (err) > + goto err_res; > + > + err = vdpa_register_device(&mvdev->vdev); > + if (err) > + goto err_reg; > + > + return ndev; > + > +err_reg: > + free_resources(ndev); > +err_res: > + mlx5_vdpa_free_resources(&ndev->mvdev); > +err_mtu: > + mutex_destroy(&ndev->reslock); > + put_device(&mvdev->vdev.dev); > + return ERR_PTR(err); > +} > + > +void mlx5_vdpa_remove_dev(struct mlx5_vdpa_dev *mvdev) > +{ > + struct mlx5_vdpa_net *ndev; > + > + mvdev->status = 0;This is probably unnecessary. Thanks> + ndev = container_of(mvdev, struct mlx5_vdpa_net, mvdev); > + vdpa_unregister_device(&ndev->mvdev.vdev); > + free_resources(ndev); > + mlx5_vdpa_free_resources(&ndev->mvdev); > + mutex_destroy(&ndev->reslock); > + put_device(&mvdev->vdev.dev); > +} > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.h b/drivers/vdpa/mlx5/net/mlx5_vnet.h > new file mode 100644 > index 000000000000..38bb9adaadc1 > --- /dev/null > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.h > @@ -0,0 +1,32 @@ > +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ > +/* Copyright (c) 2020 Mellanox Technologies Ltd. */ > + > +#ifndef __MLX5_VNET_H_ > +#define __MLX5_VNET_H_ > + > +#include <linux/vdpa.h> > +#include <linux/virtio_net.h> > +#include <linux/vringh.h> > +#include <linux/mlx5/driver.h> > +#include <linux/mlx5/cq.h> > +#include <linux/mlx5/qp.h> > +#include "../core/mlx5_vdpa.h" > + > +/* we want to have one virtqueue reserved for control queue. We do this > + * reservation only if we have more than two vqs available. > + */ > +static inline u32 mlx5_vdpa_max_qps(int max_vqs) > +{ > + if (max_vqs > 2) > + return (max_vqs - 1) / 2; > + > + return max_vqs / 2; > +} > + > +#define to_mlx5_vdpa_ndev(__mvdev) container_of(__mvdev, struct mlx5_vdpa_net, mvdev) > +void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev); > +void mlx5_vdpa_remove_dev(struct mlx5_vdpa_dev *mvdev); > +int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb); > +void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev); > + > +#endif /* __MLX5_VNET_H_ */