XDP socket(AF_XDP) is an excellent bypass kernel network framework. The zero
copy feature of xsk (XDP socket) needs to be supported by the driver. The
performance of zero copy is very good.
ENV: Qemu with vhost.
                   vhost cpu | Guest APP CPU |Guest Softirq CPU | PPS
-----------------------------|---------------|------------------|------------
xmit by sockperf:     90%    |   100%        |                  |  318967
xmit by xsk:          100%   |   30%         |   33%            | 1192064
recv by sockperf:     100%   |   68%         |   100%           |  692288
recv by xsk:          100%   |   33%         |   43%            |  771670
Before achieving the function of Virtio-Net, we also have to let virtio core
support these features:
1. virtio core support premapped
2. virtio core support reset per-queue
3. introduce DMA APIs to virtio core
Please review.
Thanks.
v2:
 1. based on sgs[0]->dma_address to judgment is premapped
 2. based on extra.addr to judgment to do unmap for no-indirect desc
 3. based on indir_desc to judgment to do unmap for indirect desc
 4. rename virtqueue_get_dma_dev to virtqueue_dma_dev
v1:
 1. expose dma device. NO introduce the api for dma and sync
 2. split some commit for review.
Xuan Zhuo (12):
  virtio_ring: split: separate dma codes
  virtio_ring: packed: separate dma codes
  virtio_ring: packed-indirect: separate dma codes
  virtio_ring: split: support premapped
  virtio_ring: packed: support premapped
  virtio_ring: split-indirect: support premapped
  virtio_ring: packed-indirect: support premapped
  virtio_ring: update document for virtqueue_add_*
  virtio_ring: introduce virtqueue_dma_dev()
  virtio_ring: correct the expression of the description of
    virtqueue_resize()
  virtio_ring: separate the logic of reset/enable from virtqueue_resize
  virtio_ring: introduce virtqueue_reset()
 drivers/virtio/virtio.c      |   6 +
 drivers/virtio/virtio_ring.c | 354 +++++++++++++++++++++++++----------
 include/linux/virtio.h       |   4 +
 3 files changed, 260 insertions(+), 104 deletions(-)
--
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 01/12] virtio_ring: split: separate dma codes
DMA-related logic is separated from the virtqueue_add_split() to
one new function. DMA address will be saved as sg->dma_address, then
virtqueue_add_split() will use it directly. Unmap operation will be
simpler.
The purpose of this is to facilitate subsequent support to receive
dma address mapped by drivers.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 110 ++++++++++++++++++++++++++---------
 1 file changed, 82 insertions(+), 28 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 41144b5246a8..8ace2f503953 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -520,6 +520,77 @@ static inline unsigned int virtqueue_add_desc_split(struct
virtqueue *vq,
 	return next;
 }
 
+static void virtqueue_unmap_sgs(struct vring_virtqueue *vq,
+				struct scatterlist *sgs[],
+				unsigned int total_sg,
+				unsigned int out_sgs,
+				unsigned int in_sgs)
+{
+	struct scatterlist *sg;
+	unsigned int n;
+
+	if (!vq->use_dma_api)
+		return;
+
+	for (n = 0; n < out_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			if (!sg->dma_address)
+				return;
+
+			dma_unmap_page(vring_dma_dev(vq), sg->dma_address,
+				       sg->length, DMA_TO_DEVICE);
+		}
+	}
+
+	for (; n < (out_sgs + in_sgs); n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			if (!sg->dma_address)
+				return;
+
+			dma_unmap_page(vring_dma_dev(vq), sg->dma_address,
+				       sg->length, DMA_FROM_DEVICE);
+		}
+	}
+}
+
+static int virtqueue_map_sgs(struct vring_virtqueue *vq,
+			     struct scatterlist *sgs[],
+			     unsigned int total_sg,
+			     unsigned int out_sgs,
+			     unsigned int in_sgs)
+{
+	struct scatterlist *sg;
+	unsigned int n;
+
+	for (n = 0; n < out_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
+
+			if (vring_mapping_error(vq, addr))
+				goto err;
+
+			sg->dma_address = addr;
+		}
+	}
+
+	for (; n < (out_sgs + in_sgs); n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
+
+			if (vring_mapping_error(vq, addr))
+				goto err;
+
+			sg->dma_address = addr;
+		}
+	}
+
+	return 0;
+
+err:
+	virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
+	return -ENOMEM;
+}
+
 static inline int virtqueue_add_split(struct virtqueue *_vq,
 				      struct scatterlist *sgs[],
 				      unsigned int total_sg,
@@ -532,9 +603,9 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
 	struct vring_desc *desc;
-	unsigned int i, n, avail, descs_used, prev, err_idx;
-	int head;
+	unsigned int i, n, avail, descs_used, prev;
 	bool indirect;
+	int head;
 
 	START_USE(vq);
 
@@ -586,32 +657,30 @@ static inline int virtqueue_add_split(struct virtqueue
*_vq,
 		return -ENOSPC;
 	}
 
+	if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
+		return -ENOMEM;
+
 	for (n = 0; n < out_sgs; n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
-			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
-			if (vring_mapping_error(vq, addr))
-				goto unmap_release;
-
 			prev = i;
 			/* Note that we trust indirect descriptor
 			 * table since it use stream DMA mapping.
 			 */
-			i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length,
+			i = virtqueue_add_desc_split(_vq, desc, i,
+						     sg->dma_address,
+						     sg->length,
 						     VRING_DESC_F_NEXT,
 						     indirect);
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
-			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
-			if (vring_mapping_error(vq, addr))
-				goto unmap_release;
-
 			prev = i;
 			/* Note that we trust indirect descriptor
 			 * table since it use stream DMA mapping.
 			 */
-			i = virtqueue_add_desc_split(_vq, desc, i, addr,
+			i = virtqueue_add_desc_split(_vq, desc, i,
+						     sg->dma_address,
 						     sg->length,
 						     VRING_DESC_F_NEXT |
 						     VRING_DESC_F_WRITE,
@@ -679,22 +748,7 @@ static inline int virtqueue_add_split(struct virtqueue
*_vq,
 	return 0;
 
 unmap_release:
-	err_idx = i;
-
-	if (indirect)
-		i = 0;
-	else
-		i = head;
-
-	for (n = 0; n < total_sg; n++) {
-		if (i == err_idx)
-			break;
-		if (indirect) {
-			vring_unmap_one_split_indirect(vq, &desc[i]);
-			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
-		} else
-			i = vring_unmap_one_split(vq, i);
-	}
+	virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
 
 	if (indirect)
 		kfree(desc);
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 02/12] virtio_ring: packed: separate dma codes
DMA-related logic is separated from the virtqueue_add_packed(). DMA
address will be saved as sg->dma_address, then virtqueue_add_packed()
will use it directly. Unmap operation will be simpler.
The purpose of this is to facilitate subsequent support to receive
dma address mapped by drivers.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 37 +++++++-----------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 8ace2f503953..b4beb51072f7 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1419,9 +1419,9 @@ static inline int virtqueue_add_packed(struct virtqueue
*_vq,
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct vring_packed_desc *desc;
 	struct scatterlist *sg;
-	unsigned int i, n, c, descs_used, err_idx;
+	unsigned int i, n, c, descs_used;
 	__le16 head_flags, flags;
-	u16 head, id, prev, curr, avail_used_flags;
+	u16 head, id, prev, curr;
 	int err;
 
 	START_USE(vq);
@@ -1450,7 +1450,6 @@ static inline int virtqueue_add_packed(struct virtqueue
*_vq,
 	}
 
 	head = vq->packed.next_avail_idx;
-	avail_used_flags = vq->packed.avail_used_flags;
 
 	WARN_ON_ONCE(total_sg > vq->packed.vring.num &&
!vq->indirect);
 
@@ -1468,15 +1467,13 @@ static inline int virtqueue_add_packed(struct virtqueue
*_vq,
 	id = vq->free_head;
 	BUG_ON(id == vq->packed.vring.num);
 
+	if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
+		return -ENOMEM;
+
 	curr = id;
 	c = 0;
 	for (n = 0; n < out_sgs + in_sgs; n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
-			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
-					DMA_TO_DEVICE : DMA_FROM_DEVICE);
-			if (vring_mapping_error(vq, addr))
-				goto unmap_release;
-
 			flags = cpu_to_le16(vq->packed.avail_used_flags |
 				    (++c == total_sg ? 0 : VRING_DESC_F_NEXT) |
 				    (n < out_sgs ? 0 : VRING_DESC_F_WRITE));
@@ -1485,12 +1482,12 @@ static inline int virtqueue_add_packed(struct virtqueue
*_vq,
 			else
 				desc[i].flags = flags;
 
-			desc[i].addr = cpu_to_le64(addr);
+			desc[i].addr = cpu_to_le64(sg->dma_address);
 			desc[i].len = cpu_to_le32(sg->length);
 			desc[i].id = cpu_to_le16(id);
 
 			if (unlikely(vq->use_dma_api)) {
-				vq->packed.desc_extra[curr].addr = addr;
+				vq->packed.desc_extra[curr].addr = sg->dma_address;
 				vq->packed.desc_extra[curr].len = sg->length;
 				vq->packed.desc_extra[curr].flags  					le16_to_cpu(flags);
@@ -1536,26 +1533,6 @@ static inline int virtqueue_add_packed(struct virtqueue
*_vq,
 	END_USE(vq);
 
 	return 0;
-
-unmap_release:
-	err_idx = i;
-	i = head;
-	curr = vq->free_head;
-
-	vq->packed.avail_used_flags = avail_used_flags;
-
-	for (n = 0; n < total_sg; n++) {
-		if (i == err_idx)
-			break;
-		vring_unmap_extra_packed(vq, &vq->packed.desc_extra[curr]);
-		curr = vq->packed.desc_extra[curr].next;
-		i++;
-		if (i >= vq->packed.vring.num)
-			i = 0;
-	}
-
-	END_USE(vq);
-	return -EIO;
 }
 
 static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 03/12] virtio_ring: packed-indirect: separate dma codes
DMA-related logic is separated from the virtqueue_add_indirect_packed().
DMA address will be saved as sg->dma_address, then
virtqueue_add_indirect_packed() will use it directly. Unmap operation
will be simpler.
The purpose of this is to facilitate subsequent support to receive
dma address mapped by drivers.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index b4beb51072f7..221ff54fe58b 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1303,7 +1303,7 @@ static int virtqueue_add_indirect_packed(struct
vring_virtqueue *vq,
 {
 	struct vring_packed_desc *desc;
 	struct scatterlist *sg;
-	unsigned int i, n, err_idx;
+	unsigned int i, n;
 	u16 head, id;
 	dma_addr_t addr;
 
@@ -1323,16 +1323,14 @@ static int virtqueue_add_indirect_packed(struct
vring_virtqueue *vq,
 	id = vq->free_head;
 	BUG_ON(id == vq->packed.vring.num);
 
+	if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
+		return -ENOMEM;
+
 	for (n = 0; n < out_sgs + in_sgs; n++) {
 		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
-			addr = vring_map_one_sg(vq, sg, n < out_sgs ?
-					DMA_TO_DEVICE : DMA_FROM_DEVICE);
-			if (vring_mapping_error(vq, addr))
-				goto unmap_release;
-
 			desc[i].flags = cpu_to_le16(n < out_sgs ?
 						0 : VRING_DESC_F_WRITE);
-			desc[i].addr = cpu_to_le64(addr);
+			desc[i].addr = cpu_to_le64(sg->dma_address);
 			desc[i].len = cpu_to_le32(sg->length);
 			i++;
 		}
@@ -1396,10 +1394,7 @@ static int virtqueue_add_indirect_packed(struct
vring_virtqueue *vq,
 	return 0;
 
 unmap_release:
-	err_idx = i;
-
-	for (i = 0; i < err_idx; i++)
-		vring_unmap_desc_packed(vq, &desc[i]);
+	virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
 
 	kfree(desc);
 
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 04/12] virtio_ring: split: support premapped
virtio core only supports virtual addresses, dma is completed in virtio
core.
In some scenarios (such as the AF_XDP), the memory is allocated
and DMA is completed in advance, so it is necessary for us to support
passing the DMA address to virtio core.
Drives can use sg->dma_address to pass the mapped dma address to virtio
core. If one sg->dma_address is used then all sgs must use sg->
dma_address, otherwise all dma_address must be null.
On the non-indirect path, if dma_address is used, extra.addr will be
set to DMA_MAPPING_ERROR. So when do unmap, we can pass it.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 221ff54fe58b..61deaf0a4faf 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -457,6 +457,9 @@ static unsigned int vring_unmap_one_split(const struct
vring_virtqueue *vq,
 				 (flags & VRING_DESC_F_WRITE) ?
 				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
 	} else {
+		if (extra[i].addr == DMA_MAPPING_ERROR)
+			goto out;
+
 		dma_unmap_page(vring_dma_dev(vq),
 			       extra[i].addr,
 			       extra[i].len,
@@ -497,7 +500,8 @@ static inline unsigned int virtqueue_add_desc_split(struct
virtqueue *vq,
 						    dma_addr_t addr,
 						    unsigned int len,
 						    u16 flags,
-						    bool indirect)
+						    bool indirect,
+						    bool do_map)
 {
 	struct vring_virtqueue *vring = to_vvq(vq);
 	struct vring_desc_extra *extra = vring->split.desc_extra;
@@ -511,7 +515,7 @@ static inline unsigned int virtqueue_add_desc_split(struct
virtqueue *vq,
 		next = extra[i].next;
 		desc[i].next = cpu_to_virtio16(vq->vdev, next);
 
-		extra[i].addr = addr;
+		extra[i].addr = do_map ? addr : DMA_MAPPING_ERROR;
 		extra[i].len = len;
 		extra[i].flags = flags;
 	} else
@@ -604,7 +608,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 	struct scatterlist *sg;
 	struct vring_desc *desc;
 	unsigned int i, n, avail, descs_used, prev;
-	bool indirect;
+	bool indirect, do_map;
 	int head;
 
 	START_USE(vq);
@@ -657,7 +661,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 		return -ENOSPC;
 	}
 
-	if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
+	do_map = !sgs[0]->dma_address;
+	if (do_map && virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
 		return -ENOMEM;
 
 	for (n = 0; n < out_sgs; n++) {
@@ -670,7 +675,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 						     sg->dma_address,
 						     sg->length,
 						     VRING_DESC_F_NEXT,
-						     indirect);
+						     indirect, do_map);
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
@@ -684,7 +689,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 						     sg->length,
 						     VRING_DESC_F_NEXT |
 						     VRING_DESC_F_WRITE,
-						     indirect);
+						     indirect, do_map);
 		}
 	}
 	/* Last one doesn't continue. */
@@ -705,7 +710,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 					 head, addr,
 					 total_sg * sizeof(struct vring_desc),
 					 VRING_DESC_F_INDIRECT,
-					 false);
+					 false, true);
 	}
 
 	/* We're using some buffers from the free list. */
@@ -748,7 +753,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 	return 0;
 
 unmap_release:
-	virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
+	if (do_map)
+		virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
 
 	if (indirect)
 		kfree(desc);
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 05/12] virtio_ring: packed: support premapped
virtio core only supports virtual addresses, dma is completed in virtio
core.
In some scenarios (such as the AF_XDP), the memory is allocated
and DMA is completed in advance, so it is necessary for us to support
passing the DMA address to virtio core.
Drives can use sg->dma_address to pass the mapped dma address to virtio
core. If one sg->dma_address is used then all sgs must use
sg->dma_address,
otherwise all dma_address must be null.
On the non-indirect path, if dma_address is used, extra.addr will be
set to DMA_MAPPING_ERROR. So when do unmap, we can pass it.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 61deaf0a4faf..66a071e3bdef 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1258,6 +1258,9 @@ static void vring_unmap_extra_packed(const struct
vring_virtqueue *vq,
 				 (flags & VRING_DESC_F_WRITE) ?
 				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
 	} else {
+		if (extra->addr == DMA_MAPPING_ERROR)
+			return;
+
 		dma_unmap_page(vring_dma_dev(vq),
 			       extra->addr, extra->len,
 			       (flags & VRING_DESC_F_WRITE) ?
@@ -1423,6 +1426,7 @@ static inline int virtqueue_add_packed(struct virtqueue
*_vq,
 	unsigned int i, n, c, descs_used;
 	__le16 head_flags, flags;
 	u16 head, id, prev, curr;
+	bool do_map;
 	int err;
 
 	START_USE(vq);
@@ -1468,7 +1472,8 @@ static inline int virtqueue_add_packed(struct virtqueue
*_vq,
 	id = vq->free_head;
 	BUG_ON(id == vq->packed.vring.num);
 
-	if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
+	do_map = !sgs[0]->dma_address;
+	if (do_map && virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
 		return -ENOMEM;
 
 	curr = id;
@@ -1488,7 +1493,8 @@ static inline int virtqueue_add_packed(struct virtqueue
*_vq,
 			desc[i].id = cpu_to_le16(id);
 
 			if (unlikely(vq->use_dma_api)) {
-				vq->packed.desc_extra[curr].addr = sg->dma_address;
+				vq->packed.desc_extra[curr].addr +					do_map ? sg->dma_address :
DMA_MAPPING_ERROR;
 				vq->packed.desc_extra[curr].len = sg->length;
 				vq->packed.desc_extra[curr].flags  					le16_to_cpu(flags);
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 06/12] virtio_ring: split-indirect: support premapped
virtio core only supports virtual addresses, dma is completed in virtio
core.
In some scenarios (such as the AF_XDP), the memory is allocated
and DMA is completed in advance, so it is necessary for us to support
passing the DMA address to virtio core.
Drives can use sg->dma_address to pass the mapped dma address to virtio
core. If one sg->dma_address is used then all sgs must use
sg->dma_address,
otherwise all dma_address must be null.
On the indirect path, if dma_address is used, desc_state.indir_desc will
be mixed with VRING_INDIRECT_PREMAPPED. So when do unmap, we can pass it.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 66a071e3bdef..11827d2e56a8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -231,6 +231,18 @@ static void vring_free(struct virtqueue *_vq);
  * Helpers.
  */
 
+#define VRING_INDIRECT_PREMAPPED  BIT(0)
+
+#define desc_mix_dma_map(do_map, desc) \
+	(do_map ? desc : (typeof(desc))((unsigned long)(desc) |
VRING_INDIRECT_PREMAPPED))
+
+#define desc_rm_dma_map(desc) \
+	((typeof(desc))((unsigned long)(desc) & ~VRING_INDIRECT_PREMAPPED))
+
+#define desc_map_inter(desc) \
+	!((unsigned long)(desc) & VRING_INDIRECT_PREMAPPED)
+
+
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
 static inline bool virtqueue_use_indirect(struct vring_virtqueue *vq,
@@ -725,7 +737,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 	/* Store token and indirect buffer state. */
 	vq->split.desc_state[head].data = data;
 	if (indirect)
-		vq->split.desc_state[head].indir_desc = desc;
+		vq->split.desc_state[head].indir_desc = desc_mix_dma_map(do_map, desc);
 	else
 		vq->split.desc_state[head].indir_desc = ctx;
 
@@ -820,22 +832,26 @@ static void detach_buf_split(struct vring_virtqueue *vq,
unsigned int head,
 	vq->vq.num_free++;
 
 	if (vq->indirect) {
-		struct vring_desc *indir_desc -				vq->split.desc_state[head].indir_desc;
+		struct vring_desc *mix = vq->split.desc_state[head].indir_desc;
+		struct vring_desc *indir_desc;
 		u32 len;
 
 		/* Free the indirect table, if any, now that it's unmapped. */
-		if (!indir_desc)
+		if (!mix)
 			return;
 
+		indir_desc = desc_rm_dma_map(mix);
+
 		len = vq->split.desc_extra[head].len;
 
 		BUG_ON(!(vq->split.desc_extra[head].flags &
 				VRING_DESC_F_INDIRECT));
 		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
 
-		for (j = 0; j < len / sizeof(struct vring_desc); j++)
-			vring_unmap_one_split_indirect(vq, &indir_desc[j]);
+		if (desc_map_inter(mix)) {
+			for (j = 0; j < len / sizeof(struct vring_desc); j++)
+				vring_unmap_one_split_indirect(vq, &indir_desc[j]);
+		}
 
 		kfree(indir_desc);
 		vq->split.desc_state[head].indir_desc = NULL;
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 07/12] virtio_ring: packed-indirect: support premapped
virtio core only supports virtual addresses, dma is completed in virtio
core.
In some scenarios (such as the AF_XDP), the memory is allocated
and DMA is completed in advance, so it is necessary for us to support
passing the DMA address to virtio core.
Drives can use sg->dma_address to pass the mapped dma address to virtio
core. If one sg->dma_address is used then all sgs must use
sg->dma_address,
otherwise all dma_address must be null.
On the indirect path, if dma_address is used, desc_state.indir_desc will
be mixed with VRING_INDIRECT_NO_DMA_MAP. So when do unmap, we can pass it.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 11827d2e56a8..b23d301effb5 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1331,6 +1331,7 @@ static int virtqueue_add_indirect_packed(struct
vring_virtqueue *vq,
 	unsigned int i, n;
 	u16 head, id;
 	dma_addr_t addr;
+	bool do_map;
 
 	head = vq->packed.next_avail_idx;
 	desc = alloc_indirect_packed(total_sg, gfp);
@@ -1348,7 +1349,8 @@ static int virtqueue_add_indirect_packed(struct
vring_virtqueue *vq,
 	id = vq->free_head;
 	BUG_ON(id == vq->packed.vring.num);
 
-	if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
+	do_map = !sgs[0]->dma_address;
+	if (do_map && virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
 		return -ENOMEM;
 
 	for (n = 0; n < out_sgs + in_sgs; n++) {
@@ -1408,7 +1410,7 @@ static int virtqueue_add_indirect_packed(struct
vring_virtqueue *vq,
 	/* Store token and indirect buffer state. */
 	vq->packed.desc_state[id].num = 1;
 	vq->packed.desc_state[id].data = data;
-	vq->packed.desc_state[id].indir_desc = desc;
+	vq->packed.desc_state[id].indir_desc = desc_mix_dma_map(do_map, desc);
 	vq->packed.desc_state[id].last = id;
 
 	vq->num_added += 1;
@@ -1419,7 +1421,8 @@ static int virtqueue_add_indirect_packed(struct
vring_virtqueue *vq,
 	return 0;
 
 unmap_release:
-	virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
+	if (do_map)
+		virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
 
 	kfree(desc);
 
@@ -1633,14 +1636,17 @@ static void detach_buf_packed(struct vring_virtqueue
*vq,
 	}
 
 	if (vq->indirect) {
+		struct vring_packed_desc *mix;
 		u32 len;
 
 		/* Free the indirect table, if any, now that it's unmapped. */
-		desc = state->indir_desc;
-		if (!desc)
+		mix = state->indir_desc;
+		if (!mix)
 			return;
 
-		if (vq->use_dma_api) {
+		desc = desc_rm_dma_map(mix);
+
+		if (vq->use_dma_api && desc_map_inter(mix)) {
 			len = vq->packed.desc_extra[id].len;
 			for (i = 0; i < len / sizeof(struct vring_packed_desc);
 					i++)
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 08/12] virtio_ring: update document for virtqueue_add_*
Update the document of virtqueue_add_* series API, allowing the callers to use sg->dma_address to pass the dma address to Virtio Core. Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com> --- drivers/virtio/virtio_ring.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index b23d301effb5..216ac8654982 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2190,6 +2190,10 @@ static inline int virtqueue_add(struct virtqueue *_vq, * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * + * If the caller has done dma map then use sg->dma_address to pass dma address. + * If one sg->dma_address is used, then all sgs must use sg->dma_address; + * otherwise all sg->dma_address must be NULL. + * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_sgs(struct virtqueue *_vq, @@ -2224,6 +2228,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_sgs); * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * + * If the caller has done dma map then use sg->dma_address to pass dma address. + * If one sg->dma_address is used, then all sgs must use sg->dma_address; + * otherwise all sg->dma_address must be NULL. + * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_outbuf(struct virtqueue *vq, @@ -2246,6 +2254,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * + * If the caller has done dma map then use sg->dma_address to pass dma address. + * If one sg->dma_address is used, then all sgs must use sg->dma_address; + * otherwise all sg->dma_address must be NULL. + * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_inbuf(struct virtqueue *vq, @@ -2269,6 +2281,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * + * If the caller has done dma map then use sg->dma_address to pass dma address. + * If one sg->dma_address is used, then all sgs must use sg->dma_address; + * otherwise all sg->dma_address must be NULL. + * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, -- 2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 09/12] virtio_ring: introduce virtqueue_dma_dev()
Added virtqueue_dma_dev() to get DMA device for virtio. Then the
caller can do dma operation in advance. The purpose is to keep memory
mapped across multiple add/get buf operations.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
Acked-by: Jason Wang <jasowang at redhat.com>
---
 drivers/virtio/virtio.c      |  6 ++++++
 drivers/virtio/virtio_ring.c | 17 +++++++++++++++++
 include/linux/virtio.h       |  2 ++
 3 files changed, 25 insertions(+)
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 3893dc29eb26..11c5035369e2 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+#include <linux/dma-mapping.h>
 #include <linux/virtio.h>
 #include <linux/spinlock.h>
 #include <linux/virtio_config.h>
@@ -243,6 +244,11 @@ static int virtio_dev_probe(struct device *_d)
 	u64 driver_features;
 	u64 driver_features_legacy;
 
+	_d->dma_mask = &_d->coherent_dma_mask;
+	err = dma_set_mask_and_coherent(_d, DMA_BIT_MASK(64));
+	if (err)
+		return err;
+
 	/* We have a driver! */
 	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
 
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 216ac8654982..f63637c288a0 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2297,6 +2297,23 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
 
+/**
+ * virtqueue_dma_dev - get the dma dev
+ * @vq: the struct virtqueue we're talking about.
+ *
+ * Returns the dma dev. That can been used for dma api.
+ */
+struct device *virtqueue_dma_dev(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (vq->use_dma_api)
+		return vring_dma_dev(vq);
+	else
+		return &vq->vq.vdev->dev;
+}
+EXPORT_SYMBOL_GPL(virtqueue_dma_dev);
+
 /**
  * virtqueue_kick_prepare - first half of split virtqueue_kick call.
  * @_vq: the struct virtqueue
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 2b472514c49b..1fa50191cf0a 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -61,6 +61,8 @@ int virtqueue_add_sgs(struct virtqueue *vq,
 		      void *data,
 		      gfp_t gfp);
 
+struct device *virtqueue_dma_dev(struct virtqueue *vq);
+
 bool virtqueue_kick(struct virtqueue *vq);
 
 bool virtqueue_kick_prepare(struct virtqueue *vq);
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 10/12] virtio_ring: correct the expression of the description of virtqueue_resize()
Modify the "useless" to a more accurate "unused". Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com> Acked-by: Jason Wang <jasowang at redhat.com> --- drivers/virtio/virtio_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f63637c288a0..a705485fea47 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2702,7 +2702,7 @@ EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); * virtqueue_resize - resize the vring of vq * @_vq: the struct virtqueue we're talking about. * @num: new ring num - * @recycle: callback for recycle the useless buffer + * @recycle: callback to recycle unused buffers * * When it is really necessary to create a new vring, it will set the current vq * into the reset state. Then call the passed callback to recycle the buffer -- 2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 11/12] virtio_ring: separate the logic of reset/enable from virtqueue_resize
The subsequent reset function will reuse these logic.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 58 ++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 19 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a705485fea47..f26bd7bbff5e 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2156,6 +2156,43 @@ static int virtqueue_resize_packed(struct virtqueue *_vq,
u32 num)
 	return -ENOMEM;
 }
 
+static int virtqueue_disable_and_recycle(struct virtqueue *_vq,
+					 void (*recycle)(struct virtqueue *vq, void *buf))
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct virtio_device *vdev = vq->vq.vdev;
+	void *buf;
+	int err;
+
+	if (!vq->we_own_ring)
+		return -EPERM;
+
+	if (!vdev->config->disable_vq_and_reset)
+		return -ENOENT;
+
+	if (!vdev->config->enable_vq_after_reset)
+		return -ENOENT;
+
+	err = vdev->config->disable_vq_and_reset(_vq);
+	if (err)
+		return err;
+
+	while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL)
+		recycle(_vq, buf);
+
+	return 0;
+}
+
+static int virtqueue_enable_after_reset(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct virtio_device *vdev = vq->vq.vdev;
+
+	if (vdev->config->enable_vq_after_reset(_vq))
+		return -EBUSY;
+
+	return 0;
+}
 
 /*
  * Generic functions and exported symbols.
@@ -2726,13 +2763,8 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num,
 		     void (*recycle)(struct virtqueue *vq, void *buf))
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	struct virtio_device *vdev = vq->vq.vdev;
-	void *buf;
 	int err;
 
-	if (!vq->we_own_ring)
-		return -EPERM;
-
 	if (num > vq->vq.num_max)
 		return -E2BIG;
 
@@ -2742,28 +2774,16 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num,
 	if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) ==
num)
 		return 0;
 
-	if (!vdev->config->disable_vq_and_reset)
-		return -ENOENT;
-
-	if (!vdev->config->enable_vq_after_reset)
-		return -ENOENT;
-
-	err = vdev->config->disable_vq_and_reset(_vq);
+	err = virtqueue_disable_and_recycle(_vq, recycle);
 	if (err)
 		return err;
 
-	while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL)
-		recycle(_vq, buf);
-
 	if (vq->packed_ring)
 		err = virtqueue_resize_packed(_vq, num);
 	else
 		err = virtqueue_resize_split(_vq, num);
 
-	if (vdev->config->enable_vq_after_reset(_vq))
-		return -EBUSY;
-
-	return err;
+	return virtqueue_enable_after_reset(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_resize);
 
-- 
2.32.0.3.g01195cf9f
Xuan Zhuo
2023-Mar-08  06:44 UTC
[PATCH vhost v2 12/12] virtio_ring: introduce virtqueue_reset()
Introduce virtqueue_reset() to release all buffer inside vq.
Signed-off-by: Xuan Zhuo <xuanzhuo at linux.alibaba.com>
---
 drivers/virtio/virtio_ring.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/virtio.h       |  2 ++
 2 files changed, 35 insertions(+)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index f26bd7bbff5e..1a8de916bb20 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2787,6 +2787,39 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num,
 }
 EXPORT_SYMBOL_GPL(virtqueue_resize);
 
+/**
+ * virtqueue_reset - detach and recycle all unused buffers
+ * @_vq: the struct virtqueue we're talking about.
+ * @recycle: callback to recycle unused buffers
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error.
+ * 0: success.
+ * -EBUSY: Failed to sync with device, vq may not work properly
+ * -ENOENT: Transport or device not supported
+ * -EPERM: Operation not permitted
+ */
+int virtqueue_reset(struct virtqueue *_vq,
+		    void (*recycle)(struct virtqueue *vq, void *buf))
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	int err;
+
+	err = virtqueue_disable_and_recycle(_vq, recycle);
+	if (err)
+		return err;
+
+	if (vq->packed_ring)
+		virtqueue_reinit_packed(vq);
+	else
+		virtqueue_reinit_split(vq);
+
+	return virtqueue_enable_after_reset(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_reset);
+
 /* Only available for split ring */
 struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int num,
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 1fa50191cf0a..22bbd06ef8c8 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -97,6 +97,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
 
 int virtqueue_resize(struct virtqueue *vq, u32 num,
 		     void (*recycle)(struct virtqueue *vq, void *buf));
+int virtqueue_reset(struct virtqueue *vq,
+		    void (*recycle)(struct virtqueue *vq, void *buf));
 
 /**
  * struct virtio_device - representation of a device using virtio
-- 
2.32.0.3.g01195cf9f
Michael S. Tsirkin
2023-Mar-10  09:05 UTC
[PATCH vhost v2 00/12] virtio core prepares for AF_XDP
On Wed, Mar 08, 2023 at 02:44:31PM +0800, Xuan Zhuo wrote:> XDP socket(AF_XDP) is an excellent bypass kernel network framework. The zero > copy feature of xsk (XDP socket) needs to be supported by the driver. The > performance of zero copy is very good. > > ENV: Qemu with vhost. > > vhost cpu | Guest APP CPU |Guest Softirq CPU | PPS > -----------------------------|---------------|------------------|------------ > xmit by sockperf: 90% | 100% | | 318967 > xmit by xsk: 100% | 30% | 33% | 1192064 > recv by sockperf: 100% | 68% | 100% | 692288 > recv by xsk: 100% | 33% | 43% | 771670 > > Before achieving the function of Virtio-Net, we also have to let virtio core > support these features: > > 1. virtio core support premapped > 2. virtio core support reset per-queue > 3. introduce DMA APIs to virtio core > > Please review.Jason can I get some acks on this?> Thanks. > > v2: > 1. based on sgs[0]->dma_address to judgment is premapped > 2. based on extra.addr to judgment to do unmap for no-indirect desc > 3. based on indir_desc to judgment to do unmap for indirect desc > 4. rename virtqueue_get_dma_dev to virtqueue_dma_dev > > v1: > 1. expose dma device. NO introduce the api for dma and sync > 2. split some commit for review. > > Xuan Zhuo (12): > virtio_ring: split: separate dma codes > virtio_ring: packed: separate dma codes > virtio_ring: packed-indirect: separate dma codes > virtio_ring: split: support premapped > virtio_ring: packed: support premapped > virtio_ring: split-indirect: support premapped > virtio_ring: packed-indirect: support premapped > virtio_ring: update document for virtqueue_add_* > virtio_ring: introduce virtqueue_dma_dev() > virtio_ring: correct the expression of the description of > virtqueue_resize() > virtio_ring: separate the logic of reset/enable from virtqueue_resize > virtio_ring: introduce virtqueue_reset() > > drivers/virtio/virtio.c | 6 + > drivers/virtio/virtio_ring.c | 354 +++++++++++++++++++++++++---------- > include/linux/virtio.h | 4 + > 3 files changed, 260 insertions(+), 104 deletions(-) > > -- > 2.32.0.3.g01195cf9f