Andy Lutomirski
2014-Aug-27  21:50 UTC
[PATCH v2 0/4] virtio: Clean up scatterlists and use the DMA API
This fixes virtio on Xen guests as well as on any other platform
that uses virtio_pci on which physical addresses don't match bus
addresses.
This can be tested with:
    virtme-run --xen xen --kimg arch/x86/boot/bzImage --console
using virtme from here:
    https://git.kernel.org/cgit/utils/kernel/virtme/virtme.git
Without these patches, the guest hangs forever.  With these patches,
everything works.
This should be safe on all platforms that I'm aware of.  That
doesn't mean that there isn't anything that I missed.
Changes from v1:
 - Using the DMA API is optional now.  It would be nice to improve the
   DMA API to the point that it could be used unconditionally, but s390
   proves that we're not there yet.
 - Includes patch 4, which fixes DMA debugging warnings from virtio_net.
Andy Lutomirski (4):
  virtio_ring: Remove sg_next indirection
  virtio_ring: Support DMA APIs if requested
  virtio_pci: Use the DMA API for virtqueues
  virtio_net: Stop doing DMA from the stack
 drivers/lguest/lguest_device.c         |   3 +-
 drivers/misc/mic/card/mic_virtio.c     |   2 +-
 drivers/net/virtio_net.c               |  53 +++++---
 drivers/remoteproc/remoteproc_virtio.c |   4 +-
 drivers/s390/kvm/kvm_virtio.c          |   2 +-
 drivers/s390/kvm/virtio_ccw.c          |   4 +-
 drivers/virtio/virtio_mmio.c           |   5 +-
 drivers/virtio/virtio_pci.c            |  35 ++++--
 drivers/virtio/virtio_ring.c           | 219 ++++++++++++++++++++++++++-------
 include/linux/virtio_ring.h            |   1 +
 tools/virtio/linux/virtio.h            |   1 +
 tools/virtio/virtio_test.c             |   2 +-
 tools/virtio/vringh_test.c             |   3 +-
 13 files changed, 253 insertions(+), 81 deletions(-)
-- 
1.9.3
Andy Lutomirski
2014-Aug-27  21:50 UTC
[PATCH v2 1/4] virtio_ring: Remove sg_next indirection
The only unusual thing about virtio's use of scatterlists is that
two of the APIs accept scatterlists that might not be terminated.
Using function pointers to handle this case is overkill; for_each_sg
can do it.
There's a small subtlely here: for_each_sg assumes that the provided
count is correct, but, because of the way that virtio_ring handles
multiple scatterlists at once, the count is only an upper bound if
there is more than one scatterlist.  This is easily solved by
checking the sg pointer for NULL on each iteration.
Signed-off-by: Andy Lutomirski <luto at amacapital.net>
---
 drivers/virtio/virtio_ring.c | 46 +++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 4d08f45a9c29..d356a701c9c2 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -99,25 +99,9 @@ struct vring_virtqueue
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
-static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
-						  unsigned int *count)
-{
-	return sg_next(sg);
-}
-
-static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
-					      unsigned int *count)
-{
-	if (--(*count) == 0)
-		return NULL;
-	return sg + 1;
-}
-
 /* Set up an indirect table of descriptors and add it to the queue. */
 static inline int vring_add_indirect(struct vring_virtqueue *vq,
 				     struct scatterlist *sgs[],
-				     struct scatterlist *(*next)
-				       (struct scatterlist *, unsigned int *),
 				     unsigned int total_sg,
 				     unsigned int total_out,
 				     unsigned int total_in,
@@ -128,7 +112,7 @@ static inline int vring_add_indirect(struct vring_virtqueue
*vq,
 	struct vring_desc *desc;
 	unsigned head;
 	struct scatterlist *sg;
-	int i, n;
+	int i, j, n;
 
 	/*
 	 * We require lowmem mappings for the descriptors because
@@ -144,7 +128,9 @@ static inline int vring_add_indirect(struct vring_virtqueue
*vq,
 	/* Transfer entries from the sg lists into the indirect page */
 	i = 0;
 	for (n = 0; n < out_sgs; n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+		for_each_sg(sgs[n], sg, total_out, j) {
+			if (!sg)
+				break;
 			desc[i].flags = VRING_DESC_F_NEXT;
 			desc[i].addr = sg_phys(sg);
 			desc[i].len = sg->length;
@@ -153,7 +139,9 @@ static inline int vring_add_indirect(struct vring_virtqueue
*vq,
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+		for_each_sg(sgs[n], sg, total_in, j) {
+			if (!sg)
+				break;
 			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
 			desc[i].addr = sg_phys(sg);
 			desc[i].len = sg->length;
@@ -186,8 +174,6 @@ static inline int vring_add_indirect(struct vring_virtqueue
*vq,
 
 static inline int virtqueue_add(struct virtqueue *_vq,
 				struct scatterlist *sgs[],
-				struct scatterlist *(*next)
-				  (struct scatterlist *, unsigned int *),
 				unsigned int total_out,
 				unsigned int total_in,
 				unsigned int out_sgs,
@@ -197,7 +183,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
-	unsigned int i, n, avail, uninitialized_var(prev), total_sg;
+	unsigned int i, j, n, avail, uninitialized_var(prev), total_sg;
 	int head;
 
 	START_USE(vq);
@@ -227,7 +213,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
 	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
{
-		head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
+		head = vring_add_indirect(vq, sgs, total_sg, total_out,
 					  total_in,
 					  out_sgs, in_sgs, gfp);
 		if (likely(head >= 0))
@@ -254,7 +240,9 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 	head = i = vq->free_head;
 	for (n = 0; n < out_sgs; n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+		for_each_sg(sgs[n], sg, total_out, j) {
+			if (!sg)
+				break;
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
 			vq->vring.desc[i].addr = sg_phys(sg);
 			vq->vring.desc[i].len = sg->length;
@@ -263,7 +251,9 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		}
 	}
 	for (; n < (out_sgs + in_sgs); n++) {
-		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+		for_each_sg(sgs[n], sg, total_in, j) {
+			if (!sg)
+				break;
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
 			vq->vring.desc[i].addr = sg_phys(sg);
 			vq->vring.desc[i].len = sg->length;
@@ -337,7 +327,7 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
 		for (sg = sgs[i]; sg; sg = sg_next(sg))
 			total_in++;
 	}
-	return virtqueue_add(_vq, sgs, sg_next_chained,
+	return virtqueue_add(_vq, sgs,
 			     total_out, total_in, out_sgs, in_sgs, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
@@ -360,7 +350,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq,
 			 void *data,
 			 gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
+	return virtqueue_add(vq, &sg, num, 0, 1, 0, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 
@@ -382,7 +372,7 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
 			void *data,
 			gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
+	return virtqueue_add(vq, &sg, 0, num, 0, 1, data, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
 
-- 
1.9.3
Andy Lutomirski
2014-Aug-27  21:50 UTC
[PATCH v2 2/4] virtio_ring: Support DMA APIs if requested
virtio_ring currently sends the device (usually a hypervisor)
physical addresses of its I/O buffers.  This is okay when DMA
addresses and physical addresses are the same thing, but this isn't
always the case.  For example, this never works on Xen guests, and
it is likely to fail if a physical "virtio" device ever ends up
behind an IOMMU or swiotlb.
The immediate use case for me is to enable virtio on Xen guests.
For that to work, we need vring to support DMA address translation
as well as a corresponding change to virtio_pci or to another
driver.
With this patch, if enabled, virtfs survives kmemleak and
CONFIG_DMA_API_DEBUG.  virtio-net warns (correctly) about DMA from
the stack in virtnet_set_rx_mode.
This explicitly supports !CONFIG_HAS_DMA.  If vring is asked to use
the DMA API and CONFIG_HAS_DMA is not set, then vring will refuse to
create the virtqueue.
Signed-off-by: Andy Lutomirski <luto at amacapital.net>
---
 drivers/lguest/lguest_device.c         |   3 +-
 drivers/misc/mic/card/mic_virtio.c     |   2 +-
 drivers/remoteproc/remoteproc_virtio.c |   4 +-
 drivers/s390/kvm/kvm_virtio.c          |   2 +-
 drivers/s390/kvm/virtio_ccw.c          |   4 +-
 drivers/virtio/virtio_mmio.c           |   5 +-
 drivers/virtio/virtio_pci.c            |   3 +-
 drivers/virtio/virtio_ring.c           | 187 +++++++++++++++++++++++++++++----
 include/linux/virtio_ring.h            |   1 +
 tools/virtio/linux/virtio.h            |   1 +
 tools/virtio/virtio_test.c             |   2 +-
 tools/virtio/vringh_test.c             |   3 +-
 12 files changed, 182 insertions(+), 35 deletions(-)
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index d0a1d8a45c81..f0eafbe82ed4 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -301,7 +301,8 @@ static struct virtqueue *lg_find_vq(struct virtio_device
*vdev,
 	 * barriers.
 	 */
 	vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
-				 true, lvq->pages, lg_notify, callback, name);
+				 true, false, lvq->pages,
+				 lg_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto unmap;
diff --git a/drivers/misc/mic/card/mic_virtio.c
b/drivers/misc/mic/card/mic_virtio.c
index f14b60080c21..d633964417b1 100644
--- a/drivers/misc/mic/card/mic_virtio.c
+++ b/drivers/misc/mic/card/mic_virtio.c
@@ -256,7 +256,7 @@ static struct virtqueue *mic_find_vq(struct virtio_device
*vdev,
 	mvdev->vr[index] = va;
 	memset_io(va, 0x0, _vr_size);
 	vq = vring_new_virtqueue(index, le16_to_cpu(config.num),
-				 MIC_VIRTIO_RING_ALIGN, vdev, false,
+				 MIC_VIRTIO_RING_ALIGN, vdev, false, false,
 				 (void __force *)va, mic_notify, callback,
 				 name);
 	if (!vq) {
diff --git a/drivers/remoteproc/remoteproc_virtio.c
b/drivers/remoteproc/remoteproc_virtio.c
index a34b50690b4e..e31f2fefa76e 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -107,8 +107,8 @@ static struct virtqueue *rp_find_vq(struct virtio_device
*vdev,
 	 * Create the new vq, and tell virtio we're not interested in
 	 * the 'weak' smp barriers, since we're talking with a real
device.
 	 */
-	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, addr,
-					rproc_virtio_notify, callback, name);
+	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, false,
+				 addr, rproc_virtio_notify, callback, name);
 	if (!vq) {
 		dev_err(dev, "vring_new_virtqueue %s failed\n", name);
 		rproc_free_vring(rvring);
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index a1349653c6d9..91abcdc196d0 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -206,7 +206,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device
*vdev,
 		goto out;
 
 	vq = vring_new_virtqueue(index, config->num, KVM_S390_VIRTIO_RING_ALIGN,
-				 vdev, true, (void *) config->address,
+				 vdev, true, false, (void *) config->address,
 				 kvm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c
index d2c0b442bce5..2462a443358a 100644
--- a/drivers/s390/kvm/virtio_ccw.c
+++ b/drivers/s390/kvm/virtio_ccw.c
@@ -478,8 +478,8 @@ static struct virtqueue *virtio_ccw_setup_vq(struct
virtio_device *vdev,
 	}
 
 	vq = vring_new_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev,
-				 true, info->queue, virtio_ccw_kvm_notify,
-				 callback, name);
+				 true, false, info->queue,
+				 virtio_ccw_kvm_notify, callback, name);
 	if (!vq) {
 		/* For now, we fail if we can't get the requested size. */
 		dev_warn(&vcdev->cdev->dev, "no vq\n");
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index c600ccfd6922..693254e52a5d 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -366,8 +366,9 @@ static struct virtqueue *vm_setup_vq(struct virtio_device
*vdev, unsigned index,
 			vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
 
 	/* Create the vring */
-	vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
-				 true, info->queue, vm_notify, callback, name);
+	vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN,
+				 vdev, true, false, info->queue,
+				 vm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto error_new_virtqueue;
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 3d1463c6b120..a1f299fa4626 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -430,7 +430,8 @@ static struct virtqueue *setup_vq(struct virtio_device
*vdev, unsigned index,
 
 	/* create the vring */
 	vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
-				 true, info->queue, vp_notify, callback, name);
+				 true, false, info->queue,
+				 vp_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto out_activate_queue;
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index d356a701c9c2..8f200aee0fd8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/hrtimer.h>
 #include <linux/kmemleak.h>
+#include <linux/dma-mapping.h>
 
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
@@ -54,6 +55,12 @@
 #define END_USE(vq)
 #endif
 
+struct vring_desc_state
+{
+	void *data;			/* Data for callback. */
+	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
+};
+
 struct vring_virtqueue
 {
 	struct virtqueue vq;
@@ -64,6 +71,9 @@ struct vring_virtqueue
 	/* Can we use weak barriers? */
 	bool weak_barriers;
 
+	/* Should we use the DMA API? */
+	bool use_dma_api;
+
 	/* Other side has made a mess, don't try any more. */
 	bool broken;
 
@@ -93,12 +103,89 @@ struct vring_virtqueue
 	ktime_t last_add_time;
 #endif
 
-	/* Tokens for callbacks. */
-	void *data[];
+	/* Per-descriptor state. */
+	struct vring_desc_state desc_state[];
 };
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+/* Map one sg entry. */
+static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
+				   struct scatterlist *sg,
+				   enum dma_data_direction direction)
+{
+#ifdef CONFIG_HAS_DMA
+	/*
+	 * We can't use dma_map_sg, because we don't use scatterlists in
+	 * the way it expects (we sometimes use unterminated
+	 * scatterlists, and we don't guarantee that the scatterlist
+	 * will exist for the lifetime of the mapping.
+	 */
+	if (vq->use_dma_api)
+		return dma_map_page(vq->vq.vdev->dev.parent,
+				    sg_page(sg), sg->offset, sg->length,
+				    direction);
+#endif
+
+	return sg_phys(sg);
+}
+
+static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
+				   void *cpu_addr, size_t size,
+				   enum dma_data_direction direction)
+{
+#ifdef CONFIG_HAS_DMA
+	if (vq->use_dma_api)
+		return dma_map_single(vq->vq.vdev->dev.parent,
+				      cpu_addr, size,
+				      direction);
+#endif
+
+	return virt_to_phys(cpu_addr);
+}
+
+static void vring_unmap_one(const struct vring_virtqueue *vq,
+			    struct vring_desc *desc)
+{
+#ifdef CONFIG_HAS_DMA
+	if (!vq->use_dma_api)
+		return;		/* Nothing to do. */
+
+	if (desc->flags & VRING_DESC_F_INDIRECT) {
+		dma_unmap_single(vq->vq.vdev->dev.parent,
+				 desc->addr, desc->len,
+				 (desc->flags & VRING_DESC_F_WRITE) ?
+				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	} else {
+		dma_unmap_page(vq->vq.vdev->dev.parent,
+			       desc->addr, desc->len,
+			       (desc->flags & VRING_DESC_F_WRITE) ?
+			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	}
+#endif
+}
+
+static void vring_unmap_indirect(const struct vring_virtqueue *vq,
+				 struct vring_desc *desc, int total)
+{
+	int i;
+
+	if (vq->use_dma_api)
+		for (i = 0; i < total; i++)
+			vring_unmap_one(vq, &desc[i]);
+}
+
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+			       dma_addr_t addr)
+{
+#ifdef CONFIG_HAS_DMA
+	return vq->use_dma_api &&
+		dma_mapping_error(vq->vq.vdev->dev.parent, addr);
+#else
+	return 0;
+#endif
+}
+
 /* Set up an indirect table of descriptors and add it to the queue. */
 static inline int vring_add_indirect(struct vring_virtqueue *vq,
 				     struct scatterlist *sgs[],
@@ -132,7 +219,10 @@ static inline int vring_add_indirect(struct vring_virtqueue
*vq,
 			if (!sg)
 				break;
 			desc[i].flags = VRING_DESC_F_NEXT;
-			desc[i].addr = sg_phys(sg);
+			desc[i].addr +				vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
+			if (vring_mapping_error(vq, desc[i].addr))
+				goto unmap_free;
 			desc[i].len = sg->length;
 			desc[i].next = i+1;
 			i++;
@@ -143,7 +233,10 @@ static inline int vring_add_indirect(struct vring_virtqueue
*vq,
 			if (!sg)
 				break;
 			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-			desc[i].addr = sg_phys(sg);
+			desc[i].addr +				vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
+			if (vring_mapping_error(vq, desc[i].addr))
+				goto unmap_free;
 			desc[i].len = sg->length;
 			desc[i].next = i+1;
 			i++;
@@ -161,15 +254,26 @@ static inline int vring_add_indirect(struct
vring_virtqueue *vq,
 	/* Use a single buffer which doesn't continue */
 	head = vq->free_head;
 	vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
-	vq->vring.desc[head].addr = virt_to_phys(desc);
-	/* kmemleak gives a false positive, as it's hidden by virt_to_phys */
-	kmemleak_ignore(desc);
+	vq->vring.desc[head].addr +		vring_map_single(vq,
+				 desc, i * sizeof(struct vring_desc),
+				 DMA_TO_DEVICE);
+	if (vring_mapping_error(vq, vq->vring.desc[head].addr))
+		goto unmap_free;
 	vq->vring.desc[head].len = i * sizeof(struct vring_desc);
 
 	/* Update free pointer */
 	vq->free_head = vq->vring.desc[head].next;
 
+	/* Save the indirect block */
+	vq->desc_state[head].indir_desc = desc;
+
 	return head;
+
+unmap_free:
+	vring_unmap_indirect(vq, desc, i);
+	kfree(desc);
+	return -ENOMEM;
 }
 
 static inline int virtqueue_add(struct virtqueue *_vq,
@@ -183,7 +287,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
-	unsigned int i, j, n, avail, uninitialized_var(prev), total_sg;
+	unsigned int i, j, n, avail, uninitialized_var(prev), total_sg, err_idx;
 	int head;
 
 	START_USE(vq);
@@ -244,7 +348,10 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 			if (!sg)
 				break;
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
-			vq->vring.desc[i].addr = sg_phys(sg);
+			vq->vring.desc[i].addr +				vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
+			if (vring_mapping_error(vq, vq->vring.desc[i].addr))
+				goto unmap_release;
 			vq->vring.desc[i].len = sg->length;
 			prev = i;
 			i = vq->vring.desc[i].next;
@@ -255,7 +362,10 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 			if (!sg)
 				break;
 			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-			vq->vring.desc[i].addr = sg_phys(sg);
+			vq->vring.desc[i].addr +				vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
+			if (vring_mapping_error(vq, vq->vring.desc[i].addr))
+				goto unmap_release;
 			vq->vring.desc[i].len = sg->length;
 			prev = i;
 			i = vq->vring.desc[i].next;
@@ -269,7 +379,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 add_head:
 	/* Set token. */
-	vq->data[head] = data;
+	vq->desc_state[head].data = data;
 
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
@@ -291,6 +401,20 @@ add_head:
 	END_USE(vq);
 
 	return 0;
+
+unmap_release:
+	err_idx = i;
+	i = head;
+
+	for (n = 0; n < total_sg; n++) {
+		if (i == err_idx)
+			break;
+		vring_unmap_one(vq, &vq->vring.desc[i]);
+		i = vq->vring.desc[i].next;
+	}
+
+	vq->vq.num_free += total_sg;
+	return -EIO;
 }
 
 /**
@@ -470,22 +594,33 @@ static void detach_buf(struct vring_virtqueue *vq,
unsigned int head)
 	unsigned int i;
 
 	/* Clear data ptr. */
-	vq->data[head] = NULL;
+	vq->desc_state[head].data = NULL;
 
 	/* Put back on free list: find end */
 	i = head;
 
 	/* Free the indirect table */
-	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
-		kfree(phys_to_virt(vq->vring.desc[i].addr));
+	if (vq->desc_state[head].indir_desc) {
+		u32 len = vq->vring.desc[i].len;
+
+		BUG_ON(!(vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT));
+		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+		vring_unmap_indirect(vq, vq->desc_state[head].indir_desc,
+				     len / sizeof(struct vring_desc));
+		kfree(vq->desc_state[head].indir_desc);
+		vq->desc_state[head].indir_desc = NULL;
+	}
 
 	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
+		vring_unmap_one(vq, &vq->vring.desc[i]);
 		i = vq->vring.desc[i].next;
 		vq->vq.num_free++;
 	}
 
+	vring_unmap_one(vq, &vq->vring.desc[i]);
 	vq->vring.desc[i].next = vq->free_head;
 	vq->free_head = head;
+
 	/* Plus final descriptor */
 	vq->vq.num_free++;
 }
@@ -542,13 +677,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned
int *len)
 		BAD_RING(vq, "id %u out of range\n", i);
 		return NULL;
 	}
-	if (unlikely(!vq->data[i])) {
+	if (unlikely(!vq->desc_state[i].data)) {
 		BAD_RING(vq, "id %u is not a head!\n", i);
 		return NULL;
 	}
 
 	/* detach_buf clears data, so grab it now. */
-	ret = vq->data[i];
+	ret = vq->desc_state[i].data;
 	detach_buf(vq, i);
 	vq->last_used_idx++;
 	/* If we expect an interrupt for the next entry, tell host
@@ -709,10 +844,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 	START_USE(vq);
 
 	for (i = 0; i < vq->vring.num; i++) {
-		if (!vq->data[i])
+		if (!vq->desc_state[i].data)
 			continue;
 		/* detach_buf clears data, so grab it now. */
-		buf = vq->data[i];
+		buf = vq->desc_state[i].data;
 		detach_buf(vq, i);
 		vq->vring.avail->idx--;
 		END_USE(vq);
@@ -751,6 +886,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
+				      bool use_dma_api,
 				      void *pages,
 				      bool (*notify)(struct virtqueue *),
 				      void (*callback)(struct virtqueue *),
@@ -765,7 +901,13 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 		return NULL;
 	}
 
-	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
+#ifndef CONFIG_HAS_DMA
+	if (use_dma_api)
+		return NULL;
+#endif
+
+	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
+		     GFP_KERNEL);
 	if (!vq)
 		return NULL;
 
@@ -777,6 +919,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 	vq->vq.index = index;
 	vq->notify = notify;
 	vq->weak_barriers = weak_barriers;
+	vq->use_dma_api = use_dma_api;
 	vq->broken = false;
 	vq->last_used_idx = 0;
 	vq->num_added = 0;
@@ -795,11 +938,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 
 	/* Put everything in free lists. */
 	vq->free_head = 0;
-	for (i = 0; i < num-1; i++) {
+	for (i = 0; i < num-1; i++)
 		vq->vring.desc[i].next = i+1;
-		vq->data[i] = NULL;
-	}
-	vq->data[i] = NULL;
+	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
 
 	return &vq->vq;
 }
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 67e06fe18c03..60f761a38a09 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -70,6 +70,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
+				      bool use_dma_api,
 				      void *pages,
 				      bool (*notify)(struct virtqueue *vq),
 				      void (*callback)(struct virtqueue *vq),
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
index 5a2d1f0f6bc7..5d42dc6a6201 100644
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -78,6 +78,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
+				      bool use_dma_api,
 				      void *pages,
 				      bool (*notify)(struct virtqueue *vq),
 				      void (*callback)(struct virtqueue *vq),
diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
index 00ea679b3826..860cc89900a7 100644
--- a/tools/virtio/virtio_test.c
+++ b/tools/virtio/virtio_test.c
@@ -99,7 +99,7 @@ static void vq_info_add(struct vdev_info *dev, int num)
 	vring_init(&info->vring, num, info->ring, 4096);
 	info->vq = vring_new_virtqueue(info->idx,
 				       info->vring.num, 4096, &dev->vdev,
-				       true, info->ring,
+				       true, false, info->ring,
 				       vq_notify, vq_callback, "test");
 	assert(info->vq);
 	info->vq->priv = info;
diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c
index 14a4f4cab5b9..67d3c3a1ba88 100644
--- a/tools/virtio/vringh_test.c
+++ b/tools/virtio/vringh_test.c
@@ -312,7 +312,8 @@ static int parallel_test(unsigned long features,
 		if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
 			err(1, "Could not set affinity to cpu %u", first_cpu);
 
-		vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true,
+		vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev,
+					 true, false,
 					 guest_map, fast_vringh ? no_notify_host
 					 : parallel_notify_host,
 					 never_callback_guest, "guest vq");
-- 
1.9.3
Andy Lutomirski
2014-Aug-27  21:50 UTC
[PATCH v2 3/4] virtio_pci: Use the DMA API for virtqueues
A virtqueue is a coherent DMA mapping.  Use the DMA API for it.
This fixes virtio_pci on Xen.
Signed-off-by: Andy Lutomirski <luto at amacapital.net>
---
 drivers/virtio/virtio_pci.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index a1f299fa4626..d0eee85496b5 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -80,8 +80,9 @@ struct virtio_pci_vq_info
 	/* the number of entries in the queue */
 	int num;
 
-	/* the virtual address of the ring queue */
-	void *queue;
+	/* the ring queue */
+	void *queue;			/* virtual address */
+	dma_addr_t queue_dma_addr;	/* bus address */
 
 	/* the list node for the virtqueues list */
 	struct list_head node;
@@ -417,20 +418,26 @@ static struct virtqueue *setup_vq(struct virtio_device
*vdev, unsigned index,
 	info->num = num;
 	info->msix_vector = msix_vec;
 
-	size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
-	info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
+	size = vring_size(num, VIRTIO_PCI_VRING_ALIGN);
+	info->queue = dma_zalloc_coherent(vdev->dev.parent, size,
+					  &info->queue_dma_addr, GFP_KERNEL);
 	if (info->queue == NULL) {
 		err = -ENOMEM;
 		goto out_info;
 	}
 
 	/* activate the queue */
-	iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+	iowrite32(info->queue_dma_addr >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
 		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
-	/* create the vring */
+	/*
+	 * create the vring.  If there is an IOMMU of any sort, including
+	 * Xen paravirt's ersatz IOMMU, use it.  If the host wants physical
+	 * addresses instead of bus addresses, the host shouldn't expose
+	 * an IOMMU.
+	 */
 	vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
-				 true, false, info->queue,
+				 true, true, info->queue,
 				 vp_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
@@ -463,7 +470,8 @@ out_assign:
 	vring_del_virtqueue(vq);
 out_activate_queue:
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
-	free_pages_exact(info->queue, size);
+	dma_free_coherent(vdev->dev.parent, size,
+			  info->queue, info->queue_dma_addr);
 out_info:
 	kfree(info);
 	return ERR_PTR(err);
@@ -494,7 +502,8 @@ static void vp_del_vq(struct virtqueue *vq)
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
-	free_pages_exact(info->queue, size);
+	dma_free_coherent(vq->vdev->dev.parent, size,
+			  info->queue, info->queue_dma_addr);
 	kfree(info);
 }
 
@@ -713,6 +722,13 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto out;
 
+	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+	if (err)
+		err = dma_set_mask_and_coherent(&pci_dev->dev,
+						DMA_BIT_MASK(32));
+	if (err)
+		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. 
Trying to continue, but this might not work.\n");
+
 	err = pci_request_regions(pci_dev, "virtio-pci");
 	if (err)
 		goto out_enable_device;
-- 
1.9.3
Andy Lutomirski
2014-Aug-27  21:50 UTC
[PATCH v2 4/4] virtio_net: Stop doing DMA from the stack
Now that virtio supports real DMA, drivers should play by the rules.
For virtio_net, that means that DMA should be done to and from
dynamically-allocated memory, not the kernel stack.
This should have no effect on any performance-critical code paths.
Signed-off-by: Andy Lutomirski <luto at amacapital.net>
---
 drivers/net/virtio_net.c | 53 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 17 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 59caa06f34a6..960a2172b07a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -966,31 +966,43 @@ static bool virtnet_send_command(struct virtnet_info *vi,
u8 class, u8 cmd,
 				 struct scatterlist *out)
 {
 	struct scatterlist *sgs[4], hdr, stat;
-	struct virtio_net_ctrl_hdr ctrl;
-	virtio_net_ctrl_ack status = ~0;
+
+	struct {
+		struct virtio_net_ctrl_hdr ctrl;
+		virtio_net_ctrl_ack status;
+	} *buf;
+
 	unsigned out_num = 0, tmp;
+	bool ret;
 
 	/* Caller should know better */
 	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
 
-	ctrl.class = class;
-	ctrl.cmd = cmd;
+	buf = kmalloc(sizeof(*buf), GFP_ATOMIC);
+	if (!buf)
+		return false;
+	buf->status = ~0;
+
+	buf->ctrl.class = class;
+	buf->ctrl.cmd = cmd;
 	/* Add header */
-	sg_init_one(&hdr, &ctrl, sizeof(ctrl));
+	sg_init_one(&hdr, &buf->ctrl, sizeof(buf->ctrl));
 	sgs[out_num++] = &hdr;
 
 	if (out)
 		sgs[out_num++] = out;
 
 	/* Add return status. */
-	sg_init_one(&stat, &status, sizeof(status));
+	sg_init_one(&stat, &buf->status, sizeof(buf->status));
 	sgs[out_num] = &stat;
 
 	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
 	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
 
-	if (unlikely(!virtqueue_kick(vi->cvq)))
-		return status == VIRTIO_NET_OK;
+	if (unlikely(!virtqueue_kick(vi->cvq))) {
+		ret = (buf->status == VIRTIO_NET_OK);
+		goto out;
+	}
 
 	/* Spin for a response, the kick causes an ioport write, trapping
 	 * into the hypervisor, so the request should be handled immediately.
@@ -999,7 +1011,11 @@ static bool virtnet_send_command(struct virtnet_info *vi,
u8 class, u8 cmd,
 	       !virtqueue_is_broken(vi->cvq))
 		cpu_relax();
 
-	return status == VIRTIO_NET_OK;
+	ret = (buf->status == VIRTIO_NET_OK);
+
+out:
+	kfree(buf);
+	return ret;
 }
 
 static int virtnet_set_mac_address(struct net_device *dev, void *p)
@@ -1140,7 +1156,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct scatterlist sg[2];
-	u8 promisc, allmulti;
+	u8 *cmdbyte;
 	struct virtio_net_ctrl_mac *mac_data;
 	struct netdev_hw_addr *ha;
 	int uc_count;
@@ -1152,22 +1168,25 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
 		return;
 
-	promisc = ((dev->flags & IFF_PROMISC) != 0);
-	allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
+	cmdbyte = kmalloc(sizeof(*cmdbyte), GFP_ATOMIC);
+	if (!cmdbyte)
+		return;
 
-	sg_init_one(sg, &promisc, sizeof(promisc));
+	sg_init_one(sg, cmdbyte, sizeof(*cmdbyte));
 
+	*cmdbyte = ((dev->flags & IFF_PROMISC) != 0);
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
 		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
-			 promisc ? "en" : "dis");
-
-	sg_init_one(sg, &allmulti, sizeof(allmulti));
+			 *cmdbyte ? "en" : "dis");
 
+	*cmdbyte = ((dev->flags & IFF_ALLMULTI) != 0);
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
 		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
-			 allmulti ? "en" : "dis");
+			 *cmdbyte ? "en" : "dis");
+
+	kfree(cmdbyte);
 
 	uc_count = netdev_uc_count(dev);
 	mc_count = netdev_mc_count(dev);
-- 
1.9.3
Christian Borntraeger
2014-Aug-28  07:44 UTC
[PATCH v2 0/4] virtio: Clean up scatterlists and use the DMA API
On 27/08/14 23:50, Andy Lutomirski wrote:> This fixes virtio on Xen guests as well as on any other platform > that uses virtio_pci on which physical addresses don't match bus > addresses. > > This can be tested with: > > virtme-run --xen xen --kimg arch/x86/boot/bzImage --console > > using virtme from here: > > https://git.kernel.org/cgit/utils/kernel/virtme/virtme.git > > Without these patches, the guest hangs forever. With these patches, > everything works. > > This should be safe on all platforms that I'm aware of. That > doesn't mean that there isn't anything that I missed. > > Changes from v1: > - Using the DMA API is optional now. It would be nice to improve the > DMA API to the point that it could be used unconditionally, but s390 > proves that we're not there yet. > - Includes patch 4, which fixes DMA debugging warnings from virtio_net. > > Andy Lutomirski (4): > virtio_ring: Remove sg_next indirection > virtio_ring: Support DMA APIs if requested > virtio_pci: Use the DMA API for virtqueues > virtio_net: Stop doing DMA from the stack > > drivers/lguest/lguest_device.c | 3 +- > drivers/misc/mic/card/mic_virtio.c | 2 +- > drivers/net/virtio_net.c | 53 +++++--- > drivers/remoteproc/remoteproc_virtio.c | 4 +- > drivers/s390/kvm/kvm_virtio.c | 2 +- > drivers/s390/kvm/virtio_ccw.c | 4 +- > drivers/virtio/virtio_mmio.c | 5 +- > drivers/virtio/virtio_pci.c | 35 ++++-- > drivers/virtio/virtio_ring.c | 219 ++++++++++++++++++++++++++------- > include/linux/virtio_ring.h | 1 + > tools/virtio/linux/virtio.h | 1 + > tools/virtio/virtio_test.c | 2 +- > tools/virtio/vringh_test.c | 3 +- > 13 files changed, 253 insertions(+), 81 deletions(-) >Patches were applied on top of 3.16. block seems to work, with net a simple ping works, iperf causes this: [ 8.643981] ------------[ cut here ]------------ [ 8.643986] kernel BUG at drivers/virtio/virtio_ring.c:245! [ 8.644036] illegal operation: 0001 [#1] SMP [ 8.644039] Modules linked in: virtio_net virtio_blk dm_multipath sunrpc [ 8.644046] CPU: 0 PID: 1291 Comm: iperf Not tainted 3.16.0+ #130 [ 8.644048] task: 00000000018ee4c0 ti: 0000000022098000 task.ti: 0000000022098000 [ 8.644051] Krnl PSW : 0704d00180000000 0000000000421618 (virtqueue_add_outbuf+0x2e4/0x340) [ 8.644066] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3 Krnl GPRS: 00000000299f8a80 0000000000000000 0000000000000001 000003d100000000 [ 8.644070] 00000000004214de 0000000000000e2c 0000000000000001 0000000000000002 [ 8.644072] 00000000299f8a40 0000000000000001 00000000299f8a40 0000000000000000 [ 8.644074] 0000000028077608 00000000287ec000 0000000000421548 000000002209b7b0 [ 8.644085] Krnl Code: 000000000042160a: d060a7a90000 trtr 1961(97,%r10),0 0000000000421610: a7f4ff07 brc 15,42141e #0000000000421614: a7f40001 brc 15,421616 >0000000000421618: a7c8fffb lhi %r12,-5 000000000042161c: a7f4ff46 brc 15,4214a8 0000000000421620: a7f40001 brc 15,421622 0000000000421624: b9040021 lgr %r2,%r1 0000000000421628: c030001c3569 larl %r3,7a80fa [ 8.644106] Call Trace: [ 8.644110] ([<00000000004214de>] virtqueue_add_outbuf+0x1aa/0x340) [ 8.644114] [<000003ff80174466>] start_xmit+0x1e6/0x49c [virtio_net] [ 8.644119] [<000000000050571a>] dev_hard_start_xmit+0x346/0x600 [ 8.644123] [<000000000052a43c>] sch_direct_xmit+0xe8/0x1e8 [ 8.644126] [<0000000000505be8>] __dev_queue_xmit+0x214/0x4ec [ 8.644131] [<000000000054a476>] ip_finish_output+0x436/0x90c [ 8.644134] [<000000000054b7c0>] ip_queue_xmit+0x170/0x3e8 [ 8.644137] [<00000000005636ca>] tcp_transmit_skb+0x462/0x984 [ 8.644140] [<0000000000564870>] tcp_write_xmit+0x220/0xd0c [ 8.644143] [<00000000005653a2>] tcp_push_one+0x46/0x58 [ 8.644145] [<00000000005568b4>] tcp_sendmsg+0xb7c/0xc9c [ 8.644148] [<00000000004e7514>] sock_aio_write+0x12c/0x158 [ 8.644153] [<000000000027033c>] do_sync_write+0x80/0xc8 [ 8.644156] [<0000000000271444>] vfs_write+0x144/0x1d8 [ 8.644158] [<000000000027192a>] SyS_write+0x62/0xd0 [ 8.644163] [<000000000062781c>] sysc_tracego+0x14/0x1a [ 8.644170] [<000003fffd51203c>] 0x3fffd51203c [ 8.644171] Last Breaking-Event-Address: [ 8.644174] [<0000000000421614>] virtqueue_add_outbuf+0x2e0/0x340
Andy Lutomirski
2014-Aug-28  18:06 UTC
[PATCH v2 0/4] virtio: Clean up scatterlists and use the DMA API
On Thu, Aug 28, 2014 at 12:44 AM, Christian Borntraeger <borntraeger at de.ibm.com> wrote:> On 27/08/14 23:50, Andy Lutomirski wrote: >> This fixes virtio on Xen guests as well as on any other platform >> that uses virtio_pci on which physical addresses don't match bus >> addresses. >> >> This can be tested with: >> >> virtme-run --xen xen --kimg arch/x86/boot/bzImage --console >> >> using virtme from here: >> >> https://git.kernel.org/cgit/utils/kernel/virtme/virtme.git >> >> Without these patches, the guest hangs forever. With these patches, >> everything works. >> >> This should be safe on all platforms that I'm aware of. That >> doesn't mean that there isn't anything that I missed. >> >> Changes from v1: >> - Using the DMA API is optional now. It would be nice to improve the >> DMA API to the point that it could be used unconditionally, but s390 >> proves that we're not there yet. >> - Includes patch 4, which fixes DMA debugging warnings from virtio_net. >> >> Andy Lutomirski (4): >> virtio_ring: Remove sg_next indirection >> virtio_ring: Support DMA APIs if requested >> virtio_pci: Use the DMA API for virtqueues >> virtio_net: Stop doing DMA from the stack >> >> drivers/lguest/lguest_device.c | 3 +- >> drivers/misc/mic/card/mic_virtio.c | 2 +- >> drivers/net/virtio_net.c | 53 +++++--- >> drivers/remoteproc/remoteproc_virtio.c | 4 +- >> drivers/s390/kvm/kvm_virtio.c | 2 +- >> drivers/s390/kvm/virtio_ccw.c | 4 +- >> drivers/virtio/virtio_mmio.c | 5 +- >> drivers/virtio/virtio_pci.c | 35 ++++-- >> drivers/virtio/virtio_ring.c | 219 ++++++++++++++++++++++++++------- >> include/linux/virtio_ring.h | 1 + >> tools/virtio/linux/virtio.h | 1 + >> tools/virtio/virtio_test.c | 2 +- >> tools/virtio/vringh_test.c | 3 +- >> 13 files changed, 253 insertions(+), 81 deletions(-) >> > > Patches were applied on top of 3.16. > > block seems to work, with net a simple ping works, iperf causes this:I neither see the bug, nor can I reproduce it on x86_64 on KVM. I doubt that dma vs. non-dma is relevant. I tried -net user a -net tap with iperf running in both directions. I also tried switching virtio_pci into non-DMA-API mode. No errors. Is there any chance that you could instrument that BUG_ON to print n, i, in_sgs, out_sgs, total_sg, total_in, and total_out? I assume this is some oddity with patch 1, but I'm mystified. Thanks, Andy> > [ 8.643981] ------------[ cut here ]------------ > [ 8.643986] kernel BUG at drivers/virtio/virtio_ring.c:245! > [ 8.644036] illegal operation: 0001 [#1] SMP > [ 8.644039] Modules linked in: virtio_net virtio_blk dm_multipath sunrpc > [ 8.644046] CPU: 0 PID: 1291 Comm: iperf Not tainted 3.16.0+ #130 > [ 8.644048] task: 00000000018ee4c0 ti: 0000000022098000 task.ti: 0000000022098000 > [ 8.644051] Krnl PSW : 0704d00180000000 0000000000421618 (virtqueue_add_outbuf+0x2e4/0x340) > [ 8.644066] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3 > Krnl GPRS: 00000000299f8a80 0000000000000000 0000000000000001 000003d100000000 > [ 8.644070] 00000000004214de 0000000000000e2c 0000000000000001 0000000000000002 > [ 8.644072] 00000000299f8a40 0000000000000001 00000000299f8a40 0000000000000000 > [ 8.644074] 0000000028077608 00000000287ec000 0000000000421548 000000002209b7b0 > [ 8.644085] Krnl Code: 000000000042160a: d060a7a90000 trtr 1961(97,%r10),0 > 0000000000421610: a7f4ff07 brc 15,42141e > #0000000000421614: a7f40001 brc 15,421616 > >0000000000421618: a7c8fffb lhi %r12,-5 > 000000000042161c: a7f4ff46 brc 15,4214a8 > 0000000000421620: a7f40001 brc 15,421622 > 0000000000421624: b9040021 lgr %r2,%r1 > 0000000000421628: c030001c3569 larl %r3,7a80fa > [ 8.644106] Call Trace: > [ 8.644110] ([<00000000004214de>] virtqueue_add_outbuf+0x1aa/0x340) > [ 8.644114] [<000003ff80174466>] start_xmit+0x1e6/0x49c [virtio_net] > [ 8.644119] [<000000000050571a>] dev_hard_start_xmit+0x346/0x600 > [ 8.644123] [<000000000052a43c>] sch_direct_xmit+0xe8/0x1e8 > [ 8.644126] [<0000000000505be8>] __dev_queue_xmit+0x214/0x4ec > [ 8.644131] [<000000000054a476>] ip_finish_output+0x436/0x90c > [ 8.644134] [<000000000054b7c0>] ip_queue_xmit+0x170/0x3e8 > [ 8.644137] [<00000000005636ca>] tcp_transmit_skb+0x462/0x984 > [ 8.644140] [<0000000000564870>] tcp_write_xmit+0x220/0xd0c > [ 8.644143] [<00000000005653a2>] tcp_push_one+0x46/0x58 > [ 8.644145] [<00000000005568b4>] tcp_sendmsg+0xb7c/0xc9c > [ 8.644148] [<00000000004e7514>] sock_aio_write+0x12c/0x158 > [ 8.644153] [<000000000027033c>] do_sync_write+0x80/0xc8 > [ 8.644156] [<0000000000271444>] vfs_write+0x144/0x1d8 > [ 8.644158] [<000000000027192a>] SyS_write+0x62/0xd0 > [ 8.644163] [<000000000062781c>] sysc_tracego+0x14/0x1a > [ 8.644170] [<000003fffd51203c>] 0x3fffd51203c > [ 8.644171] Last Breaking-Event-Address: > [ 8.644174] [<0000000000421614>] virtqueue_add_outbuf+0x2e0/0x340 >-- Andy Lutomirski AMA Capital Management, LLC
Reasonably Related Threads
- [PATCH v2 0/4] virtio: Clean up scatterlists and use the DMA API
- [PATCH v2 0/4] virtio: Clean up scatterlists and use the DMA API
- [PATCH v2 0/4] virtio: Clean up scatterlists and use the DMA API
- [PATCH v2 0/4] virtio: Clean up scatterlists and use the DMA API
- [PATCH v2 0/4] virtio: Clean up scatterlists and use the DMA API