Jason Wang
2018-Dec-13  10:10 UTC
[PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
Hi: This series tries to access virtqueue metadata through kernel virtual address instead of copy_user() friends since they had too much overheads like checks, spec barriers or even hardware feature toggling. Test shows about 24% improvement on TX PPS. It should benefit other cases as well. Please review Jason Wang (3): vhost: generalize adding used elem vhost: fine grain userspace memory accessors vhost: access vq metadata through kernel virtual address drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++---- drivers/vhost/vhost.h | 11 ++ 2 files changed, 266 insertions(+), 26 deletions(-) -- 2.17.1
Use one generic vhost_copy_to_user() instead of two dedicated
accessor. This will simplify the conversion to fine grain accessors.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/vhost/vhost.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 3a5f81a66d34..1c54ec1b82f8 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2164,16 +2164,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 
 	start = vq->last_used_idx & (vq->num - 1);
 	used = vq->used->ring + start;
-	if (count == 1) {
-		if (vhost_put_user(vq, heads[0].id, &used->id)) {
-			vq_err(vq, "Failed to write used id");
-			return -EFAULT;
-		}
-		if (vhost_put_user(vq, heads[0].len, &used->len)) {
-			vq_err(vq, "Failed to write used len");
-			return -EFAULT;
-		}
-	} else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
+	if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
 		vq_err(vq, "Failed to write used");
 		return -EFAULT;
 	}
-- 
2.17.1
Jason Wang
2018-Dec-13  10:10 UTC
[PATCH net-next 2/3] vhost: fine grain userspace memory accessors
This is used to hide the metadata address from virtqueue helpers. This
will allow to implement a vmap based fast accessing to metadata.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/vhost/vhost.c | 94 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 77 insertions(+), 17 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1c54ec1b82f8..bafe39d2e637 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -871,6 +871,34 @@ static inline void __user *__vhost_get_user(struct
vhost_virtqueue *vq,
 	ret; \
 })
 
+static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
+{
+	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
+			      vhost_avail_event(vq));
+}
+
+static inline int vhost_put_used(struct vhost_virtqueue *vq,
+				 struct vring_used_elem *head, int idx,
+				 int count)
+{
+	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
+				  count * sizeof(*head));
+}
+
+static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
+
+{
+	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
+			      &vq->used->flags);
+}
+
+static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
+
+{
+	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
+			      &vq->used->idx);
+}
+
 #define vhost_get_user(vq, x, ptr, type)		\
 ({ \
 	int ret; \
@@ -895,6 +923,43 @@ static inline void __user *__vhost_get_user(struct
vhost_virtqueue *vq,
 #define vhost_get_used(vq, x, ptr) \
 	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
+				      __virtio16 *idx)
+{
+	return vhost_get_avail(vq, *idx, &vq->avail->idx);
+}
+
+static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
+				       __virtio16 *head, int idx)
+{
+	return vhost_get_avail(vq, *head,
+			       &vq->avail->ring[idx & (vq->num - 1)]);
+}
+
+static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
+					__virtio16 *flags)
+{
+	return vhost_get_avail(vq, *flags, &vq->avail->flags);
+}
+
+static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
+				       __virtio16 *event)
+{
+	return vhost_get_avail(vq, *event, vhost_used_event(vq));
+}
+
+static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
+				     __virtio16 *idx)
+{
+	return vhost_get_used(vq, *idx, &vq->used->idx);
+}
+
+static inline int vhost_get_desc(struct vhost_virtqueue *vq,
+				 struct vring_desc *desc, int idx)
+{
+	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
+}
+
 static int vhost_new_umem_range(struct vhost_umem *umem,
 				u64 start, u64 size, u64 end,
 				u64 userspace_addr, int perm)
@@ -1751,8 +1816,7 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
 	void __user *used;
-	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
-			   &vq->used->flags) < 0)
+	if (vhost_put_used_flags(vq))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		/* Make sure the flag is seen before log. */
@@ -1770,8 +1834,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue
*vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16
avail_event)
 {
-	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
-			   vhost_avail_event(vq)))
+	if (vhost_put_avail_event(vq))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		void __user *used;
@@ -1808,7 +1871,7 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 		r = -EFAULT;
 		goto err;
 	}
-	r = vhost_get_used(vq, last_used_idx, &vq->used->idx);
+	r = vhost_get_used_idx(vq, &last_used_idx);
 	if (r) {
 		vq_err(vq, "Can't access used idx at %p\n",
 		       &vq->used->idx);
@@ -2007,7 +2070,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	last_avail_idx = vq->last_avail_idx;
 
 	if (vq->avail_idx == vq->last_avail_idx) {
-		if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
+		if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
 			vq_err(vq, "Failed to access avail idx at %p\n",
 				&vq->avail->idx);
 			return -EFAULT;
@@ -2034,8 +2097,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
 	/* Grab the next descriptor number they're advertising, and increment
 	 * the index we've seen. */
-	if (unlikely(vhost_get_avail(vq, ring_head,
-		     &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) {
+	if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
 		vq_err(vq, "Failed to read head: idx %d address %p\n",
 		       last_avail_idx,
 		       &vq->avail->ring[last_avail_idx % vq->num]);
@@ -2070,8 +2132,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			       i, vq->num, head);
 			return -EINVAL;
 		}
-		ret = vhost_copy_from_user(vq, &desc, vq->desc + i,
-					   sizeof desc);
+		ret = vhost_get_desc(vq, &desc, i);
 		if (unlikely(ret)) {
 			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
 			       i, vq->desc + i);
@@ -2164,7 +2225,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 
 	start = vq->last_used_idx & (vq->num - 1);
 	used = vq->used->ring + start;
-	if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
+	if (vhost_put_used(vq, heads, start, count)) {
 		vq_err(vq, "Failed to write used");
 		return -EFAULT;
 	}
@@ -2208,8 +2269,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct
vring_used_elem *heads,
 
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
-	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
-			   &vq->used->idx)) {
+	if (vhost_put_used_idx(vq)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
@@ -2241,7 +2301,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct
vhost_virtqueue *vq)
 
 	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
 		__virtio16 flags;
-		if (vhost_get_avail(vq, flags, &vq->avail->flags)) {
+		if (vhost_get_avail_flags(vq, &flags)) {
 			vq_err(vq, "Failed to get flags");
 			return true;
 		}
@@ -2255,7 +2315,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct
vhost_virtqueue *vq)
 	if (unlikely(!v))
 		return true;
 
-	if (vhost_get_avail(vq, event, vhost_used_event(vq))) {
+	if (vhost_get_used_event(vq, &event)) {
 		vq_err(vq, "Failed to get used event idx");
 		return true;
 	}
@@ -2300,7 +2360,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct
vhost_virtqueue *vq)
 	if (vq->avail_idx != vq->last_avail_idx)
 		return false;
 
-	r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);
+	r = vhost_get_avail_idx(vq, &avail_idx);
 	if (unlikely(r))
 		return false;
 	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
@@ -2336,7 +2396,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct
vhost_virtqueue *vq)
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again. */
 	smp_mb();
-	r = vhost_get_avail(vq, avail_idx, &vq->avail->idx);
+	r = vhost_get_avail_idx(vq, &avail_idx);
 	if (r) {
 		vq_err(vq, "Failed to check avail idx at %p: %d\n",
 		       &vq->avail->idx, r);
-- 
2.17.1
Jason Wang
2018-Dec-13  10:10 UTC
[PATCH net-next 3/3] vhost: access vq metadata through kernel virtual address
It was noticed that the copy_user() friends that was used to access
virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software check,
speculation barrier, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets.
This patch tries to eliminate those overhead by pin vq metadata pages
and access them through vmap(). During SET_VRING_ADDR, we will setup
those mappings and memory accessors are modified to use pointers to
access the metadata directly.
Note, this was only done when device IOTLB is not enabled. We could
use similar method to optimize it in the future.
Tests shows about ~24% improvement on TX PPS when using virtio-user +
vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
Before: ~5.0Mpps
After:  ~6.1Mpps
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  11 +++
 2 files changed, 189 insertions(+)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index bafe39d2e637..1bd24203afb6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->indirect = NULL;
 		vq->heads = NULL;
 		vq->dev = dev;
+		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
+		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
+		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
 		mutex_init(&vq->mutex);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
@@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
+static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
+			   size_t size, int write)
+{
+	struct page **pages;
+	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
+	int npinned;
+	void *vaddr;
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	npinned = get_user_pages_fast(uaddr, npages, write, pages);
+	if (npinned != npages)
+		goto err;
+
+	vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
+	if (!vaddr)
+		goto err;
+
+	map->pages = pages;
+	map->addr = vaddr + (uaddr & (PAGE_SIZE - 1));
+	map->npages = npages;
+
+	return 0;
+
+err:
+	if (npinned > 0)
+		release_pages(pages, npinned);
+	kfree(pages);
+	return -EFAULT;
+}
+
+static void vhost_uninit_vmap(struct vhost_vmap *map)
+{
+	if (!map->addr)
+		return;
+
+	vunmap(map->addr);
+	release_pages(map->pages, map->npages);
+	kfree(map->pages);
+
+	map->addr = NULL;
+	map->pages = NULL;
+	map->npages = 0;
+}
+
+static void vhost_clean_vmaps(struct vhost_virtqueue *vq)
+{
+	vhost_uninit_vmap(&vq->avail_ring);
+	vhost_uninit_vmap(&vq->desc_ring);
+	vhost_uninit_vmap(&vq->used_ring);
+}
+
+static int vhost_setup_vmaps(struct vhost_virtqueue *vq, unsigned long avail,
+			     unsigned long desc, unsigned long used)
+{
+	size_t event = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+	size_t avail_size, desc_size, used_size;
+	int ret;
+
+	vhost_clean_vmaps(vq);
+
+	avail_size = sizeof(*vq->avail) +
+		     sizeof(*vq->avail->ring) * vq->num + event;
+	ret = vhost_init_vmap(&vq->avail_ring, avail, avail_size, false);
+	if (ret) {
+		vq_err(vq, "Fail to setup vmap for avail ring!\n");
+		goto err_avail;
+	}
+
+	desc_size = sizeof(*vq->desc) * vq->num;
+	ret = vhost_init_vmap(&vq->desc_ring, desc, desc_size, false);
+	if (ret) {
+		vq_err(vq, "Fail to setup vmap for desc ring!\n");
+		goto err_desc;
+	}
+
+	used_size = sizeof(*vq->used) +
+		    sizeof(*vq->used->ring) * vq->num + event;
+	ret = vhost_init_vmap(&vq->used_ring, used, used_size, true);
+	if (ret) {
+		vq_err(vq, "Fail to setup vmap for used ring!\n");
+		goto err_used;
+	}
+
+	return 0;
+
+err_used:
+	vhost_uninit_vmap(&vq->used_ring);
+err_desc:
+	vhost_uninit_vmap(&vq->avail_ring);
+err_avail:
+	return -EFAULT;
+}
+
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -626,6 +725,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		if (dev->vqs[i]->call_ctx)
 			eventfd_ctx_put(dev->vqs[i]->call_ctx);
 		vhost_vq_reset(dev, dev->vqs[i]);
+		vhost_clean_vmaps(dev->vqs[i]);
 	}
 	vhost_dev_free_iovecs(dev);
 	if (dev->log_ctx)
@@ -873,6 +973,14 @@ static inline void __user *__vhost_get_user(struct
vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		*((__virtio16 *)&used->ring[vq->num]) +			cpu_to_vhost16(vq,
vq->avail_idx);
+		return 0;
+	}
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -881,6 +989,13 @@ static inline int vhost_put_used(struct vhost_virtqueue
*vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		memcpy(used->ring + idx, head, count * sizeof(*head));
+		return 0;
+	}
+
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -888,6 +1003,13 @@ static inline int vhost_put_used(struct vhost_virtqueue
*vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		used->flags = cpu_to_vhost16(vq, vq->used_flags);
+		return 0;
+	}
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -895,6 +1017,13 @@ static inline int vhost_put_used_flags(struct
vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
+		return 0;
+	}
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -926,12 +1055,26 @@ static inline int vhost_put_used_idx(struct
vhost_virtqueue *vq)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
+	if (!vq->iotlb) {
+		struct vring_avail *avail = vq->avail_ring.addr;
+
+		*idx = avail->idx;
+		return 0;
+	}
+
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
+	if (!vq->iotlb) {
+		struct vring_avail *avail = vq->avail_ring.addr;
+
+		*head = avail->ring[idx & (vq->num - 1)];
+		return 0;
+	}
+
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -939,24 +1082,52 @@ static inline int vhost_get_avail_head(struct
vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
+	if (!vq->iotlb) {
+		struct vring_avail *avail = vq->avail_ring.addr;
+
+		*flags = avail->flags;
+		return 0;
+	}
+
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
+	if (!vq->iotlb) {
+		struct vring_avail *avail = vq->avail_ring.addr;
+
+		*event = (__virtio16)avail->ring[vq->num];
+		return 0;
+	}
+
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
+	if (!vq->iotlb) {
+		struct vring_used *used = vq->used_ring.addr;
+
+		*idx = used->idx;
+		return 0;
+	}
+
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
+	if (!vq->iotlb) {
+		struct vring_desc *d = vq->desc_ring.addr;
+
+		*desc = *(d + idx);
+		return 0;
+	}
+
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1551,6 +1722,13 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int
ioctl, void __user *arg
 			}
 		}
 
+		if (!vq->iotlb && vhost_setup_vmaps(vq, a.avail_user_addr,
+							a.desc_user_addr,
+							a.used_user_addr)) {
+			r = -EINVAL;
+			break;
+		}
+
 		vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
 		vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
 		vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 466ef7542291..89dc0ad3d055 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -80,6 +80,12 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
+struct vhost_vmap {
+	struct page **pages;
+	void *addr;
+	int npages;
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -90,6 +96,11 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
+
+	struct vhost_vmap avail_ring;
+	struct vhost_vmap desc_ring;
+	struct vhost_vmap used_ring;
+
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
-- 
2.17.1
Michael S. Tsirkin
2018-Dec-13  15:27 UTC
[PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote:> Hi: > > This series tries to access virtqueue metadata through kernel virtual > address instead of copy_user() friends since they had too much > overheads like checks, spec barriers or even hardware feature > toggling.Userspace accesses through remapping tricks and next time there's a need for a new barrier we are left to figure it out by ourselves. I don't like the idea I have to say. As a first step, why don't we switch to unsafe_put_user/unsafe_get_user etc? That would be more of an apples to apples comparison, would it not?> Test shows about 24% improvement on TX PPS. It should benefit other > cases as well. > > Please review > > Jason Wang (3): > vhost: generalize adding used elem > vhost: fine grain userspace memory accessors > vhost: access vq metadata through kernel virtual address > > drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++---- > drivers/vhost/vhost.h | 11 ++ > 2 files changed, 266 insertions(+), 26 deletions(-) > > -- > 2.17.1
Michael S. Tsirkin
2018-Dec-13  15:44 UTC
[PATCH net-next 3/3] vhost: access vq metadata through kernel virtual address
On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:> It was noticed that the copy_user() friends that was used to access > virtqueue metdata tends to be very expensive for dataplane > implementation like vhost since it involves lots of software check, > speculation barrier, hardware feature toggling (e.g SMAP). The > extra cost will be more obvious when transferring small packets. > > This patch tries to eliminate those overhead by pin vq metadata pages > and access them through vmap(). During SET_VRING_ADDR, we will setup > those mappings and memory accessors are modified to use pointers to > access the metadata directly. > > Note, this was only done when device IOTLB is not enabled. We could > use similar method to optimize it in the future. > > Tests shows about ~24% improvement on TX PPS when using virtio-user + > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled): > > Before: ~5.0Mpps > After: ~6.1Mpps > > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++ > drivers/vhost/vhost.h | 11 +++ > 2 files changed, 189 insertions(+) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index bafe39d2e637..1bd24203afb6 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev, > vq->indirect = NULL; > vq->heads = NULL; > vq->dev = dev; > + memset(&vq->avail_ring, 0, sizeof(vq->avail_ring)); > + memset(&vq->used_ring, 0, sizeof(vq->used_ring)); > + memset(&vq->desc_ring, 0, sizeof(vq->desc_ring)); > mutex_init(&vq->mutex); > vhost_vq_reset(dev, vq); > if (vq->handle_kick) > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev) > spin_unlock(&dev->iotlb_lock); > } > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr, > + size_t size, int write) > +{ > + struct page **pages; > + int npages = DIV_ROUND_UP(size, PAGE_SIZE); > + int npinned; > + void *vaddr; > + > + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); > + if (!pages) > + return -ENOMEM; > + > + npinned = get_user_pages_fast(uaddr, npages, write, pages); > + if (npinned != npages) > + goto err; > +As I said I have doubts about the whole approach, but this implementation in particular isn't a good idea as it keeps the page around forever. So no THP, no NUMA rebalancing, userspace-controlled amount of memory locked up and not accounted for. Don't get me wrong it's a great patch in an ideal world. But then in an ideal world no barriers smap etc are necessary at all.> + vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL); > + if (!vaddr) > + goto err; > + > + map->pages = pages; > + map->addr = vaddr + (uaddr & (PAGE_SIZE - 1)); > + map->npages = npages; > + > + return 0; > + > +err: > + if (npinned > 0) > + release_pages(pages, npinned); > + kfree(pages); > + return -EFAULT; > +} > + > +static void vhost_uninit_vmap(struct vhost_vmap *map) > +{ > + if (!map->addr) > + return; > + > + vunmap(map->addr); > + release_pages(map->pages, map->npages); > + kfree(map->pages); > + > + map->addr = NULL; > + map->pages = NULL; > + map->npages = 0; > +} > + > +static void vhost_clean_vmaps(struct vhost_virtqueue *vq) > +{ > + vhost_uninit_vmap(&vq->avail_ring); > + vhost_uninit_vmap(&vq->desc_ring); > + vhost_uninit_vmap(&vq->used_ring); > +} > + > +static int vhost_setup_vmaps(struct vhost_virtqueue *vq, unsigned long avail, > + unsigned long desc, unsigned long used) > +{ > + size_t event = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; > + size_t avail_size, desc_size, used_size; > + int ret; > + > + vhost_clean_vmaps(vq); > + > + avail_size = sizeof(*vq->avail) + > + sizeof(*vq->avail->ring) * vq->num + event; > + ret = vhost_init_vmap(&vq->avail_ring, avail, avail_size, false); > + if (ret) { > + vq_err(vq, "Fail to setup vmap for avail ring!\n"); > + goto err_avail; > + } > + > + desc_size = sizeof(*vq->desc) * vq->num; > + ret = vhost_init_vmap(&vq->desc_ring, desc, desc_size, false); > + if (ret) { > + vq_err(vq, "Fail to setup vmap for desc ring!\n"); > + goto err_desc; > + } > + > + used_size = sizeof(*vq->used) + > + sizeof(*vq->used->ring) * vq->num + event; > + ret = vhost_init_vmap(&vq->used_ring, used, used_size, true); > + if (ret) { > + vq_err(vq, "Fail to setup vmap for used ring!\n"); > + goto err_used; > + } > + > + return 0; > + > +err_used: > + vhost_uninit_vmap(&vq->used_ring); > +err_desc: > + vhost_uninit_vmap(&vq->avail_ring); > +err_avail: > + return -EFAULT; > +} > + > void vhost_dev_cleanup(struct vhost_dev *dev) > { > int i; > @@ -626,6 +725,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) > if (dev->vqs[i]->call_ctx) > eventfd_ctx_put(dev->vqs[i]->call_ctx); > vhost_vq_reset(dev, dev->vqs[i]); > + vhost_clean_vmaps(dev->vqs[i]); > } > vhost_dev_free_iovecs(dev); > if (dev->log_ctx) > @@ -873,6 +973,14 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, > > static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + *((__virtio16 *)&used->ring[vq->num]) > + cpu_to_vhost16(vq, vq->avail_idx); > + return 0; > + } > + > return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx), > vhost_avail_event(vq)); > } > @@ -881,6 +989,13 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, > struct vring_used_elem *head, int idx, > int count) > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + memcpy(used->ring + idx, head, count * sizeof(*head)); > + return 0; > + } > + > return vhost_copy_to_user(vq, vq->used->ring + idx, head, > count * sizeof(*head)); > } > @@ -888,6 +1003,13 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, > static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) > > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + used->flags = cpu_to_vhost16(vq, vq->used_flags); > + return 0; > + } > + > return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags), > &vq->used->flags); > } > @@ -895,6 +1017,13 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) > static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) > > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + used->idx = cpu_to_vhost16(vq, vq->last_used_idx); > + return 0; > + } > + > return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx), > &vq->used->idx); > } > @@ -926,12 +1055,26 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) > static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, > __virtio16 *idx) > { > + if (!vq->iotlb) { > + struct vring_avail *avail = vq->avail_ring.addr; > + > + *idx = avail->idx; > + return 0; > + } > + > return vhost_get_avail(vq, *idx, &vq->avail->idx); > } > > static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, > __virtio16 *head, int idx) > { > + if (!vq->iotlb) { > + struct vring_avail *avail = vq->avail_ring.addr; > + > + *head = avail->ring[idx & (vq->num - 1)]; > + return 0; > + } > + > return vhost_get_avail(vq, *head, > &vq->avail->ring[idx & (vq->num - 1)]); > } > @@ -939,24 +1082,52 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, > static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, > __virtio16 *flags) > { > + if (!vq->iotlb) { > + struct vring_avail *avail = vq->avail_ring.addr; > + > + *flags = avail->flags; > + return 0; > + } > + > return vhost_get_avail(vq, *flags, &vq->avail->flags); > } > > static inline int vhost_get_used_event(struct vhost_virtqueue *vq, > __virtio16 *event) > { > + if (!vq->iotlb) { > + struct vring_avail *avail = vq->avail_ring.addr; > + > + *event = (__virtio16)avail->ring[vq->num]; > + return 0; > + } > + > return vhost_get_avail(vq, *event, vhost_used_event(vq)); > } > > static inline int vhost_get_used_idx(struct vhost_virtqueue *vq, > __virtio16 *idx) > { > + if (!vq->iotlb) { > + struct vring_used *used = vq->used_ring.addr; > + > + *idx = used->idx; > + return 0; > + } > + > return vhost_get_used(vq, *idx, &vq->used->idx); > } > > static inline int vhost_get_desc(struct vhost_virtqueue *vq, > struct vring_desc *desc, int idx) > { > + if (!vq->iotlb) { > + struct vring_desc *d = vq->desc_ring.addr; > + > + *desc = *(d + idx); > + return 0; > + } > + > return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc)); > } > > @@ -1551,6 +1722,13 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg > } > } > > + if (!vq->iotlb && vhost_setup_vmaps(vq, a.avail_user_addr, > + a.desc_user_addr, > + a.used_user_addr)) { > + r = -EINVAL; > + break; > + } > + > vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); > vq->desc = (void __user *)(unsigned long)a.desc_user_addr; > vq->avail = (void __user *)(unsigned long)a.avail_user_addr; > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 466ef7542291..89dc0ad3d055 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -80,6 +80,12 @@ enum vhost_uaddr_type { > VHOST_NUM_ADDRS = 3, > }; > > +struct vhost_vmap { > + struct page **pages; > + void *addr; > + int npages; > +}; > + > /* The virtqueue structure describes a queue attached to a device. */ > struct vhost_virtqueue { > struct vhost_dev *dev; > @@ -90,6 +96,11 @@ struct vhost_virtqueue { > struct vring_desc __user *desc; > struct vring_avail __user *avail; > struct vring_used __user *used; > + > + struct vhost_vmap avail_ring; > + struct vhost_vmap desc_ring; > + struct vhost_vmap used_ring; > + > const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; > struct file *kick; > struct eventfd_ctx *call_ctx; > -- > 2.17.1
Michael S. Tsirkin
2018-Dec-13  19:41 UTC
[PATCH net-next 1/3] vhost: generalize adding used elem
On Thu, Dec 13, 2018 at 06:10:20PM +0800, Jason Wang wrote:> Use one generic vhost_copy_to_user() instead of two dedicated > accessor. This will simplify the conversion to fine grain accessors. > > Signed-off-by: Jason Wang <jasowang at redhat.com>The reason we did it like this is because it was faster. Want to try benchmarking before we change it?> --- > drivers/vhost/vhost.c | 11 +---------- > 1 file changed, 1 insertion(+), 10 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 3a5f81a66d34..1c54ec1b82f8 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -2164,16 +2164,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, > > start = vq->last_used_idx & (vq->num - 1); > used = vq->used->ring + start; > - if (count == 1) { > - if (vhost_put_user(vq, heads[0].id, &used->id)) { > - vq_err(vq, "Failed to write used id"); > - return -EFAULT; > - } > - if (vhost_put_user(vq, heads[0].len, &used->len)) { > - vq_err(vq, "Failed to write used len"); > - return -EFAULT; > - } > - } else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) { > + if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) { > vq_err(vq, "Failed to write used"); > return -EFAULT; > } > -- > 2.17.1
Michael S. Tsirkin
2018-Dec-13  20:12 UTC
[PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote:> Hi: > > This series tries to access virtqueue metadata through kernel virtual > address instead of copy_user() friends since they had too much > overheads like checks, spec barriers or even hardware feature > toggling. > > Test shows about 24% improvement on TX PPS. It should benefit other > cases as well. > > Please reviewI think the idea of speeding up userspace access is a good one. However I think that moving all checks to start is way too aggressive. Instead, let's batch things up but let's not keep them around forever. Here are some ideas: 1. Disable preemption, process a small number of small packets directly in an atomic context. This should cut latency down significantly, the tricky part is to only do it on a light load and disable this for the streaming case otherwise it's unfair. This might fail, if it does just bounce things out to a thread. 2. Switch to unsafe_put_user/unsafe_get_user, and batch up multiple accesses. 3. Allow adding a fixup point manually, such that multiple independent get_user accesses can get a single fixup (will allow better compiler optimizations).> Jason Wang (3): > vhost: generalize adding used elem > vhost: fine grain userspace memory accessors > vhost: access vq metadata through kernel virtual address > > drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++---- > drivers/vhost/vhost.h | 11 ++ > 2 files changed, 266 insertions(+), 26 deletions(-) > > -- > 2.17.1
Jason Wang
2018-Dec-14  03:42 UTC
[PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
On 2018/12/13 ??11:27, Michael S. Tsirkin wrote:> On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote: >> Hi: >> >> This series tries to access virtqueue metadata through kernel virtual >> address instead of copy_user() friends since they had too much >> overheads like checks, spec barriers or even hardware feature >> toggling. > Userspace accesses through remapping tricks and next time there's a need > for a new barrier we are left to figure it out by ourselves.I don't get here, do you mean spec barriers? It's completely unnecessary for vhost which is kernel thread. And even if you're right, vhost is not the only place, there's lots of vmap() based accessing in kernel. Think in another direction, this means we won't suffer form unnecessary barriers for kthread like vhost in the future, we will manually pick the one we really need (but it should have little possibility). Please notice we only access metdata through remapping not the data itself. This idea has been used for high speed userspace backend for years, e.g packet socket or recent AF_XDP. The only difference is the page was remap to from kernel to userspace.> I don't > like the idea I have to say. As a first step, why don't we switch to > unsafe_put_user/unsafe_get_user etc?Several reasons: - They only have x86 variant, it won't have any difference for the rest of architecture. - unsafe_put_user/unsafe_get_user is not sufficient for accessing structures (e.g accessing descriptor) or arrays (batching). - Unless we can batch at least the accessing of two places in three of avail, used and descriptor in one run. There will be no difference. E.g we can batch updating used ring, but it won't make any difference in this case.> That would be more of an apples to apples comparison, would it not?Apples to apples comparison only help if we are the No.1. But the fact is we are not. If we want to compete with e.g dpdk or AF_XDP, vmap() is the fastest method AFAIK. Thanks> > >> Test shows about 24% improvement on TX PPS. It should benefit other >> cases as well. >> >> Please review >> >> Jason Wang (3): >> vhost: generalize adding used elem >> vhost: fine grain userspace memory accessors >> vhost: access vq metadata through kernel virtual address >> >> drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++---- >> drivers/vhost/vhost.h | 11 ++ >> 2 files changed, 266 insertions(+), 26 deletions(-) >> >> -- >> 2.17.1
Jason Wang
2018-Dec-14  04:29 UTC
[PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
On 2018/12/14 ??4:12, Michael S. Tsirkin wrote:> On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote: >> Hi: >> >> This series tries to access virtqueue metadata through kernel virtual >> address instead of copy_user() friends since they had too much >> overheads like checks, spec barriers or even hardware feature >> toggling. >> >> Test shows about 24% improvement on TX PPS. It should benefit other >> cases as well. >> >> Please review > I think the idea of speeding up userspace access is a good one. > However I think that moving all checks to start is way too aggressive.So did packet and AF_XDP. Anyway, sharing address space and access them directly is the fastest way. Performance is the major consideration for people to choose backend. Compare to userspace implementation, vhost does not have security advantages at any level. If vhost is still slow, people will start to develop backends based on e.g AF_XDP.> Instead, let's batch things up but let's not keep them > around forever. > Here are some ideas: > > > 1. Disable preemption, process a small number of small packets > directly in an atomic context. This should cut latency > down significantly, the tricky part is to only do it > on a light load and disable this > for the streaming case otherwise it's unfair. > This might fail, if it does just bounce things out to > a thread.I'm not sure what context you meant here. Is this for TX path of TUN? But a fundamental difference is my series is targeted for extreme heavy load not light one, 100% cpu for vhost is expected.> > 2. Switch to unsafe_put_user/unsafe_get_user, > and batch up multiple accesses.As I said, unless we can batch accessing of two difference places of three of avail, descriptor and used. It won't help for batching the accessing of a single place like used. I'm even not sure this can be done consider the case of packed virtqueue, we have a single descriptor ring. Batching through unsafe helpers may not help in this case since it's equivalent to safe ones . And This requires non trivial refactoring of vhost. And such refactoring itself make give us noticeable impact (e.g it may lead regression).> > 3. Allow adding a fixup point manually, > such that multiple independent get_user accesses > can get a single fixup (will allow better compiler > optimizations). >So for metadata access, I don't see how you suggest here can help in the case of heavy workload. For data access, this may help but I've played to batch the data copy to reduce SMAP/spec barriers in vhost-net but I don't see performance improvement. Thanks> > > >> Jason Wang (3): >> vhost: generalize adding used elem >> vhost: fine grain userspace memory accessors >> vhost: access vq metadata through kernel virtual address >> >> drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++---- >> drivers/vhost/vhost.h | 11 ++ >> 2 files changed, 266 insertions(+), 26 deletions(-) >> >> -- >> 2.17.1
kbuild test robot
2018-Dec-14  14:48 UTC
[PATCH net-next 3/3] vhost: access vq metadata through kernel virtual address
Hi Jason,
I love your patch! Yet something to improve:
[auto build test ERROR on net-next/master]
url:   
https://github.com/0day-ci/linux/commits/Jason-Wang/vhost-accelerate-metadata-access-through-vmap/20181214-200417
config: mips-malta_kvm_defconfig (attached as .config)
compiler: mipsel-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O
~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=mips 
All errors (new ones prefixed by >>):
   drivers//vhost/vhost.c: In function
'vhost_init_vmap':>> drivers//vhost/vhost.c:648:3: error: implicit declaration of function
'release_pages'; did you mean 'release_task'?
[-Werror=implicit-function-declaration]
      release_pages(pages, npinned);
      ^~~~~~~~~~~~~
      release_task
   cc1: some warnings being treated as errors
vim +648 drivers//vhost/vhost.c
   619	
   620	static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
   621				   size_t size, int write)
   622	{
   623		struct page **pages;
   624		int npages = DIV_ROUND_UP(size, PAGE_SIZE);
   625		int npinned;
   626		void *vaddr;
   627	
   628		pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
   629		if (!pages)
   630			return -ENOMEM;
   631	
   632		npinned = get_user_pages_fast(uaddr, npages, write, pages);
   633		if (npinned != npages)
   634			goto err;
   635	
   636		vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
   637		if (!vaddr)
   638			goto err;
   639	
   640		map->pages = pages;
   641		map->addr = vaddr + (uaddr & (PAGE_SIZE - 1));
   642		map->npages = npages;
   643	
   644		return 0;
   645	
   646	err:
   647		if (npinned > 0)
 > 648			release_pages(pages, npinned);
   649		kfree(pages);
   650		return -EFAULT;
   651	}
   652	
---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
-------------- next part --------------
A non-text attachment was scrubbed...
Name: .config.gz
Type: application/gzip
Size: 19468 bytes
Desc: not available
URL:
<http://lists.linuxfoundation.org/pipermail/virtualization/attachments/20181214/1c4c947c/attachment-0001.bin>
Michael S. Tsirkin
2018-Dec-14  15:16 UTC
[PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote:> Hi: > > This series tries to access virtqueue metadata through kernel virtual > address instead of copy_user() friends since they had too much > overheads like checks, spec barriers or even hardware feature > toggling. > > Test shows about 24% improvement on TX PPS. It should benefit other > cases as well.BTW if the issue is all the error checking, maybe we should consider using something like uaccess_catch and check in a single place.> Please review > > Jason Wang (3): > vhost: generalize adding used elem > vhost: fine grain userspace memory accessors > vhost: access vq metadata through kernel virtual address > > drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++---- > drivers/vhost/vhost.h | 11 ++ > 2 files changed, 266 insertions(+), 26 deletions(-) > > -- > 2.17.1
Seemingly Similar Threads
- [PATCH net-next 1/3] vhost: generalize adding used elem
- [RFC PATCH V3 1/5] vhost: generalize adding used elem
- [RFC PATCH V2 2/5] vhost: fine grain userspace memory accessors
- [RFC PATCH V2 2/5] vhost: fine grain userspace memory accessors
- [RFC PATCH V2 2/5] vhost: fine grain userspace memory accessors