thr3ads.net - Linux Virtualization - [PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree [Jan 2016]

If this information is useful, please help other people find it:
Share via:

Jason Wang

2016-Jan-18 02:42 UTC

[PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree

Current pre-sorted memory region array has some limitations for future
device IOTLB conversion:

1) need extra work for adding and removing a single region, and it's
   expected to be slow because of sorting or memory re-allocation.
2) need extra work of removing a large range which may intersect
   several regions with different size.
3) need trick for a replacement policy like LRU

To overcome the above shortcomings, this patch convert it to interval
tree which can easily address the above issue with almost no extra
work.

The patch could be used for:

- Extend the current API and only let the userspace to send diffs of
  memory table.
- Simplify Device IOTLB implementation.

Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/vhost/net.c   |   8 +--
 drivers/vhost/vhost.c | 179 +++++++++++++++++++++++++++-----------------------
 drivers/vhost/vhost.h |  27 ++++++--
 3 files changed, 125 insertions(+), 89 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9eda69e..481db96 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -968,20 +968,20 @@ static long vhost_net_reset_owner(struct vhost_net *n)
 	struct socket *tx_sock = NULL;
 	struct socket *rx_sock = NULL;
 	long err;
-	struct vhost_memory *memory;
+	struct vhost_umem *umem;
 
 	mutex_lock(&n->dev.mutex);
 	err = vhost_dev_check_owner(&n->dev);
 	if (err)
 		goto done;
-	memory = vhost_dev_reset_owner_prepare();
-	if (!memory) {
+	umem = vhost_dev_reset_owner_prepare();
+	if (!umem) {
 		err = -ENOMEM;
 		goto done;
 	}
 	vhost_net_stop(n, &tx_sock, &rx_sock);
 	vhost_net_flush(n);
-	vhost_dev_reset_owner(&n->dev, memory);
+	vhost_dev_reset_owner(&n->dev, umem);
 	vhost_net_vq_reset(n);
 done:
 	mutex_unlock(&n->dev.mutex);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ad2146a..851dce8 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -27,6 +27,7 @@
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/interval_tree_generic.h>
 
 #include "vhost.h"
 
@@ -42,6 +43,10 @@ enum {
 #define vhost_used_event(vq) ((__virtio16 __user
*)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((__virtio16 __user
*)&vq->used->ring[vq->num])
 
+INTERVAL_TREE_DEFINE(struct vhost_umem_node,
+		     rb, __u64, __subtree_last,
+		     START, LAST, , vhost_umem_interval_tree);
+
 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
 static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
 {
@@ -275,7 +280,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
-	vq->memory = NULL;
+	vq->umem = NULL;
 	vq->is_le = virtio_legacy_is_little_endian();
 	vhost_vq_reset_user_be(vq);
 }
@@ -381,7 +386,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	mutex_init(&dev->mutex);
 	dev->log_ctx = NULL;
 	dev->log_file = NULL;
-	dev->memory = NULL;
+	dev->umem = NULL;
 	dev->mm = NULL;
 	spin_lock_init(&dev->work_lock);
 	INIT_LIST_HEAD(&dev->work_list);
@@ -486,27 +491,36 @@ err_mm:
 }
 EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
 
-struct vhost_memory *vhost_dev_reset_owner_prepare(void)
+static void *vhost_kvzalloc(unsigned long size)
+{
+	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+
+	if (!n)
+		n = vzalloc(size);
+	return n;
+}
+
+struct vhost_umem *vhost_dev_reset_owner_prepare(void)
 {
-	return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
+	return vhost_kvzalloc(sizeof(struct vhost_umem));
 }
 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
 
 /* Caller should have device mutex */
-void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory)
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
 {
 	int i;
 
 	vhost_dev_cleanup(dev, true);
 
 	/* Restore memory to default empty mapping. */
-	memory->nregions = 0;
-	dev->memory = memory;
+	INIT_LIST_HEAD(&umem->umem_list);
+	dev->umem = umem;
 	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
 	 * VQs aren't running.
 	 */
 	for (i = 0; i < dev->nvqs; ++i)
-		dev->vqs[i]->memory = memory;
+		dev->vqs[i]->umem = umem;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
 
@@ -523,6 +537,19 @@ void vhost_dev_stop(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 
+static void vhost_umem_clean(struct vhost_dev *dev)
+{
+	struct vhost_umem_node *node, *tmp;
+	struct vhost_umem *umem = dev->umem;
+
+	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
+		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
+		list_del(&node->link);
+		kvfree(node);
+	}
+	kvfree(umem);
+}
+
 /* Caller should have device mutex if and only if locked is set */
 void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 {
@@ -549,8 +576,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
 		fput(dev->log_file);
 	dev->log_file = NULL;
 	/* No one will access memory at this point */
-	kvfree(dev->memory);
-	dev->memory = NULL;
+	vhost_umem_clean(dev);
+	dev->umem = NULL;
 	WARN_ON(!list_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
@@ -576,25 +603,25 @@ static int log_access_ok(void __user *log_base, u64 addr,
unsigned long sz)
 }
 
 /* Caller should have vq mutex and device mutex. */
-static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
+static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
 			       int log_all)
 {
-	int i;
+	struct vhost_umem_node *node;
 
-	if (!mem)
+	if (!umem)
 		return 0;
 
-	for (i = 0; i < mem->nregions; ++i) {
-		struct vhost_memory_region *m = mem->regions + i;
-		unsigned long a = m->userspace_addr;
-		if (m->memory_size > ULONG_MAX)
+	list_for_each_entry(node, &umem->umem_list, link) {
+		unsigned long a = node->userspace_addr;
+
+		if (node->size > ULONG_MAX)
 			return 0;
 		else if (!access_ok(VERIFY_WRITE, (void __user *)a,
-				    m->memory_size))
+				    node->size))
 			return 0;
 		else if (log_all && !log_access_ok(log_base,
-						   m->guest_phys_addr,
-						   m->memory_size))
+						   node->start,
+						   node->size))
 			return 0;
 	}
 	return 1;
@@ -602,7 +629,7 @@ static int vq_memory_access_ok(void __user *log_base, struct
vhost_memory *mem,
 
 /* Can we switch to this memory table? */
 /* Caller should have device mutex but not vq mutex */
-static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
+static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
 			    int log_all)
 {
 	int i;
@@ -615,7 +642,8 @@ static int memory_access_ok(struct vhost_dev *d, struct
vhost_memory *mem,
 		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
 		/* If ring is inactive, will check when it's enabled. */
 		if (d->vqs[i]->private_data)
-			ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log);
+			ok = vq_memory_access_ok(d->vqs[i]->log_base,
+						 umem, log);
 		else
 			ok = 1;
 		mutex_unlock(&d->vqs[i]->mutex);
@@ -642,7 +670,7 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned
int num,
 /* Caller should have device mutex but not vq mutex */
 int vhost_log_access_ok(struct vhost_dev *dev)
 {
-	return memory_access_ok(dev, dev->memory, 1);
+	return memory_access_ok(dev, dev->umem, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_log_access_ok);
 
@@ -653,7 +681,7 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 
-	return vq_memory_access_ok(log_base, vq->memory,
+	return vq_memory_access_ok(log_base, vq->umem,
 				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
 		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
 					sizeof *vq->used +
@@ -669,28 +697,12 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
 
-static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
-{
-	const struct vhost_memory_region *r1 = p1, *r2 = p2;
-	if (r1->guest_phys_addr < r2->guest_phys_addr)
-		return 1;
-	if (r1->guest_phys_addr > r2->guest_phys_addr)
-		return -1;
-	return 0;
-}
-
-static void *vhost_kvzalloc(unsigned long size)
-{
-	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
-
-	if (!n)
-		n = vzalloc(size);
-	return n;
-}
-
 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user
*m)
 {
-	struct vhost_memory mem, *newmem, *oldmem;
+	struct vhost_memory mem, *newmem;
+	struct vhost_memory_region *region;
+	struct vhost_umem_node *node;
+	struct vhost_umem *newumem, *oldumem;
 	unsigned long size = offsetof(struct vhost_memory, regions);
 	int i;
 
@@ -710,24 +722,51 @@ static long vhost_set_memory(struct vhost_dev *d, struct
vhost_memory __user *m)
 		kvfree(newmem);
 		return -EFAULT;
 	}
-	sort(newmem->regions, newmem->nregions, sizeof(*newmem->regions),
-		vhost_memory_reg_sort_cmp, NULL);
 
-	if (!memory_access_ok(d, newmem, 0)) {
+	newumem = vhost_kvzalloc(sizeof(*newumem));
+	if (!newumem) {
 		kvfree(newmem);
-		return -EFAULT;
+		return -ENOMEM;
+	}
+
+	newumem->umem_tree = RB_ROOT;
+	INIT_LIST_HEAD(&newumem->umem_list);
+
+	for (region = newmem->regions;
+	     region < newmem->regions + mem.nregions;
+	     region++) {
+		node = vhost_kvzalloc(sizeof(*node));
+		if (!node)
+			goto err;
+		node->start = region->guest_phys_addr;
+		node->size = region->memory_size;
+		node->last = node->start + node->size - 1;
+		node->userspace_addr = region->userspace_addr;
+		INIT_LIST_HEAD(&node->link);
+		list_add_tail(&node->link, &newumem->umem_list);
+		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
 	}
-	oldmem = d->memory;
-	d->memory = newmem;
+
+	if (!memory_access_ok(d, newumem, 0))
+		goto err;
+
+	oldumem = d->umem;
+	d->umem = newumem;
 
 	/* All memory accesses are done under some VQ mutex. */
 	for (i = 0; i < d->nvqs; ++i) {
 		mutex_lock(&d->vqs[i]->mutex);
-		d->vqs[i]->memory = newmem;
+		d->vqs[i]->umem = newumem;
 		mutex_unlock(&d->vqs[i]->mutex);
 	}
-	kvfree(oldmem);
+
+	kvfree(newmem);
 	return 0;
+
+err:
+	vhost_umem_clean(d);
+	kvfree(newmem);
+	return -EFAULT;
 }
 
 long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
@@ -1017,28 +1056,6 @@ done:
 }
 EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
 
-static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
-						     __u64 addr, __u32 len)
-{
-	const struct vhost_memory_region *reg;
-	int start = 0, end = mem->nregions;
-
-	while (start < end) {
-		int slot = start + (end - start) / 2;
-		reg = mem->regions + slot;
-		if (addr >= reg->guest_phys_addr)
-			end = slot;
-		else
-			start = slot + 1;
-	}
-
-	reg = mem->regions + start;
-	if (addr >= reg->guest_phys_addr &&
-		reg->guest_phys_addr + reg->memory_size > addr)
-		return reg;
-	return NULL;
-}
-
 /* TODO: This is really inefficient.  We need something like get_user()
  * (instruction directly accesses the data, with an exception table entry
  * returning -EFAULT). See Documentation/x86/exception-tables.txt.
@@ -1180,29 +1197,29 @@ EXPORT_SYMBOL_GPL(vhost_init_used);
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			  struct iovec iov[], int iov_size)
 {
-	const struct vhost_memory_region *reg;
-	struct vhost_memory *mem;
+	const struct vhost_umem_node *node;
+	struct vhost_umem *umem = vq->umem;
 	struct iovec *_iov;
 	u64 s = 0;
 	int ret = 0;
 
-	mem = vq->memory;
 	while ((u64)len > s) {
 		u64 size;
 		if (unlikely(ret >= iov_size)) {
 			ret = -ENOBUFS;
 			break;
 		}
-		reg = find_region(mem, addr, len);
-		if (unlikely(!reg)) {
+		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
+							addr, addr + len - 1);
+		if (node == NULL || node->start > addr) {
 			ret = -EFAULT;
 			break;
 		}
 		_iov = iov + ret;
-		size = reg->memory_size - addr + reg->guest_phys_addr;
+		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
 		_iov->iov_base = (void __user *)(unsigned long)
-			(reg->userspace_addr + addr - reg->guest_phys_addr);
+			(node->userspace_addr + addr - node->start);
 		s += size;
 		addr += size;
 		++ret;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d3f7674..5d64393 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -52,6 +52,25 @@ struct vhost_log {
 	u64 len;
 };
 
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+
+struct vhost_umem_node {
+	struct rb_node rb;
+	struct list_head link;
+	__u64 start;
+	__u64 last;
+	__u64 size;
+	__u64 userspace_addr;
+	__u64 flags_padding;
+	__u64 __subtree_last;
+};
+
+struct vhost_umem {
+	struct rb_root umem_tree;
+	struct list_head umem_list;
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -100,7 +119,7 @@ struct vhost_virtqueue {
 	struct iovec *indirect;
 	struct vring_used_elem *heads;
 	/* Protected by virtqueue mutex. */
-	struct vhost_memory *memory;
+	struct vhost_umem *umem;
 	void *private_data;
 	u64 acked_features;
 	/* Log write descriptors */
@@ -117,7 +136,6 @@ struct vhost_virtqueue {
 };
 
 struct vhost_dev {
-	struct vhost_memory *memory;
 	struct mm_struct *mm;
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
@@ -127,14 +145,15 @@ struct vhost_dev {
 	spinlock_t work_lock;
 	struct list_head work_list;
 	struct task_struct *worker;
+	struct vhost_umem *umem;
 };
 
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int
nvqs);
 long vhost_dev_set_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
 long vhost_dev_check_owner(struct vhost_dev *);
-struct vhost_memory *vhost_dev_reset_owner_prepare(void);
-void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *);
+struct vhost_umem *vhost_dev_reset_owner_prepare(void);
+void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
 void vhost_dev_cleanup(struct vhost_dev *, bool locked);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user
*argp);
-- 
2.5.0

Igor Mammedov

2016-Jan-19 15:24 UTC

head link

[PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree

On Mon, 18 Jan 2016 10:42:29 +0800
Jason Wang <jasowang at redhat.com> wrote:
> Current pre-sorted memory region array has some limitations for future
> device IOTLB conversion:
> 
> 1) need extra work for adding and removing a single region, and it's
>    expected to be slow because of sorting or memory re-allocation.
> 2) need extra work of removing a large range which may intersect
>    several regions with different size.
> 3) need trick for a replacement policy like LRU
> 
> To overcome the above shortcomings, this patch convert it to interval
> tree which can easily address the above issue with almost no extra
> work.
> 
> The patch could be used for:
> 
> - Extend the current API and only let the userspace to send diffs of
>   memory table.
> - Simplify Device IOTLB implementation.
I'm curios how performance changes on translate_desc() hot-path in
default case and in the case of 256 memory regions?
> 
> Signed-off-by: Jason Wang <jasowang at redhat.com>
> ---
>  drivers/vhost/net.c   |   8 +--
>  drivers/vhost/vhost.c | 179
+++++++++++++++++++++++++++-----------------------
>  drivers/vhost/vhost.h |  27 ++++++--
>  3 files changed, 125 insertions(+), 89 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 9eda69e..481db96 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -968,20 +968,20 @@ static long vhost_net_reset_owner(struct vhost_net
*n)
>  	struct socket *tx_sock = NULL;
>  	struct socket *rx_sock = NULL;
>  	long err;
> -	struct vhost_memory *memory;
> +	struct vhost_umem *umem;
>  
>  	mutex_lock(&n->dev.mutex);
>  	err = vhost_dev_check_owner(&n->dev);
>  	if (err)
>  		goto done;
> -	memory = vhost_dev_reset_owner_prepare();
> -	if (!memory) {
> +	umem = vhost_dev_reset_owner_prepare();
> +	if (!umem) {
>  		err = -ENOMEM;
>  		goto done;
>  	}
>  	vhost_net_stop(n, &tx_sock, &rx_sock);
>  	vhost_net_flush(n);
> -	vhost_dev_reset_owner(&n->dev, memory);
> +	vhost_dev_reset_owner(&n->dev, umem);
>  	vhost_net_vq_reset(n);
>  done:
>  	mutex_unlock(&n->dev.mutex);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index ad2146a..851dce8 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -27,6 +27,7 @@
>  #include <linux/cgroup.h>
>  #include <linux/module.h>
>  #include <linux/sort.h>
> +#include <linux/interval_tree_generic.h>
>  
>  #include "vhost.h"
>  
> @@ -42,6 +43,10 @@ enum {
>  #define vhost_used_event(vq) ((__virtio16 __user
*)&vq->avail->ring[vq->num])
>  #define vhost_avail_event(vq) ((__virtio16 __user
*)&vq->used->ring[vq->num])
>  
> +INTERVAL_TREE_DEFINE(struct vhost_umem_node,
> +		     rb, __u64, __subtree_last,
> +		     START, LAST, , vhost_umem_interval_tree);
> +
>  #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
>  static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
>  {
> @@ -275,7 +280,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> -	vq->memory = NULL;
> +	vq->umem = NULL;
>  	vq->is_le = virtio_legacy_is_little_endian();
>  	vhost_vq_reset_user_be(vq);
>  }
> @@ -381,7 +386,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	mutex_init(&dev->mutex);
>  	dev->log_ctx = NULL;
>  	dev->log_file = NULL;
> -	dev->memory = NULL;
> +	dev->umem = NULL;
>  	dev->mm = NULL;
>  	spin_lock_init(&dev->work_lock);
>  	INIT_LIST_HEAD(&dev->work_list);
> @@ -486,27 +491,36 @@ err_mm:
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
>  
> -struct vhost_memory *vhost_dev_reset_owner_prepare(void)
> +static void *vhost_kvzalloc(unsigned long size)
> +{
> +	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
> +
> +	if (!n)
> +		n = vzalloc(size);
> +	return n;
> +}
> +
> +struct vhost_umem *vhost_dev_reset_owner_prepare(void)
>  {
> -	return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
> +	return vhost_kvzalloc(sizeof(struct vhost_umem));
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
>  
>  /* Caller should have device mutex */
> -void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory
*memory)
> +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
>  {
>  	int i;
>  
>  	vhost_dev_cleanup(dev, true);
>  
>  	/* Restore memory to default empty mapping. */
> -	memory->nregions = 0;
> -	dev->memory = memory;
> +	INIT_LIST_HEAD(&umem->umem_list);
> +	dev->umem = umem;
>  	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
>  	 * VQs aren't running.
>  	 */
>  	for (i = 0; i < dev->nvqs; ++i)
> -		dev->vqs[i]->memory = memory;
> +		dev->vqs[i]->umem = umem;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
>  
> @@ -523,6 +537,19 @@ void vhost_dev_stop(struct vhost_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_stop);
>  
> +static void vhost_umem_clean(struct vhost_dev *dev)
> +{
> +	struct vhost_umem_node *node, *tmp;
> +	struct vhost_umem *umem = dev->umem;
> +
> +	list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
> +		vhost_umem_interval_tree_remove(node, &umem->umem_tree);
> +		list_del(&node->link);
> +		kvfree(node);
> +	}
> +	kvfree(umem);
> +}
> +
>  /* Caller should have device mutex if and only if locked is set */
>  void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
>  {
> @@ -549,8 +576,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool
locked)
>  		fput(dev->log_file);
>  	dev->log_file = NULL;
>  	/* No one will access memory at this point */
> -	kvfree(dev->memory);
> -	dev->memory = NULL;
> +	vhost_umem_clean(dev);
> +	dev->umem = NULL;
>  	WARN_ON(!list_empty(&dev->work_list));
>  	if (dev->worker) {
>  		kthread_stop(dev->worker);
> @@ -576,25 +603,25 @@ static int log_access_ok(void __user *log_base, u64
addr, unsigned long sz)
>  }
>  
>  /* Caller should have vq mutex and device mutex. */
> -static int vq_memory_access_ok(void __user *log_base, struct vhost_memory
*mem,
> +static int vq_memory_access_ok(void __user *log_base, struct vhost_umem
*umem,
>  			       int log_all)
>  {
> -	int i;
> +	struct vhost_umem_node *node;
>  
> -	if (!mem)
> +	if (!umem)
>  		return 0;
>  
> -	for (i = 0; i < mem->nregions; ++i) {
> -		struct vhost_memory_region *m = mem->regions + i;
> -		unsigned long a = m->userspace_addr;
> -		if (m->memory_size > ULONG_MAX)
> +	list_for_each_entry(node, &umem->umem_list, link) {
> +		unsigned long a = node->userspace_addr;
> +
> +		if (node->size > ULONG_MAX)
>  			return 0;
>  		else if (!access_ok(VERIFY_WRITE, (void __user *)a,
> -				    m->memory_size))
> +				    node->size))
>  			return 0;
>  		else if (log_all && !log_access_ok(log_base,
> -						   m->guest_phys_addr,
> -						   m->memory_size))
> +						   node->start,
> +						   node->size))
>  			return 0;
>  	}
>  	return 1;
> @@ -602,7 +629,7 @@ static int vq_memory_access_ok(void __user *log_base,
struct vhost_memory *mem,
>  
>  /* Can we switch to this memory table? */
>  /* Caller should have device mutex but not vq mutex */
> -static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
> +static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
>  			    int log_all)
>  {
>  	int i;
> @@ -615,7 +642,8 @@ static int memory_access_ok(struct vhost_dev *d, struct
vhost_memory *mem,
>  		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
>  		/* If ring is inactive, will check when it's enabled. */
>  		if (d->vqs[i]->private_data)
> -			ok = vq_memory_access_ok(d->vqs[i]->log_base, mem, log);
> +			ok = vq_memory_access_ok(d->vqs[i]->log_base,
> +						 umem, log);
>  		else
>  			ok = 1;
>  		mutex_unlock(&d->vqs[i]->mutex);
> @@ -642,7 +670,7 @@ static int vq_access_ok(struct vhost_virtqueue *vq,
unsigned int num,
>  /* Caller should have device mutex but not vq mutex */
>  int vhost_log_access_ok(struct vhost_dev *dev)
>  {
> -	return memory_access_ok(dev, dev->memory, 1);
> +	return memory_access_ok(dev, dev->umem, 1);
>  }
>  EXPORT_SYMBOL_GPL(vhost_log_access_ok);
>  
> @@ -653,7 +681,7 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
>  {
>  	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
>  
> -	return vq_memory_access_ok(log_base, vq->memory,
> +	return vq_memory_access_ok(log_base, vq->umem,
>  				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
>  		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
>  					sizeof *vq->used +
> @@ -669,28 +697,12 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
>  }
>  EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
>  
> -static int vhost_memory_reg_sort_cmp(const void *p1, const void *p2)
> -{
> -	const struct vhost_memory_region *r1 = p1, *r2 = p2;
> -	if (r1->guest_phys_addr < r2->guest_phys_addr)
> -		return 1;
> -	if (r1->guest_phys_addr > r2->guest_phys_addr)
> -		return -1;
> -	return 0;
> -}
> -
> -static void *vhost_kvzalloc(unsigned long size)
> -{
> -	void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
> -
> -	if (!n)
> -		n = vzalloc(size);
> -	return n;
> -}
> -
>  static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory
__user *m)
>  {
> -	struct vhost_memory mem, *newmem, *oldmem;
> +	struct vhost_memory mem, *newmem;
> +	struct vhost_memory_region *region;
> +	struct vhost_umem_node *node;
> +	struct vhost_umem *newumem, *oldumem;
>  	unsigned long size = offsetof(struct vhost_memory, regions);
>  	int i;
>  
> @@ -710,24 +722,51 @@ static long vhost_set_memory(struct vhost_dev *d,
struct vhost_memory __user *m)
>  		kvfree(newmem);
>  		return -EFAULT;
>  	}
> -	sort(newmem->regions, newmem->nregions,
sizeof(*newmem->regions),
> -		vhost_memory_reg_sort_cmp, NULL);
>  
> -	if (!memory_access_ok(d, newmem, 0)) {
> +	newumem = vhost_kvzalloc(sizeof(*newumem));
> +	if (!newumem) {
>  		kvfree(newmem);
> -		return -EFAULT;
> +		return -ENOMEM;
> +	}
> +
> +	newumem->umem_tree = RB_ROOT;
> +	INIT_LIST_HEAD(&newumem->umem_list);
> +
> +	for (region = newmem->regions;
> +	     region < newmem->regions + mem.nregions;
> +	     region++) {
> +		node = vhost_kvzalloc(sizeof(*node));
> +		if (!node)
> +			goto err;
> +		node->start = region->guest_phys_addr;
> +		node->size = region->memory_size;
> +		node->last = node->start + node->size - 1;
> +		node->userspace_addr = region->userspace_addr;
> +		INIT_LIST_HEAD(&node->link);
> +		list_add_tail(&node->link, &newumem->umem_list);
> +		vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
>  	}
> -	oldmem = d->memory;
> -	d->memory = newmem;
> +
> +	if (!memory_access_ok(d, newumem, 0))
> +		goto err;
> +
> +	oldumem = d->umem;
> +	d->umem = newumem;
>  
>  	/* All memory accesses are done under some VQ mutex. */
>  	for (i = 0; i < d->nvqs; ++i) {
>  		mutex_lock(&d->vqs[i]->mutex);
> -		d->vqs[i]->memory = newmem;
> +		d->vqs[i]->umem = newumem;
>  		mutex_unlock(&d->vqs[i]->mutex);
>  	}
> -	kvfree(oldmem);
> +
> +	kvfree(newmem);
>  	return 0;
> +
> +err:
> +	vhost_umem_clean(d);
> +	kvfree(newmem);
> +	return -EFAULT;
>  }
>  
>  long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
> @@ -1017,28 +1056,6 @@ done:
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
>  
> -static const struct vhost_memory_region *find_region(struct vhost_memory
*mem,
> -						     __u64 addr, __u32 len)
> -{
> -	const struct vhost_memory_region *reg;
> -	int start = 0, end = mem->nregions;
> -
> -	while (start < end) {
> -		int slot = start + (end - start) / 2;
> -		reg = mem->regions + slot;
> -		if (addr >= reg->guest_phys_addr)
> -			end = slot;
> -		else
> -			start = slot + 1;
> -	}
> -
> -	reg = mem->regions + start;
> -	if (addr >= reg->guest_phys_addr &&
> -		reg->guest_phys_addr + reg->memory_size > addr)
> -		return reg;
> -	return NULL;
> -}
> -
>  /* TODO: This is really inefficient.  We need something like get_user()
>   * (instruction directly accesses the data, with an exception table entry
>   * returning -EFAULT). See Documentation/x86/exception-tables.txt.
> @@ -1180,29 +1197,29 @@ EXPORT_SYMBOL_GPL(vhost_init_used);
>  static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>  			  struct iovec iov[], int iov_size)
>  {
> -	const struct vhost_memory_region *reg;
> -	struct vhost_memory *mem;
> +	const struct vhost_umem_node *node;
> +	struct vhost_umem *umem = vq->umem;
>  	struct iovec *_iov;
>  	u64 s = 0;
>  	int ret = 0;
>  
> -	mem = vq->memory;
>  	while ((u64)len > s) {
>  		u64 size;
>  		if (unlikely(ret >= iov_size)) {
>  			ret = -ENOBUFS;
>  			break;
>  		}
> -		reg = find_region(mem, addr, len);
> -		if (unlikely(!reg)) {
> +		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
> +							addr, addr + len - 1);
> +		if (node == NULL || node->start > addr) {
>  			ret = -EFAULT;
>  			break;
>  		}
>  		_iov = iov + ret;
> -		size = reg->memory_size - addr + reg->guest_phys_addr;
> +		size = node->size - addr + node->start;
>  		_iov->iov_len = min((u64)len - s, size);
>  		_iov->iov_base = (void __user *)(unsigned long)
> -			(reg->userspace_addr + addr - reg->guest_phys_addr);
> +			(node->userspace_addr + addr - node->start);
>  		s += size;
>  		addr += size;
>  		++ret;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index d3f7674..5d64393 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -52,6 +52,25 @@ struct vhost_log {
>  	u64 len;
>  };
>  
> +#define START(node) ((node)->start)
> +#define LAST(node) ((node)->last)
> +
> +struct vhost_umem_node {
> +	struct rb_node rb;
> +	struct list_head link;
> +	__u64 start;
> +	__u64 last;
> +	__u64 size;
> +	__u64 userspace_addr;
> +	__u64 flags_padding;
> +	__u64 __subtree_last;
> +};
> +
> +struct vhost_umem {
> +	struct rb_root umem_tree;
> +	struct list_head umem_list;
> +};
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> @@ -100,7 +119,7 @@ struct vhost_virtqueue {
>  	struct iovec *indirect;
>  	struct vring_used_elem *heads;
>  	/* Protected by virtqueue mutex. */
> -	struct vhost_memory *memory;
> +	struct vhost_umem *umem;
>  	void *private_data;
>  	u64 acked_features;
>  	/* Log write descriptors */
> @@ -117,7 +136,6 @@ struct vhost_virtqueue {
>  };
>  
>  struct vhost_dev {
> -	struct vhost_memory *memory;
>  	struct mm_struct *mm;
>  	struct mutex mutex;
>  	struct vhost_virtqueue **vqs;
> @@ -127,14 +145,15 @@ struct vhost_dev {
>  	spinlock_t work_lock;
>  	struct list_head work_list;
>  	struct task_struct *worker;
> +	struct vhost_umem *umem;
>  };
>  
>  void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int
nvqs);
>  long vhost_dev_set_owner(struct vhost_dev *dev);
>  bool vhost_dev_has_owner(struct vhost_dev *dev);
>  long vhost_dev_check_owner(struct vhost_dev *);
> -struct vhost_memory *vhost_dev_reset_owner_prepare(void);
> -void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *);
> +struct vhost_umem *vhost_dev_reset_owner_prepare(void);
> +void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
>  void vhost_dev_cleanup(struct vhost_dev *, bool locked);
>  void vhost_dev_stop(struct vhost_dev *);
>  long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user
*argp);

Jason Wang

2016-Jan-20 02:50 UTC

head link

[PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree

On 01/19/2016 11:24 PM, Igor Mammedov wrote:> On Mon, 18 Jan 2016 10:42:29 +0800
> Jason Wang <jasowang at redhat.com> wrote:
>
>> > Current pre-sorted memory region array has some limitations for
future
>> > device IOTLB conversion:
>> > 
>> > 1) need extra work for adding and removing a single region, and
it's
>> >    expected to be slow because of sorting or memory re-allocation.
>> > 2) need extra work of removing a large range which may intersect
>> >    several regions with different size.
>> > 3) need trick for a replacement policy like LRU
>> > 
>> > To overcome the above shortcomings, this patch convert it to
interval
>> > tree which can easily address the above issue with almost no extra
>> > work.
>> > 
>> > The patch could be used for:
>> > 
>> > - Extend the current API and only let the userspace to send diffs
of
>> >   memory table.
>> > - Simplify Device IOTLB implementation.
> I'm curios how performance changes on translate_desc() hot-path in
> default case and in the case of 256 memory regions?
>
Haven't measured this. But consider both methods are O(logN), should be
no noticeable difference. Will measure it in the future.

The main user is for the device IOTLB API[1], which:

- have thousands of userspace memory region sections
- may dynamically adding or removing one or some of the regions
- need a replacement algorithm like LRU

[1] https://lkml.org/lkml/2015/12/31/16

Reasonably Related Threads

Search for more possibly parallel threads

Linux Virtualization - Jan 2016 - [PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree

[PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree

[PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree

[PATCH RFC] vhost: convert pre sorted vhost memory array to interval tree

Reasonably Related Threads