Hi all: This series try to fix several issues introduced by meta data accelreation series. Please review. Jason Wang (6): vhost: don't set uaddr for invalid address vhost: validate MMU notifier registration vhost: fix vhost map leak vhost: reset invalidate_count in vhost_set_vring_num_addr() vhost: mark dirty pages during map uninit vhost: don't do synchronize_rcu() in vhost_uninit_vq_maps() drivers/vhost/vhost.c | 56 +++++++++++++++++++++++++++++++------------ drivers/vhost/vhost.h | 1 + 2 files changed, 42 insertions(+), 15 deletions(-) -- 2.18.1
We should not setup uaddr for the invalid address, otherwise we may try to pin or prefetch mapping of wrong pages. Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index dc9301d31f12..34c0d970bcbc 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2083,7 +2083,8 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, } #if VHOST_ARCH_CAN_ACCEL_UACCESS - vhost_setup_vq_uaddr(vq); + if (r == 0) + vhost_setup_vq_uaddr(vq); if (d->mm) mmu_notifier_register(&d->mmu_notifier, d->mm); -- 2.18.1
The return value of mmu_notifier_register() is not checked in vhost_vring_set_num_addr(). This will cause an out of sync between mm and MMU notifier thus a double free. To solve this, introduce a boolean flag to track whether MMU notifier is registered and only do unregistering when it was true. Reported-and-tested-by: syzbot+e58112d71f77113ddb7b at syzkaller.appspotmail.com Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 19 +++++++++++++++---- drivers/vhost/vhost.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 34c0d970bcbc..058191d5efad 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -630,6 +630,7 @@ void vhost_dev_init(struct vhost_dev *dev, dev->iov_limit = iov_limit; dev->weight = weight; dev->byte_weight = byte_weight; + dev->has_notifier = false; init_llist_head(&dev->work_list); init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); @@ -731,6 +732,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) if (err) goto err_mmu_notifier; #endif + dev->has_notifier = true; return 0; @@ -960,7 +962,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev) } if (dev->mm) { #if VHOST_ARCH_CAN_ACCEL_UACCESS - mmu_notifier_unregister(&dev->mmu_notifier, dev->mm); + if (dev->has_notifier) { + mmu_notifier_unregister(&dev->mmu_notifier, + dev->mm); + dev->has_notifier = false; + } #endif mmput(dev->mm); } @@ -2065,8 +2071,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, /* Unregister MMU notifer to allow invalidation callback * can access vq->uaddrs[] without holding a lock. */ - if (d->mm) + if (d->has_notifier) { mmu_notifier_unregister(&d->mmu_notifier, d->mm); + d->has_notifier = false; + } vhost_uninit_vq_maps(vq); #endif @@ -2086,8 +2094,11 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, if (r == 0) vhost_setup_vq_uaddr(vq); - if (d->mm) - mmu_notifier_register(&d->mmu_notifier, d->mm); + if (d->mm) { + r = mmu_notifier_register(&d->mmu_notifier, d->mm); + if (!r) + d->has_notifier = true; + } #endif mutex_unlock(&vq->mutex); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 819296332913..a62f56a4cf72 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -214,6 +214,7 @@ struct vhost_dev { int iov_limit; int weight; int byte_weight; + bool has_notifier; }; bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); -- 2.18.1
We don't free map during vhost_map_unprefetch(). This means it could be leaked. Fixing by free the map. Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 058191d5efad..03666b702498 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -303,9 +303,7 @@ static void vhost_vq_meta_reset(struct vhost_dev *d) static void vhost_map_unprefetch(struct vhost_map *map) { kfree(map->pages); - map->pages = NULL; - map->npages = 0; - map->addr = NULL; + kfree(map); } static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) -- 2.18.1
Jason Wang
2019-Jul-23 07:57 UTC
[PATCH 4/6] vhost: reset invalidate_count in vhost_set_vring_num_addr()
The vhost_set_vring_num_addr() could be called in the middle of invalidate_range_start() and invalidate_range_end(). If we don't reset invalidate_count after the un-registering of MMU notifier, the invalidate_cont will run out of sync (e.g never reach zero). This will in fact disable the fast accessor path. Fixing by reset the count to zero. Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 03666b702498..89c9f08b5146 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2074,6 +2074,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, d->has_notifier = false; } + /* reset invalidate_count in case we are in the middle of + * invalidate_start() and invalidate_end(). + */ + vq->invalidate_count = 0; vhost_uninit_vq_maps(vq); #endif -- 2.18.1
We don't mark dirty pages if the map was teared down outside MMU notifier. This will lead untracked dirty pages. Fixing by marking dirty pages during map uninit. Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 89c9f08b5146..5b8821d00fe4 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -306,6 +306,18 @@ static void vhost_map_unprefetch(struct vhost_map *map) kfree(map); } +static void vhost_set_map_dirty(struct vhost_virtqueue *vq, + struct vhost_map *map, int index) +{ + struct vhost_uaddr *uaddr = &vq->uaddrs[index]; + int i; + + if (uaddr->write) { + for (i = 0; i < map->npages; i++) + set_page_dirty(map->pages[i]); + } +} + static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) { struct vhost_map *map[VHOST_NUM_ADDRS]; @@ -315,8 +327,10 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) for (i = 0; i < VHOST_NUM_ADDRS; i++) { map[i] = rcu_dereference_protected(vq->maps[i], lockdep_is_held(&vq->mmu_lock)); - if (map[i]) + if (map[i]) { + vhost_set_map_dirty(vq, map[i], i); rcu_assign_pointer(vq->maps[i], NULL); + } } spin_unlock(&vq->mmu_lock); @@ -354,7 +368,6 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, { struct vhost_uaddr *uaddr = &vq->uaddrs[index]; struct vhost_map *map; - int i; if (!vhost_map_range_overlap(uaddr, start, end)) return; @@ -365,10 +378,7 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, map = rcu_dereference_protected(vq->maps[index], lockdep_is_held(&vq->mmu_lock)); if (map) { - if (uaddr->write) { - for (i = 0; i < map->npages; i++) - set_page_dirty(map->pages[i]); - } + vhost_set_map_dirty(vq, map, index); rcu_assign_pointer(vq->maps[index], NULL); } spin_unlock(&vq->mmu_lock); -- 2.18.1
Jason Wang
2019-Jul-23 07:57 UTC
[PATCH 6/6] vhost: don't do synchronize_rcu() in vhost_uninit_vq_maps()
There's no need for RCU synchronization in vhost_uninit_vq_maps() since we've already serialized with readers (memory accessors). This also avoid the possible userspace DOS through ioctl() because of the possible high latency caused by synchronize_rcu(). Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5b8821d00fe4..a17df1f4069a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -334,7 +334,9 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) } spin_unlock(&vq->mmu_lock); - synchronize_rcu(); + /* No need for synchronize_rcu() or kfree_rcu() since we are + * serialized with memory accessors (e.g vq mutex held). + */ for (i = 0; i < VHOST_NUM_ADDRS; i++) if (map[i]) -- 2.18.1
Michael S. Tsirkin
2019-Jul-23 09:16 UTC
[PATCH 6/6] vhost: don't do synchronize_rcu() in vhost_uninit_vq_maps()
On Tue, Jul 23, 2019 at 03:57:18AM -0400, Jason Wang wrote:> There's no need for RCU synchronization in vhost_uninit_vq_maps() > since we've already serialized with readers (memory accessors). This > also avoid the possible userspace DOS through ioctl() because of the > possible high latency caused by synchronize_rcu(). > > Reported-by: Michael S. Tsirkin <mst at redhat.com> > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com>I agree synchronize_rcu in both mmu notifiers and ioctl is a problem we must fix.> --- > drivers/vhost/vhost.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 5b8821d00fe4..a17df1f4069a 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -334,7 +334,9 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) > } > spin_unlock(&vq->mmu_lock); > > - synchronize_rcu(); > + /* No need for synchronize_rcu() or kfree_rcu() since we are > + * serialized with memory accessors (e.g vq mutex held). > + */ > > for (i = 0; i < VHOST_NUM_ADDRS; i++) > if (map[i]) > -- > 2.18.1.. however we can not RCU with no synchronization in sight. Sometimes there are hacks like using a lock/unlock pair instead of sync, but here no one bothers. specifically notifiers call reset vq maps which calls uninit vq maps which is not under any lock. You will get use after free when map is then accessed. If you always have a lock then just take that lock and no need for RCU. -- MST
Michael S. Tsirkin
2019-Jul-23 09:17 UTC
[PATCH 5/6] vhost: mark dirty pages during map uninit
On Tue, Jul 23, 2019 at 03:57:17AM -0400, Jason Wang wrote:> We don't mark dirty pages if the map was teared down outside MMU > notifier. This will lead untracked dirty pages. Fixing by marking > dirty pages during map uninit. > > Reported-by: Michael S. Tsirkin <mst at redhat.com> > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/vhost.c | 22 ++++++++++++++++------ > 1 file changed, 16 insertions(+), 6 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 89c9f08b5146..5b8821d00fe4 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -306,6 +306,18 @@ static void vhost_map_unprefetch(struct vhost_map *map) > kfree(map); > } > > +static void vhost_set_map_dirty(struct vhost_virtqueue *vq, > + struct vhost_map *map, int index) > +{ > + struct vhost_uaddr *uaddr = &vq->uaddrs[index]; > + int i; > + > + if (uaddr->write) { > + for (i = 0; i < map->npages; i++) > + set_page_dirty(map->pages[i]); > + } > +} > + > static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) > { > struct vhost_map *map[VHOST_NUM_ADDRS]; > @@ -315,8 +327,10 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) > for (i = 0; i < VHOST_NUM_ADDRS; i++) { > map[i] = rcu_dereference_protected(vq->maps[i], > lockdep_is_held(&vq->mmu_lock)); > - if (map[i]) > + if (map[i]) { > + vhost_set_map_dirty(vq, map[i], i); > rcu_assign_pointer(vq->maps[i], NULL); > + } > } > spin_unlock(&vq->mmu_lock); > > @@ -354,7 +368,6 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > { > struct vhost_uaddr *uaddr = &vq->uaddrs[index]; > struct vhost_map *map; > - int i; > > if (!vhost_map_range_overlap(uaddr, start, end)) > return; > @@ -365,10 +378,7 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > map = rcu_dereference_protected(vq->maps[index], > lockdep_is_held(&vq->mmu_lock)); > if (map) { > - if (uaddr->write) { > - for (i = 0; i < map->npages; i++) > - set_page_dirty(map->pages[i]); > - } > + vhost_set_map_dirty(vq, map, index); > rcu_assign_pointer(vq->maps[index], NULL); > } > spin_unlock(&vq->mmu_lock);OK and the reason it's safe is because the invalidate counter got incremented so we know page will not get mapped again. But we *do* need to wait for page not to be mapped. And if that means waiting for VQ processing to finish, then I worry that is a very log time.> -- > 2.18.1
Michael S. Tsirkin
2019-Jul-23 09:17 UTC
[PATCH 4/6] vhost: reset invalidate_count in vhost_set_vring_num_addr()
On Tue, Jul 23, 2019 at 03:57:16AM -0400, Jason Wang wrote:> The vhost_set_vring_num_addr() could be called in the middle of > invalidate_range_start() and invalidate_range_end(). If we don't reset > invalidate_count after the un-registering of MMU notifier, the > invalidate_cont will run out of sync (e.g never reach zero). This will > in fact disable the fast accessor path. Fixing by reset the count to > zero. > > Reported-by: Michael S. Tsirkin <mst at redhat.com> > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/vhost.c | 4 ++++ > 1 file changed, 4 insertions(+) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 03666b702498..89c9f08b5146 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -2074,6 +2074,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, > d->has_notifier = false; > } > > + /* reset invalidate_count in case we are in the middle of > + * invalidate_start() and invalidate_end(). > + */ > + vq->invalidate_count = 0;I think that the code is ok but the comments are not very clear: - we are never in the middle since we just removed the notifier - the result is not just disabling optimization: if notifier becomes negative, then later we can think it's ok to map when it isn't since notifier is active.> vhost_uninit_vq_maps(vq); > #endif > > -- > 2.18.1
Michael S. Tsirkin
2019-Jul-23 09:17 UTC
[PATCH 2/6] vhost: validate MMU notifier registration
On Tue, Jul 23, 2019 at 03:57:14AM -0400, Jason Wang wrote:> The return value of mmu_notifier_register() is not checked in > vhost_vring_set_num_addr(). This will cause an out of sync between mm > and MMU notifier thus a double free. To solve this, introduce a > boolean flag to track whether MMU notifier is registered and only do > unregistering when it was true. > > Reported-and-tested-by: > syzbot+e58112d71f77113ddb7b at syzkaller.appspotmail.com > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com>Right. This fixes the bug. But it's not great that simple things like setting vq address put pressure on memory allocator. Also, if we get a single during processing notifier register will fail, disabling optimization permanently. In fact, see below:> --- > drivers/vhost/vhost.c | 19 +++++++++++++++---- > drivers/vhost/vhost.h | 1 + > 2 files changed, 16 insertions(+), 4 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 34c0d970bcbc..058191d5efad 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -630,6 +630,7 @@ void vhost_dev_init(struct vhost_dev *dev, > dev->iov_limit = iov_limit; > dev->weight = weight; > dev->byte_weight = byte_weight; > + dev->has_notifier = false; > init_llist_head(&dev->work_list); > init_waitqueue_head(&dev->wait); > INIT_LIST_HEAD(&dev->read_list); > @@ -731,6 +732,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) > if (err) > goto err_mmu_notifier; > #endif > + dev->has_notifier = true; > > return 0; >I just noticed that set owner now fails if we get a signal. Userspace could retry in theory but it does not: this is userspace abi breakage since it used to only fail on invalid input.> @@ -960,7 +962,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev) > } > if (dev->mm) { > #if VHOST_ARCH_CAN_ACCEL_UACCESS > - mmu_notifier_unregister(&dev->mmu_notifier, dev->mm); > + if (dev->has_notifier) { > + mmu_notifier_unregister(&dev->mmu_notifier, > + dev->mm); > + dev->has_notifier = false; > + } > #endif > mmput(dev->mm); > } > @@ -2065,8 +2071,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, > /* Unregister MMU notifer to allow invalidation callback > * can access vq->uaddrs[] without holding a lock. > */ > - if (d->mm) > + if (d->has_notifier) { > mmu_notifier_unregister(&d->mmu_notifier, d->mm); > + d->has_notifier = false; > + } > > vhost_uninit_vq_maps(vq); > #endif > @@ -2086,8 +2094,11 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, > if (r == 0) > vhost_setup_vq_uaddr(vq); > > - if (d->mm) > - mmu_notifier_register(&d->mmu_notifier, d->mm); > + if (d->mm) { > + r = mmu_notifier_register(&d->mmu_notifier, d->mm); > + if (!r) > + d->has_notifier = true; > + } > #endif > > mutex_unlock(&vq->mutex); > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 819296332913..a62f56a4cf72 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -214,6 +214,7 @@ struct vhost_dev { > int iov_limit; > int weight; > int byte_weight; > + bool has_notifier; > }; > > bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); > -- > 2.18.1