Hi all: This series try to fix several issues introduced by meta data accelreation series. Please review. Changes from V1: - Try not use RCU to syncrhonize MMU notifier with vhost worker - set dirty pages after no readers - return -EAGAIN only when we find the range is overlapped with metadata Jason Wang (9): vhost: don't set uaddr for invalid address vhost: validate MMU notifier registration vhost: fix vhost map leak vhost: reset invalidate_count in vhost_set_vring_num_addr() vhost: mark dirty pages during map uninit vhost: don't do synchronize_rcu() in vhost_uninit_vq_maps() vhost: do not use RCU to synchronize MMU notifier with worker vhost: correctly set dirty pages in MMU notifiers callback vhost: do not return -EAGIAN for non blocking invalidation too early drivers/vhost/vhost.c | 232 +++++++++++++++++++++++++++--------------- drivers/vhost/vhost.h | 8 +- 2 files changed, 154 insertions(+), 86 deletions(-) -- 2.18.1
Jason Wang
2019-Jul-31 08:46 UTC
[PATCH V2 1/9] vhost: don't set uaddr for invalid address
We should not setup uaddr for the invalid address, otherwise we may try to pin or prefetch mapping of wrong pages. Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 0536f8526359..488380a581dc 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2082,7 +2082,8 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, } #if VHOST_ARCH_CAN_ACCEL_UACCESS - vhost_setup_vq_uaddr(vq); + if (r == 0) + vhost_setup_vq_uaddr(vq); if (d->mm) mmu_notifier_register(&d->mmu_notifier, d->mm); -- 2.18.1
The return value of mmu_notifier_register() is not checked in vhost_vring_set_num_addr(). This will cause an out of sync between mm and MMU notifier thus a double free. To solve this, introduce a boolean flag to track whether MMU notifier is registered and only do unregistering when it was true. Reported-and-tested-by: syzbot+e58112d71f77113ddb7b at syzkaller.appspotmail.com Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 19 +++++++++++++++---- drivers/vhost/vhost.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 488380a581dc..17f6abea192e 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -629,6 +629,7 @@ void vhost_dev_init(struct vhost_dev *dev, dev->iov_limit = iov_limit; dev->weight = weight; dev->byte_weight = byte_weight; + dev->has_notifier = false; init_llist_head(&dev->work_list); init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); @@ -730,6 +731,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) if (err) goto err_mmu_notifier; #endif + dev->has_notifier = true; return 0; @@ -959,7 +961,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev) } if (dev->mm) { #if VHOST_ARCH_CAN_ACCEL_UACCESS - mmu_notifier_unregister(&dev->mmu_notifier, dev->mm); + if (dev->has_notifier) { + mmu_notifier_unregister(&dev->mmu_notifier, + dev->mm); + dev->has_notifier = false; + } #endif mmput(dev->mm); } @@ -2064,8 +2070,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, /* Unregister MMU notifer to allow invalidation callback * can access vq->uaddrs[] without holding a lock. */ - if (d->mm) + if (d->has_notifier) { mmu_notifier_unregister(&d->mmu_notifier, d->mm); + d->has_notifier = false; + } vhost_uninit_vq_maps(vq); #endif @@ -2085,8 +2093,11 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, if (r == 0) vhost_setup_vq_uaddr(vq); - if (d->mm) - mmu_notifier_register(&d->mmu_notifier, d->mm); + if (d->mm) { + r = mmu_notifier_register(&d->mmu_notifier, d->mm); + if (!r) + d->has_notifier = true; + } #endif mutex_unlock(&vq->mutex); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 42a8c2a13ab1..a9a2a93857d2 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -214,6 +214,7 @@ struct vhost_dev { int iov_limit; int weight; int byte_weight; + bool has_notifier; }; bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); -- 2.18.1
We don't free map during vhost_map_unprefetch(). This means it could be leaked. Fixing by free the map. Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 17f6abea192e..2a3154976277 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -302,9 +302,7 @@ static void vhost_vq_meta_reset(struct vhost_dev *d) static void vhost_map_unprefetch(struct vhost_map *map) { kfree(map->pages); - map->pages = NULL; - map->npages = 0; - map->addr = NULL; + kfree(map); } static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) -- 2.18.1
Jason Wang
2019-Jul-31 08:46 UTC
[PATCH V2 4/9] vhost: reset invalidate_count in vhost_set_vring_num_addr()
The vhost_set_vring_num_addr() could be called in the middle of invalidate_range_start() and invalidate_range_end(). If we don't reset invalidate_count after the un-registering of MMU notifier, the invalidate_cont will run out of sync (e.g never reach zero). This will in fact disable the fast accessor path. Fixing by reset the count to zero. Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 2a3154976277..2a7217c33668 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2073,6 +2073,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, d->has_notifier = false; } + /* reset invalidate_count in case we are in the middle of + * invalidate_start() and invalidate_end(). + */ + vq->invalidate_count = 0; vhost_uninit_vq_maps(vq); #endif -- 2.18.1
We don't mark dirty pages if the map was teared down outside MMU notifier. This will lead untracked dirty pages. Fixing by marking dirty pages during map uninit. Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 2a7217c33668..c12cdadb0855 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -305,6 +305,18 @@ static void vhost_map_unprefetch(struct vhost_map *map) kfree(map); } +static void vhost_set_map_dirty(struct vhost_virtqueue *vq, + struct vhost_map *map, int index) +{ + struct vhost_uaddr *uaddr = &vq->uaddrs[index]; + int i; + + if (uaddr->write) { + for (i = 0; i < map->npages; i++) + set_page_dirty(map->pages[i]); + } +} + static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) { struct vhost_map *map[VHOST_NUM_ADDRS]; @@ -314,8 +326,10 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) for (i = 0; i < VHOST_NUM_ADDRS; i++) { map[i] = rcu_dereference_protected(vq->maps[i], lockdep_is_held(&vq->mmu_lock)); - if (map[i]) + if (map[i]) { + vhost_set_map_dirty(vq, map[i], i); rcu_assign_pointer(vq->maps[i], NULL); + } } spin_unlock(&vq->mmu_lock); @@ -353,7 +367,6 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, { struct vhost_uaddr *uaddr = &vq->uaddrs[index]; struct vhost_map *map; - int i; if (!vhost_map_range_overlap(uaddr, start, end)) return; @@ -364,10 +377,7 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, map = rcu_dereference_protected(vq->maps[index], lockdep_is_held(&vq->mmu_lock)); if (map) { - if (uaddr->write) { - for (i = 0; i < map->npages; i++) - set_page_dirty(map->pages[i]); - } + vhost_set_map_dirty(vq, map, index); rcu_assign_pointer(vq->maps[index], NULL); } spin_unlock(&vq->mmu_lock); -- 2.18.1
Jason Wang
2019-Jul-31 08:46 UTC
[PATCH V2 6/9] vhost: don't do synchronize_rcu() in vhost_uninit_vq_maps()
There's no need for RCU synchronization in vhost_uninit_vq_maps() since we've already serialized with readers (memory accessors). This also avoid the possible userspace DOS through ioctl() because of the possible high latency caused by synchronize_rcu(). Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c12cdadb0855..cfc11f9ed9c9 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -333,7 +333,9 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) } spin_unlock(&vq->mmu_lock); - synchronize_rcu(); + /* No need for synchronize_rcu() or kfree_rcu() since we are + * serialized with memory accessors (e.g vq mutex held). + */ for (i = 0; i < VHOST_NUM_ADDRS; i++) if (map[i]) -- 2.18.1
Jason Wang
2019-Jul-31 08:46 UTC
[PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
We used to use RCU to synchronize MMU notifier with worker. This leads calling synchronize_rcu() in invalidate_range_start(). But on a busy system, there would be many factors that may slow down the synchronize_rcu() which makes it unsuitable to be called in MMU notifier. A solution is SRCU but its overhead is obvious with the expensive full memory barrier. Another choice is to use seqlock, but it doesn't provide a synchronization method between readers and writers. The last choice is to use vq mutex, but it need to deal with the worst case that MMU notifier must be blocked and wait for the finish of swap in. So this patch switches use a counter to track whether or not the map was used. The counter was increased when vq try to start or finish uses the map. This means, when it was even, we're sure there's no readers and MMU notifier is synchronized. When it was odd, it means there's a reader we need to wait it to be even again then we are synchronized. To avoid full memory barrier, store_release + load_acquire on the counter is used. Consider the read critical section is pretty small the synchronization should be done very fast. Note the patch lead about 3% PPS dropping. Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++---------------- drivers/vhost/vhost.h | 7 +- 2 files changed, 94 insertions(+), 58 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index cfc11f9ed9c9..db2c81cb1e90 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) spin_lock(&vq->mmu_lock); for (i = 0; i < VHOST_NUM_ADDRS; i++) { - map[i] = rcu_dereference_protected(vq->maps[i], - lockdep_is_held(&vq->mmu_lock)); + map[i] = vq->maps[i]; if (map[i]) { vhost_set_map_dirty(vq, map[i], i); - rcu_assign_pointer(vq->maps[i], NULL); + vq->maps[i] = NULL; } } spin_unlock(&vq->mmu_lock); - /* No need for synchronize_rcu() or kfree_rcu() since we are - * serialized with memory accessors (e.g vq mutex held). + /* No need for synchronization since we are serialized with + * memory accessors (e.g vq mutex held). */ for (i = 0; i < VHOST_NUM_ADDRS; i++) @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr, return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size); } +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq) +{ + int ref = READ_ONCE(vq->ref); + + smp_store_release(&vq->ref, ref + 1); + /* Make sure ref counter is visible before accessing the map */ + smp_load_acquire(&vq->ref); +} + +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq) +{ + int ref = READ_ONCE(vq->ref); + + /* Make sure vq access is done before increasing ref counter */ + smp_store_release(&vq->ref, ref + 1); +} + +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq) +{ + int ref; + + /* Make sure map change was done before checking ref counter */ + smp_mb(); + + ref = READ_ONCE(vq->ref); + if (ref & 0x1) { + /* When ref change, we are sure no reader can see + * previous map */ + while (READ_ONCE(vq->ref) == ref) { + set_current_state(TASK_RUNNING); + schedule(); + } + } + /* Make sure ref counter was checked before any other + * operations that was dene on map. */ + smp_mb(); +} + static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, int index, unsigned long start, @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, spin_lock(&vq->mmu_lock); ++vq->invalidate_count; - map = rcu_dereference_protected(vq->maps[index], - lockdep_is_held(&vq->mmu_lock)); + map = vq->maps[index]; if (map) { vhost_set_map_dirty(vq, map, index); - rcu_assign_pointer(vq->maps[index], NULL); + vq->maps[index] = NULL; } spin_unlock(&vq->mmu_lock); if (map) { - synchronize_rcu(); + vhost_vq_sync_access(vq); vhost_map_unprefetch(map); } } @@ -457,7 +493,7 @@ static void vhost_init_maps(struct vhost_dev *dev) for (i = 0; i < dev->nvqs; ++i) { vq = dev->vqs[i]; for (j = 0; j < VHOST_NUM_ADDRS; j++) - RCU_INIT_POINTER(vq->maps[j], NULL); + vq->maps[j] = NULL; } } #endif @@ -655,6 +691,7 @@ void vhost_dev_init(struct vhost_dev *dev, vq->indirect = NULL; vq->heads = NULL; vq->dev = dev; + vq->ref = 0; mutex_init(&vq->mutex); spin_lock_init(&vq->mmu_lock); vhost_vq_reset(dev, vq); @@ -921,7 +958,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq, map->npages = npages; map->pages = pages; - rcu_assign_pointer(vq->maps[index], map); + vq->maps[index] = map; /* No need for a synchronize_rcu(). This function should be * called by dev->worker so we are serialized with all * readers. @@ -1216,18 +1253,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) struct vring_used *used; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); + map = vq->maps[VHOST_ADDR_USED]; if (likely(map)) { used = map->addr; *((__virtio16 *)&used->ring[vq->num]) cpu_to_vhost16(vq, vq->avail_idx); - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1245,18 +1282,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, size_t size; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); + map = vq->maps[VHOST_ADDR_USED]; if (likely(map)) { used = map->addr; size = count * sizeof(*head); memcpy(used->ring + idx, head, size); - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1272,17 +1309,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) struct vring_used *used; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); + map = vq->maps[VHOST_ADDR_USED]; if (likely(map)) { used = map->addr; used->flags = cpu_to_vhost16(vq, vq->used_flags); - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1298,17 +1335,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) struct vring_used *used; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); + map = vq->maps[VHOST_ADDR_USED]; if (likely(map)) { used = map->addr; used->idx = cpu_to_vhost16(vq, vq->last_used_idx); - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1362,17 +1399,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, struct vring_avail *avail; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); + map = vq->maps[VHOST_ADDR_AVAIL]; if (likely(map)) { avail = map->addr; *idx = avail->idx; - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1387,17 +1424,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, struct vring_avail *avail; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); + map = vq->maps[VHOST_ADDR_AVAIL]; if (likely(map)) { avail = map->addr; *head = avail->ring[idx & (vq->num - 1)]; - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1413,17 +1450,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, struct vring_avail *avail; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); + map = vq->maps[VHOST_ADDR_AVAIL]; if (likely(map)) { avail = map->addr; *flags = avail->flags; - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1438,15 +1475,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq, struct vring_avail *avail; if (!vq->iotlb) { - rcu_read_lock(); - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); + vhost_vq_access_map_begin(vq); + map = vq->maps[VHOST_ADDR_AVAIL]; if (likely(map)) { avail = map->addr; *event = (__virtio16)avail->ring[vq->num]; - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1461,17 +1498,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq, struct vring_used *used; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); + map = vq->maps[VHOST_ADDR_USED]; if (likely(map)) { used = map->addr; *idx = used->idx; - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1486,17 +1523,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq, struct vring_desc *d; if (!vq->iotlb) { - rcu_read_lock(); + vhost_vq_access_map_begin(vq); - map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]); + map = vq->maps[VHOST_ADDR_DESC]; if (likely(map)) { d = map->addr; *desc = *(d + idx); - rcu_read_unlock(); + vhost_vq_access_map_end(vq); return 0; } - rcu_read_unlock(); + vhost_vq_access_map_end(vq); } #endif @@ -1843,13 +1880,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq, #if VHOST_ARCH_CAN_ACCEL_UACCESS static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq) { - struct vhost_map __rcu *map; + struct vhost_map *map; int i; for (i = 0; i < VHOST_NUM_ADDRS; i++) { - rcu_read_lock(); - map = rcu_dereference(vq->maps[i]); - rcu_read_unlock(); + map = vq->maps[i]; if (unlikely(!map)) vhost_map_prefetch(vq, i); } diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index a9a2a93857d2..f9e9558a529d 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -115,16 +115,17 @@ struct vhost_virtqueue { #if VHOST_ARCH_CAN_ACCEL_UACCESS /* Read by memory accessors, modified by meta data * prefetching, MMU notifier and vring ioctl(). - * Synchonrized through mmu_lock (writers) and RCU (writers - * and readers). + * Synchonrized through mmu_lock (writers) and ref counters, + * see vhost_vq_access_map_begin()/vhost_vq_access_map_end(). */ - struct vhost_map __rcu *maps[VHOST_NUM_ADDRS]; + struct vhost_map *maps[VHOST_NUM_ADDRS]; /* Read by MMU notifier, modified by vring ioctl(), * synchronized through MMU notifier * registering/unregistering. */ struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS]; #endif + int ref; const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; struct file *kick; -- 2.18.1
Jason Wang
2019-Jul-31 08:46 UTC
[PATCH V2 8/9] vhost: correctly set dirty pages in MMU notifiers callback
We need make sure there's no reference on the map before trying to mark set dirty pages. Reported-by: Michael S. Tsirkin <mst at redhat.com> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index db2c81cb1e90..fc2da8a0c671 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -414,14 +414,13 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, ++vq->invalidate_count; map = vq->maps[index]; - if (map) { - vhost_set_map_dirty(vq, map, index); + if (map) vq->maps[index] = NULL; - } spin_unlock(&vq->mmu_lock); if (map) { vhost_vq_sync_access(vq); + vhost_set_map_dirty(vq, map, index); vhost_map_unprefetch(map); } } -- 2.18.1
Jason Wang
2019-Jul-31 08:46 UTC
[PATCH V2 9/9] vhost: do not return -EAGIAN for non blocking invalidation too early
Instead of returning -EAGAIN unconditionally, we'd better do that only we're sure the range is overlapped with the metadata area. Reported-by: Jason Gunthorpe <jgg at ziepe.ca> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index fc2da8a0c671..96c6aeb1871f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -399,16 +399,19 @@ static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq) smp_mb(); } -static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, - int index, - unsigned long start, - unsigned long end) +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq, + int index, + unsigned long start, + unsigned long end, + bool blockable) { struct vhost_uaddr *uaddr = &vq->uaddrs[index]; struct vhost_map *map; if (!vhost_map_range_overlap(uaddr, start, end)) - return; + return 0; + else if (!blockable) + return -EAGAIN; spin_lock(&vq->mmu_lock); ++vq->invalidate_count; @@ -423,6 +426,8 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, vhost_set_map_dirty(vq, map, index); vhost_map_unprefetch(map); } + + return 0; } static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq, @@ -443,18 +448,19 @@ static int vhost_invalidate_range_start(struct mmu_notifier *mn, { struct vhost_dev *dev = container_of(mn, struct vhost_dev, mmu_notifier); - int i, j; - - if (!mmu_notifier_range_blockable(range)) - return -EAGAIN; + bool blockable = mmu_notifier_range_blockable(range); + int i, j, ret; for (i = 0; i < dev->nvqs; i++) { struct vhost_virtqueue *vq = dev->vqs[i]; - for (j = 0; j < VHOST_NUM_ADDRS; j++) - vhost_invalidate_vq_start(vq, j, - range->start, - range->end); + for (j = 0; j < VHOST_NUM_ADDRS; j++) { + ret = vhost_invalidate_vq_start(vq, j, + range->start, + range->end, blockable); + if (ret) + return ret; + } } return 0; -- 2.18.1
Jason Wang
2019-Jul-31 08:50 UTC
[PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
On 2019/7/31 ??4:46, Jason Wang wrote:> We used to use RCU to synchronize MMU notifier with worker. This leads > calling synchronize_rcu() in invalidate_range_start(). But on a busy > system, there would be many factors that may slow down the > synchronize_rcu() which makes it unsuitable to be called in MMU > notifier. > > A solution is SRCU but its overhead is obvious with the expensive full > memory barrier. Another choice is to use seqlock, but it doesn't > provide a synchronization method between readers and writers. The last > choice is to use vq mutex, but it need to deal with the worst case > that MMU notifier must be blocked and wait for the finish of swap in. > > So this patch switches use a counter to track whether or not the map > was used. The counter was increased when vq try to start or finish > uses the map. This means, when it was even, we're sure there's no > readers and MMU notifier is synchronized. When it was odd, it means > there's a reader we need to wait it to be even again then we are > synchronized. To avoid full memory barrier, store_release + > load_acquire on the counter is used.For reviewers, I try hard to avoid e.g smp_mb(), please double check whether or not this trick work. Thanks> > Consider the read critical section is pretty small the synchronization > should be done very fast. > > Note the patch lead about 3% PPS dropping. > > Reported-by: Michael S. Tsirkin <mst at redhat.com> > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++---------------- > drivers/vhost/vhost.h | 7 +- > 2 files changed, 94 insertions(+), 58 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index cfc11f9ed9c9..db2c81cb1e90 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) > > spin_lock(&vq->mmu_lock); > for (i = 0; i < VHOST_NUM_ADDRS; i++) { > - map[i] = rcu_dereference_protected(vq->maps[i], > - lockdep_is_held(&vq->mmu_lock)); > + map[i] = vq->maps[i]; > if (map[i]) { > vhost_set_map_dirty(vq, map[i], i); > - rcu_assign_pointer(vq->maps[i], NULL); > + vq->maps[i] = NULL; > } > } > spin_unlock(&vq->mmu_lock); > > - /* No need for synchronize_rcu() or kfree_rcu() since we are > - * serialized with memory accessors (e.g vq mutex held). > + /* No need for synchronization since we are serialized with > + * memory accessors (e.g vq mutex held). > */ > > for (i = 0; i < VHOST_NUM_ADDRS; i++) > @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr, > return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size); > } > > +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq) > +{ > + int ref = READ_ONCE(vq->ref); > + > + smp_store_release(&vq->ref, ref + 1); > + /* Make sure ref counter is visible before accessing the map */ > + smp_load_acquire(&vq->ref); > +} > + > +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq) > +{ > + int ref = READ_ONCE(vq->ref); > + > + /* Make sure vq access is done before increasing ref counter */ > + smp_store_release(&vq->ref, ref + 1); > +} > + > +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq) > +{ > + int ref; > + > + /* Make sure map change was done before checking ref counter */ > + smp_mb(); > + > + ref = READ_ONCE(vq->ref); > + if (ref & 0x1) { > + /* When ref change, we are sure no reader can see > + * previous map */ > + while (READ_ONCE(vq->ref) == ref) { > + set_current_state(TASK_RUNNING); > + schedule(); > + } > + } > + /* Make sure ref counter was checked before any other > + * operations that was dene on map. */ > + smp_mb(); > +} > + > static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > int index, > unsigned long start, > @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > spin_lock(&vq->mmu_lock); > ++vq->invalidate_count; > > - map = rcu_dereference_protected(vq->maps[index], > - lockdep_is_held(&vq->mmu_lock)); > + map = vq->maps[index]; > if (map) { > vhost_set_map_dirty(vq, map, index); > - rcu_assign_pointer(vq->maps[index], NULL); > + vq->maps[index] = NULL; > } > spin_unlock(&vq->mmu_lock); > > if (map) { > - synchronize_rcu(); > + vhost_vq_sync_access(vq); > vhost_map_unprefetch(map); > } > } > @@ -457,7 +493,7 @@ static void vhost_init_maps(struct vhost_dev *dev) > for (i = 0; i < dev->nvqs; ++i) { > vq = dev->vqs[i]; > for (j = 0; j < VHOST_NUM_ADDRS; j++) > - RCU_INIT_POINTER(vq->maps[j], NULL); > + vq->maps[j] = NULL; > } > } > #endif > @@ -655,6 +691,7 @@ void vhost_dev_init(struct vhost_dev *dev, > vq->indirect = NULL; > vq->heads = NULL; > vq->dev = dev; > + vq->ref = 0; > mutex_init(&vq->mutex); > spin_lock_init(&vq->mmu_lock); > vhost_vq_reset(dev, vq); > @@ -921,7 +958,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq, > map->npages = npages; > map->pages = pages; > > - rcu_assign_pointer(vq->maps[index], map); > + vq->maps[index] = map; > /* No need for a synchronize_rcu(). This function should be > * called by dev->worker so we are serialized with all > * readers. > @@ -1216,18 +1253,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) > struct vring_used *used; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > *((__virtio16 *)&used->ring[vq->num]) > cpu_to_vhost16(vq, vq->avail_idx); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1245,18 +1282,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, > size_t size; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > size = count * sizeof(*head); > memcpy(used->ring + idx, head, size); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1272,17 +1309,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) > struct vring_used *used; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > used->flags = cpu_to_vhost16(vq, vq->used_flags); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1298,17 +1335,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) > struct vring_used *used; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > used->idx = cpu_to_vhost16(vq, vq->last_used_idx); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1362,17 +1399,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, > struct vring_avail *avail; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); > + map = vq->maps[VHOST_ADDR_AVAIL]; > if (likely(map)) { > avail = map->addr; > *idx = avail->idx; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1387,17 +1424,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, > struct vring_avail *avail; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); > + map = vq->maps[VHOST_ADDR_AVAIL]; > if (likely(map)) { > avail = map->addr; > *head = avail->ring[idx & (vq->num - 1)]; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1413,17 +1450,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, > struct vring_avail *avail; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); > + map = vq->maps[VHOST_ADDR_AVAIL]; > if (likely(map)) { > avail = map->addr; > *flags = avail->flags; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1438,15 +1475,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq, > struct vring_avail *avail; > > if (!vq->iotlb) { > - rcu_read_lock(); > - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); > + vhost_vq_access_map_begin(vq); > + map = vq->maps[VHOST_ADDR_AVAIL]; > if (likely(map)) { > avail = map->addr; > *event = (__virtio16)avail->ring[vq->num]; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1461,17 +1498,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq, > struct vring_used *used; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > *idx = used->idx; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1486,17 +1523,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq, > struct vring_desc *d; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]); > + map = vq->maps[VHOST_ADDR_DESC]; > if (likely(map)) { > d = map->addr; > *desc = *(d + idx); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1843,13 +1880,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq, > #if VHOST_ARCH_CAN_ACCEL_UACCESS > static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq) > { > - struct vhost_map __rcu *map; > + struct vhost_map *map; > int i; > > for (i = 0; i < VHOST_NUM_ADDRS; i++) { > - rcu_read_lock(); > - map = rcu_dereference(vq->maps[i]); > - rcu_read_unlock(); > + map = vq->maps[i]; > if (unlikely(!map)) > vhost_map_prefetch(vq, i); > } > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index a9a2a93857d2..f9e9558a529d 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -115,16 +115,17 @@ struct vhost_virtqueue { > #if VHOST_ARCH_CAN_ACCEL_UACCESS > /* Read by memory accessors, modified by meta data > * prefetching, MMU notifier and vring ioctl(). > - * Synchonrized through mmu_lock (writers) and RCU (writers > - * and readers). > + * Synchonrized through mmu_lock (writers) and ref counters, > + * see vhost_vq_access_map_begin()/vhost_vq_access_map_end(). > */ > - struct vhost_map __rcu *maps[VHOST_NUM_ADDRS]; > + struct vhost_map *maps[VHOST_NUM_ADDRS]; > /* Read by MMU notifier, modified by vring ioctl(), > * synchronized through MMU notifier > * registering/unregistering. > */ > struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS]; > #endif > + int ref; > const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; > > struct file *kick;
Stefano Garzarella
2019-Jul-31 09:59 UTC
[PATCH V2 9/9] vhost: do not return -EAGIAN for non blocking invalidation too early
A little typo in the title: s/EAGIAN/EAGAIN Thanks, Stefano On Wed, Jul 31, 2019 at 04:46:55AM -0400, Jason Wang wrote:> Instead of returning -EAGAIN unconditionally, we'd better do that only > we're sure the range is overlapped with the metadata area. > > Reported-by: Jason Gunthorpe <jgg at ziepe.ca> > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/vhost.c | 32 +++++++++++++++++++------------- > 1 file changed, 19 insertions(+), 13 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index fc2da8a0c671..96c6aeb1871f 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -399,16 +399,19 @@ static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq) > smp_mb(); > } > > -static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > - int index, > - unsigned long start, > - unsigned long end) > +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > + int index, > + unsigned long start, > + unsigned long end, > + bool blockable) > { > struct vhost_uaddr *uaddr = &vq->uaddrs[index]; > struct vhost_map *map; > > if (!vhost_map_range_overlap(uaddr, start, end)) > - return; > + return 0; > + else if (!blockable) > + return -EAGAIN; > > spin_lock(&vq->mmu_lock); > ++vq->invalidate_count; > @@ -423,6 +426,8 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > vhost_set_map_dirty(vq, map, index); > vhost_map_unprefetch(map); > } > + > + return 0; > } > > static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq, > @@ -443,18 +448,19 @@ static int vhost_invalidate_range_start(struct mmu_notifier *mn, > { > struct vhost_dev *dev = container_of(mn, struct vhost_dev, > mmu_notifier); > - int i, j; > - > - if (!mmu_notifier_range_blockable(range)) > - return -EAGAIN; > + bool blockable = mmu_notifier_range_blockable(range); > + int i, j, ret; > > for (i = 0; i < dev->nvqs; i++) { > struct vhost_virtqueue *vq = dev->vqs[i]; > > - for (j = 0; j < VHOST_NUM_ADDRS; j++) > - vhost_invalidate_vq_start(vq, j, > - range->start, > - range->end); > + for (j = 0; j < VHOST_NUM_ADDRS; j++) { > + ret = vhost_invalidate_vq_start(vq, j, > + range->start, > + range->end, blockable); > + if (ret) > + return ret; > + } > } > > return 0; > -- > 2.18.1 >--
Jason Gunthorpe
2019-Jul-31 12:39 UTC
[PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:> We used to use RCU to synchronize MMU notifier with worker. This leads > calling synchronize_rcu() in invalidate_range_start(). But on a busy > system, there would be many factors that may slow down the > synchronize_rcu() which makes it unsuitable to be called in MMU > notifier. > > A solution is SRCU but its overhead is obvious with the expensive full > memory barrier. Another choice is to use seqlock, but it doesn't > provide a synchronization method between readers and writers. The last > choice is to use vq mutex, but it need to deal with the worst case > that MMU notifier must be blocked and wait for the finish of swap in. > > So this patch switches use a counter to track whether or not the map > was used. The counter was increased when vq try to start or finish > uses the map. This means, when it was even, we're sure there's no > readers and MMU notifier is synchronized. When it was odd, it means > there's a reader we need to wait it to be even again then we are > synchronized.You just described a seqlock. We've been talking about providing this as some core service from mmu notifiers because nearly every use of this API needs it. IMHO this gets the whole thing backwards, the common pattern is to protect the 'shadow pte' data with a seqlock (usually open coded), such that the mmu notififer side has the write side of that lock and the read side is consumed by the thread accessing or updating the SPTE.> Reported-by: Michael S. Tsirkin <mst at redhat.com> > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com> > drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++---------------- > drivers/vhost/vhost.h | 7 +- > 2 files changed, 94 insertions(+), 58 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index cfc11f9ed9c9..db2c81cb1e90 100644 > +++ b/drivers/vhost/vhost.c > @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) > > spin_lock(&vq->mmu_lock); > for (i = 0; i < VHOST_NUM_ADDRS; i++) { > - map[i] = rcu_dereference_protected(vq->maps[i], > - lockdep_is_held(&vq->mmu_lock)); > + map[i] = vq->maps[i]; > if (map[i]) { > vhost_set_map_dirty(vq, map[i], i); > - rcu_assign_pointer(vq->maps[i], NULL); > + vq->maps[i] = NULL; > } > } > spin_unlock(&vq->mmu_lock); > > - /* No need for synchronize_rcu() or kfree_rcu() since we are > - * serialized with memory accessors (e.g vq mutex held). > + /* No need for synchronization since we are serialized with > + * memory accessors (e.g vq mutex held). > */ > > for (i = 0; i < VHOST_NUM_ADDRS; i++) > @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr, > return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size); > } > > +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq) > +{ > + int ref = READ_ONCE(vq->ref);Is a lock/single threaded supposed to be held for this?> + > + smp_store_release(&vq->ref, ref + 1); > + /* Make sure ref counter is visible before accessing the map */ > + smp_load_acquire(&vq->ref);release/acquire semantics are intended to protect blocks of related data, so reading something with acquire and throwing away the result is nonsense.> +} > + > +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq) > +{ > + int ref = READ_ONCE(vq->ref);If the write to vq->ref is not locked this algorithm won't work, if it is locked the READ_ONCE is not needed.> + /* Make sure vq access is done before increasing ref counter */ > + smp_store_release(&vq->ref, ref + 1); > +} > + > +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq) > +{ > + int ref; > + > + /* Make sure map change was done before checking ref counter */ > + smp_mb();This is probably smp_rmb after reading ref, and if you are setting ref with smp_store_release then this should be smp_load_acquire() without an explicit mb.> + ref = READ_ONCE(vq->ref); > + if (ref & 0x1) { > + /* When ref change, we are sure no reader can see > + * previous map */ > + while (READ_ONCE(vq->ref) == ref) { > + set_current_state(TASK_RUNNING); > + schedule(); > + } > + }This is basically read_seqcount_begin()' with a schedule instead of cpu_relax> + /* Make sure ref counter was checked before any other > + * operations that was dene on map. */ > + smp_mb();should be in a smp_load_acquire()> +} > + > static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > int index, > unsigned long start, > @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > spin_lock(&vq->mmu_lock); > ++vq->invalidate_count; > > - map = rcu_dereference_protected(vq->maps[index], > - lockdep_is_held(&vq->mmu_lock)); > + map = vq->maps[index]; > if (map) { > vhost_set_map_dirty(vq, map, index); > - rcu_assign_pointer(vq->maps[index], NULL); > + vq->maps[index] = NULL; > } > spin_unlock(&vq->mmu_lock); > > if (map) { > - synchronize_rcu(); > + vhost_vq_sync_access(vq);What prevents racing with vhost_vq_access_map_end here?> vhost_map_unprefetch(map); > } > }Overall I don't like it. We are trying to get rid of these botique mmu notifier patterns in drivers. Jason
Jason Gunthorpe
2019-Jul-31 12:41 UTC
[PATCH V2 4/9] vhost: reset invalidate_count in vhost_set_vring_num_addr()
On Wed, Jul 31, 2019 at 04:46:50AM -0400, Jason Wang wrote:> The vhost_set_vring_num_addr() could be called in the middle of > invalidate_range_start() and invalidate_range_end(). If we don't reset > invalidate_count after the un-registering of MMU notifier, the > invalidate_cont will run out of sync (e.g never reach zero). This will > in fact disable the fast accessor path. Fixing by reset the count to > zero. > > Reported-by: Michael S. Tsirkin <mst at redhat.com>Did Michael report this as well?> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com> > drivers/vhost/vhost.c | 4 ++++ > 1 file changed, 4 insertions(+) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 2a3154976277..2a7217c33668 100644 > +++ b/drivers/vhost/vhost.c > @@ -2073,6 +2073,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d, > d->has_notifier = false; > } > > + /* reset invalidate_count in case we are in the middle of > + * invalidate_start() and invalidate_end(). > + */ > + vq->invalidate_count = 0; > vhost_uninit_vq_maps(vq); > #endif >
Michael S. Tsirkin
2019-Jul-31 18:29 UTC
[PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:> We used to use RCU to synchronize MMU notifier with worker. This leads > calling synchronize_rcu() in invalidate_range_start(). But on a busy > system, there would be many factors that may slow down the > synchronize_rcu() which makes it unsuitable to be called in MMU > notifier. > > A solution is SRCU but its overhead is obvious with the expensive full > memory barrier. Another choice is to use seqlock, but it doesn't > provide a synchronization method between readers and writers. The last > choice is to use vq mutex, but it need to deal with the worst case > that MMU notifier must be blocked and wait for the finish of swap in. > > So this patch switches use a counter to track whether or not the map > was used. The counter was increased when vq try to start or finish > uses the map. This means, when it was even, we're sure there's no > readers and MMU notifier is synchronized. When it was odd, it means > there's a reader we need to wait it to be even again then we are > synchronized. To avoid full memory barrier, store_release + > load_acquire on the counter is used.Unfortunately this needs a lot of review and testing, so this can't make rc2, and I don't think this is the kind of patch I can merge after rc3. Subtle memory barrier tricks like this can introduce new bugs while they are fixing old ones.> > Consider the read critical section is pretty small the synchronization > should be done very fast. > > Note the patch lead about 3% PPS dropping.Sorry what do you mean by this last sentence? This degrades performance compared to what?> > Reported-by: Michael S. Tsirkin <mst at redhat.com> > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address") > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++---------------- > drivers/vhost/vhost.h | 7 +- > 2 files changed, 94 insertions(+), 58 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index cfc11f9ed9c9..db2c81cb1e90 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq) > > spin_lock(&vq->mmu_lock); > for (i = 0; i < VHOST_NUM_ADDRS; i++) { > - map[i] = rcu_dereference_protected(vq->maps[i], > - lockdep_is_held(&vq->mmu_lock)); > + map[i] = vq->maps[i]; > if (map[i]) { > vhost_set_map_dirty(vq, map[i], i); > - rcu_assign_pointer(vq->maps[i], NULL); > + vq->maps[i] = NULL; > } > } > spin_unlock(&vq->mmu_lock); > > - /* No need for synchronize_rcu() or kfree_rcu() since we are > - * serialized with memory accessors (e.g vq mutex held). > + /* No need for synchronization since we are serialized with > + * memory accessors (e.g vq mutex held). > */ > > for (i = 0; i < VHOST_NUM_ADDRS; i++) > @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr, > return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size); > } > > +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq) > +{ > + int ref = READ_ONCE(vq->ref); > + > + smp_store_release(&vq->ref, ref + 1); > + /* Make sure ref counter is visible before accessing the map */ > + smp_load_acquire(&vq->ref);The map access is after this sequence, correct? Just going by the rules in Documentation/memory-barriers.txt, I think that this pair will not order following accesses with ref store. Documentation/memory-barriers.txt says: + In addition, a RELEASE+ACQUIRE + pair is -not- guaranteed to act as a full memory barrier. The guarantee that is made is this: after an ACQUIRE on a given variable, all memory accesses preceding any prior RELEASE on that same variable are guaranteed to be visible. And if we also had the reverse rule we'd end up with a full barrier, won't we? Cc Paul in case I missed something here. And if I'm right, maybe we should call this out, adding "The opposite is not true: a prior RELEASE is not guaranteed to be visible before memory accesses following the subsequent ACQUIRE".> +} > + > +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq) > +{ > + int ref = READ_ONCE(vq->ref); > + > + /* Make sure vq access is done before increasing ref counter */ > + smp_store_release(&vq->ref, ref + 1); > +} > + > +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq) > +{ > + int ref; > + > + /* Make sure map change was done before checking ref counter */ > + smp_mb(); > + > + ref = READ_ONCE(vq->ref); > + if (ref & 0x1) {Please document the even/odd trick here too, not just in the commit log.> + /* When ref change,changes> we are sure no reader can see > + * previous map */ > + while (READ_ONCE(vq->ref) == ref) {what is the below line in aid of?> + set_current_state(TASK_RUNNING); > + schedule();if (need_resched()) schedule(); ?> + }On an interruptible kernel, there's a risk here is that a task got preempted with an odd ref. So I suspect we'll have to disable preemption when we make ref odd.> + } > + /* Make sure ref counter was checked before any other > + * operations that was dene on map. */was dene -> were done?> + smp_mb(); > +} > + > static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > int index, > unsigned long start, > @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq, > spin_lock(&vq->mmu_lock); > ++vq->invalidate_count; > > - map = rcu_dereference_protected(vq->maps[index], > - lockdep_is_held(&vq->mmu_lock)); > + map = vq->maps[index]; > if (map) { > vhost_set_map_dirty(vq, map, index); > - rcu_assign_pointer(vq->maps[index], NULL); > + vq->maps[index] = NULL; > } > spin_unlock(&vq->mmu_lock); > > if (map) { > - synchronize_rcu(); > + vhost_vq_sync_access(vq); > vhost_map_unprefetch(map); > } > } > @@ -457,7 +493,7 @@ static void vhost_init_maps(struct vhost_dev *dev) > for (i = 0; i < dev->nvqs; ++i) { > vq = dev->vqs[i]; > for (j = 0; j < VHOST_NUM_ADDRS; j++) > - RCU_INIT_POINTER(vq->maps[j], NULL); > + vq->maps[j] = NULL; > } > } > #endif > @@ -655,6 +691,7 @@ void vhost_dev_init(struct vhost_dev *dev, > vq->indirect = NULL; > vq->heads = NULL; > vq->dev = dev; > + vq->ref = 0; > mutex_init(&vq->mutex); > spin_lock_init(&vq->mmu_lock); > vhost_vq_reset(dev, vq); > @@ -921,7 +958,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq, > map->npages = npages; > map->pages = pages; > > - rcu_assign_pointer(vq->maps[index], map); > + vq->maps[index] = map; > /* No need for a synchronize_rcu(). This function should be > * called by dev->worker so we are serialized with all > * readers. > @@ -1216,18 +1253,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) > struct vring_used *used; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > *((__virtio16 *)&used->ring[vq->num]) > cpu_to_vhost16(vq, vq->avail_idx); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1245,18 +1282,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq, > size_t size; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > size = count * sizeof(*head); > memcpy(used->ring + idx, head, size); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1272,17 +1309,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) > struct vring_used *used; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > used->flags = cpu_to_vhost16(vq, vq->used_flags); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1298,17 +1335,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) > struct vring_used *used; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > used->idx = cpu_to_vhost16(vq, vq->last_used_idx); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1362,17 +1399,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, > struct vring_avail *avail; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); > + map = vq->maps[VHOST_ADDR_AVAIL]; > if (likely(map)) { > avail = map->addr; > *idx = avail->idx; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1387,17 +1424,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, > struct vring_avail *avail; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); > + map = vq->maps[VHOST_ADDR_AVAIL]; > if (likely(map)) { > avail = map->addr; > *head = avail->ring[idx & (vq->num - 1)]; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1413,17 +1450,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, > struct vring_avail *avail; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); > + map = vq->maps[VHOST_ADDR_AVAIL]; > if (likely(map)) { > avail = map->addr; > *flags = avail->flags; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1438,15 +1475,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq, > struct vring_avail *avail; > > if (!vq->iotlb) { > - rcu_read_lock(); > - map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]); > + vhost_vq_access_map_begin(vq); > + map = vq->maps[VHOST_ADDR_AVAIL]; > if (likely(map)) { > avail = map->addr; > *event = (__virtio16)avail->ring[vq->num]; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1461,17 +1498,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq, > struct vring_used *used; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_USED]); > + map = vq->maps[VHOST_ADDR_USED]; > if (likely(map)) { > used = map->addr; > *idx = used->idx; > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1486,17 +1523,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq, > struct vring_desc *d; > > if (!vq->iotlb) { > - rcu_read_lock(); > + vhost_vq_access_map_begin(vq); > > - map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]); > + map = vq->maps[VHOST_ADDR_DESC]; > if (likely(map)) { > d = map->addr; > *desc = *(d + idx); > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > return 0; > } > > - rcu_read_unlock(); > + vhost_vq_access_map_end(vq); > } > #endif > > @@ -1843,13 +1880,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq, > #if VHOST_ARCH_CAN_ACCEL_UACCESS > static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq) > { > - struct vhost_map __rcu *map; > + struct vhost_map *map; > int i; > > for (i = 0; i < VHOST_NUM_ADDRS; i++) { > - rcu_read_lock(); > - map = rcu_dereference(vq->maps[i]); > - rcu_read_unlock(); > + map = vq->maps[i]; > if (unlikely(!map)) > vhost_map_prefetch(vq, i); > } > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index a9a2a93857d2..f9e9558a529d 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -115,16 +115,17 @@ struct vhost_virtqueue { > #if VHOST_ARCH_CAN_ACCEL_UACCESS > /* Read by memory accessors, modified by meta data > * prefetching, MMU notifier and vring ioctl(). > - * Synchonrized through mmu_lock (writers) and RCU (writers > - * and readers). > + * Synchonrized through mmu_lock (writers) and ref counters, > + * see vhost_vq_access_map_begin()/vhost_vq_access_map_end(). > */ > - struct vhost_map __rcu *maps[VHOST_NUM_ADDRS]; > + struct vhost_map *maps[VHOST_NUM_ADDRS]; > /* Read by MMU notifier, modified by vring ioctl(), > * synchronized through MMU notifier > * registering/unregistering. > */ > struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS]; > #endif > + int ref;Is it important that this is signed? If not I'd do unsigned here: even though kernel does compile with 2s complement sign overflow, it seems cleaner not to depend on that.> const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; > > struct file *kick; > -- > 2.18.1
Reasonably Related Threads
- [PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
- [PATCH V4 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
- [PATCH V4 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
- [PATCH V4 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
- [PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker