thr3ads.net - Virtualization - [RFC v2 11/13] vhost: Shadow virtqueue buffers forwarding [Mar 2021]

If this information is useful, please help other people find it:
Share via:
Jason Wang
2021-Mar-16 08:15 UTC
[RFC v2 11/13] vhost: Shadow virtqueue buffers forwarding

? 2021/3/16 ??3:48, Eugenio P?rez ??:> Initial version of shadow virtqueue that actually forward buffers.
>
> It reuses the VirtQueue code for the device part. The driver part is
> based on Linux's virtio_ring driver, but with stripped functionality
> and optimizations so it's easier to review.
>
> These will be added in later commits.
>
> Signed-off-by: Eugenio P?rez <eperezma at redhat.com>
> ---
>   hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++--
>   hw/virtio/vhost.c                  | 113 ++++++++++++++-
>   2 files changed, 312 insertions(+), 13 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c
b/hw/virtio/vhost-shadow-virtqueue.c
> index 1460d1d5d1..68ed0f2740 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -9,6 +9,7 @@
>   
>   #include "hw/virtio/vhost-shadow-virtqueue.h"
>   #include "hw/virtio/vhost.h"
> +#include "hw/virtio/virtio-access.h"
>   
>   #include "standard-headers/linux/vhost_types.h"
>   
> @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue {
>       /* Virtio device */
>       VirtIODevice *vdev;
>   
> +    /* Map for returning guest's descriptors */
> +    VirtQueueElement **ring_id_maps;
> +
> +    /* Next head to expose to device */
> +    uint16_t avail_idx_shadow;
> +
> +    /* Next free descriptor */
> +    uint16_t free_head;
> +
> +    /* Last seen used idx */
> +    uint16_t shadow_used_idx;
> +
> +    /* Next head to consume from device */
> +    uint16_t used_idx;
> +
>       /* Descriptors copied from guest */
>       vring_desc_t descs[];
>   } VhostShadowVirtqueue;
>   
> -/* Forward guest notifications */
> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> +                                    const struct iovec *iovec,
> +                                    size_t num, bool more_descs, bool
write)
> +{
> +    uint16_t i = svq->free_head, last = svq->free_head;
> +    unsigned n;
> +    uint16_t flags = write ? virtio_tswap16(svq->vdev,
VRING_DESC_F_WRITE) : 0;
> +    vring_desc_t *descs = svq->vring.desc;
> +
> +    if (num == 0) {
> +        return;
> +    }
> +
> +    for (n = 0; n < num; n++) {
> +        if (more_descs || (n + 1 < num)) {
> +            descs[i].flags = flags | virtio_tswap16(svq->vdev,
> +                                                    VRING_DESC_F_NEXT);
> +        } else {
> +            descs[i].flags = flags;
> +        }
> +        descs[i].addr = virtio_tswap64(svq->vdev,
(hwaddr)iovec[n].iov_base);

So unsing virtio_tswap() is probably not correct since we're talking 
with vhost backends which has its own endiness.

For vhost-vDPA, we can assume that it's a 1.0 device.

> +        descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len);
> +
> +        last = i;
> +        i = virtio_tswap16(svq->vdev, descs[i].next);
> +    }
> +
> +    svq->free_head = virtio_tswap16(svq->vdev, descs[last].next);
> +}
> +
> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq,
> +                                          VirtQueueElement *elem)
> +{
> +    int head;
> +    unsigned avail_idx;
> +    vring_avail_t *avail = svq->vring.avail;
> +
> +    head = svq->free_head;
> +
> +    /* We need some descriptors here */
> +    assert(elem->out_num || elem->in_num);
> +
> +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> +                            elem->in_num > 0, false);
> +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false,
true);
> +
> +    /*
> +     * Put entry in available array (but don't update avail->idx
until they
> +     * do sync).
> +     */
> +    avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
> +    avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head);
> +    svq->avail_idx_shadow++;
> +
> +    /* Expose descriptors to device */
> +    smp_wmb();
> +    avail->idx = virtio_tswap16(svq->vdev,
svq->avail_idx_shadow);
> +
> +    return head;
> +
> +}
> +
> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq,
> +                                VirtQueueElement *elem)
> +{
> +    unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem);
> +
> +    svq->ring_id_maps[qemu_head] = elem;
> +}
> +
> +/* Handle guest->device notifications */
>   static void vhost_handle_guest_kick(EventNotifier *n)
>   {
>       VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n)
>           return;
>       }
>   
> -    event_notifier_set(&svq->kick_notifier);
> +    /* Make available as many buffers as possible */
> +    do {
> +        if (virtio_queue_get_notification(svq->vq)) {
> +            /* No more notifications until process all available */
> +            virtio_queue_set_notification(svq->vq, false);
> +        }
> +
> +        while (true) {
> +            VirtQueueElement *elem;
> +            if (virtio_queue_full(svq->vq)) {
> +                break;

So we've disabled guest notification. If buffer has been consumed, we 
need to retry the handle_guest_kick here. But I didn't find the code?

> +            }
> +
> +            elem = virtqueue_pop(svq->vq, sizeof(*elem));
> +            if (!elem) {
> +                break;
> +            }
> +
> +            vhost_shadow_vq_add(svq, elem);
> +            event_notifier_set(&svq->kick_notifier);
> +        }
> +
> +        virtio_queue_set_notification(svq->vq, true);
> +    } while (!virtio_queue_empty(svq->vq));
> +}
> +
> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq)
> +{
> +    if (svq->used_idx != svq->shadow_used_idx) {
> +        return true;
> +    }
> +
> +    /* Get used idx must not be reordered */
> +    smp_rmb();
> +    svq->shadow_used_idx = virtio_tswap16(svq->vdev,
svq->vring.used->idx);
> +
> +    return svq->used_idx != svq->shadow_used_idx;
> +}
> +
> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue
*svq)
> +{
> +    vring_desc_t *descs = svq->vring.desc;
> +    const vring_used_t *used = svq->vring.used;
> +    vring_used_elem_t used_elem;
> +    uint16_t last_used;
> +
> +    if (!vhost_shadow_vq_more_used(svq)) {
> +        return NULL;
> +    }
> +
> +    last_used = svq->used_idx & (svq->vring.num - 1);
> +    used_elem.id = virtio_tswap32(svq->vdev,
used->ring[last_used].id);
> +    used_elem.len = virtio_tswap32(svq->vdev,
used->ring[last_used].len);
> +
> +    if (unlikely(used_elem.id >= svq->vring.num)) {
> +        error_report("Device %s says index %u is available",
svq->vdev->name,
> +                     used_elem.id);
> +        return NULL;
> +    }
> +
> +    descs[used_elem.id].next = svq->free_head;
> +    svq->free_head = used_elem.id;
> +
> +    svq->used_idx++;
> +    svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
>   }
>   
>   /* Forward vhost notifications */
> @@ -78,6 +229,7 @@ static void
vhost_shadow_vq_handle_call_no_test(EventNotifier *n)
>       VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>                                                call_notifier);
>       EventNotifier *masked_notifier;
> +    VirtQueue *vq = svq->vq;
>   
>       /* Signal start of using masked notifier */
>       qemu_event_reset(&svq->masked_notifier.is_free);
> @@ -86,14 +238,29 @@ static void
vhost_shadow_vq_handle_call_no_test(EventNotifier *n)
>           qemu_event_set(&svq->masked_notifier.is_free);
>       }
>   
> -    if (!masked_notifier) {
> -        unsigned n = virtio_get_queue_index(svq->vq);
> -        virtio_queue_invalidate_signalled_used(svq->vdev, n);
> -        virtio_notify_irqfd(svq->vdev, svq->vq);
> -    } else if (!svq->masked_notifier.signaled) {
> -        svq->masked_notifier.signaled = true;
> -        event_notifier_set(svq->masked_notifier.n);
> -    }
> +    /* Make as many buffers as possible used. */
> +    do {
> +        unsigned i = 0;
> +
> +        /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */
> +        while (true) {
> +            g_autofree VirtQueueElement *elem =
vhost_shadow_vq_get_buf(svq);
> +            if (!elem) {
> +                break;
> +            }
> +
> +            assert(i < svq->vring.num);
> +            virtqueue_fill(vq, elem, elem->len, i++);
> +        }
> +
> +        virtqueue_flush(vq, i);
> +        if (!masked_notifier) {
> +            virtio_notify_irqfd(svq->vdev, svq->vq);
> +        } else if (!svq->masked_notifier.signaled) {
> +            svq->masked_notifier.signaled = true;
> +            event_notifier_set(svq->masked_notifier.n);
> +        }
> +    } while (vhost_shadow_vq_more_used(svq));
>   
>       if (masked_notifier) {
>           /* Signal not using it anymore */
> @@ -103,7 +270,6 @@ static void
vhost_shadow_vq_handle_call_no_test(EventNotifier *n)
>   
>   static void vhost_shadow_vq_handle_call(EventNotifier *n)
>   {
> -
>       if (likely(event_notifier_test_and_clear(n))) {
>           vhost_shadow_vq_handle_call_no_test(n);
>       }
> @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev,
>                             unsigned idx,
>                             VhostShadowVirtqueue *svq)
>   {
> +    int i;
>       int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq);
> +
> +    assert(!dev->shadow_vqs_enabled);
> +
>       if (unlikely(r < 0)) {
>           error_report("Couldn't restore vq kick fd: %s",
strerror(-r));
>       }
> @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev,
>       /* Restore vhost call */
>       vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx,
>                            dev->vqs[idx].notifier_is_masked);
> +
> +
> +    for (i = 0; i < svq->vring.num; ++i) {
> +        g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
> +        /*
> +         * Although the doc says we must unpop in order, it's ok to
unpop
> +         * everything.
> +         */
> +        if (elem) {
> +            virtqueue_unpop(svq->vq, elem, elem->len);

Shouldn't we need to wait until all pending requests to be drained? Or 
we may end up duplicated requests?

Thanks

> +        }
> +    }
>   }
>   
>   /*
> @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct
vhost_dev *dev, int idx)
>       unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
>       size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE);
>       g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) +
ring_size);
> -    int r;
> +    int r, i;
>   
>       r = event_notifier_init(&svq->kick_notifier, 0);
>       if (r != 0) {
> @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct
vhost_dev *dev, int idx)
>       vring_init(&svq->vring, num, svq->descs,
VRING_DESC_ALIGN_SIZE);
>       svq->vq = virtio_get_queue(dev->vdev, vq_idx);
>       svq->vdev = dev->vdev;
> +    for (i = 0; i < num - 1; i++) {
> +        svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1);
> +    }
> +
> +    svq->ring_id_maps = g_new0(VirtQueueElement *, num);
>       event_notifier_set_handler(&svq->call_notifier,
>                                  vhost_shadow_vq_handle_call);
>       qemu_event_init(&svq->masked_notifier.is_free, true);
> @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq)
>       event_notifier_cleanup(&vq->kick_notifier);
>       event_notifier_set_handler(&vq->call_notifier, NULL);
>       event_notifier_cleanup(&vq->call_notifier);
> +    g_free(vq->ring_id_maps);
>       g_free(vq);
>   }
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index eab3e334f2..a373999bc4 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev,
uint64_t iova, int write)
>   
>       trace_vhost_iotlb_miss(dev, 1);
>   
> +    if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) {
> +        uaddr = iova;
> +        len = 4096;
> +        ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len,
> +                                                IOMMU_RW);
> +        if (ret) {
> +            trace_vhost_iotlb_miss(dev, 2);
> +            error_report("Fail to update device iotlb");
> +        }
> +
> +        return ret;
> +    }
> +
>       iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
>                                             iova, write,
>                                             MEMTXATTRS_UNSPECIFIED);
> @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct
vhost_dev *dev)
>       /* Can be read by vhost_virtqueue_mask, from vm exit */
>       qatomic_store_release(&dev->shadow_vqs_enabled, false);
>   
> +    dev->vhost_ops->vhost_set_vring_enable(dev, false);
> +    if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) {
> +        error_report("Fail to invalidate device iotlb");
> +    }
> +
>       for (idx = 0; idx < dev->nvqs; ++idx) {
> +        /*
> +         * Update used ring information for IOTLB to work correctly,
> +         * vhost-kernel code requires for this.
> +         */
> +        struct vhost_virtqueue *vq = dev->vqs + idx;
> +        vhost_device_iotlb_miss(dev, vq->used_phys, true);
> +
>           vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]);
> +        vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx],
> +                              dev->vq_index + idx);
> +    }
> +
> +    /* Enable guest's vq vring */
> +    dev->vhost_ops->vhost_set_vring_enable(dev, true);
> +
> +    for (idx = 0; idx < dev->nvqs; ++idx) {
>           vhost_shadow_vq_free(dev->shadow_vqs[idx]);
>       }
>   
> @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct
vhost_dev *dev)
>       return 0;
>   }
>   
> +/*
> + * Start shadow virtqueue in a given queue.
> + * In failure case, this function leaves queue working as regular vhost
mode.
> + */
> +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev,
> +                                             unsigned idx)
> +{
> +    struct vhost_vring_addr addr = {
> +        .index = idx,
> +    };
> +    struct vhost_vring_state s = {
> +        .index = idx,
> +    };
> +    int r;
> +    bool ok;
> +
> +    vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx],
dev->vq_index + idx);
> +    ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]);
> +    if (unlikely(!ok)) {
> +        return false;
> +    }
> +
> +    /* From this point, vhost_virtqueue_start can reset these changes */
> +    vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr);
> +    r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
> +    if (unlikely(r != 0)) {
> +        VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq
failed");
> +        goto err;
> +    }
> +
> +    r = dev->vhost_ops->vhost_set_vring_base(dev, &s);
> +    if (unlikely(r != 0)) {
> +        VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq
failed");
> +        goto err;
> +    }
> +
> +    /*
> +     * Update used ring information for IOTLB to work correctly,
> +     * vhost-kernel code requires for this.
> +     */
> +    r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true);
> +    if (unlikely(r != 0)) {
> +        /* Debug message already printed */
> +        goto err;
> +    }
> +
> +    return true;
> +
> +err:
> +    vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx],
dev->vq_index + idx);
> +    return false;
> +}
> +
>   static int vhost_sw_live_migration_start(struct vhost_dev *dev)
>   {
>       int idx, stop_idx;
> @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct
vhost_dev *dev)
>           }
>       }
>   
> +    dev->vhost_ops->vhost_set_vring_enable(dev, false);
> +    if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) {
> +        error_report("Fail to invalidate device iotlb");
> +    }
> +
>       /* Can be read by vhost_virtqueue_mask, from vm exit */
>       qatomic_store_release(&dev->shadow_vqs_enabled, true);
>       for (idx = 0; idx < dev->nvqs; ++idx) {
> -        bool ok = vhost_shadow_vq_start(dev, idx,
dev->shadow_vqs[idx]);
> +        bool ok = vhost_sw_live_migration_start_vq(dev, idx);
>           if (unlikely(!ok)) {
>               goto err_start;
>           }
>       }
>   
> +    /* Enable shadow vq vring */
> +    dev->vhost_ops->vhost_set_vring_enable(dev, true);
>       return 0;
>   
>   err_start:
>       qatomic_store_release(&dev->shadow_vqs_enabled, false);
>       for (stop_idx = 0; stop_idx < idx; stop_idx++) {
>           vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]);
> +        vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx],
> +                              dev->vq_index + stop_idx);
>       }
>   
>   err_new:
> +    /* Enable guest's vring */
> +    dev->vhost_ops->vhost_set_vring_enable(dev, true);
>       for (idx = 0; idx < dev->nvqs; ++idx) {
>           vhost_shadow_vq_free(dev->shadow_vqs[idx]);
>       }
> @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name,
bool enable, Error **errp)
>   
>           if (!hdev->started) {
>               err_cause = "Device is not started";
> +        } else if (!vhost_dev_has_iommu(hdev)) {
> +            err_cause = "Does not support iommu";
> +        } else if (hdev->acked_features &
BIT_ULL(VIRTIO_F_RING_PACKED)) {
> +            err_cause = "Is packed";
> +        } else if (hdev->acked_features &
BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) {
> +            err_cause = "Have event idx";
> +        } else if (hdev->acked_features &
> +                   BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) {
> +            err_cause = "Supports indirect descriptors";
> +        } else if (!hdev->vhost_ops->vhost_set_vring_enable) {
> +            err_cause = "Cannot pause device";
> +        }
> +
> +        if (err_cause) {
>               goto err;
>           }
>
Virtualization - Mar 2021 - [RFC v2 11/13] vhost: Shadow virtqueue buffers forwarding

[RFC v2 11/13] vhost: Shadow virtqueue buffers forwarding