Jason Wang
2021-Nov-02 07:59 UTC
[RFC PATCH v5 22/26] vhost: Shadow virtqueue buffers forwarding
? 2021/10/30 ??2:35, Eugenio P?rez ??:> Initial version of shadow virtqueue that actually forward buffers. There > are no iommu support at the moment, and that will be addressed in future > patches of this series. Since all vhost-vdpa devices uses forced IOMMU, > this means that SVQ is not usable at this point of the series on any > device. > > For simplicity it only supports modern devices, that expects vring > in little endian, with split ring and no event idx or indirect > descriptors. Support for them will not be added in this series. > > It reuses the VirtQueue code for the device part. The driver part is > based on Linux's virtio_ring driver, but with stripped functionality > and optimizations so it's easier to review. Later commits add simpler > ones. > > However to forwarding buffers have some particular pieces: One of the > most unexpected ones is that a guest's buffer can expand through more > than one descriptor in SVQ. While this is handled gracefully by qemu's > emulated virtio devices, it may cause unexpected SVQ queue full. This > patch also solves it checking for this condition at both guest's kicks > and device's calls. The code may be more elegant in the future if SVQ > code runs in its own iocontext. > > Note that vhost_vdpa_get_vq_state trust the device to write its status > to used_idx at pause(), finishing all in-flight descriptors. This may > not be enough for complex devices, but other development like usage of > inflight_fd on top of this solution may extend the usage in the future. > > In particular, SVQ trust it to recover guest's virtqueue at start, and > to mark as used the latest descriptors used by the device in the > meantime. > > Signed-off-by: Eugenio P?rez <eperezma at redhat.com> > --- > qapi/net.json | 5 +- > hw/virtio/vhost-shadow-virtqueue.c | 400 +++++++++++++++++++++++++++-- > hw/virtio/vhost-vdpa.c | 144 ++++++++++- > 3 files changed, 521 insertions(+), 28 deletions(-) > > diff --git a/qapi/net.json b/qapi/net.json > index fca2f6ebca..1c6d3b2179 100644 > --- a/qapi/net.json > +++ b/qapi/net.json > @@ -84,12 +84,9 @@ > # > # Use vhost shadow virtqueue. > # > -# SVQ can just forward notifications between the device and the guest at this > -# moment. This will expand in future changes. > -# > # @name: the device name of the VirtIO device > # > -# @set: true to use the alternate shadow VQ notifications > +# @set: true to use the alternate shadow VQ > # > # Since: 6.2 > # > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c > index cb9ffcb015..ad1b2342be 100644 > --- a/hw/virtio/vhost-shadow-virtqueue.c > +++ b/hw/virtio/vhost-shadow-virtqueue.c > @@ -9,6 +9,9 @@ > > #include "qemu/osdep.h" > #include "hw/virtio/vhost-shadow-virtqueue.h" > +#include "hw/virtio/vhost.h" > +#include "hw/virtio/virtio-access.h" > + > #include "standard-headers/linux/vhost_types.h" > > #include "qemu/error-report.h" > @@ -45,6 +48,27 @@ typedef struct VhostShadowVirtqueue { > > /* Virtio device */ > VirtIODevice *vdev; > + > + /* Map for returning guest's descriptors */ > + VirtQueueElement **ring_id_maps; > + > + /* Next VirtQueue element that guest made available */ > + VirtQueueElement *next_guest_avail_elem; > + > + /* Next head to expose to device */ > + uint16_t avail_idx_shadow; > + > + /* Next free descriptor */ > + uint16_t free_head; > + > + /* Last seen used idx */ > + uint16_t shadow_used_idx; > + > + /* Next head to consume from device */ > + uint16_t last_used_idx; > + > + /* Cache for the exposed notification flag */ > + bool notification; > } VhostShadowVirtqueue; > > /** > @@ -56,25 +80,174 @@ const EventNotifier *vhost_svq_get_dev_kick_notifier( > return &svq->hdev_kick; > } > > -/* If the device is using some of these, SVQ cannot communicate */ > +/** > + * VirtIO transport device feature acknowledge > + * > + * @dev_features The device features. If success, the acknowledged features. > + * > + * Returns true if SVQ can go with a subset of these, false otherwise. > + */ > bool vhost_svq_valid_device_features(uint64_t *dev_features) > { > - return true; > + uint64_t b; > + bool r = true; > + > + for (b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END; ++b) { > + switch (b) { > + case VIRTIO_F_NOTIFY_ON_EMPTY: > + case VIRTIO_F_ANY_LAYOUT: > + continue; > + > + case VIRTIO_F_ACCESS_PLATFORM: > + /* SVQ does not know how to translate addresses */ > + if (*dev_features & BIT_ULL(b)) { > + clear_bit(b, dev_features); > + r = false; > + } > + break; > + > + case VIRTIO_F_VERSION_1: > + /* SVQ trust that guest vring is little endian */ > + if (!(*dev_features & BIT_ULL(b))) { > + set_bit(b, dev_features); > + r = false; > + } > + continue; > + > + default: > + if (*dev_features & BIT_ULL(b)) { > + clear_bit(b, dev_features); > + } > + } > + } > + > + return r; > } > > -/* If the guest is using some of these, SVQ cannot communicate */ > +/** > + * Check of guest's acknowledge features. > + * > + * @guest_features The guest's acknowledged features > + * > + * Returns true if SVQ can handle them, false otherwise. > + */ > bool vhost_svq_valid_guest_features(uint64_t *guest_features) > { > - return true; > + static const uint64_t transport = MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START, > + VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START); > + > + /* These transport features are handled by VirtQueue */ > + static const uint64_t valid = (BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC) | > + BIT_ULL(VIRTIO_F_VERSION_1)); > + > + /* We are only interested in transport-related feature bits */ > + uint64_t guest_transport_features = (*guest_features) & transport; > + > + *guest_features &= (valid | ~transport); > + return !(guest_transport_features & (transport ^ valid)); > } > > -/* Forward guest notifications */ > -static void vhost_handle_guest_kick(EventNotifier *n) > +/** > + * Number of descriptors that SVQ can make available from the guest. > + * > + * @svq The svq > + */ > +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq) > { > - VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > - svq_kick); > + return svq->vring.num - (svq->avail_idx_shadow - svq->shadow_used_idx); > +} > > - if (unlikely(!event_notifier_test_and_clear(n))) { > +static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable) > +{ > + uint16_t notification_flag; > + > + if (svq->notification == enable) { > + return; > + } > + > + notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); > + > + svq->notification = enable; > + if (enable) { > + svq->vring.avail->flags &= ~notification_flag; > + } else { > + svq->vring.avail->flags |= notification_flag; > + } > +} > + > +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, > + const struct iovec *iovec, > + size_t num, bool more_descs, bool write) > +{ > + uint16_t i = svq->free_head, last = svq->free_head; > + unsigned n; > + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; > + vring_desc_t *descs = svq->vring.desc; > + > + if (num == 0) { > + return; > + } > + > + for (n = 0; n < num; n++) { > + if (more_descs || (n + 1 < num)) { > + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); > + } else { > + descs[i].flags = flags; > + } > + descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); > + descs[i].len = cpu_to_le32(iovec[n].iov_len); > + > + last = i; > + i = cpu_to_le16(descs[i].next); > + } > + > + svq->free_head = le16_to_cpu(descs[last].next); > +} > + > +static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq, > + VirtQueueElement *elem) > +{ > + int head; > + unsigned avail_idx; > + vring_avail_t *avail = svq->vring.avail; > + > + head = svq->free_head; > + > + /* We need some descriptors here */ > + assert(elem->out_num || elem->in_num); > + > + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, > + elem->in_num > 0, false); > + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); > + > + /* > + * Put entry in available array (but don't update avail->idx until they > + * do sync). > + */ > + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); > + avail->ring[avail_idx] = cpu_to_le16(head); > + svq->avail_idx_shadow++; > + > + /* Update avail index after the descriptor is wrote */ > + smp_wmb();A question, since we may talk with the real hardware, is smp_wmb() sufficient in this case or do we need to honer VIRTIO_F_ORDER_PLATFORM?> + avail->idx = cpu_to_le16(svq->avail_idx_shadow); > + > + return head; > + > +} > + > +static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) > +{ > + unsigned qemu_head = vhost_svq_add_split(svq, elem); > + > + svq->ring_id_maps[qemu_head] = elem; > +} > + > +static void vhost_svq_kick(VhostShadowVirtqueue *svq) > +{ > + /* We need to expose available array entries before checking used flags */ > + smp_mb(); > + if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) { > return; > } > > @@ -86,25 +259,188 @@ static void vhost_handle_guest_kick(EventNotifier *n) > } > } > > -/* > - * Set the device's memory region notifier. addr = NULL clear it. > +/** > + * Forward available buffers. > + * > + * @svq Shadow VirtQueue > + * > + * Note that this function does not guarantee that all guest's available > + * buffers are available to the device in SVQ avail ring. The guest may have > + * exposed a GPA / GIOVA congiuous buffer, but it may not be contiguous in qemu > + * vaddr. > + * > + * If that happens, guest's kick notifications will be disabled until device > + * makes some buffers used. > */ > -void vhost_svq_set_host_mr_notifier(VhostShadowVirtqueue *svq, void *addr) > +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) > { > - svq->host_notifier_mr = addr; > + /* Clear event notifier */ > + event_notifier_test_and_clear(&svq->svq_kick); > + > + /* Make available as many buffers as possible */ > + do { > + if (virtio_queue_get_notification(svq->vq)) { > + virtio_queue_set_notification(svq->vq, false); > + } > + > + while (true) { > + VirtQueueElement *elem; > + > + if (svq->next_guest_avail_elem) { > + elem = g_steal_pointer(&svq->next_guest_avail_elem); > + } else { > + elem = virtqueue_pop(svq->vq, sizeof(*elem)); > + } > + > + if (!elem) { > + break; > + } > + > + if (elem->out_num + elem->in_num > > + vhost_svq_available_slots(svq)) { > + /* > + * This condition is possible since a contiguous buffer in GPA > + * does not imply a contiguous buffer in qemu's VA > + * scatter-gather segments. If that happen, the buffer exposed > + * to the device needs to be a chain of descriptors at this > + * moment. > + * > + * SVQ cannot hold more available buffers if we are here: > + * queue the current guest descriptor and ignore further kicks > + * until some elements are used. > + */I wonder what's the advantage of tracking the pending elem like this. It looks to me we can simply rewind last_avail_idx in this case?> + svq->next_guest_avail_elem = elem; > + return; > + } > + > + vhost_svq_add(svq, elem); > + vhost_svq_kick(svq); > + } > + > + virtio_queue_set_notification(svq->vq, true); > + } while (!virtio_queue_empty(svq->vq)); > +} > + > +/** > + * Handle guest's kick. > + * > + * @n guest kick event notifier, the one that guest set to notify svq. > + */ > +static void vhost_handle_guest_kick_notifier(EventNotifier *n) > +{ > + VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > + svq_kick); > + vhost_handle_guest_kick(svq); > +} > + > +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq) > +{ > + if (svq->last_used_idx != svq->shadow_used_idx) { > + return true; > + } > + > + svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx); > + > + return svq->last_used_idx != svq->shadow_used_idx; > +} > + > +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq) > +{ > + vring_desc_t *descs = svq->vring.desc; > + const vring_used_t *used = svq->vring.used; > + vring_used_elem_t used_elem; > + uint16_t last_used; > + > + if (!vhost_svq_more_used(svq)) { > + return NULL; > + } > + > + /* Only get used array entries after they have been exposed by dev */ > + smp_rmb(); > + last_used = svq->last_used_idx & (svq->vring.num - 1); > + used_elem.id = le32_to_cpu(used->ring[last_used].id); > + used_elem.len = le32_to_cpu(used->ring[last_used].len); > + > + svq->last_used_idx++; > + if (unlikely(used_elem.id >= svq->vring.num)) { > + error_report("Device %s says index %u is used", svq->vdev->name, > + used_elem.id); > + return NULL; > + } > + > + if (unlikely(!svq->ring_id_maps[used_elem.id])) { > + error_report( > + "Device %s says index %u is used, but it was not available", > + svq->vdev->name, used_elem.id); > + return NULL; > + } > + > + descs[used_elem.id].next = svq->free_head; > + svq->free_head = used_elem.id; > + > + svq->ring_id_maps[used_elem.id]->len = used_elem.len; > + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); > } > > -/* Forward vhost notifications */ > +static void vhost_svq_flush(VhostShadowVirtqueue *svq, > + bool check_for_avail_queue) > +{ > + VirtQueue *vq = svq->vq; > + > + /* Make as many buffers as possible used. */ > + do { > + unsigned i = 0; > + > + vhost_svq_set_notification(svq, false); > + while (true) { > + g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq); > + if (!elem) { > + break; > + } > + > + if (unlikely(i >= svq->vring.num)) { > + virtio_error(svq->vdev, > + "More than %u used buffers obtained in a %u size SVQ", > + i, svq->vring.num); > + virtqueue_fill(vq, elem, elem->len, i); > + virtqueue_flush(vq, i); > + i = 0; > + } > + virtqueue_fill(vq, elem, elem->len, i++); > + } > + > + virtqueue_flush(vq, i); > + event_notifier_set(&svq->svq_call); > + > + if (check_for_avail_queue && svq->next_guest_avail_elem) { > + /* > + * Avail ring was full when vhost_svq_flush was called, so it's a > + * good moment to make more descriptors available if possible > + */ > + vhost_handle_guest_kick(svq); > + } > + > + vhost_svq_set_notification(svq, true); > + } while (vhost_svq_more_used(svq));So this actually doesn't make sure all the buffers were processed by the device? Is this intended (I see it was called by the vhost_svq_stop()). Note that it means some buffers might not be submitted to the device after migration?> +} > + > +/** > + * Forward used buffers. > + * > + * @n hdev call event notifier, the one that device set to notify svq. > + * > + * Note that we are not making any buffers available in the loop, there is no > + * way that it runs more than virtqueue size times. > + */ > static void vhost_svq_handle_call(EventNotifier *n) > { > VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, > hdev_call); > > - if (unlikely(!event_notifier_test_and_clear(n))) { > - return; > - } > + /* Clear event notifier */ > + event_notifier_test_and_clear(n); > > - event_notifier_set(&svq->svq_call); > + vhost_svq_flush(svq, true); > } > > /* > @@ -132,6 +468,14 @@ void vhost_svq_set_guest_call_notifier(VhostShadowVirtqueue *svq, int call_fd) > event_notifier_init_fd(&svq->svq_call, call_fd); > } > > +/* > + * Set the device's memory region notifier. addr = NULL clear it. > + */ > +void vhost_svq_set_host_mr_notifier(VhostShadowVirtqueue *svq, void *addr) > +{ > + svq->host_notifier_mr = addr; > +} > + > /* > * Get the shadow vq vring address. > * @svq Shadow virtqueue > @@ -185,7 +529,8 @@ static void vhost_svq_set_svq_kick_fd_internal(VhostShadowVirtqueue *svq, > * need to explicitely check for them. > */ > event_notifier_init_fd(&svq->svq_kick, svq_kick_fd); > - event_notifier_set_handler(&svq->svq_kick, vhost_handle_guest_kick); > + event_notifier_set_handler(&svq->svq_kick, > + vhost_handle_guest_kick_notifier); > > /* > * !check_old means that we are starting SVQ, taking the descriptor from > @@ -233,7 +578,16 @@ void vhost_svq_start(struct vhost_dev *dev, unsigned idx, > void vhost_svq_stop(struct vhost_dev *dev, unsigned idx, > VhostShadowVirtqueue *svq) > { > + unsigned i; > event_notifier_set_handler(&svq->svq_kick, NULL); > + vhost_svq_flush(svq, false); > + > + for (i = 0; i < svq->vring.num; ++i) { > + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; > + if (elem) { > + virtqueue_detach_element(svq->vq, elem, elem->len); > + } > + } > } > > /* > @@ -248,7 +602,7 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx) > size_t driver_size; > size_t device_size; > g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1); > - int r; > + int r, i; > > r = event_notifier_init(&svq->hdev_kick, 0); > if (r != 0) { > @@ -274,6 +628,11 @@ VhostShadowVirtqueue *vhost_svq_new(struct vhost_dev *dev, int idx) > memset(svq->vring.desc, 0, driver_size); > svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size); > memset(svq->vring.used, 0, device_size); > + for (i = 0; i < num - 1; i++) { > + svq->vring.desc[i].next = cpu_to_le16(i + 1); > + } > + > + svq->ring_id_maps = g_new0(VirtQueueElement *, num); > event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call); > return g_steal_pointer(&svq); > > @@ -292,6 +651,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq) > event_notifier_cleanup(&vq->hdev_kick); > event_notifier_set_handler(&vq->hdev_call, NULL); > event_notifier_cleanup(&vq->hdev_call); > + g_free(vq->ring_id_maps); > qemu_vfree(vq->vring.desc); > qemu_vfree(vq->vring.used); > g_free(vq); > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > index fc8396ba8a..e1c55e43e7 100644 > --- a/hw/virtio/vhost-vdpa.c > +++ b/hw/virtio/vhost-vdpa.c > @@ -19,6 +19,7 @@ > #include "hw/virtio/virtio-net.h" > #include "hw/virtio/vhost-shadow-virtqueue.h" > #include "hw/virtio/vhost-vdpa.h" > +#include "hw/virtio/vhost-shadow-virtqueue.h" > #include "exec/address-spaces.h" > #include "qemu/main-loop.h" > #include "cpu.h" > @@ -821,6 +822,19 @@ static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) > return true; > } > > +static int vhost_vdpa_vring_pause(struct vhost_dev *dev) > +{ > + int r; > + uint8_t status; > + > + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DEVICE_STOPPED); > + do { > + r = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &status); > + } while (r == 0 && !(status & VIRTIO_CONFIG_S_DEVICE_STOPPED)); > + > + return 0; > +} > + > /* > * Start or stop a shadow virtqueue in a vdpa device > * > @@ -844,7 +858,14 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx, > .index = vq_index, > }; > struct vhost_vring_file vhost_call_file = { > - .index = idx + dev->vq_index, > + .index = vq_index, > + }; > + struct vhost_vring_addr addr = { > + .index = vq_index, > + }; > + struct vhost_vring_state num = { > + .index = vq_index, > + .num = virtio_queue_get_num(dev->vdev, vq_index), > }; > int r; > > @@ -852,6 +873,7 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx, > const EventNotifier *vhost_kick = vhost_svq_get_dev_kick_notifier(svq); > const EventNotifier *vhost_call = vhost_svq_get_svq_call_notifier(svq); > > + vhost_svq_get_vring_addr(svq, &addr); > if (n->addr) { > r = virtio_queue_set_host_notifier_mr(dev->vdev, idx, &n->mr, > false); > @@ -870,8 +892,20 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx, > vhost_kick_file.fd = event_notifier_get_fd(vhost_kick); > vhost_call_file.fd = event_notifier_get_fd(vhost_call); > } else { > + struct vhost_vring_state state = { > + .index = vq_index, > + }; > + > vhost_svq_stop(dev, idx, svq); > > + state.num = virtio_queue_get_last_avail_idx(dev->vdev, idx); > + r = vhost_vdpa_set_vring_base(dev, &state); > + if (unlikely(r)) { > + error_setg_errno(errp, -r, "vhost_set_vring_base failed"); > + return false; > + } > + > + vhost_vdpa_vq_get_addr(dev, &addr, &dev->vqs[idx]); > if (n->addr) { > r = virtio_queue_set_host_notifier_mr(dev->vdev, idx, &n->mr, > true); > @@ -885,6 +919,17 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx, > vhost_call_file.fd = v->call_fd[idx]; > } > > + r = vhost_vdpa_set_vring_addr(dev, &addr); > + if (unlikely(r)) { > + error_setg_errno(errp, -r, "vhost_set_vring_addr failed"); > + return false; > + } > + r = vhost_vdpa_set_vring_num(dev, &num); > + if (unlikely(r)) { > + error_setg_errno(errp, -r, "vhost_set_vring_num failed"); > + return false; > + } > + > r = vhost_vdpa_set_vring_dev_kick(dev, &vhost_kick_file); > if (unlikely(r)) { > error_setg_errno(errp, -r, "vhost_vdpa_set_vring_kick failed"); > @@ -899,6 +944,50 @@ static bool vhost_vdpa_svq_start_vq(struct vhost_dev *dev, unsigned idx, > return true; > } > > +static void vhost_vdpa_get_vq_state(struct vhost_dev *dev, unsigned idx) > +{ > + struct VirtIODevice *vdev = dev->vdev; > + > + virtio_queue_restore_last_avail_idx(vdev, idx); > + virtio_queue_invalidate_signalled_used(vdev, idx); > + virtio_queue_update_used_idx(vdev, idx); > +}Do we need to change vhost_vdpa_get_vring_base() to return vq->last_avail_idx as well? Thanks