Jason Wang
2021-Mar-17  02:50 UTC
[RFC v2 11/13] vhost: Shadow virtqueue buffers forwarding
? 2021/3/17 ??12:05, Eugenio Perez Martin ??:> On Tue, Mar 16, 2021 at 9:15 AM Jason Wang <jasowang at redhat.com> wrote: >> >> ? 2021/3/16 ??3:48, Eugenio P?rez ??: >>> Initial version of shadow virtqueue that actually forward buffers. >>> >>> It reuses the VirtQueue code for the device part. The driver part is >>> based on Linux's virtio_ring driver, but with stripped functionality >>> and optimizations so it's easier to review. >>> >>> These will be added in later commits. >>> >>> Signed-off-by: Eugenio P?rez <eperezma at redhat.com> >>> --- >>> hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++-- >>> hw/virtio/vhost.c | 113 ++++++++++++++- >>> 2 files changed, 312 insertions(+), 13 deletions(-) >>> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c >>> index 1460d1d5d1..68ed0f2740 100644 >>> --- a/hw/virtio/vhost-shadow-virtqueue.c >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c >>> @@ -9,6 +9,7 @@ >>> >>> #include "hw/virtio/vhost-shadow-virtqueue.h" >>> #include "hw/virtio/vhost.h" >>> +#include "hw/virtio/virtio-access.h" >>> >>> #include "standard-headers/linux/vhost_types.h" >>> >>> @@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue { >>> /* Virtio device */ >>> VirtIODevice *vdev; >>> >>> + /* Map for returning guest's descriptors */ >>> + VirtQueueElement **ring_id_maps; >>> + >>> + /* Next head to expose to device */ >>> + uint16_t avail_idx_shadow; >>> + >>> + /* Next free descriptor */ >>> + uint16_t free_head; >>> + >>> + /* Last seen used idx */ >>> + uint16_t shadow_used_idx; >>> + >>> + /* Next head to consume from device */ >>> + uint16_t used_idx; >>> + >>> /* Descriptors copied from guest */ >>> vring_desc_t descs[]; >>> } VhostShadowVirtqueue; >>> >>> -/* Forward guest notifications */ >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>> + const struct iovec *iovec, >>> + size_t num, bool more_descs, bool write) >>> +{ >>> + uint16_t i = svq->free_head, last = svq->free_head; >>> + unsigned n; >>> + uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0; >>> + vring_desc_t *descs = svq->vring.desc; >>> + >>> + if (num == 0) { >>> + return; >>> + } >>> + >>> + for (n = 0; n < num; n++) { >>> + if (more_descs || (n + 1 < num)) { >>> + descs[i].flags = flags | virtio_tswap16(svq->vdev, >>> + VRING_DESC_F_NEXT); >>> + } else { >>> + descs[i].flags = flags; >>> + } >>> + descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base); >> >> So unsing virtio_tswap() is probably not correct since we're talking >> with vhost backends which has its own endiness. >> > I was trying to expose the buffer with the same endianness as the > driver originally offered, so if guest->qemu requires a bswap, I think > it will always require a bswap again to expose to the device again.So assumes vhost-vDPA always provide a non-transitional device[1]. Then if Qemu present a transitional device, we need to do the endian conversion when necessary, if Qemu present a non-transitional device, we don't need to do that, guest driver will do that for us. But it looks to me the virtio_tswap() can't be used for this since it: static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) { #if defined(LEGACY_VIRTIO_IS_BIENDIAN) ??? return virtio_is_big_endian(vdev); #elif defined(TARGET_WORDS_BIGENDIAN) ??? if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { ??????? /* Devices conforming to VIRTIO 1.0 or later are always LE. */ ??????? return false; ??? } ??? return true; #else ??? return false; #endif } So if we present a legacy device on top of a non-transitiaonl vDPA device. The VIRITIO_F_VERSION_1 check is wrong.> >> For vhost-vDPA, we can assume that it's a 1.0 device. > Isn't it needed if the host is big endian?[1] So non-transitional device always assume little endian. For vhost-vDPA, we don't want to present transitional device which may end up with a lot of burdens. I suspect the legacy driver plust vhost vDPA already break, so I plan to mandate VERSION_1 for all vDPA devices.> >> >>> + descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len); >>> + >>> + last = i; >>> + i = virtio_tswap16(svq->vdev, descs[i].next); >>> + } >>> + >>> + svq->free_head = virtio_tswap16(svq->vdev, descs[last].next); >>> +} >>> + >>> +static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq, >>> + VirtQueueElement *elem) >>> +{ >>> + int head; >>> + unsigned avail_idx; >>> + vring_avail_t *avail = svq->vring.avail; >>> + >>> + head = svq->free_head; >>> + >>> + /* We need some descriptors here */ >>> + assert(elem->out_num || elem->in_num); >>> + >>> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, >>> + elem->in_num > 0, false); >>> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); >>> + >>> + /* >>> + * Put entry in available array (but don't update avail->idx until they >>> + * do sync). >>> + */ >>> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1); >>> + avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head); >>> + svq->avail_idx_shadow++; >>> + >>> + /* Expose descriptors to device */ >>> + smp_wmb(); >>> + avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow); >>> + >>> + return head; >>> + >>> +} >>> + >>> +static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq, >>> + VirtQueueElement *elem) >>> +{ >>> + unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem); >>> + >>> + svq->ring_id_maps[qemu_head] = elem; >>> +} >>> + >>> +/* Handle guest->device notifications */ >>> static void vhost_handle_guest_kick(EventNotifier *n) >>> { >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>> @@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n) >>> return; >>> } >>> >>> - event_notifier_set(&svq->kick_notifier); >>> + /* Make available as many buffers as possible */ >>> + do { >>> + if (virtio_queue_get_notification(svq->vq)) { >>> + /* No more notifications until process all available */ >>> + virtio_queue_set_notification(svq->vq, false); >>> + } >>> + >>> + while (true) { >>> + VirtQueueElement *elem; >>> + if (virtio_queue_full(svq->vq)) { >>> + break; >> >> So we've disabled guest notification. If buffer has been consumed, we >> need to retry the handle_guest_kick here. But I didn't find the code? >> > This code follows the pattern of virtio_blk_handle_vq: we jump out of > the inner while, and we re-enable the notifications. After that, we > check for updates on guest avail_idx.Ok, but this will end up with a lot of unnecessary kicks without event index.> >>> + } >>> + >>> + elem = virtqueue_pop(svq->vq, sizeof(*elem)); >>> + if (!elem) { >>> + break; >>> + } >>> + >>> + vhost_shadow_vq_add(svq, elem); >>> + event_notifier_set(&svq->kick_notifier); >>> + } >>> + >>> + virtio_queue_set_notification(svq->vq, true); >>> + } while (!virtio_queue_empty(svq->vq)); >>> +} >>> + >>> +static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq) >>> +{ >>> + if (svq->used_idx != svq->shadow_used_idx) { >>> + return true; >>> + } >>> + >>> + /* Get used idx must not be reordered */ >>> + smp_rmb(); >>> + svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx); >>> + >>> + return svq->used_idx != svq->shadow_used_idx; >>> +} >>> + >>> +static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq) >>> +{ >>> + vring_desc_t *descs = svq->vring.desc; >>> + const vring_used_t *used = svq->vring.used; >>> + vring_used_elem_t used_elem; >>> + uint16_t last_used; >>> + >>> + if (!vhost_shadow_vq_more_used(svq)) { >>> + return NULL; >>> + } >>> + >>> + last_used = svq->used_idx & (svq->vring.num - 1); >>> + used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id); >>> + used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len); >>> + >>> + if (unlikely(used_elem.id >= svq->vring.num)) { >>> + error_report("Device %s says index %u is available", svq->vdev->name, >>> + used_elem.id); >>> + return NULL; >>> + } >>> + >>> + descs[used_elem.id].next = svq->free_head; >>> + svq->free_head = used_elem.id; >>> + >>> + svq->used_idx++; >>> + svq->ring_id_maps[used_elem.id]->len = used_elem.len; >>> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]); >>> } >>> >>> /* Forward vhost notifications */ >>> @@ -78,6 +229,7 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, >>> call_notifier); >>> EventNotifier *masked_notifier; >>> + VirtQueue *vq = svq->vq; >>> >>> /* Signal start of using masked notifier */ >>> qemu_event_reset(&svq->masked_notifier.is_free); >>> @@ -86,14 +238,29 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>> qemu_event_set(&svq->masked_notifier.is_free); >>> } >>> >>> - if (!masked_notifier) { >>> - unsigned n = virtio_get_queue_index(svq->vq); >>> - virtio_queue_invalidate_signalled_used(svq->vdev, n); >>> - virtio_notify_irqfd(svq->vdev, svq->vq); >>> - } else if (!svq->masked_notifier.signaled) { >>> - svq->masked_notifier.signaled = true; >>> - event_notifier_set(svq->masked_notifier.n); >>> - } >>> + /* Make as many buffers as possible used. */ >>> + do { >>> + unsigned i = 0; >>> + >>> + /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */ >>> + while (true) { >>> + g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq); >>> + if (!elem) { >>> + break; >>> + } >>> + >>> + assert(i < svq->vring.num); >>> + virtqueue_fill(vq, elem, elem->len, i++); >>> + } >>> + >>> + virtqueue_flush(vq, i); >>> + if (!masked_notifier) { >>> + virtio_notify_irqfd(svq->vdev, svq->vq); >>> + } else if (!svq->masked_notifier.signaled) { >>> + svq->masked_notifier.signaled = true; >>> + event_notifier_set(svq->masked_notifier.n); >>> + } >>> + } while (vhost_shadow_vq_more_used(svq)); >>> >>> if (masked_notifier) { >>> /* Signal not using it anymore */ >>> @@ -103,7 +270,6 @@ static void vhost_shadow_vq_handle_call_no_test(EventNotifier *n) >>> >>> static void vhost_shadow_vq_handle_call(EventNotifier *n) >>> { >>> - >>> if (likely(event_notifier_test_and_clear(n))) { >>> vhost_shadow_vq_handle_call_no_test(n); >>> } >>> @@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, >>> unsigned idx, >>> VhostShadowVirtqueue *svq) >>> { >>> + int i; >>> int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq); >>> + >>> + assert(!dev->shadow_vqs_enabled); >>> + >>> if (unlikely(r < 0)) { >>> error_report("Couldn't restore vq kick fd: %s", strerror(-r)); >>> } >>> @@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev, >>> /* Restore vhost call */ >>> vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx, >>> dev->vqs[idx].notifier_is_masked); >>> + >>> + >>> + for (i = 0; i < svq->vring.num; ++i) { >>> + g_autofree VirtQueueElement *elem = svq->ring_id_maps[i]; >>> + /* >>> + * Although the doc says we must unpop in order, it's ok to unpop >>> + * everything. >>> + */ >>> + if (elem) { >>> + virtqueue_unpop(svq->vq, elem, elem->len); >> >> Shouldn't we need to wait until all pending requests to be drained? Or >> we may end up duplicated requests? >> > Do you mean pending as in-flight/processing in the device? The device > must be paused at this point.Ok. I see there's a vhost_set_vring_enable(dev, false) in vhost_sw_live_migration_start().> Currently there is no assertion for > this, maybe we can track the device status for it. > > For the queue handlers to be running at this point, the main event > loop should serialize QMP and handlers as far as I know (and they > would make all state inconsistent if the device stops suddenly). It > would need to be synchronized if the handlers run in their own AIO > context. That would be nice to have but it's not included here.That's why I suggest to just drop the QMP stuffs and use cli parameters to enable shadow virtqueue. Things would be greatly simplified I guess. Thanks> >> Thanks >> >> >>> + } >>> + } >>> } >>> >>> /* >>> @@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) >>> unsigned num = virtio_queue_get_num(dev->vdev, vq_idx); >>> size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE); >>> g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) + ring_size); >>> - int r; >>> + int r, i; >>> >>> r = event_notifier_init(&svq->kick_notifier, 0); >>> if (r != 0) { >>> @@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx) >>> vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE); >>> svq->vq = virtio_get_queue(dev->vdev, vq_idx); >>> svq->vdev = dev->vdev; >>> + for (i = 0; i < num - 1; i++) { >>> + svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1); >>> + } >>> + >>> + svq->ring_id_maps = g_new0(VirtQueueElement *, num); >>> event_notifier_set_handler(&svq->call_notifier, >>> vhost_shadow_vq_handle_call); >>> qemu_event_init(&svq->masked_notifier.is_free, true); >>> @@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq) >>> event_notifier_cleanup(&vq->kick_notifier); >>> event_notifier_set_handler(&vq->call_notifier, NULL); >>> event_notifier_cleanup(&vq->call_notifier); >>> + g_free(vq->ring_id_maps); >>> g_free(vq); >>> } >>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c >>> index eab3e334f2..a373999bc4 100644 >>> --- a/hw/virtio/vhost.c >>> +++ b/hw/virtio/vhost.c >>> @@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) >>> >>> trace_vhost_iotlb_miss(dev, 1); >>> >>> + if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) { >>> + uaddr = iova; >>> + len = 4096; >>> + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len, >>> + IOMMU_RW); >>> + if (ret) { >>> + trace_vhost_iotlb_miss(dev, 2); >>> + error_report("Fail to update device iotlb"); >>> + } >>> + >>> + return ret; >>> + } >>> + >>> iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, >>> iova, write, >>> MEMTXATTRS_UNSPECIFIED); >>> @@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) >>> /* Can be read by vhost_virtqueue_mask, from vm exit */ >>> qatomic_store_release(&dev->shadow_vqs_enabled, false); >>> >>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); >>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { >>> + error_report("Fail to invalidate device iotlb"); >>> + } >>> + >>> for (idx = 0; idx < dev->nvqs; ++idx) { >>> + /* >>> + * Update used ring information for IOTLB to work correctly, >>> + * vhost-kernel code requires for this. >>> + */ >>> + struct vhost_virtqueue *vq = dev->vqs + idx; >>> + vhost_device_iotlb_miss(dev, vq->used_phys, true); >>> + >>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]); >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], >>> + dev->vq_index + idx); >>> + } >>> + >>> + /* Enable guest's vq vring */ >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>> + >>> + for (idx = 0; idx < dev->nvqs; ++idx) { >>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); >>> } >>> >>> @@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev) >>> return 0; >>> } >>> >>> +/* >>> + * Start shadow virtqueue in a given queue. >>> + * In failure case, this function leaves queue working as regular vhost mode. >>> + */ >>> +static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev, >>> + unsigned idx) >>> +{ >>> + struct vhost_vring_addr addr = { >>> + .index = idx, >>> + }; >>> + struct vhost_vring_state s = { >>> + .index = idx, >>> + }; >>> + int r; >>> + bool ok; >>> + >>> + vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); >>> + ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); >>> + if (unlikely(!ok)) { >>> + return false; >>> + } >>> + >>> + /* From this point, vhost_virtqueue_start can reset these changes */ >>> + vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr); >>> + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); >>> + if (unlikely(r != 0)) { >>> + VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed"); >>> + goto err; >>> + } >>> + >>> + r = dev->vhost_ops->vhost_set_vring_base(dev, &s); >>> + if (unlikely(r != 0)) { >>> + VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed"); >>> + goto err; >>> + } >>> + >>> + /* >>> + * Update used ring information for IOTLB to work correctly, >>> + * vhost-kernel code requires for this. >>> + */ >>> + r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true); >>> + if (unlikely(r != 0)) { >>> + /* Debug message already printed */ >>> + goto err; >>> + } >>> + >>> + return true; >>> + >>> +err: >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx); >>> + return false; >>> +} >>> + >>> static int vhost_sw_live_migration_start(struct vhost_dev *dev) >>> { >>> int idx, stop_idx; >>> @@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct vhost_dev *dev) >>> } >>> } >>> >>> + dev->vhost_ops->vhost_set_vring_enable(dev, false); >>> + if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) { >>> + error_report("Fail to invalidate device iotlb"); >>> + } >>> + >>> /* Can be read by vhost_virtqueue_mask, from vm exit */ >>> qatomic_store_release(&dev->shadow_vqs_enabled, true); >>> for (idx = 0; idx < dev->nvqs; ++idx) { >>> - bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]); >>> + bool ok = vhost_sw_live_migration_start_vq(dev, idx); >>> if (unlikely(!ok)) { >>> goto err_start; >>> } >>> } >>> >>> + /* Enable shadow vq vring */ >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>> return 0; >>> >>> err_start: >>> qatomic_store_release(&dev->shadow_vqs_enabled, false); >>> for (stop_idx = 0; stop_idx < idx; stop_idx++) { >>> vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]); >>> + vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], >>> + dev->vq_index + stop_idx); >>> } >>> >>> err_new: >>> + /* Enable guest's vring */ >>> + dev->vhost_ops->vhost_set_vring_enable(dev, true); >>> for (idx = 0; idx < dev->nvqs; ++idx) { >>> vhost_shadow_vq_free(dev->shadow_vqs[idx]); >>> } >>> @@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp) >>> >>> if (!hdev->started) { >>> err_cause = "Device is not started"; >>> + } else if (!vhost_dev_has_iommu(hdev)) { >>> + err_cause = "Does not support iommu"; >>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { >>> + err_cause = "Is packed"; >>> + } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) { >>> + err_cause = "Have event idx"; >>> + } else if (hdev->acked_features & >>> + BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) { >>> + err_cause = "Supports indirect descriptors"; >>> + } else if (!hdev->vhost_ops->vhost_set_vring_enable) { >>> + err_cause = "Cannot pause device"; >>> + } >>> + >>> + if (err_cause) { >>> goto err; >>> } >>> >