? 2022/1/22 ??4:27, Eugenio P?rez ??:> Initial version of shadow virtqueue that actually forward buffers. There
> is no iommu support at the moment, and that will be addressed in future
> patches of this series. Since all vhost-vdpa devices use forced IOMMU,
> this means that SVQ is not usable at this point of the series on any
> device.
>
> For simplicity it only supports modern devices, that expects vring
> in little endian, with split ring and no event idx or indirect
> descriptors. Support for them will not be added in this series.
>
> It reuses the VirtQueue code for the device part. The driver part is
> based on Linux's virtio_ring driver, but with stripped functionality
> and optimizations so it's easier to review.
>
> However, forwarding buffers have some particular pieces: One of the most
> unexpected ones is that a guest's buffer can expand through more than
> one descriptor in SVQ. While this is handled gracefully by qemu's
> emulated virtio devices, it may cause unexpected SVQ queue full. This
> patch also solves it by checking for this condition at both guest's
> kicks and device's calls. The code may be more elegant in the future if
> SVQ code runs in its own iocontext.
>
> Signed-off-by: Eugenio P?rez <eperezma at redhat.com>
> ---
> hw/virtio/vhost-shadow-virtqueue.h | 2 +
> hw/virtio/vhost-shadow-virtqueue.c | 365 ++++++++++++++++++++++++++++-
> hw/virtio/vhost-vdpa.c | 111 ++++++++-
> 3 files changed, 462 insertions(+), 16 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.h
b/hw/virtio/vhost-shadow-virtqueue.h
> index 39aef5ffdf..19c934af49 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.h
> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> @@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const VhostShadowVirtqueue
*svq);
> size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
> size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
>
> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> + VirtQueue *vq);
> void vhost_svq_stop(VhostShadowVirtqueue *svq);
>
> VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c
b/hw/virtio/vhost-shadow-virtqueue.c
> index 7c168075d7..a1a404f68f 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -9,6 +9,8 @@
>
> #include "qemu/osdep.h"
> #include "hw/virtio/vhost-shadow-virtqueue.h"
> +#include "hw/virtio/vhost.h"
> +#include "hw/virtio/virtio-access.h"
> #include "standard-headers/linux/vhost_types.h"
>
> #include "qemu/error-report.h"
> @@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue {
>
> /* Guest's call notifier, where SVQ calls guest. */
> EventNotifier svq_call;
> +
> + /* Virtio queue shadowing */
> + VirtQueue *vq;
> +
> + /* Virtio device */
> + VirtIODevice *vdev;
> +
> + /* Map for returning guest's descriptors */
> + VirtQueueElement **ring_id_maps;
> +
> + /* Next VirtQueue element that guest made available */
> + VirtQueueElement *next_guest_avail_elem;
> +
> + /* Next head to expose to device */
> + uint16_t avail_idx_shadow;
> +
> + /* Next free descriptor */
> + uint16_t free_head;
> +
> + /* Last seen used idx */
> + uint16_t shadow_used_idx;
> +
> + /* Next head to consume from device */
> + uint16_t last_used_idx;
> +
> + /* Cache for the exposed notification flag */
> + bool notification;
> } VhostShadowVirtqueue;
>
> #define INVALID_SVQ_KICK_FD -1
> @@ -148,30 +177,294 @@ bool vhost_svq_ack_guest_features(uint64_t
dev_features,
> return true;
> }
>
> -/* Forward guest notifications */
> -static void vhost_handle_guest_kick(EventNotifier *n)
> +/**
> + * Number of descriptors that SVQ can make available from the guest.
> + *
> + * @svq The svq
> + */
> +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
> {
> - VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> - svq_kick);
> + return svq->vring.num - (svq->avail_idx_shadow -
svq->shadow_used_idx);
> +}
> +
> +static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool
enable)
> +{
> + uint16_t notification_flag;
>
> - if (unlikely(!event_notifier_test_and_clear(n))) {
> + if (svq->notification == enable) {
> + return;
> + }
> +
> + notification_flag = cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
> +
> + svq->notification = enable;
> + if (enable) {
> + svq->vring.avail->flags &= ~notification_flag;
> + } else {
> + svq->vring.avail->flags |= notification_flag;
> + }
> +}
> +
> +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> + const struct iovec *iovec,
> + size_t num, bool more_descs, bool
write)
> +{
> + uint16_t i = svq->free_head, last = svq->free_head;
> + unsigned n;
> + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> + vring_desc_t *descs = svq->vring.desc;
> +
> + if (num == 0) {
> + return;
> + }
> +
> + for (n = 0; n < num; n++) {
> + if (more_descs || (n + 1 < num)) {
> + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> + } else {
> + descs[i].flags = flags;
> + }
> + descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> + descs[i].len = cpu_to_le32(iovec[n].iov_len);
> +
> + last = i;
> + i = cpu_to_le16(descs[i].next);
> + }
> +
> + svq->free_head = le16_to_cpu(descs[last].next);
> +}
> +
> +static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> + VirtQueueElement *elem)
> +{
> + int head;
> + unsigned avail_idx;
> + vring_avail_t *avail = svq->vring.avail;
> +
> + head = svq->free_head;
> +
> + /* We need some descriptors here */
> + assert(elem->out_num || elem->in_num);
Looks like this could be triggered by guest, we need fail instead assert
here.
> +
> + vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> + elem->in_num > 0, false);
> + vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false,
true);
> +
> + /*
> + * Put entry in available array (but don't update avail->idx
until they
> + * do sync).
> + */
> + avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
> + avail->ring[avail_idx] = cpu_to_le16(head);
> + svq->avail_idx_shadow++;
> +
> + /* Update avail index after the descriptor is wrote */
> + smp_wmb();
> + avail->idx = cpu_to_le16(svq->avail_idx_shadow);
> +
> + return head;
> +}
> +
> +static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement
*elem)
> +{
> + unsigned qemu_head = vhost_svq_add_split(svq, elem);
> +
> + svq->ring_id_maps[qemu_head] = elem;
> +}
> +
> +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> +{
> + /* We need to expose available array entries before checking used
flags */
> + smp_mb();
> + if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
> return;
> }
>
> event_notifier_set(&svq->hdev_kick);
> }
>
> -/* Forward vhost notifications */
> +/**
> + * Forward available buffers.
> + *
> + * @svq Shadow VirtQueue
> + *
> + * Note that this function does not guarantee that all guest's
available
> + * buffers are available to the device in SVQ avail ring. The guest may
have
> + * exposed a GPA / GIOVA congiuous buffer, but it may not be contiguous in
qemu
> + * vaddr.
> + *
> + * If that happens, guest's kick notifications will be disabled until
device
> + * makes some buffers used.
> + */
> +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
> +{
> + /* Clear event notifier */
> + event_notifier_test_and_clear(&svq->svq_kick);
> +
> + /* Make available as many buffers as possible */
> + do {
> + if (virtio_queue_get_notification(svq->vq)) {
> + virtio_queue_set_notification(svq->vq, false);
This looks like an optimization the should belong to
virtio_queue_set_notification() itself.
> + }
> +
> + while (true) {
> + VirtQueueElement *elem;
> +
> + if (svq->next_guest_avail_elem) {
> + elem =
g_steal_pointer(&svq->next_guest_avail_elem);
> + } else {
> + elem = virtqueue_pop(svq->vq, sizeof(*elem));
> + }
> +
> + if (!elem) {
> + break;
> + }
> +
> + if (elem->out_num + elem->in_num >
> + vhost_svq_available_slots(svq)) {
> + /*
> + * This condition is possible since a contiguous buffer in
GPA
> + * does not imply a contiguous buffer in qemu's VA
> + * scatter-gather segments. If that happen, the buffer
exposed
> + * to the device needs to be a chain of descriptors at
this
> + * moment.
> + *
> + * SVQ cannot hold more available buffers if we are here:
> + * queue the current guest descriptor and ignore further
kicks
> + * until some elements are used.
> + */
> + svq->next_guest_avail_elem = elem;
> + return;
> + }
> +
> + vhost_svq_add(svq, elem);
> + vhost_svq_kick(svq);
> + }
> +
> + virtio_queue_set_notification(svq->vq, true);
> + } while (!virtio_queue_empty(svq->vq));
> +}
> +
> +/**
> + * Handle guest's kick.
> + *
> + * @n guest kick event notifier, the one that guest set to notify svq.
> + */
> +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
> +{
> + VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> + svq_kick);
> + vhost_handle_guest_kick(svq);
> +}
> +
> +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
> +{
> + if (svq->last_used_idx != svq->shadow_used_idx) {
> + return true;
> + }
> +
> + svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
> +
> + return svq->last_used_idx != svq->shadow_used_idx;
> +}
> +
> +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> +{
> + vring_desc_t *descs = svq->vring.desc;
> + const vring_used_t *used = svq->vring.used;
> + vring_used_elem_t used_elem;
> + uint16_t last_used;
> +
> + if (!vhost_svq_more_used(svq)) {
> + return NULL;
> + }
> +
> + /* Only get used array entries after they have been exposed by dev */
> + smp_rmb();
> + last_used = svq->last_used_idx & (svq->vring.num - 1);
> + used_elem.id = le32_to_cpu(used->ring[last_used].id);
> + used_elem.len = le32_to_cpu(used->ring[last_used].len);
> +
> + svq->last_used_idx++;
> + if (unlikely(used_elem.id >= svq->vring.num)) {
> + error_report("Device %s says index %u is used",
svq->vdev->name,
> + used_elem.id);
> + return NULL;
> + }
> +
> + if (unlikely(!svq->ring_id_maps[used_elem.id])) {
> + error_report(
> + "Device %s says index %u is used, but it was not
available",
> + svq->vdev->name, used_elem.id);
> + return NULL;
> + }
> +
> + descs[used_elem.id].next = svq->free_head;
> + svq->free_head = used_elem.id;
> +
> + svq->ring_id_maps[used_elem.id]->len = used_elem.len;
> + return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> +}
> +
> +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
> + bool check_for_avail_queue)
> +{
> + VirtQueue *vq = svq->vq;
> +
> + /* Make as many buffers as possible used. */
> + do {
> + unsigned i = 0;
> +
> + vhost_svq_set_notification(svq, false);
> + while (true) {
> + g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq);
> + if (!elem) {
> + break;
> + }
> +
> + if (unlikely(i >= svq->vring.num)) {
> + virtio_error(svq->vdev,
> + "More than %u used buffers obtained in a %u
size SVQ",
> + i, svq->vring.num);
> + virtqueue_fill(vq, elem, elem->len, i);
> + virtqueue_flush(vq, i);
Let's simply use virtqueue_push() here?
> + i = 0;
Do we need to bail out here?
> + }
> + virtqueue_fill(vq, elem, elem->len, i++);
> + }
> +
> + virtqueue_flush(vq, i);
> + event_notifier_set(&svq->svq_call);
> +
> + if (check_for_avail_queue &&
svq->next_guest_avail_elem) {
> + /*
> + * Avail ring was full when vhost_svq_flush was called, so
it's a
> + * good moment to make more descriptors available if possible
> + */
> + vhost_handle_guest_kick(svq);
Is there better to have a similar check as vhost_handle_guest_kick() did?
??????????? if (elem->out_num + elem->in_num >
??????????????? vhost_svq_available_slots(svq)) {
> + }
> +
> + vhost_svq_set_notification(svq, true);
A mb() is needed here? Otherwise we may lost a call here (where
vhost_svq_more_used() is run before vhost_svq_set_notification()).
> + } while (vhost_svq_more_used(svq));
> +}
> +
> +/**
> + * Forward used buffers.
> + *
> + * @n hdev call event notifier, the one that device set to notify svq.
> + *
> + * Note that we are not making any buffers available in the loop, there is
no
> + * way that it runs more than virtqueue size times.
> + */
> static void vhost_svq_handle_call(EventNotifier *n)
> {
> VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> hdev_call);
>
> - if (unlikely(!event_notifier_test_and_clear(n))) {
> - return;
> - }
> + /* Clear event notifier */
> + event_notifier_test_and_clear(n);
Any reason that we remove the above check?
>
> - event_notifier_set(&svq->svq_call);
> + vhost_svq_flush(svq, true);
> }
>
> /**
> @@ -258,13 +551,38 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue
*svq, int svq_kick_fd)
> * need to explicitely check for them.
> */
> event_notifier_init_fd(&svq->svq_kick, svq_kick_fd);
> - event_notifier_set_handler(&svq->svq_kick,
vhost_handle_guest_kick);
> + event_notifier_set_handler(&svq->svq_kick,
> + vhost_handle_guest_kick_notifier);
>
> if (!check_old || event_notifier_test_and_clear(&tmp)) {
> event_notifier_set(&svq->hdev_kick);
> }
> }
>
> +/**
> + * Start shadow virtqueue operation.
> + *
> + * @svq Shadow Virtqueue
> + * @vdev VirtIO device
> + * @vq Virtqueue to shadow
> + */
> +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
> + VirtQueue *vq)
> +{
> + svq->next_guest_avail_elem = NULL;
> + svq->avail_idx_shadow = 0;
> + svq->shadow_used_idx = 0;
> + svq->last_used_idx = 0;
> + svq->vdev = vdev;
> + svq->vq = vq;
> +
> + memset(svq->vring.avail, 0, sizeof(*svq->vring.avail));
> + memset(svq->vring.used, 0, sizeof(*svq->vring.avail));
> + for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> + svq->vring.desc[i].next = cpu_to_le16(i + 1);
> + }
> +}
> +
> /**
> * Stop shadow virtqueue operation.
> * @svq Shadow Virtqueue
> @@ -272,6 +590,28 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue
*svq, int svq_kick_fd)
> void vhost_svq_stop(VhostShadowVirtqueue *svq)
> {
> event_notifier_set_handler(&svq->svq_kick, NULL);
> + g_autofree VirtQueueElement *next_avail_elem = NULL;
> +
> + if (!svq->vq) {
> + return;
> + }
> +
> + /* Send all pending used descriptors to guest */
> + vhost_svq_flush(svq, false);
> +
> + for (unsigned i = 0; i < svq->vring.num; ++i) {
> + g_autofree VirtQueueElement *elem = NULL;
> + elem = g_steal_pointer(&svq->ring_id_maps[i]);
> + if (elem) {
> + virtqueue_detach_element(svq->vq, elem, elem->len);
> + }
> + }
> +
> + next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
> + if (next_avail_elem) {
> + virtqueue_detach_element(svq->vq, next_avail_elem,
> + next_avail_elem->len);
> + }
> }
>
> /**
> @@ -316,7 +656,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> memset(svq->vring.desc, 0, driver_size);
> svq->vring.used = qemu_memalign(qemu_real_host_page_size,
device_size);
> memset(svq->vring.used, 0, device_size);
> -
> + svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
> event_notifier_set_handler(&svq->hdev_call,
vhost_svq_handle_call);
> return g_steal_pointer(&svq);
>
> @@ -335,6 +675,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
> event_notifier_cleanup(&vq->hdev_kick);
> event_notifier_set_handler(&vq->hdev_call, NULL);
> event_notifier_cleanup(&vq->hdev_call);
> + g_free(vq->ring_id_maps);
> qemu_vfree(vq->vring.desc);
> qemu_vfree(vq->vring.used);
> g_free(vq);
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 53e14bafa0..0e5c00ed7e 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -752,9 +752,9 @@ static int vhost_vdpa_set_vring_call(struct vhost_dev
*dev,
> * Note that this function does not rewind kick file descriptor if cannot
set
> * call one.
> */
> -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> - VhostShadowVirtqueue *svq,
> - unsigned idx)
> +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
> + VhostShadowVirtqueue *svq,
> + unsigned idx)
> {
> struct vhost_vring_file file = {
> .index = dev->vq_index + idx,
> @@ -767,7 +767,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> r = vhost_vdpa_set_vring_dev_kick(dev, &file);
> if (unlikely(r != 0)) {
> error_report("Can't set device kick fd (%d)", -r);
> - return false;
> + return r;
> }
>
> event_notifier = vhost_svq_get_svq_call_notifier(svq);
> @@ -777,6 +777,99 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev
*dev,
> error_report("Can't set device call fd (%d)", -r);
> }
>
> + return r;
> +}
> +
> +/**
> + * Unmap SVQ area in the device
> + */
> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> + hwaddr size)
> +{
> + int r;
> +
> + size = ROUND_UP(size, qemu_real_host_page_size);
> + r = vhost_vdpa_dma_unmap(v, iova, size);
> + return r == 0;
> +}
> +
> +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> + const VhostShadowVirtqueue *svq)
> +{
> + struct vhost_vdpa *v = dev->opaque;
> + struct vhost_vring_addr svq_addr;
> + size_t device_size = vhost_svq_device_area_size(svq);
> + size_t driver_size = vhost_svq_driver_area_size(svq);
> + bool ok;
> +
> + vhost_svq_get_vring_addr(svq, &svq_addr);
> +
> + ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr,
driver_size);
> + if (unlikely(!ok)) {
> + return false;
> + }
> +
> + return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr,
device_size);
> +}
> +
> +/**
> + * Map shadow virtqueue rings in device
> + *
> + * @dev The vhost device
> + * @svq The shadow virtqueue
> + */
> +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
> + const VhostShadowVirtqueue *svq)
> +{
> + struct vhost_vdpa *v = dev->opaque;
> + struct vhost_vring_addr svq_addr;
> + size_t device_size = vhost_svq_device_area_size(svq);
> + size_t driver_size = vhost_svq_driver_area_size(svq);
> + int r;
> +
> + vhost_svq_get_vring_addr(svq, &svq_addr);
> +
> + r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
> + (void *)svq_addr.desc_user_addr, true);
> + if (unlikely(r != 0)) {
> + return false;
> + }
> +
> + r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
> + (void *)svq_addr.used_user_addr, false);
Do we need unmap the driver area if we fail here?
Thanks
> + return r == 0;
> +}
> +
> +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> + VhostShadowVirtqueue *svq,
> + unsigned idx)
> +{
> + uint16_t vq_index = dev->vq_index + idx;
> + struct vhost_vring_state s = {
> + .index = vq_index,
> + };
> + int r;
> + bool ok;
> +
> + r = vhost_vdpa_set_dev_vring_base(dev, &s);
> + if (unlikely(r)) {
> + error_report("Can't set vring base (%d)", r);
> + return false;
> + }
> +
> + s.num = vhost_svq_get_num(svq);
> + r = vhost_vdpa_set_dev_vring_num(dev, &s);
> + if (unlikely(r)) {
> + error_report("Can't set vring num (%d)", r);
> + return false;
> + }
> +
> + ok = vhost_vdpa_svq_map_rings(dev, svq);
> + if (unlikely(!ok)) {
> + return false;
> + }
> +
> + r = vhost_vdpa_svq_set_fds(dev, svq, idx);
> return r == 0;
> }
>
> @@ -788,14 +881,24 @@ static int vhost_vdpa_dev_start(struct vhost_dev
*dev, bool started)
> if (started) {
> vhost_vdpa_host_notifiers_init(dev);
> for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> + VirtQueue *vq = virtio_get_queue(dev->vdev,
dev->vq_index + i);
> VhostShadowVirtqueue *svq =
g_ptr_array_index(v->shadow_vqs, i);
> bool ok = vhost_vdpa_svq_setup(dev, svq, i);
> if (unlikely(!ok)) {
> return -1;
> }
> + vhost_svq_start(svq, dev->vdev, vq);
> }
> vhost_vdpa_set_vring_ready(dev);
> } else {
> + for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> + VhostShadowVirtqueue *svq =
g_ptr_array_index(v->shadow_vqs,
> + i);
> + bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
> + if (unlikely(!ok)) {
> + return -1;
> + }
> + }
> vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
> }
>