On Wed, Feb 23, 2022 at 3:01 AM Eugenio Perez Martin
<eperezma at redhat.com> wrote:>
> On Tue, Feb 8, 2022 at 9:11 AM Jason Wang <jasowang at redhat.com>
wrote:
> >
> >
> > ? 2022/2/2 ??1:08, Eugenio Perez Martin ??:
> > > On Sun, Jan 30, 2022 at 5:43 AM Jason Wang <jasowang at
redhat.com> wrote:
> > >>
> > >> ? 2022/1/22 ??4:27, Eugenio P?rez ??:
> > >>> Initial version of shadow virtqueue that actually forward
buffers. There
> > >>> is no iommu support at the moment, and that will be
addressed in future
> > >>> patches of this series. Since all vhost-vdpa devices use
forced IOMMU,
> > >>> this means that SVQ is not usable at this point of the
series on any
> > >>> device.
> > >>>
> > >>> For simplicity it only supports modern devices, that
expects vring
> > >>> in little endian, with split ring and no event idx or
indirect
> > >>> descriptors. Support for them will not be added in this
series.
> > >>>
> > >>> It reuses the VirtQueue code for the device part. The
driver part is
> > >>> based on Linux's virtio_ring driver, but with
stripped functionality
> > >>> and optimizations so it's easier to review.
> > >>>
> > >>> However, forwarding buffers have some particular pieces:
One of the most
> > >>> unexpected ones is that a guest's buffer can expand
through more than
> > >>> one descriptor in SVQ. While this is handled gracefully
by qemu's
> > >>> emulated virtio devices, it may cause unexpected SVQ
queue full. This
> > >>> patch also solves it by checking for this condition at
both guest's
> > >>> kicks and device's calls. The code may be more
elegant in the future if
> > >>> SVQ code runs in its own iocontext.
> > >>>
> > >>> Signed-off-by: Eugenio P?rez <eperezma at
redhat.com>
> > >>> ---
> > >>> hw/virtio/vhost-shadow-virtqueue.h | 2 +
> > >>> hw/virtio/vhost-shadow-virtqueue.c | 365
++++++++++++++++++++++++++++-
> > >>> hw/virtio/vhost-vdpa.c | 111 ++++++++-
> > >>> 3 files changed, 462 insertions(+), 16 deletions(-)
> > >>>
> > >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h
b/hw/virtio/vhost-shadow-virtqueue.h
> > >>> index 39aef5ffdf..19c934af49 100644
> > >>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> > >>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > >>> @@ -33,6 +33,8 @@ uint16_t vhost_svq_get_num(const
VhostShadowVirtqueue *svq);
> > >>> size_t vhost_svq_driver_area_size(const
VhostShadowVirtqueue *svq);
> > >>> size_t vhost_svq_device_area_size(const
VhostShadowVirtqueue *svq);
> > >>>
> > >>> +void vhost_svq_start(VhostShadowVirtqueue *svq,
VirtIODevice *vdev,
> > >>> + VirtQueue *vq);
> > >>> void vhost_svq_stop(VhostShadowVirtqueue *svq);
> > >>>
> > >>> VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> > >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c
b/hw/virtio/vhost-shadow-virtqueue.c
> > >>> index 7c168075d7..a1a404f68f 100644
> > >>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> > >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > >>> @@ -9,6 +9,8 @@
> > >>>
> > >>> #include "qemu/osdep.h"
> > >>> #include
"hw/virtio/vhost-shadow-virtqueue.h"
> > >>> +#include "hw/virtio/vhost.h"
> > >>> +#include "hw/virtio/virtio-access.h"
> > >>> #include
"standard-headers/linux/vhost_types.h"
> > >>>
> > >>> #include "qemu/error-report.h"
> > >>> @@ -36,6 +38,33 @@ typedef struct VhostShadowVirtqueue {
> > >>>
> > >>> /* Guest's call notifier, where SVQ calls
guest. */
> > >>> EventNotifier svq_call;
> > >>> +
> > >>> + /* Virtio queue shadowing */
> > >>> + VirtQueue *vq;
> > >>> +
> > >>> + /* Virtio device */
> > >>> + VirtIODevice *vdev;
> > >>> +
> > >>> + /* Map for returning guest's descriptors */
> > >>> + VirtQueueElement **ring_id_maps;
> > >>> +
> > >>> + /* Next VirtQueue element that guest made available
*/
> > >>> + VirtQueueElement *next_guest_avail_elem;
> > >>> +
> > >>> + /* Next head to expose to device */
> > >>> + uint16_t avail_idx_shadow;
> > >>> +
> > >>> + /* Next free descriptor */
> > >>> + uint16_t free_head;
> > >>> +
> > >>> + /* Last seen used idx */
> > >>> + uint16_t shadow_used_idx;
> > >>> +
> > >>> + /* Next head to consume from device */
> > >>> + uint16_t last_used_idx;
> > >>> +
> > >>> + /* Cache for the exposed notification flag */
> > >>> + bool notification;
> > >>> } VhostShadowVirtqueue;
> > >>>
> > >>> #define INVALID_SVQ_KICK_FD -1
> > >>> @@ -148,30 +177,294 @@ bool
vhost_svq_ack_guest_features(uint64_t dev_features,
> > >>> return true;
> > >>> }
> > >>>
> > >>> -/* Forward guest notifications */
> > >>> -static void vhost_handle_guest_kick(EventNotifier *n)
> > >>> +/**
> > >>> + * Number of descriptors that SVQ can make available
from the guest.
> > >>> + *
> > >>> + * @svq The svq
> > >>> + */
> > >>> +static uint16_t vhost_svq_available_slots(const
VhostShadowVirtqueue *svq)
> > >>> {
> > >>> - VhostShadowVirtqueue *svq = container_of(n,
VhostShadowVirtqueue,
> > >>> - svq_kick);
> > >>> + return svq->vring.num - (svq->avail_idx_shadow
- svq->shadow_used_idx);
> > >>> +}
> > >>> +
> > >>> +static void
vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > >>> +{
> > >>> + uint16_t notification_flag;
> > >>>
> > >>> - if (unlikely(!event_notifier_test_and_clear(n))) {
> > >>> + if (svq->notification == enable) {
> > >>> + return;
> > >>> + }
> > >>> +
> > >>> + notification_flag =
cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
> > >>> +
> > >>> + svq->notification = enable;
> > >>> + if (enable) {
> > >>> + svq->vring.avail->flags &=
~notification_flag;
> > >>> + } else {
> > >>> + svq->vring.avail->flags |=
notification_flag;
> > >>> + }
> > >>> +}
> > >>> +
> > >>> +static void vhost_vring_write_descs(VhostShadowVirtqueue
*svq,
> > >>> + const struct iovec
*iovec,
> > >>> + size_t num, bool
more_descs, bool write)
> > >>> +{
> > >>> + uint16_t i = svq->free_head, last =
svq->free_head;
> > >>> + unsigned n;
> > >>> + uint16_t flags = write ?
cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> > >>> + vring_desc_t *descs = svq->vring.desc;
> > >>> +
> > >>> + if (num == 0) {
> > >>> + return;
> > >>> + }
> > >>> +
> > >>> + for (n = 0; n < num; n++) {
> > >>> + if (more_descs || (n + 1 < num)) {
> > >>> + descs[i].flags = flags |
cpu_to_le16(VRING_DESC_F_NEXT);
> > >>> + } else {
> > >>> + descs[i].flags = flags;
> > >>> + }
> > >>> + descs[i].addr =
cpu_to_le64((hwaddr)iovec[n].iov_base);
> > >>> + descs[i].len = cpu_to_le32(iovec[n].iov_len);
> > >>> +
> > >>> + last = i;
> > >>> + i = cpu_to_le16(descs[i].next);
> > >>> + }
> > >>> +
> > >>> + svq->free_head = le16_to_cpu(descs[last].next);
> > >>> +}
> > >>> +
> > >>> +static unsigned vhost_svq_add_split(VhostShadowVirtqueue
*svq,
> > >>> + VirtQueueElement
*elem)
> > >>> +{
> > >>> + int head;
> > >>> + unsigned avail_idx;
> > >>> + vring_avail_t *avail = svq->vring.avail;
> > >>> +
> > >>> + head = svq->free_head;
> > >>> +
> > >>> + /* We need some descriptors here */
> > >>> + assert(elem->out_num || elem->in_num);
> > >>
> > >> Looks like this could be triggered by guest, we need fail
instead assert
> > >> here.
> > >>
> > > My understanding was that virtqueue_pop already sanitized that
case,
> > > but I'm not able to find where now. I will recheck and, in
case it's
> > > not, I will move to a failure.
> > >
> > >>> +
> > >>> + vhost_vring_write_descs(svq, elem->out_sg,
elem->out_num,
> > >>> + elem->in_num > 0,
false);
> > >>> + vhost_vring_write_descs(svq, elem->in_sg,
elem->in_num, false, true);
> > >>> +
> > >>> + /*
> > >>> + * Put entry in available array (but don't
update avail->idx until they
> > >>> + * do sync).
> > >>> + */
> > >>> + avail_idx = svq->avail_idx_shadow &
(svq->vring.num - 1);
> > >>> + avail->ring[avail_idx] = cpu_to_le16(head);
> > >>> + svq->avail_idx_shadow++;
> > >>> +
> > >>> + /* Update avail index after the descriptor is wrote
*/
> > >>> + smp_wmb();
> > >>> + avail->idx =
cpu_to_le16(svq->avail_idx_shadow);
> > >>> +
> > >>> + return head;
> > >>> +}
> > >>> +
> > >>> +static void vhost_svq_add(VhostShadowVirtqueue *svq,
VirtQueueElement *elem)
> > >>> +{
> > >>> + unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > >>> +
> > >>> + svq->ring_id_maps[qemu_head] = elem;
> > >>> +}
> > >>> +
> > >>> +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> + /* We need to expose available array entries before
checking used flags */
> > >>> + smp_mb();
> > >>> + if (svq->vring.used->flags &
VRING_USED_F_NO_NOTIFY) {
> > >>> return;
> > >>> }
> > >>>
> > >>> event_notifier_set(&svq->hdev_kick);
> > >>> }
> > >>>
> > >>> -/* Forward vhost notifications */
> > >>> +/**
> > >>> + * Forward available buffers.
> > >>> + *
> > >>> + * @svq Shadow VirtQueue
> > >>> + *
> > >>> + * Note that this function does not guarantee that all
guest's available
> > >>> + * buffers are available to the device in SVQ avail
ring. The guest may have
> > >>> + * exposed a GPA / GIOVA congiuous buffer, but it may
not be contiguous in qemu
> > >>> + * vaddr.
> > >>> + *
> > >>> + * If that happens, guest's kick notifications will
be disabled until device
> > >>> + * makes some buffers used.
> > >>> + */
> > >>> +static void vhost_handle_guest_kick(VhostShadowVirtqueue
*svq)
> > >>> +{
> > >>> + /* Clear event notifier */
> > >>> +
event_notifier_test_and_clear(&svq->svq_kick);
> > >>> +
> > >>> + /* Make available as many buffers as possible */
> > >>> + do {
> > >>> + if (virtio_queue_get_notification(svq->vq)) {
> > >>> + virtio_queue_set_notification(svq->vq,
false);
> > >>
> > >> This looks like an optimization the should belong to
> > >> virtio_queue_set_notification() itself.
> > >>
> > > Sure we can move.
> > >
> > >>> + }
> > >>> +
> > >>> + while (true) {
> > >>> + VirtQueueElement *elem;
> > >>> +
> > >>> + if (svq->next_guest_avail_elem) {
> > >>> + elem =
g_steal_pointer(&svq->next_guest_avail_elem);
> > >>> + } else {
> > >>> + elem = virtqueue_pop(svq->vq,
sizeof(*elem));
> > >>> + }
> > >>> +
> > >>> + if (!elem) {
> > >>> + break;
> > >>> + }
> > >>> +
> > >>> + if (elem->out_num + elem->in_num >
> > >>> + vhost_svq_available_slots(svq)) {
> > >>> + /*
> > >>> + * This condition is possible since a
contiguous buffer in GPA
> > >>> + * does not imply a contiguous buffer in
qemu's VA
> > >>> + * scatter-gather segments. If that
happen, the buffer exposed
> > >>> + * to the device needs to be a chain of
descriptors at this
> > >>> + * moment.
> > >>> + *
> > >>> + * SVQ cannot hold more available
buffers if we are here:
> > >>> + * queue the current guest descriptor
and ignore further kicks
> > >>> + * until some elements are used.
> > >>> + */
> > >>> + svq->next_guest_avail_elem = elem;
> > >>> + return;
> > >>> + }
> > >>> +
> > >>> + vhost_svq_add(svq, elem);
> > >>> + vhost_svq_kick(svq);
> > >>> + }
> > >>> +
> > >>> + virtio_queue_set_notification(svq->vq, true);
> > >>> + } while (!virtio_queue_empty(svq->vq));
> > >>> +}
> > >>> +
> > >>> +/**
> > >>> + * Handle guest's kick.
> > >>> + *
> > >>> + * @n guest kick event notifier, the one that guest set
to notify svq.
> > >>> + */
> > >>> +static void
vhost_handle_guest_kick_notifier(EventNotifier *n)
> > >>> +{
> > >>> + VhostShadowVirtqueue *svq = container_of(n,
VhostShadowVirtqueue,
> > >>> + svq_kick);
> > >>> + vhost_handle_guest_kick(svq);
> > >>> +}
> > >>> +
> > >>> +static bool vhost_svq_more_used(VhostShadowVirtqueue
*svq)
> > >>> +{
> > >>> + if (svq->last_used_idx !=
svq->shadow_used_idx) {
> > >>> + return true;
> > >>> + }
> > >>> +
> > >>> + svq->shadow_used_idx =
cpu_to_le16(svq->vring.used->idx);
> > >>> +
> > >>> + return svq->last_used_idx !=
svq->shadow_used_idx;
> > >>> +}
> > >>> +
> > >>> +static VirtQueueElement
*vhost_svq_get_buf(VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> + vring_desc_t *descs = svq->vring.desc;
> > >>> + const vring_used_t *used = svq->vring.used;
> > >>> + vring_used_elem_t used_elem;
> > >>> + uint16_t last_used;
> > >>> +
> > >>> + if (!vhost_svq_more_used(svq)) {
> > >>> + return NULL;
> > >>> + }
> > >>> +
> > >>> + /* Only get used array entries after they have been
exposed by dev */
> > >>> + smp_rmb();
> > >>> + last_used = svq->last_used_idx &
(svq->vring.num - 1);
> > >>> + used_elem.id =
le32_to_cpu(used->ring[last_used].id);
> > >>> + used_elem.len =
le32_to_cpu(used->ring[last_used].len);
> > >>> +
> > >>> + svq->last_used_idx++;
> > >>> + if (unlikely(used_elem.id >= svq->vring.num))
{
> > >>> + error_report("Device %s says index %u is
used", svq->vdev->name,
> > >>> + used_elem.id);
> > >>> + return NULL;
> > >>> + }
> > >>> +
> > >>> + if (unlikely(!svq->ring_id_maps[used_elem.id])) {
> > >>> + error_report(
> > >>> + "Device %s says index %u is used, but
it was not available",
> > >>> + svq->vdev->name, used_elem.id);
> > >>> + return NULL;
> > >>> + }
> > >>> +
> > >>> + descs[used_elem.id].next = svq->free_head;
> > >>> + svq->free_head = used_elem.id;
> > >>> +
> > >>> + svq->ring_id_maps[used_elem.id]->len =
used_elem.len;
> > >>> + return
g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
> > >>> +}
> > >>> +
> > >>> +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
> > >>> + bool check_for_avail_queue)
> > >>> +{
> > >>> + VirtQueue *vq = svq->vq;
> > >>> +
> > >>> + /* Make as many buffers as possible used. */
> > >>> + do {
> > >>> + unsigned i = 0;
> > >>> +
> > >>> + vhost_svq_set_notification(svq, false);
> > >>> + while (true) {
> > >>> + g_autofree VirtQueueElement *elem =
vhost_svq_get_buf(svq);
> > >>> + if (!elem) {
> > >>> + break;
> > >>> + }
> > >>> +
> > >>> + if (unlikely(i >= svq->vring.num)) {
> > >>> + virtio_error(svq->vdev,
> > >>> + "More than %u used buffers
obtained in a %u size SVQ",
> > >>> + i, svq->vring.num);
> > >>> + virtqueue_fill(vq, elem, elem->len,
i);
> > >>> + virtqueue_flush(vq, i);
> > >>
> > >> Let's simply use virtqueue_push() here?
> > >>
> > > virtqueue_push support to fill and flush only one element,
instead of
> > > batch. I'm fine with either but I think the less updates to
the used
> > > idx, the better.
> >
> >
> > Fine.
> >
> >
> > >
> > >>> + i = 0;
> > >>
> > >> Do we need to bail out here?
> > >>
> > > Yes I guess we can simply return.
> > >
> > >>> + }
> > >>> + virtqueue_fill(vq, elem, elem->len, i++);
> > >>> + }
> > >>> +
> > >>> + virtqueue_flush(vq, i);
> > >>> + event_notifier_set(&svq->svq_call);
> > >>> +
> > >>> + if (check_for_avail_queue &&
svq->next_guest_avail_elem) {
> > >>> + /*
> > >>> + * Avail ring was full when vhost_svq_flush
was called, so it's a
> > >>> + * good moment to make more descriptors
available if possible
> > >>> + */
> > >>> + vhost_handle_guest_kick(svq);
> > >>
> > >> Is there better to have a similar check as
vhost_handle_guest_kick() did?
> > >>
> > >> if (elem->out_num + elem->in_num >
> > >> vhost_svq_available_slots(svq)) {
> > >>
> > > It will be duplicated when we call vhost_handle_guest_kick,
won't it?
> >
> >
> > Right, I mis-read the code.
> >
> >
> > >
> > >>> + }
> > >>> +
> > >>> + vhost_svq_set_notification(svq, true);
> > >>
> > >> A mb() is needed here? Otherwise we may lost a call here
(where
> > >> vhost_svq_more_used() is run before
vhost_svq_set_notification()).
> > >>
> > > I'm confused here then, I thought you said this is just a
hint so
> > > there was no need? [1]. I think the memory barrier is needed too.
> >
> >
> > Yes, it's a hint but:
> >
> > 1) When we disable the notification, consider the notification disable
> > is just a hint, device can still raise an interrupt, so the ordering
is
> > meaningless and a memory barrier is not necessary (the
> > vhost_svq_set_notification(svq, false))
> >
> > 2) When we enable the notification, though it's a hint, the device
can
> > choose to implement it by enabling the interrupt, in this case, the
> > notification enable should be done before checking the used.
Otherwise,
> > the checking of more used might be done before enable the
notification:
> >
> > 1) driver check more used
> > 2) device add more used but no notification
> > 3) driver enable the notification then we lost a notification here
> >
>
> That was my understanding too. So the right way is to only add the
> memory barrier in case 2), when setting the flag, right?
Yes.
>
> >
> > >>> + } while (vhost_svq_more_used(svq));
> > >>> +}
> > >>> +
> > >>> +/**
> > >>> + * Forward used buffers.
> > >>> + *
> > >>> + * @n hdev call event notifier, the one that device set
to notify svq.
> > >>> + *
> > >>> + * Note that we are not making any buffers available in
the loop, there is no
> > >>> + * way that it runs more than virtqueue size times.
> > >>> + */
> > >>> static void vhost_svq_handle_call(EventNotifier *n)
> > >>> {
> > >>> VhostShadowVirtqueue *svq = container_of(n,
VhostShadowVirtqueue,
> > >>>
hdev_call);
> > >>>
> > >>> - if (unlikely(!event_notifier_test_and_clear(n))) {
> > >>> - return;
> > >>> - }
> > >>> + /* Clear event notifier */
> > >>> + event_notifier_test_and_clear(n);
> > >>
> > >> Any reason that we remove the above check?
> > >>
> > > This comes from the previous versions, where this made sure we
missed
> > > no used buffers in the process of switching to SVQ mode.
> >
> >
> > I'm not sure I get here. Even if for the switching, it should be
more
> > safe the handle the flush unconditionally?
> >
>
> Yes, I also think it's better to forward and kick/call unconditionally.
>
> Thanks!
Ok.
Thanks
>
> > Thanks
> >
> >
> > >
> > > If we enable SVQ from the beginning I think we can rely on
getting all
> > > the device's used buffer notifications, so let me think a
little bit
> > > and I can move to check the eventfd.
> > >
> > >>> - event_notifier_set(&svq->svq_call);
> > >>> + vhost_svq_flush(svq, true);
> > >>> }
> > >>>
> > >>> /**
> > >>> @@ -258,13 +551,38 @@ void
vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> > >>> * need to explicitely check for them.
> > >>> */
> > >>> event_notifier_init_fd(&svq->svq_kick,
svq_kick_fd);
> > >>> - event_notifier_set_handler(&svq->svq_kick,
vhost_handle_guest_kick);
> > >>> + event_notifier_set_handler(&svq->svq_kick,
> > >>> +
vhost_handle_guest_kick_notifier);
> > >>>
> > >>> if (!check_old ||
event_notifier_test_and_clear(&tmp)) {
> > >>> event_notifier_set(&svq->hdev_kick);
> > >>> }
> > >>> }
> > >>>
> > >>> +/**
> > >>> + * Start shadow virtqueue operation.
> > >>> + *
> > >>> + * @svq Shadow Virtqueue
> > >>> + * @vdev VirtIO device
> > >>> + * @vq Virtqueue to shadow
> > >>> + */
> > >>> +void vhost_svq_start(VhostShadowVirtqueue *svq,
VirtIODevice *vdev,
> > >>> + VirtQueue *vq)
> > >>> +{
> > >>> + svq->next_guest_avail_elem = NULL;
> > >>> + svq->avail_idx_shadow = 0;
> > >>> + svq->shadow_used_idx = 0;
> > >>> + svq->last_used_idx = 0;
> > >>> + svq->vdev = vdev;
> > >>> + svq->vq = vq;
> > >>> +
> > >>> + memset(svq->vring.avail, 0,
sizeof(*svq->vring.avail));
> > >>> + memset(svq->vring.used, 0,
sizeof(*svq->vring.avail));
> > >>> + for (unsigned i = 0; i < svq->vring.num - 1;
i++) {
> > >>> + svq->vring.desc[i].next = cpu_to_le16(i + 1);
> > >>> + }
> > >>> +}
> > >>> +
> > >>> /**
> > >>> * Stop shadow virtqueue operation.
> > >>> * @svq Shadow Virtqueue
> > >>> @@ -272,6 +590,28 @@ void
vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
> > >>> void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > >>> {
> > >>> event_notifier_set_handler(&svq->svq_kick,
NULL);
> > >>> + g_autofree VirtQueueElement *next_avail_elem = NULL;
> > >>> +
> > >>> + if (!svq->vq) {
> > >>> + return;
> > >>> + }
> > >>> +
> > >>> + /* Send all pending used descriptors to guest */
> > >>> + vhost_svq_flush(svq, false);
> > >>> +
> > >>> + for (unsigned i = 0; i < svq->vring.num; ++i)
{
> > >>> + g_autofree VirtQueueElement *elem = NULL;
> > >>> + elem =
g_steal_pointer(&svq->ring_id_maps[i]);
> > >>> + if (elem) {
> > >>> + virtqueue_detach_element(svq->vq, elem,
elem->len);
> > >>> + }
> > >>> + }
> > >>> +
> > >>> + next_avail_elem =
g_steal_pointer(&svq->next_guest_avail_elem);
> > >>> + if (next_avail_elem) {
> > >>> + virtqueue_detach_element(svq->vq,
next_avail_elem,
> > >>> +
next_avail_elem->len);
> > >>> + }
> > >>> }
> > >>>
> > >>> /**
> > >>> @@ -316,7 +656,7 @@ VhostShadowVirtqueue
*vhost_svq_new(uint16_t qsize)
> > >>> memset(svq->vring.desc, 0, driver_size);
> > >>> svq->vring.used =
qemu_memalign(qemu_real_host_page_size, device_size);
> > >>> memset(svq->vring.used, 0, device_size);
> > >>> -
> > >>> + svq->ring_id_maps = g_new0(VirtQueueElement *,
qsize);
> > >>> event_notifier_set_handler(&svq->hdev_call,
vhost_svq_handle_call);
> > >>> return g_steal_pointer(&svq);
> > >>>
> > >>> @@ -335,6 +675,7 @@ void
vhost_svq_free(VhostShadowVirtqueue *vq)
> > >>> event_notifier_cleanup(&vq->hdev_kick);
> > >>> event_notifier_set_handler(&vq->hdev_call,
NULL);
> > >>> event_notifier_cleanup(&vq->hdev_call);
> > >>> + g_free(vq->ring_id_maps);
> > >>> qemu_vfree(vq->vring.desc);
> > >>> qemu_vfree(vq->vring.used);
> > >>> g_free(vq);
> > >>> diff --git a/hw/virtio/vhost-vdpa.c
b/hw/virtio/vhost-vdpa.c
> > >>> index 53e14bafa0..0e5c00ed7e 100644
> > >>> --- a/hw/virtio/vhost-vdpa.c
> > >>> +++ b/hw/virtio/vhost-vdpa.c
> > >>> @@ -752,9 +752,9 @@ static int
vhost_vdpa_set_vring_call(struct vhost_dev *dev,
> > >>> * Note that this function does not rewind kick file
descriptor if cannot set
> > >>> * call one.
> > >>> */
> > >>> -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > >>> - VhostShadowVirtqueue
*svq,
> > >>> - unsigned idx)
> > >>> +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
> > >>> + VhostShadowVirtqueue
*svq,
> > >>> + unsigned idx)
> > >>> {
> > >>> struct vhost_vring_file file = {
> > >>> .index = dev->vq_index + idx,
> > >>> @@ -767,7 +767,7 @@ static bool
vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > >>> r = vhost_vdpa_set_vring_dev_kick(dev, &file);
> > >>> if (unlikely(r != 0)) {
> > >>> error_report("Can't set device kick
fd (%d)", -r);
> > >>> - return false;
> > >>> + return r;
> > >>> }
> > >>>
> > >>> event_notifier =
vhost_svq_get_svq_call_notifier(svq);
> > >>> @@ -777,6 +777,99 @@ static bool
vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > >>> error_report("Can't set device call
fd (%d)", -r);
> > >>> }
> > >>>
> > >>> + return r;
> > >>> +}
> > >>> +
> > >>> +/**
> > >>> + * Unmap SVQ area in the device
> > >>> + */
> > >>> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa
*v, hwaddr iova,
> > >>> + hwaddr size)
> > >>> +{
> > >>> + int r;
> > >>> +
> > >>> + size = ROUND_UP(size, qemu_real_host_page_size);
> > >>> + r = vhost_vdpa_dma_unmap(v, iova, size);
> > >>> + return r == 0;
> > >>> +}
> > >>> +
> > >>> +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev
*dev,
> > >>> + const
VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> + struct vhost_vdpa *v = dev->opaque;
> > >>> + struct vhost_vring_addr svq_addr;
> > >>> + size_t device_size =
vhost_svq_device_area_size(svq);
> > >>> + size_t driver_size =
vhost_svq_driver_area_size(svq);
> > >>> + bool ok;
> > >>> +
> > >>> + vhost_svq_get_vring_addr(svq, &svq_addr);
> > >>> +
> > >>> + ok = vhost_vdpa_svq_unmap_ring(v,
svq_addr.desc_user_addr, driver_size);
> > >>> + if (unlikely(!ok)) {
> > >>> + return false;
> > >>> + }
> > >>> +
> > >>> + return vhost_vdpa_svq_unmap_ring(v,
svq_addr.used_user_addr, device_size);
> > >>> +}
> > >>> +
> > >>> +/**
> > >>> + * Map shadow virtqueue rings in device
> > >>> + *
> > >>> + * @dev The vhost device
> > >>> + * @svq The shadow virtqueue
> > >>> + */
> > >>> +static bool vhost_vdpa_svq_map_rings(struct vhost_dev
*dev,
> > >>> + const
VhostShadowVirtqueue *svq)
> > >>> +{
> > >>> + struct vhost_vdpa *v = dev->opaque;
> > >>> + struct vhost_vring_addr svq_addr;
> > >>> + size_t device_size =
vhost_svq_device_area_size(svq);
> > >>> + size_t driver_size =
vhost_svq_driver_area_size(svq);
> > >>> + int r;
> > >>> +
> > >>> + vhost_svq_get_vring_addr(svq, &svq_addr);
> > >>> +
> > >>> + r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr,
driver_size,
> > >>> + (void
*)svq_addr.desc_user_addr, true);
> > >>> + if (unlikely(r != 0)) {
> > >>> + return false;
> > >>> + }
> > >>> +
> > >>> + r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr,
device_size,
> > >>> + (void
*)svq_addr.used_user_addr, false);
> > >>
> > >> Do we need unmap the driver area if we fail here?
> > >>
> > > Yes, this used to trust in unmap them at the disabling of SVQ.
Now I
> > > think we need to unmap as you say.
> > >
> > > Thanks!
> > >
> > > [1]
https://lists.linuxfoundation.org/pipermail/virtualization/2021-March/053322.html
> > >
> > >> Thanks
> > >>
> > >>
> > >>> + return r == 0;
> > >>> +}
> > >>> +
> > >>> +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > >>> + VhostShadowVirtqueue
*svq,
> > >>> + unsigned idx)
> > >>> +{
> > >>> + uint16_t vq_index = dev->vq_index + idx;
> > >>> + struct vhost_vring_state s = {
> > >>> + .index = vq_index,
> > >>> + };
> > >>> + int r;
> > >>> + bool ok;
> > >>> +
> > >>> + r = vhost_vdpa_set_dev_vring_base(dev, &s);
> > >>> + if (unlikely(r)) {
> > >>> + error_report("Can't set vring base
(%d)", r);
> > >>> + return false;
> > >>> + }
> > >>> +
> > >>> + s.num = vhost_svq_get_num(svq);
> > >>> + r = vhost_vdpa_set_dev_vring_num(dev, &s);
> > >>> + if (unlikely(r)) {
> > >>> + error_report("Can't set vring num
(%d)", r);
> > >>> + return false;
> > >>> + }
> > >>> +
> > >>> + ok = vhost_vdpa_svq_map_rings(dev, svq);
> > >>> + if (unlikely(!ok)) {
> > >>> + return false;
> > >>> + }
> > >>> +
> > >>> + r = vhost_vdpa_svq_set_fds(dev, svq, idx);
> > >>> return r == 0;
> > >>> }
> > >>>
> > >>> @@ -788,14 +881,24 @@ static int
vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> > >>> if (started) {
> > >>> vhost_vdpa_host_notifiers_init(dev);
> > >>> for (unsigned i = 0; i <
v->shadow_vqs->len; ++i) {
> > >>> + VirtQueue *vq =
virtio_get_queue(dev->vdev, dev->vq_index + i);
> > >>> VhostShadowVirtqueue *svq =
g_ptr_array_index(v->shadow_vqs, i);
> > >>> bool ok = vhost_vdpa_svq_setup(dev, svq,
i);
> > >>> if (unlikely(!ok)) {
> > >>> return -1;
> > >>> }
> > >>> + vhost_svq_start(svq, dev->vdev, vq);
> > >>> }
> > >>> vhost_vdpa_set_vring_ready(dev);
> > >>> } else {
> > >>> + for (unsigned i = 0; i <
v->shadow_vqs->len; ++i) {
> > >>> + VhostShadowVirtqueue *svq =
g_ptr_array_index(v->shadow_vqs,
> > >>> +
i);
> > >>> + bool ok = vhost_vdpa_svq_unmap_rings(dev,
svq);
> > >>> + if (unlikely(!ok)) {
> > >>> + return -1;
> > >>> + }
> > >>> + }
> > >>> vhost_vdpa_host_notifiers_uninit(dev,
dev->nvqs);
> > >>> }
> > >>>
> >
>