thr3ads.net - Virtualization - [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ [Mar 2022]

If this information is useful, please help other people find it:
Share via:
Jason Wang
2022-Mar-03 07:33 UTC
[PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ

? 2022/3/1 ??4:50, Eugenio Perez Martin ??:> On Mon, Feb 28, 2022 at 8:37 AM Jason Wang <jasowang at redhat.com>
wrote:
>>
>> ? 2022/2/27 ??9:41, Eugenio P?rez ??:
>>> Use translations added in VhostIOVATree in SVQ.
>>>
>>> Only introduce usage here, not allocation and deallocation. As with
>>> previous patches, we use the dead code paths of shadow_vqs_enabled
to
>>> avoid commiting too many changes at once. These are impossible to
take
>>> at the moment.
>>>
>>> Signed-off-by: Eugenio P?rez <eperezma at redhat.com>
>>> ---
>>>    hw/virtio/vhost-shadow-virtqueue.h |   6 +-
>>>    include/hw/virtio/vhost-vdpa.h     |   3 +
>>>    hw/virtio/vhost-shadow-virtqueue.c |  76 ++++++++++++++++-
>>>    hw/virtio/vhost-vdpa.c             | 128
++++++++++++++++++++++++-----
>>>    4 files changed, 187 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h
b/hw/virtio/vhost-shadow-virtqueue.h
>>> index 04c67685fd..b2f722d101 100644
>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
>>> @@ -13,6 +13,7 @@
>>>    #include "qemu/event_notifier.h"
>>>    #include "hw/virtio/virtio.h"
>>>    #include "standard-headers/linux/vhost_types.h"
>>> +#include "hw/virtio/vhost-iova-tree.h"
>>>
>>>    /* Shadow virtqueue to relay notifications */
>>>    typedef struct VhostShadowVirtqueue {
>>> @@ -43,6 +44,9 @@ typedef struct VhostShadowVirtqueue {
>>>        /* Virtio device */
>>>        VirtIODevice *vdev;
>>>
>>> +    /* IOVA mapping */
>>> +    VhostIOVATree *iova_tree;
>>> +
>>>        /* Map for use the guest's descriptors */
>>>        VirtQueueElement **ring_id_maps;
>>>
>>> @@ -78,7 +82,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq,
VirtIODevice *vdev,
>>>                         VirtQueue *vq);
>>>    void vhost_svq_stop(VhostShadowVirtqueue *svq);
>>>
>>> -VhostShadowVirtqueue *vhost_svq_new(void);
>>> +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
>>>
>>>    void vhost_svq_free(gpointer vq);
>>>    G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue,
vhost_svq_free);
>>> diff --git a/include/hw/virtio/vhost-vdpa.h
b/include/hw/virtio/vhost-vdpa.h
>>> index 009a9f3b6b..ee8e939ad0 100644
>>> --- a/include/hw/virtio/vhost-vdpa.h
>>> +++ b/include/hw/virtio/vhost-vdpa.h
>>> @@ -14,6 +14,7 @@
>>>
>>>    #include <gmodule.h>
>>>
>>> +#include "hw/virtio/vhost-iova-tree.h"
>>>    #include "hw/virtio/virtio.h"
>>>    #include "standard-headers/linux/vhost_types.h"
>>>
>>> @@ -30,6 +31,8 @@ typedef struct vhost_vdpa {
>>>        MemoryListener listener;
>>>        struct vhost_vdpa_iova_range iova_range;
>>>        bool shadow_vqs_enabled;
>>> +    /* IOVA mapping used by the Shadow Virtqueue */
>>> +    VhostIOVATree *iova_tree;
>>>        GPtrArray *shadow_vqs;
>>>        struct vhost_dev *dev;
>>>        VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c
b/hw/virtio/vhost-shadow-virtqueue.c
>>> index a38d313755..7e073773d1 100644
>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>>> @@ -11,6 +11,7 @@
>>>    #include "hw/virtio/vhost-shadow-virtqueue.h"
>>>
>>>    #include "qemu/error-report.h"
>>> +#include "qemu/log.h"
>>>    #include "qemu/main-loop.h"
>>>    #include "qemu/log.h"
>>>    #include "linux-headers/linux/vhost.h"
>>> @@ -84,7 +85,58 @@ static void
vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
>>>        }
>>>    }
>>>
>>> +/**
>>> + * Translate addresses between the qemu's virtual address and
the SVQ IOVA
>>> + *
>>> + * @svq    Shadow VirtQueue
>>> + * @vaddr  Translated IOVA addresses
>>> + * @iovec  Source qemu's VA addresses
>>> + * @num    Length of iovec and minimum length of vaddr
>>> + */
>>> +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue
*svq,
>>> +                                     void **addrs, const struct
iovec *iovec,
>>> +                                     size_t num)
>>> +{
>>> +    if (num == 0) {
>>> +        return true;
>>> +    }
>>> +
>>> +    for (size_t i = 0; i < num; ++i) {
>>> +        DMAMap needle = {
>>> +            .translated_addr = (hwaddr)iovec[i].iov_base,
>>> +            .size = iovec[i].iov_len,
>>> +        };
>>> +        size_t off;
>>> +
>>> +        const DMAMap *map =
vhost_iova_tree_find_iova(svq->iova_tree, &needle);
>>> +        /*
>>> +         * Map cannot be NULL since iova map contains all guest
space and
>>> +         * qemu already has a physical address mapped
>>> +         */
>>> +        if (unlikely(!map)) {
>>> +            qemu_log_mask(LOG_GUEST_ERROR,
>>> +                          "Invalid address
0x%"HWADDR_PRIx" given by guest",
>>> +                          needle.translated_addr);
>>> +            return false;
>>> +        }
>>> +
>>> +        off = needle.translated_addr - map->translated_addr;
>>> +        addrs[i] = (void *)(map->iova + off);
>>> +
>>> +        if (unlikely(int128_gt(int128_add(needle.translated_addr,
>>> +                                          iovec[i].iov_len),
>>> +                               map->translated_addr +
map->size))) {
>>> +            qemu_log_mask(LOG_GUEST_ERROR,
>>> +                          "Guest buffer expands over iova
range");
>>> +            return false;
>>> +        }
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>> +
>>>    static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
>>> +                                    void * const *vaddr_sg,
>>
>> Nit: it looks to me we are not passing vaddr but iova here, so it might
>> be better to use "sg"?
>>
> Sure, this is a leftover of pre-IOVA translations. I see better to
> write as you say.
>
>>>                                        const struct iovec *iovec,
>>>                                        size_t num, bool more_descs,
bool write)
>>>    {
>>> @@ -103,7 +155,7 @@ static void
vhost_vring_write_descs(VhostShadowVirtqueue *svq,
>>>            } else {
>>>                descs[i].flags = flags;
>>>            }
>>> -        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
>>> +        descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]);
>>>            descs[i].len = cpu_to_le32(iovec[n].iov_len);
>>>
>>>            last = i;
>>> @@ -119,6 +171,8 @@ static bool
vhost_svq_add_split(VhostShadowVirtqueue *svq,
>>>    {
>>>        unsigned avail_idx;
>>>        vring_avail_t *avail = svq->vring.avail;
>>> +    bool ok;
>>> +    g_autofree void **sgs = g_new(void *, MAX(elem->out_num,
elem->in_num));
>>>
>>>        *head = svq->free_head;
>>>
>>> @@ -129,9 +183,20 @@ static bool
vhost_svq_add_split(VhostShadowVirtqueue *svq,
>>>            return false;
>>>        }
>>>
>>> -    vhost_vring_write_descs(svq, elem->out_sg,
elem->out_num,
>>> +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg,
elem->out_num);
>>> +    if (unlikely(!ok)) {
>>> +        return false;
>>> +    }
>>> +    vhost_vring_write_descs(svq, sgs, elem->out_sg,
elem->out_num,
>>>                                elem->in_num > 0, false);
>>> -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num,
false, true);
>>> +
>>> +
>>> +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg,
elem->in_num);
>>> +    if (unlikely(!ok)) {
>>> +        return false;
>>> +    }
>>> +
>>> +    vhost_vring_write_descs(svq, sgs, elem->in_sg,
elem->in_num, false, true);
>>>
>>>        /*
>>>         * Put the entry in the available array (but don't
update avail->idx until
>>> @@ -514,11 +579,13 @@ void vhost_svq_stop(VhostShadowVirtqueue
*svq)
>>>     * Creates vhost shadow virtqueue, and instructs the vhost
device to use the
>>>     * shadow methods and file descriptors.
>>>     *
>>> + * @iova_tree Tree to perform descriptors translations
>>> + *
>>>     * Returns the new virtqueue or NULL.
>>>     *
>>>     * In case of error, reason is reported through error_report.
>>>     */
>>> -VhostShadowVirtqueue *vhost_svq_new(void)
>>> +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
>>>    {
>>>        g_autofree VhostShadowVirtqueue *svq =
g_new0(VhostShadowVirtqueue, 1);
>>>        int r;
>>> @@ -539,6 +606,7 @@ VhostShadowVirtqueue *vhost_svq_new(void)
>>>
>>>        event_notifier_init_fd(&svq->svq_kick,
VHOST_FILE_UNBIND);
>>>        event_notifier_set_handler(&svq->hdev_call,
vhost_svq_handle_call);
>>> +    svq->iova_tree = iova_tree;
>>>        return g_steal_pointer(&svq);
>>>
>>>    err_init_hdev_call:
>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
>>> index 435b9c2e9e..56f9f125cd 100644
>>> --- a/hw/virtio/vhost-vdpa.c
>>> +++ b/hw/virtio/vhost-vdpa.c
>>> @@ -209,6 +209,21 @@ static void
vhost_vdpa_listener_region_add(MemoryListener *listener,
>>>                                             vaddr,
section->readonly);
>>>
>>>        llsize = int128_sub(llend, int128_make64(iova));
>>> +    if (v->shadow_vqs_enabled) {
>>> +        DMAMap mem_region = {
>>> +            .translated_addr = (hwaddr)vaddr,
>>> +            .size = int128_get64(llsize) - 1,
>>> +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
>>> +        };
>>> +
>>> +        int r = vhost_iova_tree_map_alloc(v->iova_tree,
&mem_region);
>>> +        if (unlikely(r != IOVA_OK)) {
>>> +            error_report("Can't allocate a mapping
(%d)", r);
>>> +            goto fail;
>>> +        }
>>> +
>>> +        iova = mem_region.iova;
>>> +    }
>>>
>>>        vhost_vdpa_iotlb_batch_begin_once(v);
>>>        ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
>>> @@ -261,6 +276,20 @@ static void
vhost_vdpa_listener_region_del(MemoryListener *listener,
>>>
>>>        llsize = int128_sub(llend, int128_make64(iova));
>>>
>>> +    if (v->shadow_vqs_enabled) {
>>> +        const DMAMap *result;
>>> +        const void *vaddr =
memory_region_get_ram_ptr(section->mr) +
>>> +            section->offset_within_region +
>>> +            (iova - section->offset_within_address_space);
>>> +        DMAMap mem_region = {
>>> +            .translated_addr = (hwaddr)vaddr,
>>> +            .size = int128_get64(llsize) - 1,
>>> +        };
>>> +
>>> +        result = vhost_iova_tree_find_iova(v->iova_tree,
&mem_region);
>>> +        iova = result->iova;
>>> +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
>>> +    }
>>>        vhost_vdpa_iotlb_batch_begin_once(v);
>>>        ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
>>>        if (ret) {
>>> @@ -383,7 +412,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev
*hdev, struct vhost_vdpa *v,
>>>
>>>        shadow_vqs = g_ptr_array_new_full(hdev->nvqs,
vhost_svq_free);
>>>        for (unsigned n = 0; n < hdev->nvqs; ++n) {
>>> -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
>>> +        g_autoptr(VhostShadowVirtqueue) svq =
vhost_svq_new(v->iova_tree);
>>>
>>>            if (unlikely(!svq)) {
>>>                error_setg(errp, "Cannot create svq %u",
n);
>>> @@ -834,37 +863,78 @@ static int vhost_vdpa_svq_set_fds(struct
vhost_dev *dev,
>>>    /**
>>>     * Unmap a SVQ area in the device
>>>     */
>>> -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr
iova,
>>> -                                      hwaddr size)
>>> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
>>> +                                      const DMAMap *needle)
>>>    {
>>> +    const DMAMap *result =
vhost_iova_tree_find_iova(v->iova_tree, needle);
>>> +    hwaddr size;
>>>        int r;
>>>
>>> -    size = ROUND_UP(size, qemu_real_host_page_size);
>>> -    r = vhost_vdpa_dma_unmap(v, iova, size);
>>> +    if (unlikely(!result)) {
>>> +        error_report("Unable to find SVQ address to
unmap");
>>> +        return false;
>>> +    }
>>> +
>>> +    size = ROUND_UP(result->size, qemu_real_host_page_size);
>>> +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
>>>        return r == 0;
>>>    }
>>>
>>>    static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
>>>                                           const
VhostShadowVirtqueue *svq)
>>>    {
>>> +    DMAMap needle;
>>>        struct vhost_vdpa *v = dev->opaque;
>>>        struct vhost_vring_addr svq_addr;
>>> -    size_t device_size = vhost_svq_device_area_size(svq);
>>> -    size_t driver_size = vhost_svq_driver_area_size(svq);
>>>        bool ok;
>>>
>>>        vhost_svq_get_vring_addr(svq, &svq_addr);
>>>
>>> -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr,
driver_size);
>>> +    needle = (DMAMap) {
>>> +        .translated_addr = svq_addr.desc_user_addr,
>>> +    };
>>
>> Let's simply initialize the member to zero during start of this
function
>> then we can use needle->transalted_addr = XXX here.
>>
> Sure
>
>>> +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
>>>        if (unlikely(!ok)) {
>>>            return false;
>>>        }
>>>
>>> -    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr,
device_size);
>>> +    needle = (DMAMap) {
>>> +        .translated_addr = svq_addr.used_user_addr,
>>> +    };
>>> +    return vhost_vdpa_svq_unmap_ring(v, &needle);
>>> +}
>>> +
>>> +/**
>>> + * Map the SVQ area in the device
>>> + *
>>> + * @v          Vhost-vdpa device
>>> + * @needle     The area to search iova
>>> + * @errorp     Error pointer
>>> + */
>>> +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap
*needle,
>>> +                                    Error **errp)
>>> +{
>>> +    int r;
>>> +
>>> +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
>>> +    if (unlikely(r != IOVA_OK)) {
>>> +        error_setg(errp, "Cannot allocate iova (%d)",
r);
>>> +        return false;
>>> +    }
>>> +
>>> +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size,
>>> +                           (void *)needle->translated_addr,
>>> +                           !(needle->perm &
IOMMU_ACCESS_FLAG(0, 1)));
>>
>> Let's simply use needle->perm == IOMMU_RO here?
>>
> The motivation to use this way is to be more resilient to the future.
> For example, if a new flag is added.
>
> But I'm totally ok with comparing with IOMMU_RO, I see that scenario
> unlikely at the moment.
>
>>> +    if (unlikely(r != 0)) {
>>> +        error_setg_errno(errp, -r, "Cannot map region to
device");
>>> +        vhost_iova_tree_remove(v->iova_tree, needle);
>>> +    }
>>> +
>>> +    return r == 0;
>>>    }
>>>
>>>    /**
>>> - * Map shadow virtqueue rings in device
>>> + * Map the shadow virtqueue rings in the device
>>>     *
>>>     * @dev   The vhost device
>>>     * @svq   The shadow virtqueue
>>> @@ -876,28 +946,44 @@ static bool vhost_vdpa_svq_map_rings(struct
vhost_dev *dev,
>>>                                         struct vhost_vring_addr
*addr,
>>>                                         Error **errp)
>>>    {
>>> +    DMAMap device_region, driver_region;
>>> +    struct vhost_vring_addr svq_addr;
>>>        struct vhost_vdpa *v = dev->opaque;
>>>        size_t device_size = vhost_svq_device_area_size(svq);
>>>        size_t driver_size = vhost_svq_driver_area_size(svq);
>>> -    int r;
>>> +    size_t avail_offset;
>>> +    bool ok;
>>>
>>>        ERRP_GUARD();
>>> -    vhost_svq_get_vring_addr(svq, addr);
>>> +    vhost_svq_get_vring_addr(svq, &svq_addr);
>>>
>>> -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr,
driver_size,
>>> -                           (void *)addr->desc_user_addr, true);
>>> -    if (unlikely(r != 0)) {
>>> -        error_setg_errno(errp, -r, "Cannot create vq driver
region: ");
>>> +    driver_region = (DMAMap) {
>>> +        .translated_addr = svq_addr.desc_user_addr,
>>> +        .size = driver_size - 1,
>>
>> Any reason for the "-1" here? I see several places do things
like that,
>> it's probably hint of wrong API somehwere.
>>
> The "problem" is the api mismatch between _end and _last, to
include
> the last member in the size or not.
>
> IOVA tree needs to use _end so we can allocate the last page in case
> of available range ending in (uint64_t)-1 [1]. But If we change
> vhost_svq_{device,driver}_area_size to make it inclusive,

These functions looks sane since it doesn't return a range. It's up to 
the caller to decide how to use the size.

>   we need to
> use "+1" in calls like qemu_memalign and memset at
vhost_svq_start.
> Probably in more places too

I'm not sure I get here. Maybe you can show which code may suffers if we 
don't decrease it by one here.

But current code may endup to passing qemu_real_host_page_size - 1 to 
vhost-VDPA which seems wrong?

E.g vhost_svq_device_area_size() return qemu_real_host_page_size, but it 
was decreased by 1 here for size, then we pass size to vhost_vdpa_dma_map().

Thanks

>
> QEMU's emulated Intel iommu code solves it using the address mask as
> the size, something that does not fit 100% with vhost devices since
> they can allocate an arbitrary address of arbitrary size when using
> vIOMMU. It's not a problem for vhost-vdpa at this moment since we make
> sure we expose unaligned and whole pages with vrings, but I feel it
> would only be to move the problem somewhere else.
>
> Thanks!
>
> [1] There are alternatives: to use Int128_t, etc. But I think it's
> better not to change that in this patch series.
>
>> Thanks
>>
>>
>>> +        .perm = IOMMU_RO,
>>> +    };
>>> +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
>>> +    if (unlikely(!ok)) {
>>> +        error_prepend(errp, "Cannot create vq driver region:
");
>>>            return false;
>>>        }
>>> +    addr->desc_user_addr = driver_region.iova;
>>> +    avail_offset = svq_addr.avail_user_addr -
svq_addr.desc_user_addr;
>>> +    addr->avail_user_addr = driver_region.iova + avail_offset;
>>>
>>> -    r = vhost_vdpa_dma_map(v, addr->used_user_addr,
device_size,
>>> -                           (void *)addr->used_user_addr,
false);
>>> -    if (unlikely(r != 0)) {
>>> -        error_setg_errno(errp, -r, "Cannot create vq device
region: ");
>>> +    device_region = (DMAMap) {
>>> +        .translated_addr = svq_addr.used_user_addr,
>>> +        .size = device_size - 1,
>>> +        .perm = IOMMU_RW,
>>> +    };
>>> +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
>>> +    if (unlikely(!ok)) {
>>> +        error_prepend(errp, "Cannot create vq device region:
");
>>> +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
>>>        }
>>> +    addr->used_user_addr = device_region.iova;
>>>
>>> -    return r == 0;
>>> +    return ok;
>>>    }
>>>
>>>    static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
Virtualization - Mar 2022 - [PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ

[PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ