Jason Wang
2022-Mar-03 07:33 UTC
[PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ
? 2022/3/1 ??4:50, Eugenio Perez Martin ??:> On Mon, Feb 28, 2022 at 8:37 AM Jason Wang <jasowang at redhat.com> wrote: >> >> ? 2022/2/27 ??9:41, Eugenio P?rez ??: >>> Use translations added in VhostIOVATree in SVQ. >>> >>> Only introduce usage here, not allocation and deallocation. As with >>> previous patches, we use the dead code paths of shadow_vqs_enabled to >>> avoid commiting too many changes at once. These are impossible to take >>> at the moment. >>> >>> Signed-off-by: Eugenio P?rez <eperezma at redhat.com> >>> --- >>> hw/virtio/vhost-shadow-virtqueue.h | 6 +- >>> include/hw/virtio/vhost-vdpa.h | 3 + >>> hw/virtio/vhost-shadow-virtqueue.c | 76 ++++++++++++++++- >>> hw/virtio/vhost-vdpa.c | 128 ++++++++++++++++++++++++----- >>> 4 files changed, 187 insertions(+), 26 deletions(-) >>> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h >>> index 04c67685fd..b2f722d101 100644 >>> --- a/hw/virtio/vhost-shadow-virtqueue.h >>> +++ b/hw/virtio/vhost-shadow-virtqueue.h >>> @@ -13,6 +13,7 @@ >>> #include "qemu/event_notifier.h" >>> #include "hw/virtio/virtio.h" >>> #include "standard-headers/linux/vhost_types.h" >>> +#include "hw/virtio/vhost-iova-tree.h" >>> >>> /* Shadow virtqueue to relay notifications */ >>> typedef struct VhostShadowVirtqueue { >>> @@ -43,6 +44,9 @@ typedef struct VhostShadowVirtqueue { >>> /* Virtio device */ >>> VirtIODevice *vdev; >>> >>> + /* IOVA mapping */ >>> + VhostIOVATree *iova_tree; >>> + >>> /* Map for use the guest's descriptors */ >>> VirtQueueElement **ring_id_maps; >>> >>> @@ -78,7 +82,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, >>> VirtQueue *vq); >>> void vhost_svq_stop(VhostShadowVirtqueue *svq); >>> >>> -VhostShadowVirtqueue *vhost_svq_new(void); >>> +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree); >>> >>> void vhost_svq_free(gpointer vq); >>> G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free); >>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h >>> index 009a9f3b6b..ee8e939ad0 100644 >>> --- a/include/hw/virtio/vhost-vdpa.h >>> +++ b/include/hw/virtio/vhost-vdpa.h >>> @@ -14,6 +14,7 @@ >>> >>> #include <gmodule.h> >>> >>> +#include "hw/virtio/vhost-iova-tree.h" >>> #include "hw/virtio/virtio.h" >>> #include "standard-headers/linux/vhost_types.h" >>> >>> @@ -30,6 +31,8 @@ typedef struct vhost_vdpa { >>> MemoryListener listener; >>> struct vhost_vdpa_iova_range iova_range; >>> bool shadow_vqs_enabled; >>> + /* IOVA mapping used by the Shadow Virtqueue */ >>> + VhostIOVATree *iova_tree; >>> GPtrArray *shadow_vqs; >>> struct vhost_dev *dev; >>> VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX]; >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c >>> index a38d313755..7e073773d1 100644 >>> --- a/hw/virtio/vhost-shadow-virtqueue.c >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c >>> @@ -11,6 +11,7 @@ >>> #include "hw/virtio/vhost-shadow-virtqueue.h" >>> >>> #include "qemu/error-report.h" >>> +#include "qemu/log.h" >>> #include "qemu/main-loop.h" >>> #include "qemu/log.h" >>> #include "linux-headers/linux/vhost.h" >>> @@ -84,7 +85,58 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable) >>> } >>> } >>> >>> +/** >>> + * Translate addresses between the qemu's virtual address and the SVQ IOVA >>> + * >>> + * @svq Shadow VirtQueue >>> + * @vaddr Translated IOVA addresses >>> + * @iovec Source qemu's VA addresses >>> + * @num Length of iovec and minimum length of vaddr >>> + */ >>> +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, >>> + void **addrs, const struct iovec *iovec, >>> + size_t num) >>> +{ >>> + if (num == 0) { >>> + return true; >>> + } >>> + >>> + for (size_t i = 0; i < num; ++i) { >>> + DMAMap needle = { >>> + .translated_addr = (hwaddr)iovec[i].iov_base, >>> + .size = iovec[i].iov_len, >>> + }; >>> + size_t off; >>> + >>> + const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); >>> + /* >>> + * Map cannot be NULL since iova map contains all guest space and >>> + * qemu already has a physical address mapped >>> + */ >>> + if (unlikely(!map)) { >>> + qemu_log_mask(LOG_GUEST_ERROR, >>> + "Invalid address 0x%"HWADDR_PRIx" given by guest", >>> + needle.translated_addr); >>> + return false; >>> + } >>> + >>> + off = needle.translated_addr - map->translated_addr; >>> + addrs[i] = (void *)(map->iova + off); >>> + >>> + if (unlikely(int128_gt(int128_add(needle.translated_addr, >>> + iovec[i].iov_len), >>> + map->translated_addr + map->size))) { >>> + qemu_log_mask(LOG_GUEST_ERROR, >>> + "Guest buffer expands over iova range"); >>> + return false; >>> + } >>> + } >>> + >>> + return true; >>> +} >>> + >>> static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>> + void * const *vaddr_sg, >> >> Nit: it looks to me we are not passing vaddr but iova here, so it might >> be better to use "sg"? >> > Sure, this is a leftover of pre-IOVA translations. I see better to > write as you say. > >>> const struct iovec *iovec, >>> size_t num, bool more_descs, bool write) >>> { >>> @@ -103,7 +155,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>> } else { >>> descs[i].flags = flags; >>> } >>> - descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); >>> + descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]); >>> descs[i].len = cpu_to_le32(iovec[n].iov_len); >>> >>> last = i; >>> @@ -119,6 +171,8 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, >>> { >>> unsigned avail_idx; >>> vring_avail_t *avail = svq->vring.avail; >>> + bool ok; >>> + g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num)); >>> >>> *head = svq->free_head; >>> >>> @@ -129,9 +183,20 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, >>> return false; >>> } >>> >>> - vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, >>> + ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num); >>> + if (unlikely(!ok)) { >>> + return false; >>> + } >>> + vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num, >>> elem->in_num > 0, false); >>> - vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); >>> + >>> + >>> + ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num); >>> + if (unlikely(!ok)) { >>> + return false; >>> + } >>> + >>> + vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true); >>> >>> /* >>> * Put the entry in the available array (but don't update avail->idx until >>> @@ -514,11 +579,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq) >>> * Creates vhost shadow virtqueue, and instructs the vhost device to use the >>> * shadow methods and file descriptors. >>> * >>> + * @iova_tree Tree to perform descriptors translations >>> + * >>> * Returns the new virtqueue or NULL. >>> * >>> * In case of error, reason is reported through error_report. >>> */ >>> -VhostShadowVirtqueue *vhost_svq_new(void) >>> +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree) >>> { >>> g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1); >>> int r; >>> @@ -539,6 +606,7 @@ VhostShadowVirtqueue *vhost_svq_new(void) >>> >>> event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND); >>> event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call); >>> + svq->iova_tree = iova_tree; >>> return g_steal_pointer(&svq); >>> >>> err_init_hdev_call: >>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c >>> index 435b9c2e9e..56f9f125cd 100644 >>> --- a/hw/virtio/vhost-vdpa.c >>> +++ b/hw/virtio/vhost-vdpa.c >>> @@ -209,6 +209,21 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, >>> vaddr, section->readonly); >>> >>> llsize = int128_sub(llend, int128_make64(iova)); >>> + if (v->shadow_vqs_enabled) { >>> + DMAMap mem_region = { >>> + .translated_addr = (hwaddr)vaddr, >>> + .size = int128_get64(llsize) - 1, >>> + .perm = IOMMU_ACCESS_FLAG(true, section->readonly), >>> + }; >>> + >>> + int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); >>> + if (unlikely(r != IOVA_OK)) { >>> + error_report("Can't allocate a mapping (%d)", r); >>> + goto fail; >>> + } >>> + >>> + iova = mem_region.iova; >>> + } >>> >>> vhost_vdpa_iotlb_batch_begin_once(v); >>> ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), >>> @@ -261,6 +276,20 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener, >>> >>> llsize = int128_sub(llend, int128_make64(iova)); >>> >>> + if (v->shadow_vqs_enabled) { >>> + const DMAMap *result; >>> + const void *vaddr = memory_region_get_ram_ptr(section->mr) + >>> + section->offset_within_region + >>> + (iova - section->offset_within_address_space); >>> + DMAMap mem_region = { >>> + .translated_addr = (hwaddr)vaddr, >>> + .size = int128_get64(llsize) - 1, >>> + }; >>> + >>> + result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); >>> + iova = result->iova; >>> + vhost_iova_tree_remove(v->iova_tree, &mem_region); >>> + } >>> vhost_vdpa_iotlb_batch_begin_once(v); >>> ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); >>> if (ret) { >>> @@ -383,7 +412,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, >>> >>> shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); >>> for (unsigned n = 0; n < hdev->nvqs; ++n) { >>> - g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(); >>> + g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree); >>> >>> if (unlikely(!svq)) { >>> error_setg(errp, "Cannot create svq %u", n); >>> @@ -834,37 +863,78 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, >>> /** >>> * Unmap a SVQ area in the device >>> */ >>> -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova, >>> - hwaddr size) >>> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, >>> + const DMAMap *needle) >>> { >>> + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); >>> + hwaddr size; >>> int r; >>> >>> - size = ROUND_UP(size, qemu_real_host_page_size); >>> - r = vhost_vdpa_dma_unmap(v, iova, size); >>> + if (unlikely(!result)) { >>> + error_report("Unable to find SVQ address to unmap"); >>> + return false; >>> + } >>> + >>> + size = ROUND_UP(result->size, qemu_real_host_page_size); >>> + r = vhost_vdpa_dma_unmap(v, result->iova, size); >>> return r == 0; >>> } >>> >>> static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, >>> const VhostShadowVirtqueue *svq) >>> { >>> + DMAMap needle; >>> struct vhost_vdpa *v = dev->opaque; >>> struct vhost_vring_addr svq_addr; >>> - size_t device_size = vhost_svq_device_area_size(svq); >>> - size_t driver_size = vhost_svq_driver_area_size(svq); >>> bool ok; >>> >>> vhost_svq_get_vring_addr(svq, &svq_addr); >>> >>> - ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size); >>> + needle = (DMAMap) { >>> + .translated_addr = svq_addr.desc_user_addr, >>> + }; >> >> Let's simply initialize the member to zero during start of this function >> then we can use needle->transalted_addr = XXX here. >> > Sure > >>> + ok = vhost_vdpa_svq_unmap_ring(v, &needle); >>> if (unlikely(!ok)) { >>> return false; >>> } >>> >>> - return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size); >>> + needle = (DMAMap) { >>> + .translated_addr = svq_addr.used_user_addr, >>> + }; >>> + return vhost_vdpa_svq_unmap_ring(v, &needle); >>> +} >>> + >>> +/** >>> + * Map the SVQ area in the device >>> + * >>> + * @v Vhost-vdpa device >>> + * @needle The area to search iova >>> + * @errorp Error pointer >>> + */ >>> +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, >>> + Error **errp) >>> +{ >>> + int r; >>> + >>> + r = vhost_iova_tree_map_alloc(v->iova_tree, needle); >>> + if (unlikely(r != IOVA_OK)) { >>> + error_setg(errp, "Cannot allocate iova (%d)", r); >>> + return false; >>> + } >>> + >>> + r = vhost_vdpa_dma_map(v, needle->iova, needle->size, >>> + (void *)needle->translated_addr, >>> + !(needle->perm & IOMMU_ACCESS_FLAG(0, 1))); >> >> Let's simply use needle->perm == IOMMU_RO here? >> > The motivation to use this way is to be more resilient to the future. > For example, if a new flag is added. > > But I'm totally ok with comparing with IOMMU_RO, I see that scenario > unlikely at the moment. > >>> + if (unlikely(r != 0)) { >>> + error_setg_errno(errp, -r, "Cannot map region to device"); >>> + vhost_iova_tree_remove(v->iova_tree, needle); >>> + } >>> + >>> + return r == 0; >>> } >>> >>> /** >>> - * Map shadow virtqueue rings in device >>> + * Map the shadow virtqueue rings in the device >>> * >>> * @dev The vhost device >>> * @svq The shadow virtqueue >>> @@ -876,28 +946,44 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, >>> struct vhost_vring_addr *addr, >>> Error **errp) >>> { >>> + DMAMap device_region, driver_region; >>> + struct vhost_vring_addr svq_addr; >>> struct vhost_vdpa *v = dev->opaque; >>> size_t device_size = vhost_svq_device_area_size(svq); >>> size_t driver_size = vhost_svq_driver_area_size(svq); >>> - int r; >>> + size_t avail_offset; >>> + bool ok; >>> >>> ERRP_GUARD(); >>> - vhost_svq_get_vring_addr(svq, addr); >>> + vhost_svq_get_vring_addr(svq, &svq_addr); >>> >>> - r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size, >>> - (void *)addr->desc_user_addr, true); >>> - if (unlikely(r != 0)) { >>> - error_setg_errno(errp, -r, "Cannot create vq driver region: "); >>> + driver_region = (DMAMap) { >>> + .translated_addr = svq_addr.desc_user_addr, >>> + .size = driver_size - 1, >> >> Any reason for the "-1" here? I see several places do things like that, >> it's probably hint of wrong API somehwere. >> > The "problem" is the api mismatch between _end and _last, to include > the last member in the size or not. > > IOVA tree needs to use _end so we can allocate the last page in case > of available range ending in (uint64_t)-1 [1]. But If we change > vhost_svq_{device,driver}_area_size to make it inclusive,These functions looks sane since it doesn't return a range. It's up to the caller to decide how to use the size.> we need to > use "+1" in calls like qemu_memalign and memset at vhost_svq_start. > Probably in more places tooI'm not sure I get here. Maybe you can show which code may suffers if we don't decrease it by one here. But current code may endup to passing qemu_real_host_page_size - 1 to vhost-VDPA which seems wrong? E.g vhost_svq_device_area_size() return qemu_real_host_page_size, but it was decreased by 1 here for size, then we pass size to vhost_vdpa_dma_map(). Thanks> > QEMU's emulated Intel iommu code solves it using the address mask as > the size, something that does not fit 100% with vhost devices since > they can allocate an arbitrary address of arbitrary size when using > vIOMMU. It's not a problem for vhost-vdpa at this moment since we make > sure we expose unaligned and whole pages with vrings, but I feel it > would only be to move the problem somewhere else. > > Thanks! > > [1] There are alternatives: to use Int128_t, etc. But I think it's > better not to change that in this patch series. > >> Thanks >> >> >>> + .perm = IOMMU_RO, >>> + }; >>> + ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); >>> + if (unlikely(!ok)) { >>> + error_prepend(errp, "Cannot create vq driver region: "); >>> return false; >>> } >>> + addr->desc_user_addr = driver_region.iova; >>> + avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; >>> + addr->avail_user_addr = driver_region.iova + avail_offset; >>> >>> - r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size, >>> - (void *)addr->used_user_addr, false); >>> - if (unlikely(r != 0)) { >>> - error_setg_errno(errp, -r, "Cannot create vq device region: "); >>> + device_region = (DMAMap) { >>> + .translated_addr = svq_addr.used_user_addr, >>> + .size = device_size - 1, >>> + .perm = IOMMU_RW, >>> + }; >>> + ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); >>> + if (unlikely(!ok)) { >>> + error_prepend(errp, "Cannot create vq device region: "); >>> + vhost_vdpa_svq_unmap_ring(v, &driver_region); >>> } >>> + addr->used_user_addr = device_region.iova; >>> >>> - return r == 0; >>> + return ok; >>> } >>> >>> static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,