Jason Wang
2022-Mar-07 04:24 UTC
[PATCH v2 10/14] vdpa: Add custom IOTLB translations to SVQ
? 2022/3/3 ??7:35, Eugenio Perez Martin ??:> On Thu, Mar 3, 2022 at 8:33 AM Jason Wang <jasowang at redhat.com> wrote: >> >> ? 2022/3/1 ??4:50, Eugenio Perez Martin ??: >>> On Mon, Feb 28, 2022 at 8:37 AM Jason Wang <jasowang at redhat.com> wrote: >>>> ? 2022/2/27 ??9:41, Eugenio P?rez ??: >>>>> Use translations added in VhostIOVATree in SVQ. >>>>> >>>>> Only introduce usage here, not allocation and deallocation. As with >>>>> previous patches, we use the dead code paths of shadow_vqs_enabled to >>>>> avoid commiting too many changes at once. These are impossible to take >>>>> at the moment. >>>>> >>>>> Signed-off-by: Eugenio P?rez <eperezma at redhat.com> >>>>> --- >>>>> hw/virtio/vhost-shadow-virtqueue.h | 6 +- >>>>> include/hw/virtio/vhost-vdpa.h | 3 + >>>>> hw/virtio/vhost-shadow-virtqueue.c | 76 ++++++++++++++++- >>>>> hw/virtio/vhost-vdpa.c | 128 ++++++++++++++++++++++++----- >>>>> 4 files changed, 187 insertions(+), 26 deletions(-) >>>>> >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h >>>>> index 04c67685fd..b2f722d101 100644 >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h >>>>> @@ -13,6 +13,7 @@ >>>>> #include "qemu/event_notifier.h" >>>>> #include "hw/virtio/virtio.h" >>>>> #include "standard-headers/linux/vhost_types.h" >>>>> +#include "hw/virtio/vhost-iova-tree.h" >>>>> >>>>> /* Shadow virtqueue to relay notifications */ >>>>> typedef struct VhostShadowVirtqueue { >>>>> @@ -43,6 +44,9 @@ typedef struct VhostShadowVirtqueue { >>>>> /* Virtio device */ >>>>> VirtIODevice *vdev; >>>>> >>>>> + /* IOVA mapping */ >>>>> + VhostIOVATree *iova_tree; >>>>> + >>>>> /* Map for use the guest's descriptors */ >>>>> VirtQueueElement **ring_id_maps; >>>>> >>>>> @@ -78,7 +82,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, >>>>> VirtQueue *vq); >>>>> void vhost_svq_stop(VhostShadowVirtqueue *svq); >>>>> >>>>> -VhostShadowVirtqueue *vhost_svq_new(void); >>>>> +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree); >>>>> >>>>> void vhost_svq_free(gpointer vq); >>>>> G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free); >>>>> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h >>>>> index 009a9f3b6b..ee8e939ad0 100644 >>>>> --- a/include/hw/virtio/vhost-vdpa.h >>>>> +++ b/include/hw/virtio/vhost-vdpa.h >>>>> @@ -14,6 +14,7 @@ >>>>> >>>>> #include <gmodule.h> >>>>> >>>>> +#include "hw/virtio/vhost-iova-tree.h" >>>>> #include "hw/virtio/virtio.h" >>>>> #include "standard-headers/linux/vhost_types.h" >>>>> >>>>> @@ -30,6 +31,8 @@ typedef struct vhost_vdpa { >>>>> MemoryListener listener; >>>>> struct vhost_vdpa_iova_range iova_range; >>>>> bool shadow_vqs_enabled; >>>>> + /* IOVA mapping used by the Shadow Virtqueue */ >>>>> + VhostIOVATree *iova_tree; >>>>> GPtrArray *shadow_vqs; >>>>> struct vhost_dev *dev; >>>>> VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX]; >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c >>>>> index a38d313755..7e073773d1 100644 >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c >>>>> @@ -11,6 +11,7 @@ >>>>> #include "hw/virtio/vhost-shadow-virtqueue.h" >>>>> >>>>> #include "qemu/error-report.h" >>>>> +#include "qemu/log.h" >>>>> #include "qemu/main-loop.h" >>>>> #include "qemu/log.h" >>>>> #include "linux-headers/linux/vhost.h" >>>>> @@ -84,7 +85,58 @@ static void vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable) >>>>> } >>>>> } >>>>> >>>>> +/** >>>>> + * Translate addresses between the qemu's virtual address and the SVQ IOVA >>>>> + * >>>>> + * @svq Shadow VirtQueue >>>>> + * @vaddr Translated IOVA addresses >>>>> + * @iovec Source qemu's VA addresses >>>>> + * @num Length of iovec and minimum length of vaddr >>>>> + */ >>>>> +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, >>>>> + void **addrs, const struct iovec *iovec, >>>>> + size_t num) >>>>> +{ >>>>> + if (num == 0) { >>>>> + return true; >>>>> + } >>>>> + >>>>> + for (size_t i = 0; i < num; ++i) { >>>>> + DMAMap needle = { >>>>> + .translated_addr = (hwaddr)iovec[i].iov_base, >>>>> + .size = iovec[i].iov_len, >>>>> + }; >>>>> + size_t off; >>>>> + >>>>> + const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); >>>>> + /* >>>>> + * Map cannot be NULL since iova map contains all guest space and >>>>> + * qemu already has a physical address mapped >>>>> + */ >>>>> + if (unlikely(!map)) { >>>>> + qemu_log_mask(LOG_GUEST_ERROR, >>>>> + "Invalid address 0x%"HWADDR_PRIx" given by guest", >>>>> + needle.translated_addr); >>>>> + return false; >>>>> + } >>>>> + >>>>> + off = needle.translated_addr - map->translated_addr; >>>>> + addrs[i] = (void *)(map->iova + off); >>>>> + >>>>> + if (unlikely(int128_gt(int128_add(needle.translated_addr, >>>>> + iovec[i].iov_len), >>>>> + map->translated_addr + map->size))) { >>>>> + qemu_log_mask(LOG_GUEST_ERROR, >>>>> + "Guest buffer expands over iova range"); >>>>> + return false; >>>>> + } >>>>> + } >>>>> + >>>>> + return true; >>>>> +} >>>>> + >>>>> static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>>>> + void * const *vaddr_sg, >>>> Nit: it looks to me we are not passing vaddr but iova here, so it might >>>> be better to use "sg"? >>>> >>> Sure, this is a leftover of pre-IOVA translations. I see better to >>> write as you say. >>> >>>>> const struct iovec *iovec, >>>>> size_t num, bool more_descs, bool write) >>>>> { >>>>> @@ -103,7 +155,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, >>>>> } else { >>>>> descs[i].flags = flags; >>>>> } >>>>> - descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base); >>>>> + descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]); >>>>> descs[i].len = cpu_to_le32(iovec[n].iov_len); >>>>> >>>>> last = i; >>>>> @@ -119,6 +171,8 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, >>>>> { >>>>> unsigned avail_idx; >>>>> vring_avail_t *avail = svq->vring.avail; >>>>> + bool ok; >>>>> + g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num)); >>>>> >>>>> *head = svq->free_head; >>>>> >>>>> @@ -129,9 +183,20 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, >>>>> return false; >>>>> } >>>>> >>>>> - vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, >>>>> + ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num); >>>>> + if (unlikely(!ok)) { >>>>> + return false; >>>>> + } >>>>> + vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num, >>>>> elem->in_num > 0, false); >>>>> - vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true); >>>>> + >>>>> + >>>>> + ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num); >>>>> + if (unlikely(!ok)) { >>>>> + return false; >>>>> + } >>>>> + >>>>> + vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true); >>>>> >>>>> /* >>>>> * Put the entry in the available array (but don't update avail->idx until >>>>> @@ -514,11 +579,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq) >>>>> * Creates vhost shadow virtqueue, and instructs the vhost device to use the >>>>> * shadow methods and file descriptors. >>>>> * >>>>> + * @iova_tree Tree to perform descriptors translations >>>>> + * >>>>> * Returns the new virtqueue or NULL. >>>>> * >>>>> * In case of error, reason is reported through error_report. >>>>> */ >>>>> -VhostShadowVirtqueue *vhost_svq_new(void) >>>>> +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree) >>>>> { >>>>> g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1); >>>>> int r; >>>>> @@ -539,6 +606,7 @@ VhostShadowVirtqueue *vhost_svq_new(void) >>>>> >>>>> event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND); >>>>> event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call); >>>>> + svq->iova_tree = iova_tree; >>>>> return g_steal_pointer(&svq); >>>>> >>>>> err_init_hdev_call: >>>>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c >>>>> index 435b9c2e9e..56f9f125cd 100644 >>>>> --- a/hw/virtio/vhost-vdpa.c >>>>> +++ b/hw/virtio/vhost-vdpa.c >>>>> @@ -209,6 +209,21 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, >>>>> vaddr, section->readonly); >>>>> >>>>> llsize = int128_sub(llend, int128_make64(iova)); >>>>> + if (v->shadow_vqs_enabled) { >>>>> + DMAMap mem_region = { >>>>> + .translated_addr = (hwaddr)vaddr, >>>>> + .size = int128_get64(llsize) - 1, >>>>> + .perm = IOMMU_ACCESS_FLAG(true, section->readonly), >>>>> + }; >>>>> + >>>>> + int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); >>>>> + if (unlikely(r != IOVA_OK)) { >>>>> + error_report("Can't allocate a mapping (%d)", r); >>>>> + goto fail; >>>>> + } >>>>> + >>>>> + iova = mem_region.iova; >>>>> + } >>>>> >>>>> vhost_vdpa_iotlb_batch_begin_once(v); >>>>> ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), >>>>> @@ -261,6 +276,20 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener, >>>>> >>>>> llsize = int128_sub(llend, int128_make64(iova)); >>>>> >>>>> + if (v->shadow_vqs_enabled) { >>>>> + const DMAMap *result; >>>>> + const void *vaddr = memory_region_get_ram_ptr(section->mr) + >>>>> + section->offset_within_region + >>>>> + (iova - section->offset_within_address_space); >>>>> + DMAMap mem_region = { >>>>> + .translated_addr = (hwaddr)vaddr, >>>>> + .size = int128_get64(llsize) - 1, >>>>> + }; >>>>> + >>>>> + result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); >>>>> + iova = result->iova; >>>>> + vhost_iova_tree_remove(v->iova_tree, &mem_region); >>>>> + } >>>>> vhost_vdpa_iotlb_batch_begin_once(v); >>>>> ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); >>>>> if (ret) { >>>>> @@ -383,7 +412,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, >>>>> >>>>> shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); >>>>> for (unsigned n = 0; n < hdev->nvqs; ++n) { >>>>> - g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(); >>>>> + g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree); >>>>> >>>>> if (unlikely(!svq)) { >>>>> error_setg(errp, "Cannot create svq %u", n); >>>>> @@ -834,37 +863,78 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, >>>>> /** >>>>> * Unmap a SVQ area in the device >>>>> */ >>>>> -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova, >>>>> - hwaddr size) >>>>> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, >>>>> + const DMAMap *needle) >>>>> { >>>>> + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); >>>>> + hwaddr size; >>>>> int r; >>>>> >>>>> - size = ROUND_UP(size, qemu_real_host_page_size); >>>>> - r = vhost_vdpa_dma_unmap(v, iova, size); >>>>> + if (unlikely(!result)) { >>>>> + error_report("Unable to find SVQ address to unmap"); >>>>> + return false; >>>>> + } >>>>> + >>>>> + size = ROUND_UP(result->size, qemu_real_host_page_size); >>>>> + r = vhost_vdpa_dma_unmap(v, result->iova, size); >>>>> return r == 0; >>>>> } >>>>> >>>>> static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, >>>>> const VhostShadowVirtqueue *svq) >>>>> { >>>>> + DMAMap needle; >>>>> struct vhost_vdpa *v = dev->opaque; >>>>> struct vhost_vring_addr svq_addr; >>>>> - size_t device_size = vhost_svq_device_area_size(svq); >>>>> - size_t driver_size = vhost_svq_driver_area_size(svq); >>>>> bool ok; >>>>> >>>>> vhost_svq_get_vring_addr(svq, &svq_addr); >>>>> >>>>> - ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size); >>>>> + needle = (DMAMap) { >>>>> + .translated_addr = svq_addr.desc_user_addr, >>>>> + }; >>>> Let's simply initialize the member to zero during start of this function >>>> then we can use needle->transalted_addr = XXX here. >>>> >>> Sure >>> >>>>> + ok = vhost_vdpa_svq_unmap_ring(v, &needle); >>>>> if (unlikely(!ok)) { >>>>> return false; >>>>> } >>>>> >>>>> - return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size); >>>>> + needle = (DMAMap) { >>>>> + .translated_addr = svq_addr.used_user_addr, >>>>> + }; >>>>> + return vhost_vdpa_svq_unmap_ring(v, &needle); >>>>> +} >>>>> + >>>>> +/** >>>>> + * Map the SVQ area in the device >>>>> + * >>>>> + * @v Vhost-vdpa device >>>>> + * @needle The area to search iova >>>>> + * @errorp Error pointer >>>>> + */ >>>>> +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, >>>>> + Error **errp) >>>>> +{ >>>>> + int r; >>>>> + >>>>> + r = vhost_iova_tree_map_alloc(v->iova_tree, needle); >>>>> + if (unlikely(r != IOVA_OK)) { >>>>> + error_setg(errp, "Cannot allocate iova (%d)", r); >>>>> + return false; >>>>> + } >>>>> + >>>>> + r = vhost_vdpa_dma_map(v, needle->iova, needle->size, >>>>> + (void *)needle->translated_addr, >>>>> + !(needle->perm & IOMMU_ACCESS_FLAG(0, 1))); >>>> Let's simply use needle->perm == IOMMU_RO here? >>>> >>> The motivation to use this way is to be more resilient to the future. >>> For example, if a new flag is added. >>> >>> But I'm totally ok with comparing with IOMMU_RO, I see that scenario >>> unlikely at the moment. >>> >>>>> + if (unlikely(r != 0)) { >>>>> + error_setg_errno(errp, -r, "Cannot map region to device"); >>>>> + vhost_iova_tree_remove(v->iova_tree, needle); >>>>> + } >>>>> + >>>>> + return r == 0; >>>>> } >>>>> >>>>> /** >>>>> - * Map shadow virtqueue rings in device >>>>> + * Map the shadow virtqueue rings in the device >>>>> * >>>>> * @dev The vhost device >>>>> * @svq The shadow virtqueue >>>>> @@ -876,28 +946,44 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, >>>>> struct vhost_vring_addr *addr, >>>>> Error **errp) >>>>> { >>>>> + DMAMap device_region, driver_region; >>>>> + struct vhost_vring_addr svq_addr; >>>>> struct vhost_vdpa *v = dev->opaque; >>>>> size_t device_size = vhost_svq_device_area_size(svq); >>>>> size_t driver_size = vhost_svq_driver_area_size(svq); >>>>> - int r; >>>>> + size_t avail_offset; >>>>> + bool ok; >>>>> >>>>> ERRP_GUARD(); >>>>> - vhost_svq_get_vring_addr(svq, addr); >>>>> + vhost_svq_get_vring_addr(svq, &svq_addr); >>>>> >>>>> - r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size, >>>>> - (void *)addr->desc_user_addr, true); >>>>> - if (unlikely(r != 0)) { >>>>> - error_setg_errno(errp, -r, "Cannot create vq driver region: "); >>>>> + driver_region = (DMAMap) { >>>>> + .translated_addr = svq_addr.desc_user_addr, >>>>> + .size = driver_size - 1, >>>> Any reason for the "-1" here? I see several places do things like that, >>>> it's probably hint of wrong API somehwere. >>>> >>> The "problem" is the api mismatch between _end and _last, to include >>> the last member in the size or not. >>> >>> IOVA tree needs to use _end so we can allocate the last page in case >>> of available range ending in (uint64_t)-1 [1]. But If we change >>> vhost_svq_{device,driver}_area_size to make it inclusive, >> >> These functions looks sane since it doesn't return a range. It's up to >> the caller to decide how to use the size. >> > Ok I think I didn't get your comment the first time, so there is a bug > here. But I'm not sure if we are on the same page regarding the iova > tree. > > Regarding the alignment, it's up to the caller how to use the size. > But if you introduce a mapping of (iova_1, translated_addr_1, size_1), > the iova address iova_1+size_1 belongs to that mapping.This seems contradict to the definition of size_1? E.g if we get a iova range start from 0 and it's size is 1, 1 is not included in that mapping.> If you want to > introduce a new mapping (iova_2 = iova_1 + size_1, translated_addr_2, > size_2) it will be rejected, since it overlaps with the first one. > That part is not up to the caller. > > At this moment, vhost_svq_driver_area_size and > vhost_svq_device_area_size returns in the same terms as sizeof(x). In > other words, size is not inclusive. As memset() or VHOST_IOTLB_UPDATE > expects, for example. We could move the -1 inside of these functions, > and then we need to adapt qemu_memalign calls on vhost_svq_start or > vhost_vdpa dma_map/unmap. > >>> we need to >>> use "+1" in calls like qemu_memalign and memset at vhost_svq_start. >>> Probably in more places too >> >> I'm not sure I get here. Maybe you can show which code may suffers if we >> don't decrease it by one here. >> > Less than I expected I have to say: > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c > b/hw/virtio/vhost-shadow-virtqueue.c > index 497237dcbb..b42ba5a3c0 100644 > --- a/hw/virtio/vhost-shadow-virtqueue.c > +++ b/hw/virtio/vhost-shadow-virtqueue.c > @@ -479,7 +479,7 @@ size_t vhost_svq_device_area_size(const > VhostShadowVirtqueue *svq) > { > size_t used_size = offsetof(vring_used_t, ring) + > sizeof(vring_used_elem_t) * svq->vring.num; > - return ROUND_UP(used_size, qemu_real_host_page_size); > + return ROUND_UP(used_size, qemu_real_host_page_size) - 1; > } > > /** > @@ -532,8 +532,8 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, > VirtIODevice *vdev, > svq->vq = vq; > > svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq)); > - driver_size = vhost_svq_driver_area_size(svq); > - device_size = vhost_svq_device_area_size(svq); > + driver_size = vhost_svq_driver_area_size(svq) + 1; > + device_size = vhost_svq_device_area_size(svq) + 1; > svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size); > desc_size = sizeof(vring_desc_t) * svq->vring.num; > svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size); > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > index 5eefc5911a..2bf648de4a 100644 > --- a/hw/virtio/vhost-vdpa.c > +++ b/hw/virtio/vhost-vdpa.c > @@ -955,7 +955,7 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, > > driver_region = (DMAMap) { > .translated_addr = svq_addr.desc_user_addr, > - .size = driver_size - 1, > + .size = driver_size, > .perm = IOMMU_RO, > }; > ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); > @@ -969,7 +969,7 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, > > device_region = (DMAMap) { > .translated_addr = svq_addr.used_user_addr, > - .size = device_size - 1, > + .size = device_size, > .perm = IOMMU_RW, > }; > ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); > diff --git a/hw/virtio/vhost-shadow-virtqueue.c > b/hw/virtio/vhost-shadow-virtqueue.c > index 497237dcbb..b42ba5a3c0 100644 > --- a/hw/virtio/vhost-shadow-virtqueue.c > +++ b/hw/virtio/vhost-shadow-virtqueue.c > @@ -479,7 +479,7 @@ size_t vhost_svq_device_area_size(const > VhostShadowVirtqueue *svq) > { > size_t used_size = offsetof(vring_used_t, ring) + > sizeof(vring_used_elem_t) * svq->vring.num; > - return ROUND_UP(used_size, qemu_real_host_page_size); > + return ROUND_UP(used_size, qemu_real_host_page_size) - 1; > } > > /** > @@ -532,8 +532,8 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, > VirtIODevice *vdev, > svq->vq = vq; > > svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq)); > - driver_size = vhost_svq_driver_area_size(svq); > - device_size = vhost_svq_device_area_size(svq); > + driver_size = vhost_svq_driver_area_size(svq) + 1; > + device_size = vhost_svq_device_area_size(svq) + 1; > svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size); > desc_size = sizeof(vring_desc_t) * svq->vring.num; > svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size); > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > index 5eefc5911a..2bf648de4a 100644 > --- a/hw/virtio/vhost-vdpa.c > +++ b/hw/virtio/vhost-vdpa.c > @@ -918,7 +918,7 @@ static bool vhost_vdpa_svq_map_ring(struct > vhost_vdpa *v, DMAMap *needle, > return false; > } > > - r = vhost_vdpa_dma_map(v, needle->iova, needle->size, > + r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1, > (void *)needle->translated_addr, > needle->perm == IOMMU_RO); > if (unlikely(r != 0)) { > @@ -955,7 +955,7 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, > > driver_region = (DMAMap) { > .translated_addr = svq_addr.desc_user_addr, > - .size = driver_size - 1, > + .size = driver_size, > .perm = IOMMU_RO, > }; > ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); > @@ -969,7 +969,7 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, > > device_region = (DMAMap) { > .translated_addr = svq_addr.used_user_addr, > - .size = device_size - 1, > + .size = device_size, > .perm = IOMMU_RW, > }; > ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); > ---Sorry, I still don't get why -1/+1 is required. Maybe you can show me what happens if we don't do these. Thanks> >> But current code may endup to passing qemu_real_host_page_size - 1 to >> vhost-VDPA which seems wrong? >> >> E.g vhost_svq_device_area_size() return qemu_real_host_page_size, but it >> was decreased by 1 here for size, then we pass size to vhost_vdpa_dma_map(). >> > That part needs fixing, but the right solution is not to skip the -1 > but increment to pass to the vhost_vdpa_dma_map. Doing otherwise would > bring problems with how iova-tree works. It will be included in the > next series. > > Thanks! > >> Thanks >> >> >>> QEMU's emulated Intel iommu code solves it using the address mask as >>> the size, something that does not fit 100% with vhost devices since >>> they can allocate an arbitrary address of arbitrary size when using >>> vIOMMU. It's not a problem for vhost-vdpa at this moment since we make >>> sure we expose unaligned and whole pages with vrings, but I feel it >>> would only be to move the problem somewhere else. >>> >>> Thanks! >>> >>> [1] There are alternatives: to use Int128_t, etc. But I think it's >>> better not to change that in this patch series. >>> >>>> Thanks >>>> >>>> >>>>> + .perm = IOMMU_RO, >>>>> + }; >>>>> + ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); >>>>> + if (unlikely(!ok)) { >>>>> + error_prepend(errp, "Cannot create vq driver region: "); >>>>> return false; >>>>> } >>>>> + addr->desc_user_addr = driver_region.iova; >>>>> + avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; >>>>> + addr->avail_user_addr = driver_region.iova + avail_offset; >>>>> >>>>> - r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size, >>>>> - (void *)addr->used_user_addr, false); >>>>> - if (unlikely(r != 0)) { >>>>> - error_setg_errno(errp, -r, "Cannot create vq device region: "); >>>>> + device_region = (DMAMap) { >>>>> + .translated_addr = svq_addr.used_user_addr, >>>>> + .size = device_size - 1, >>>>> + .perm = IOMMU_RW, >>>>> + }; >>>>> + ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); >>>>> + if (unlikely(!ok)) { >>>>> + error_prepend(errp, "Cannot create vq device region: "); >>>>> + vhost_vdpa_svq_unmap_ring(v, &driver_region); >>>>> } >>>>> + addr->used_user_addr = device_region.iova; >>>>> >>>>> - return r == 0; >>>>> + return ok; >>>>> } >>>>> >>>>> static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,