? 2022/1/22 ??4:27, Eugenio P?rez ??:> Use translations added in VhostIOVATree in SVQ.
>
> Only introduce usage here, not allocation and deallocation. As with
> previous patches, we use the dead code paths of shadow_vqs_enabled to
> avoid commiting too many changes at once. These are impossible to take
> at the moment.
>
> Signed-off-by: Eugenio P?rez <eperezma at redhat.com>
> ---
> hw/virtio/vhost-shadow-virtqueue.h | 3 +-
> include/hw/virtio/vhost-vdpa.h | 3 +
> hw/virtio/vhost-shadow-virtqueue.c | 111 ++++++++++++++++----
> hw/virtio/vhost-vdpa.c | 161 +++++++++++++++++++++++++----
> 4 files changed, 238 insertions(+), 40 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.h
b/hw/virtio/vhost-shadow-virtqueue.h
> index 19c934af49..c6f67d6f76 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.h
> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> @@ -12,6 +12,7 @@
>
> #include "hw/virtio/vhost.h"
> #include "qemu/event_notifier.h"
> +#include "hw/virtio/vhost-iova-tree.h"
>
> typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>
> @@ -37,7 +38,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq,
VirtIODevice *vdev,
> VirtQueue *vq);
> void vhost_svq_stop(VhostShadowVirtqueue *svq);
>
> -VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize, VhostIOVATree
*iova_map);
>
> void vhost_svq_free(VhostShadowVirtqueue *vq);
>
> diff --git a/include/hw/virtio/vhost-vdpa.h
b/include/hw/virtio/vhost-vdpa.h
> index 009a9f3b6b..cd2388b3be 100644
> --- a/include/hw/virtio/vhost-vdpa.h
> +++ b/include/hw/virtio/vhost-vdpa.h
> @@ -14,6 +14,7 @@
>
> #include <gmodule.h>
>
> +#include "hw/virtio/vhost-iova-tree.h"
> #include "hw/virtio/virtio.h"
> #include "standard-headers/linux/vhost_types.h"
>
> @@ -30,6 +31,8 @@ typedef struct vhost_vdpa {
> MemoryListener listener;
> struct vhost_vdpa_iova_range iova_range;
> bool shadow_vqs_enabled;
> + /* IOVA mapping used by Shadow Virtqueue */
> + VhostIOVATree *iova_tree;
> GPtrArray *shadow_vqs;
> struct vhost_dev *dev;
> VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c
b/hw/virtio/vhost-shadow-virtqueue.c
> index a1a404f68f..c7888eb8cf 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -11,6 +11,7 @@
> #include "hw/virtio/vhost-shadow-virtqueue.h"
> #include "hw/virtio/vhost.h"
> #include "hw/virtio/virtio-access.h"
> +#include "hw/virtio/vhost-iova-tree.h"
> #include "standard-headers/linux/vhost_types.h"
>
> #include "qemu/error-report.h"
> @@ -45,6 +46,9 @@ typedef struct VhostShadowVirtqueue {
> /* Virtio device */
> VirtIODevice *vdev;
>
> + /* IOVA mapping */
> + VhostIOVATree *iova_tree;
> +
> /* Map for returning guest's descriptors */
> VirtQueueElement **ring_id_maps;
>
> @@ -97,13 +101,7 @@ bool vhost_svq_valid_device_features(uint64_t
*dev_features)
> continue;
>
> case VIRTIO_F_ACCESS_PLATFORM:
> - /* SVQ does not know how to translate addresses */
> - if (*dev_features & BIT_ULL(b)) {
> - clear_bit(b, dev_features);
> - r = false;
> - }
> - break;
> -
> + /* SVQ trust in host's IOMMU to translate addresses */
> case VIRTIO_F_VERSION_1:
> /* SVQ trust that guest vring is little endian */
> if (!(*dev_features & BIT_ULL(b))) {
> @@ -205,7 +203,55 @@ static void
vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> }
> }
>
> +/**
> + * Translate addresses between qemu's virtual address and SVQ IOVA
> + *
> + * @svq Shadow VirtQueue
> + * @vaddr Translated IOVA addresses
> + * @iovec Source qemu's VA addresses
> + * @num Length of iovec and minimum length of vaddr
> + */
> +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> + void **addrs, const struct iovec
*iovec,
> + size_t num)
> +{
> + size_t i;
> +
> + if (num == 0) {
> + return true;
> + }
> +
> + for (i = 0; i < num; ++i) {
> + DMAMap needle = {
> + .translated_addr = (hwaddr)iovec[i].iov_base,
> + .size = iovec[i].iov_len,
> + };
> + size_t off;
> +
> + const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree,
&needle);
> + /*
> + * Map cannot be NULL since iova map contains all guest space and
> + * qemu already has a physical address mapped
> + */
> + if (unlikely(!map)) {
> + error_report("Invalid address 0x%"HWADDR_PRIx"
given by guest",
> + needle.translated_addr);
This can be triggered by guest, we need use once or log_guest_error() etc.
> + return false;
> + }
> +
> + /*
> + * Map->iova chunk size is ignored. What to do if descriptor
> + * (addr, size) does not fit is delegated to the device.
> + */
I think we need at least check the size and fail if the size doesn't
match here. Or is it possible that we have a buffer that may cross two
memory regions?
> + off = needle.translated_addr - map->translated_addr;
> + addrs[i] = (void *)(map->iova + off);
> + }
> +
> + return true;
> +}
> +
> static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> + void * const *vaddr_sg,
> const struct iovec *iovec,
> size_t num, bool more_descs, bool
write)
> {
> @@ -224,7 +270,7 @@ static void
vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> } else {
> descs[i].flags = flags;
> }
> - descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> + descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]);
> descs[i].len = cpu_to_le32(iovec[n].iov_len);
>
> last = i;
> @@ -234,42 +280,60 @@ static void
vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> svq->free_head = le16_to_cpu(descs[last].next);
> }
>
> -static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> - VirtQueueElement *elem)
> +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> + VirtQueueElement *elem,
> + unsigned *head)
I'd suggest to make it returns bool since the patch that introduces this
function.
> {
> - int head;
> unsigned avail_idx;
> vring_avail_t *avail = svq->vring.avail;
> + bool ok;
> + g_autofree void **sgs = g_new(void *, MAX(elem->out_num,
elem->in_num));
>
> - head = svq->free_head;
> + *head = svq->free_head;
>
> /* We need some descriptors here */
> assert(elem->out_num || elem->in_num);
>
> - vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> + ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg,
elem->out_num);
> + if (unlikely(!ok)) {
> + return false;
> + }
> + vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
> elem->in_num > 0, false);
> - vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false,
true);
> +
> +
> + ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg,
elem->in_num);
> + if (unlikely(!ok)) {
> + return false;
> + }
> +
> + vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num,
false, true);
>
> /*
> * Put entry in available array (but don't update avail->idx
until they
> * do sync).
> */
> avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
> - avail->ring[avail_idx] = cpu_to_le16(head);
> + avail->ring[avail_idx] = cpu_to_le16(*head);
> svq->avail_idx_shadow++;
>
> /* Update avail index after the descriptor is wrote */
> smp_wmb();
> avail->idx = cpu_to_le16(svq->avail_idx_shadow);
>
> - return head;
> + return true;
> }
>
> -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement
*elem)
> +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement
*elem)
> {
> - unsigned qemu_head = vhost_svq_add_split(svq, elem);
> + unsigned qemu_head;
> + bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
> + if (unlikely(!ok)) {
> + return false;
> + }
>
> svq->ring_id_maps[qemu_head] = elem;
> + return true;
> }
>
> static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> @@ -309,6 +373,7 @@ static void
vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
>
> while (true) {
> VirtQueueElement *elem;
> + bool ok;
>
> if (svq->next_guest_avail_elem) {
> elem =
g_steal_pointer(&svq->next_guest_avail_elem);
> @@ -337,7 +402,11 @@ static void
vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
> return;
> }
>
> - vhost_svq_add(svq, elem);
> + ok = vhost_svq_add(svq, elem);
> + if (unlikely(!ok)) {
> + /* VQ is broken, just return and ignore any other kicks */
> + return;
> + }
> vhost_svq_kick(svq);
> }
>
> @@ -619,12 +688,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> * methods and file descriptors.
> *
> * @qsize Shadow VirtQueue size
> + * @iova_tree Tree to perform descriptors translations
> *
> * Returns the new virtqueue or NULL.
> *
> * In case of error, reason is reported through error_report.
> */
> -VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize, VhostIOVATree
*iova_tree)
> {
> size_t desc_size = sizeof(vring_desc_t) * qsize;
> size_t device_size, driver_size;
> @@ -656,6 +726,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> memset(svq->vring.desc, 0, driver_size);
> svq->vring.used = qemu_memalign(qemu_real_host_page_size,
device_size);
> memset(svq->vring.used, 0, device_size);
> + svq->iova_tree = iova_tree;
> svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
> event_notifier_set_handler(&svq->hdev_call,
vhost_svq_handle_call);
> return g_steal_pointer(&svq);
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 0e5c00ed7e..276a559649 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -209,6 +209,18 @@ static void
vhost_vdpa_listener_region_add(MemoryListener *listener,
> vaddr, section->readonly);
>
> llsize = int128_sub(llend, int128_make64(iova));
> + if (v->shadow_vqs_enabled) {
> + DMAMap mem_region = {
> + .translated_addr = (hwaddr)vaddr,
> + .size = int128_get64(llsize) - 1,
> + .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> + };
> +
> + int r = vhost_iova_tree_map_alloc(v->iova_tree,
&mem_region);
> + assert(r == IOVA_OK);
It's better to fail or warn here.
> +
> + iova = mem_region.iova;
> + }
>
> vhost_vdpa_iotlb_batch_begin_once(v);
> ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> @@ -261,6 +273,20 @@ static void
vhost_vdpa_listener_region_del(MemoryListener *listener,
>
> llsize = int128_sub(llend, int128_make64(iova));
>
> + if (v->shadow_vqs_enabled) {
> + const DMAMap *result;
> + const void *vaddr = memory_region_get_ram_ptr(section->mr) +
> + section->offset_within_region +
> + (iova - section->offset_within_address_space);
> + DMAMap mem_region = {
> + .translated_addr = (hwaddr)vaddr,
> + .size = int128_get64(llsize) - 1,
> + };
> +
> + result = vhost_iova_tree_find_iova(v->iova_tree,
&mem_region);
> + iova = result->iova;
> + vhost_iova_tree_remove(v->iova_tree, &mem_region);
> + }
> vhost_vdpa_iotlb_batch_begin_once(v);
> ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
> if (ret) {
> @@ -783,33 +809,70 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev
*dev,
> /**
> * Unmap SVQ area in the device
> */
> -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> - hwaddr size)
> +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
> + const DMAMap *needle)
> {
> + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree,
needle);
> + hwaddr size;
> int r;
>
> - size = ROUND_UP(size, qemu_real_host_page_size);
> - r = vhost_vdpa_dma_unmap(v, iova, size);
> + if (unlikely(!result)) {
> + error_report("Unable to find SVQ address to unmap");
> + return false;
> + }
> +
> + size = ROUND_UP(result->size, qemu_real_host_page_size);
> + r = vhost_vdpa_dma_unmap(v, result->iova, size);
> return r == 0;
> }
>
> static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> const VhostShadowVirtqueue *svq)
> {
> + DMAMap needle;
> struct vhost_vdpa *v = dev->opaque;
> struct vhost_vring_addr svq_addr;
> - size_t device_size = vhost_svq_device_area_size(svq);
> - size_t driver_size = vhost_svq_driver_area_size(svq);
> bool ok;
>
> vhost_svq_get_vring_addr(svq, &svq_addr);
>
> - ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr,
driver_size);
> + needle = (DMAMap) {
> + .translated_addr = svq_addr.desc_user_addr,
> + };
> + ok = vhost_vdpa_svq_unmap_ring(v, &needle);
> if (unlikely(!ok)) {
> return false;
> }
>
> - return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr,
device_size);
> + needle = (DMAMap) {
> + .translated_addr = svq_addr.used_user_addr,
> + };
> + return vhost_vdpa_svq_unmap_ring(v, &needle);
> +}
> +
> +/**
> + * Map SVQ area in the device
> + *
> + * @v Vhost-vdpa device
> + * @needle The area to search iova
> + * @readonly Permissions of the area
> + */
> +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, const DMAMap
*needle,
> + bool readonly)
> +{
> + hwaddr off;
> + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree,
needle);
> + int r;
> +
> + if (unlikely(!result)) {
> + error_report("Can't locate SVQ ring");
> + return false;
> + }
> +
> + off = needle->translated_addr - result->translated_addr;
> + r = vhost_vdpa_dma_map(v, result->iova + off, needle->size,
> + (void *)needle->translated_addr, readonly);
> + return r == 0;
> }
>
> /**
> @@ -821,23 +884,29 @@ static bool vhost_vdpa_svq_unmap_rings(struct
vhost_dev *dev,
> static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
> const VhostShadowVirtqueue *svq)
> {
> + DMAMap needle;
> struct vhost_vdpa *v = dev->opaque;
> struct vhost_vring_addr svq_addr;
> size_t device_size = vhost_svq_device_area_size(svq);
> size_t driver_size = vhost_svq_driver_area_size(svq);
> - int r;
> + bool ok;
>
> vhost_svq_get_vring_addr(svq, &svq_addr);
>
> - r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
> - (void *)svq_addr.desc_user_addr, true);
> - if (unlikely(r != 0)) {
> + needle = (DMAMap) {
> + .translated_addr = svq_addr.desc_user_addr,
> + .size = driver_size,
> + };
> + ok = vhost_vdpa_svq_map_ring(v, &needle, true);
> + if (unlikely(!ok)) {
> return false;
> }
>
> - r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
> - (void *)svq_addr.used_user_addr, false);
> - return r == 0;
> + needle = (DMAMap) {
> + .translated_addr = svq_addr.used_user_addr,
> + .size = device_size,
> + };
> + return vhost_vdpa_svq_map_ring(v, &needle, false);
> }
>
> static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> @@ -1006,6 +1075,23 @@ static int vhost_vdpa_set_owner(struct vhost_dev
*dev)
> return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
> }
>
> +static bool vhost_vdpa_svq_get_vq_region(struct vhost_vdpa *v,
> + unsigned long long addr,
> + uint64_t *iova_addr)
> +{
> + const DMAMap needle = {
> + .translated_addr = addr,
> + };
> + const DMAMap *translation = vhost_iova_tree_find_iova(v->iova_tree,
> + &needle);
> + if (!translation) {
> + return false;
> + }
> +
> + *iova_addr = translation->iova + (addr -
translation->translated_addr);
> + return true;
> +}
> +
> static void vhost_vdpa_vq_get_guest_addr(struct vhost_vring_addr *addr,
> struct vhost_virtqueue *vq)
> {
> @@ -1023,10 +1109,23 @@ static int vhost_vdpa_vq_get_addr(struct vhost_dev
*dev,
> assert(dev->vhost_ops->backend_type ==
VHOST_BACKEND_TYPE_VDPA);
>
> if (v->shadow_vqs_enabled) {
> + struct vhost_vring_addr svq_addr;
> int idx = vhost_vdpa_get_vq_index(dev, addr->index);
> VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
idx);
>
> - vhost_svq_get_vring_addr(svq, addr);
> + vhost_svq_get_vring_addr(svq, &svq_addr);
> + if (!vhost_vdpa_svq_get_vq_region(v, svq_addr.desc_user_addr,
> + &addr->desc_user_addr)) {
> + return -1;
> + }
> + if (!vhost_vdpa_svq_get_vq_region(v, svq_addr.avail_user_addr,
> + &addr->avail_user_addr))
{
> + return -1;
> + }
> + if (!vhost_vdpa_svq_get_vq_region(v, svq_addr.used_user_addr,
> + &addr->used_user_addr)) {
> + return -1;
> + }
> } else {
> vhost_vdpa_vq_get_guest_addr(addr, vq);
> }
> @@ -1095,13 +1194,37 @@ static int vhost_vdpa_init_svq(struct vhost_dev
*hdev, struct vhost_vdpa *v,
>
> shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_psvq_free);
> for (unsigned n = 0; n < hdev->nvqs; ++n) {
> - VhostShadowVirtqueue *svq = vhost_svq_new(qsize);
> -
> + DMAMap device_region, driver_region;
> + struct vhost_vring_addr addr;
> + VhostShadowVirtqueue *svq = vhost_svq_new(qsize, v->iova_tree);
> if (unlikely(!svq)) {
> error_setg(errp, "Cannot create svq %u", n);
> return -1;
> }
> - g_ptr_array_add(v->shadow_vqs, svq);
> +
> + vhost_svq_get_vring_addr(svq, &addr);
> + driver_region = (DMAMap) {
> + .translated_addr = (hwaddr)addr.desc_user_addr,
> +
> + /*
> + * DMAMAp.size include the last byte included in the range,
while
> + * sizeof marks one past it. Substract one byte to make them
match.
> + */
> + .size = vhost_svq_driver_area_size(svq) - 1,
> + .perm = VHOST_ACCESS_RO,
> + };
> + device_region = (DMAMap) {
> + .translated_addr = (hwaddr)addr.used_user_addr,
> + .size = vhost_svq_device_area_size(svq) - 1,
> + .perm = VHOST_ACCESS_RW,
> + };
> +
> + r = vhost_iova_tree_map_alloc(v->iova_tree,
&driver_region);
> + assert(r == IOVA_OK);
Let's fail instead of assert here.
Thanks
> + r = vhost_iova_tree_map_alloc(v->iova_tree,
&device_region);
> + assert(r == IOVA_OK);
> +
> + g_ptr_array_add(shadow_vqs, svq);
> }
>
> out: