Stefano Garzarella
2022-Dec-14 16:30 UTC
[RFC PATCH 0/6] vdpa_sim: add support for user VA
This series adds support for the use of user virtual addresses in the vDPA simulator devices. The main reason for this change is to lift the pinning of all guest memory. Especially with virtio devices implemented in software. The next step would be to generalize the code in vdpa-sim to allow the implementation of in-kernel software devices. Similar to vhost, but using vDPA so we can reuse the same software stack (e.g. in QEMU) for both HW and SW devices. For example, we have never merged vhost-blk, and lately there has been interest. So it would be nice to do it directly with vDPA to reuse the same code in the VMM for both HW and SW vDPA block devices. The main problem (addressed by this series) was due to the pinning of all guest memory, which thus prevented the overcommit of guest memory. There are still some TODOs to be fixed, but I would like to have your feedback on this RFC. Thanks, Stefano Note: this series is based on Linux v6.1 + couple of fixes (that I needed to run libblkio tests) already posted but not yet merged. Tree available here: https://gitlab.com/sgarzarella/linux/-/tree/vdpa-sim-use-va Stefano Garzarella (6): vdpa: add bind_mm callback vhost-vdpa: use bind_mm device callback vringh: support VA with iotlb vdpa_sim: make devices agnostic for work management vdpa_sim: use kthread worker vdpa_sim: add support for user VA drivers/vdpa/vdpa_sim/vdpa_sim.h | 7 +- include/linux/vdpa.h | 8 + include/linux/vringh.h | 5 +- drivers/vdpa/mlx5/core/resources.c | 3 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 132 +++++++++++++- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 6 +- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 6 +- drivers/vhost/vdpa.c | 22 +++ drivers/vhost/vringh.c | 250 +++++++++++++++++++++------ 10 files changed, 370 insertions(+), 71 deletions(-) -- 2.38.1
This new optional callback is used to bind the device to a specific address space so the vDPA framework can use VA when this callback is implemented. Suggested-by: Jason Wang <jasowang at redhat.com> Signed-off-by: Stefano Garzarella <sgarzare at redhat.com> --- include/linux/vdpa.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 6d0f5e4e82c2..34388e21ef3f 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -282,6 +282,12 @@ struct vdpa_map_file { * @iova: iova to be unmapped * @size: size of the area * Returns integer: success (0) or error (< 0) + * @bind_mm: Bind the device to a specific address space + * so the vDPA framework can use VA when this + * callback is implemented. (optional) + * @vdev: vdpa device + * @mm: address space to bind + * @owner: process that owns the address space * @free: Free resources that belongs to vDPA (optional) * @vdev: vdpa device */ @@ -341,6 +347,8 @@ struct vdpa_config_ops { u64 iova, u64 size); int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group, unsigned int asid); + int (*bind_mm)(struct vdpa_device *vdev, struct mm_struct *mm, + struct task_struct *owner); /* Free device resources */ void (*free)(struct vdpa_device *vdev); -- 2.38.1
Stefano Garzarella
2022-Dec-14 16:30 UTC
[RFC PATCH 2/6] vhost-vdpa: use bind_mm device callback
When the user call VHOST_SET_OWNER ioctl and the vDPA device has `use_va` set to true, let's call the bind_mm callback. In this way we can bind the device to the user address space and directly use the user VA. Signed-off-by: Stefano Garzarella <sgarzare at redhat.com> --- drivers/vhost/vdpa.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b08e07fc7d1f..a775d1a52c77 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -219,6 +219,17 @@ static int vhost_vdpa_reset(struct vhost_vdpa *v) return vdpa_reset(vdpa); } +static long vhost_vdpa_bind_mm(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (!vdpa->use_va || !ops->bind_mm) + return 0; + + return ops->bind_mm(vdpa, v->vdev.mm, current); +} + static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) { struct vdpa_device *vdpa = v->vdpa; @@ -276,6 +287,10 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) ret = vdpa_reset(vdpa); if (ret) return ret; + + ret = vhost_vdpa_bind_mm(v); + if (ret) + return ret; } else vdpa_set_status(vdpa, status); @@ -679,6 +694,13 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, break; default: r = vhost_dev_ioctl(&v->vdev, cmd, argp); + if (!r && cmd == VHOST_SET_OWNER) { + r = vhost_vdpa_bind_mm(v); + if (r) { + vhost_dev_reset_owner(&v->vdev, NULL); + break; + } + } if (r == -ENOIOCTLCMD) r = vhost_vdpa_vring_ioctl(v, cmd, argp); break; -- 2.38.1
vDPA supports the possibility to use user VA in the iotlb messages. So, let's add support for user VA in vringh to use it in the vDPA simulators. Signed-off-by: Stefano Garzarella <sgarzare at redhat.com> --- include/linux/vringh.h | 5 +- drivers/vdpa/mlx5/core/resources.c | 3 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 +- drivers/vhost/vringh.c | 250 +++++++++++++++++++++++------ 5 files changed, 207 insertions(+), 57 deletions(-) diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 212892cf9822..c70962f16b1f 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -32,6 +32,9 @@ struct vringh { /* Can we get away with weak barriers? */ bool weak_barriers; + /* Use user's VA */ + bool use_va; + /* Last available index we saw (ie. where we're up to). */ u16 last_avail_idx; @@ -279,7 +282,7 @@ void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb, spinlock_t *iotlb_lock); int vringh_init_iotlb(struct vringh *vrh, u64 features, - unsigned int num, bool weak_barriers, + unsigned int num, bool weak_barriers, bool use_va, struct vring_desc *desc, struct vring_avail *avail, struct vring_used *used); diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c index 9800f9bec225..e0bab3458b40 100644 --- a/drivers/vdpa/mlx5/core/resources.c +++ b/drivers/vdpa/mlx5/core/resources.c @@ -233,7 +233,8 @@ static int init_ctrl_vq(struct mlx5_vdpa_dev *mvdev) if (!mvdev->cvq.iotlb) return -ENOMEM; - vringh_set_iotlb(&mvdev->cvq.vring, mvdev->cvq.iotlb, &mvdev->cvq.iommu_lock); + vringh_set_iotlb(&mvdev->cvq.vring, mvdev->cvq.iotlb, + &mvdev->cvq.iommu_lock); return 0; } diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 90913365def4..81ba0867e2c8 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2504,7 +2504,7 @@ static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev) if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features, - MLX5_CVQ_MAX_ENT, false, + MLX5_CVQ_MAX_ENT, false, false, (struct vring_desc *)(uintptr_t)cvq->desc_addr, (struct vring_avail *)(uintptr_t)cvq->driver_addr, (struct vring_used *)(uintptr_t)cvq->device_addr); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index b20689f8fe89..2e0ee7280aa8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -67,7 +67,7 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; - vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, false, + vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, false, false, (struct vring_desc *)(uintptr_t)vq->desc_addr, (struct vring_avail *) (uintptr_t)vq->driver_addr, @@ -87,7 +87,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim, vq->cb = NULL; vq->private = NULL; vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, - VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL); + VDPASIM_QUEUE_MAX, false, false, NULL, NULL, NULL); vq->vring.notify = NULL; } diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 11f59dd06a74..c1f77dc93482 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1094,15 +1094,99 @@ EXPORT_SYMBOL(vringh_need_notify_kern); #if IS_REACHABLE(CONFIG_VHOST_IOTLB) -static int iotlb_translate(const struct vringh *vrh, - u64 addr, u64 len, u64 *translated, - struct bio_vec iov[], - int iov_size, u32 perm) +static int iotlb_translate_va(const struct vringh *vrh, + u64 addr, u64 len, u64 *translated, + struct iovec iov[], + int iov_size, u32 perm) { struct vhost_iotlb_map *map; struct vhost_iotlb *iotlb = vrh->iotlb; + u64 s = 0, last = addr + len - 1; + int ret = 0; + + spin_lock(vrh->iotlb_lock); + + while (len > s) { + u64 size; + + if (unlikely(ret >= iov_size)) { + ret = -ENOBUFS; + break; + } + + map = vhost_iotlb_itree_first(iotlb, addr, last); + if (!map || map->start > addr) { + ret = -EINVAL; + break; + } else if (!(map->perm & perm)) { + ret = -EPERM; + break; + } + + size = map->size - addr + map->start; + iov[ret].iov_len = min(len - s, size); + iov[ret].iov_base = (void __user *)(unsigned long) + (map->addr + addr - map->start); + s += size; + addr += size; + ++ret; + } + + spin_unlock(vrh->iotlb_lock); + + if (translated) + *translated = min(len, s); + + return ret; +} + +static inline int copy_from_va(const struct vringh *vrh, void *dst, void *src, + u64 len, u64 *translated) +{ + struct iovec iov[16]; + struct iov_iter iter; + int ret; + + ret = iotlb_translate_va(vrh, (u64)(uintptr_t)src, len, translated, iov, + ARRAY_SIZE(iov), VHOST_MAP_RO); + if (ret == -ENOBUFS) + ret = ARRAY_SIZE(iov); + else if (ret < 0) + return ret; + + iov_iter_init(&iter, READ, iov, ret, *translated); + + return copy_from_iter(dst, *translated, &iter); +} + +static inline int copy_to_va(const struct vringh *vrh, void *dst, void *src, + u64 len, u64 *translated) +{ + struct iovec iov[16]; + struct iov_iter iter; + int ret; + + ret = iotlb_translate_va(vrh, (u64)(uintptr_t)dst, len, translated, iov, + ARRAY_SIZE(iov), VHOST_MAP_WO); + if (ret == -ENOBUFS) + ret = ARRAY_SIZE(iov); + else if (ret < 0) + return ret; + + iov_iter_init(&iter, WRITE, iov, ret, *translated); + + return copy_to_iter(src, *translated, &iter); +} + +static int iotlb_translate_pa(const struct vringh *vrh, + u64 addr, u64 len, u64 *translated, + struct bio_vec iov[], + int iov_size, u32 perm) +{ + struct vhost_iotlb_map *map; + struct vhost_iotlb *iotlb = vrh->iotlb; + u64 s = 0, last = addr + len - 1; int ret = 0; - u64 s = 0; spin_lock(vrh->iotlb_lock); @@ -1114,8 +1198,7 @@ static int iotlb_translate(const struct vringh *vrh, break; } - map = vhost_iotlb_itree_first(iotlb, addr, - addr + len - 1); + map = vhost_iotlb_itree_first(iotlb, addr, last); if (!map || map->start > addr) { ret = -EINVAL; break; @@ -1143,28 +1226,61 @@ static int iotlb_translate(const struct vringh *vrh, return ret; } +static inline int copy_from_pa(const struct vringh *vrh, void *dst, void *src, + u64 len, u64 *translated) +{ + struct bio_vec iov[16]; + struct iov_iter iter; + int ret; + + ret = iotlb_translate_pa(vrh, (u64)(uintptr_t)src, len, translated, iov, + ARRAY_SIZE(iov), VHOST_MAP_RO); + if (ret == -ENOBUFS) + ret = ARRAY_SIZE(iov); + else if (ret < 0) + return ret; + + iov_iter_bvec(&iter, READ, iov, ret, *translated); + + return copy_from_iter(dst, *translated, &iter); +} + +static inline int copy_to_pa(const struct vringh *vrh, void *dst, void *src, + u64 len, u64 *translated) +{ + struct bio_vec iov[16]; + struct iov_iter iter; + int ret; + + ret = iotlb_translate_pa(vrh, (u64)(uintptr_t)dst, len, translated, iov, + ARRAY_SIZE(iov), VHOST_MAP_WO); + if (ret == -ENOBUFS) + ret = ARRAY_SIZE(iov); + else if (ret < 0) + return ret; + + iov_iter_bvec(&iter, WRITE, iov, ret, *translated); + + return copy_to_iter(src, *translated, &iter); +} + static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { u64 total_translated = 0; while (total_translated < len) { - struct bio_vec iov[16]; - struct iov_iter iter; u64 translated; int ret; - ret = iotlb_translate(vrh, (u64)(uintptr_t)src, - len - total_translated, &translated, - iov, ARRAY_SIZE(iov), VHOST_MAP_RO); - if (ret == -ENOBUFS) - ret = ARRAY_SIZE(iov); - else if (ret < 0) - return ret; - - iov_iter_bvec(&iter, READ, iov, ret, translated); + if (vrh->use_va) { + ret = copy_from_va(vrh, dst, src, + len - total_translated, &translated); + } else { + ret = copy_from_pa(vrh, dst, src, + len - total_translated, &translated); + } - ret = copy_from_iter(dst, translated, &iter); if (ret < 0) return ret; @@ -1182,22 +1298,17 @@ static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, u64 total_translated = 0; while (total_translated < len) { - struct bio_vec iov[16]; - struct iov_iter iter; u64 translated; int ret; - ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, - len - total_translated, &translated, - iov, ARRAY_SIZE(iov), VHOST_MAP_WO); - if (ret == -ENOBUFS) - ret = ARRAY_SIZE(iov); - else if (ret < 0) - return ret; - - iov_iter_bvec(&iter, WRITE, iov, ret, translated); + if (vrh->use_va) { + ret = copy_to_va(vrh, dst, src, + len - total_translated, &translated); + } else { + ret = copy_to_pa(vrh, dst, src, + len - total_translated, &translated); + } - ret = copy_to_iter(src, translated, &iter); if (ret < 0) return ret; @@ -1212,20 +1323,36 @@ static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, static inline int getu16_iotlb(const struct vringh *vrh, u16 *val, const __virtio16 *p) { - struct bio_vec iov; - void *kaddr, *from; int ret; /* Atomic read is needed for getu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, - &iov, 1, VHOST_MAP_RO); - if (ret < 0) - return ret; + if (vrh->use_va) { + struct iovec iov; + + ret = iotlb_translate_va(vrh, (u64)(uintptr_t)p, sizeof(*p), + NULL, &iov, 1, VHOST_MAP_RO); + if (ret < 0) + return ret; - kaddr = kmap_atomic(iov.bv_page); - from = kaddr + iov.bv_offset; - *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from)); - kunmap_atomic(kaddr); + ret = __get_user(*val, (__virtio16 *)iov.iov_base); + if (ret) + return ret; + + *val = vringh16_to_cpu(vrh, *val); + } else { + struct bio_vec iov; + void *kaddr, *from; + + ret = iotlb_translate_pa(vrh, (u64)(uintptr_t)p, sizeof(*p), + NULL, &iov, 1, VHOST_MAP_RO); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(iov.bv_page); + from = kaddr + iov.bv_offset; + *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from)); + kunmap_atomic(kaddr); + } return 0; } @@ -1233,20 +1360,36 @@ static inline int getu16_iotlb(const struct vringh *vrh, static inline int putu16_iotlb(const struct vringh *vrh, __virtio16 *p, u16 val) { - struct bio_vec iov; - void *kaddr, *to; int ret; /* Atomic write is needed for putu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, - &iov, 1, VHOST_MAP_WO); - if (ret < 0) - return ret; + if (vrh->use_va) { + struct iovec iov; - kaddr = kmap_atomic(iov.bv_page); - to = kaddr + iov.bv_offset; - WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val)); - kunmap_atomic(kaddr); + ret = iotlb_translate_va(vrh, (u64)(uintptr_t)p, sizeof(*p), + NULL, &iov, 1, VHOST_MAP_RO); + if (ret < 0) + return ret; + + val = cpu_to_vringh16(vrh, val); + + ret = __put_user(val, (__virtio16 *)iov.iov_base); + if (ret) + return ret; + } else { + struct bio_vec iov; + void *kaddr, *to; + + ret = iotlb_translate_pa(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, + &iov, 1, VHOST_MAP_WO); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(iov.bv_page); + to = kaddr + iov.bv_offset; + WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val)); + kunmap_atomic(kaddr); + } return 0; } @@ -1308,6 +1451,7 @@ static inline int putused_iotlb(const struct vringh *vrh, * @features: the feature bits for this ring. * @num: the number of elements. * @weak_barriers: true if we only need memory barriers, not I/O. + * @use_va: true if IOTLB contains user VA * @desc: the userpace descriptor pointer. * @avail: the userpace avail pointer. * @used: the userpace used pointer. @@ -1315,11 +1459,13 @@ static inline int putused_iotlb(const struct vringh *vrh, * Returns an error if num is invalid. */ int vringh_init_iotlb(struct vringh *vrh, u64 features, - unsigned int num, bool weak_barriers, + unsigned int num, bool weak_barriers, bool use_va, struct vring_desc *desc, struct vring_avail *avail, struct vring_used *used) { + vrh->use_va = use_va; + return vringh_init_kern(vrh, features, num, weak_barriers, desc, avail, used); } -- 2.38.1
Stefano Garzarella
2022-Dec-14 16:30 UTC
[RFC PATCH 4/6] vdpa_sim: make devices agnostic for work management
Let's move work management inside the vdpa_sim core. This way we can easily change how we manage the works, without having to change the devices each time. Signed-off-by: Stefano Garzarella <sgarzare at redhat.com> --- drivers/vdpa/vdpa_sim/vdpa_sim.h | 3 ++- drivers/vdpa/vdpa_sim/vdpa_sim.c | 17 +++++++++++++++-- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 6 ++---- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 6 ++---- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 0e78737dcc16..7e6dd366856f 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -44,7 +44,7 @@ struct vdpasim_dev_attr { u32 ngroups; u32 nas; - work_func_t work_fn; + void (*work_fn)(struct vdpasim *vdpasim); void (*get_config)(struct vdpasim *vdpasim, void *config); void (*set_config)(struct vdpasim *vdpasim, const void *config); }; @@ -73,6 +73,7 @@ struct vdpasim { struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr, const struct vdpa_dev_set_config *config); +void vdpasim_schedule_work(struct vdpasim *vdpasim); /* TODO: cross-endian support */ static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 2e0ee7280aa8..9bde33e38e27 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -245,6 +245,13 @@ static const struct dma_map_ops vdpasim_dma_ops = { static const struct vdpa_config_ops vdpasim_config_ops; static const struct vdpa_config_ops vdpasim_batch_config_ops; +static void vdpasim_work_fn(struct work_struct *work) +{ + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + + vdpasim->dev_attr.work_fn(vdpasim); +} + struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, const struct vdpa_dev_set_config *config) { @@ -275,7 +282,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, } vdpasim->dev_attr = *dev_attr; - INIT_WORK(&vdpasim->work, dev_attr->work_fn); + INIT_WORK(&vdpasim->work, vdpasim_work_fn); spin_lock_init(&vdpasim->lock); spin_lock_init(&vdpasim->iommu_lock); @@ -329,6 +336,12 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, } EXPORT_SYMBOL_GPL(vdpasim_create); +void vdpasim_schedule_work(struct vdpasim *vdpasim) +{ + schedule_work(&vdpasim->work); +} +EXPORT_SYMBOL_GPL(vdpasim_schedule_work); + static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx, u64 desc_area, u64 driver_area, u64 device_area) @@ -357,7 +370,7 @@ static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx) struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; if (vq->ready) - schedule_work(&vdpasim->work); + vdpasim_schedule_work(vdpasim); } static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index c6db1a1baf76..ae2309411acd 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/blkdev.h> #include <linux/vringh.h> #include <linux/vdpa.h> @@ -286,9 +285,8 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, return handled; } -static void vdpasim_blk_work(struct work_struct *work) +static void vdpasim_blk_work(struct vdpasim *vdpasim) { - struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); bool reschedule = false; int i; @@ -326,7 +324,7 @@ static void vdpasim_blk_work(struct work_struct *work) spin_unlock(&vdpasim->lock); if (reschedule) - schedule_work(&vdpasim->work); + vdpasim_schedule_work(vdpasim); } static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index c3cb225ea469..a209df365158 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/etherdevice.h> #include <linux/vringh.h> #include <linux/vdpa.h> @@ -143,9 +142,8 @@ static void vdpasim_handle_cvq(struct vdpasim *vdpasim) } } -static void vdpasim_net_work(struct work_struct *work) +static void vdpasim_net_work(struct vdpasim *vdpasim) { - struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; ssize_t read, write; @@ -196,7 +194,7 @@ static void vdpasim_net_work(struct work_struct *work) vdpasim_net_complete(rxq, write); if (++pkts > 4) { - schedule_work(&vdpasim->work); + vdpasim_schedule_work(vdpasim); goto out; } } -- 2.38.1
Let's use our own kthread to run device jobs. This allows us more flexibility, especially we can attach the kthread to the user address space when vDPA uses user's VA. Signed-off-by: Stefano Garzarella <sgarzare at redhat.com> --- drivers/vdpa/vdpa_sim/vdpa_sim.h | 3 ++- drivers/vdpa/vdpa_sim/vdpa_sim.c | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 7e6dd366856f..07ef53ea375e 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -53,7 +53,8 @@ struct vdpasim_dev_attr { struct vdpasim { struct vdpa_device vdpa; struct vdpasim_virtqueue *vqs; - struct work_struct work; + struct kthread_worker *worker; + struct kthread_work work; struct vdpasim_dev_attr dev_attr; /* spinlock to synchronize virtqueue state */ spinlock_t lock; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 9bde33e38e27..36a1d2e0a6ba 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -11,8 +11,8 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> +#include <linux/kthread.h> #include <linux/slab.h> -#include <linux/sched.h> #include <linux/dma-map-ops.h> #include <linux/vringh.h> #include <linux/vdpa.h> @@ -245,7 +245,7 @@ static const struct dma_map_ops vdpasim_dma_ops = { static const struct vdpa_config_ops vdpasim_config_ops; static const struct vdpa_config_ops vdpasim_batch_config_ops; -static void vdpasim_work_fn(struct work_struct *work) +static void vdpasim_work_fn(struct kthread_work *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); @@ -282,7 +282,13 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, } vdpasim->dev_attr = *dev_attr; - INIT_WORK(&vdpasim->work, vdpasim_work_fn); + + kthread_init_work(&vdpasim->work, vdpasim_work_fn); + vdpasim->worker = kthread_create_worker(0, "vDPA sim worker: %s", + dev_attr->name); + if (IS_ERR(vdpasim->worker)) + goto err_iommu; + spin_lock_init(&vdpasim->lock); spin_lock_init(&vdpasim->iommu_lock); @@ -338,7 +344,7 @@ EXPORT_SYMBOL_GPL(vdpasim_create); void vdpasim_schedule_work(struct vdpasim *vdpasim) { - schedule_work(&vdpasim->work); + kthread_queue_work(vdpasim->worker, &vdpasim->work); } EXPORT_SYMBOL_GPL(vdpasim_schedule_work); @@ -689,7 +695,8 @@ static void vdpasim_free(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); int i; - cancel_work_sync(&vdpasim->work); + kthread_cancel_work_sync(&vdpasim->work); + kthread_destroy_worker(vdpasim->worker); for (i = 0; i < vdpasim->dev_attr.nvqs; i++) { vringh_kiov_cleanup(&vdpasim->vqs[i].out_iov); -- 2.38.1
Stefano Garzarella
2022-Dec-14 16:30 UTC
[RFC PATCH 6/6] vdpa_sim: add support for user VA
The new "use_va" module parameter (default: false) is used in vdpa_alloc_device() to inform the vDPA framework that the device supports VA. vringh is initialized to use VA only when "use_va" is true and the user's mm has been bound. So, only when the bus supports user VA (e.g. vhost-vdpa). vdpasim_mm_work_fn work is used to attach the kthread to the user address space when the .bind_mm callback is invoked, and to detach it when the device is reset. Signed-off-by: Stefano Garzarella <sgarzare at redhat.com> --- drivers/vdpa/vdpa_sim/vdpa_sim.h | 1 + drivers/vdpa/vdpa_sim/vdpa_sim.c | 104 ++++++++++++++++++++++++++++++- 2 files changed, 103 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 07ef53ea375e..1b010e5c0445 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -55,6 +55,7 @@ struct vdpasim { struct vdpasim_virtqueue *vqs; struct kthread_worker *worker; struct kthread_work work; + struct mm_struct *mm_bound; struct vdpasim_dev_attr dev_attr; /* spinlock to synchronize virtqueue state */ spinlock_t lock; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 36a1d2e0a6ba..6e07cedef30c 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -36,10 +36,90 @@ module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, "Maximum number of iotlb entries for each address space. 0 means unlimited. (default: 2048)"); +static bool use_va; +module_param(use_va, bool, 0444); +MODULE_PARM_DESC(use_va, "Enable the device's ability to use VA"); + #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 #define VDPASIM_VENDOR_ID 0 +struct vdpasim_mm_work { + struct kthread_work work; + struct task_struct *owner; + struct mm_struct *mm; + bool bind; + int ret; +}; + +static void vdpasim_mm_work_fn(struct kthread_work *work) +{ + struct vdpasim_mm_work *mm_work + container_of(work, struct vdpasim_mm_work, work); + + mm_work->ret = 0; + + if (mm_work->bind) { + kthread_use_mm(mm_work->mm); +#if 0 + if (mm_work->owner) + mm_work->ret = cgroup_attach_task_all(mm_work->owner, + current); +#endif + } else { +#if 0 + //TODO: check it + cgroup_release(current); +#endif + kthread_unuse_mm(mm_work->mm); + } +} + +static void vdpasim_worker_queue_mm(struct vdpasim *vdpasim, + struct vdpasim_mm_work *mm_work) +{ + struct kthread_work *work = &mm_work->work; + + kthread_init_work(work, vdpasim_mm_work_fn); + kthread_queue_work(vdpasim->worker, work); + + spin_unlock(&vdpasim->lock); + kthread_flush_work(work); + spin_lock(&vdpasim->lock); +} + +static int vdpasim_worker_bind_mm(struct vdpasim *vdpasim, + struct mm_struct *new_mm, + struct task_struct *owner) +{ + struct vdpasim_mm_work mm_work; + + mm_work.owner = owner; + mm_work.mm = new_mm; + mm_work.bind = true; + + vdpasim_worker_queue_mm(vdpasim, &mm_work); + + if (!mm_work.ret) + vdpasim->mm_bound = new_mm; + + return mm_work.ret; +} + +static void vdpasim_worker_unbind_mm(struct vdpasim *vdpasim) +{ + struct vdpasim_mm_work mm_work; + + if (!vdpasim->mm_bound) + return; + + mm_work.mm = vdpasim->mm_bound; + mm_work.bind = false; + + vdpasim_worker_queue_mm(vdpasim, &mm_work); + + vdpasim->mm_bound = NULL; +} static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) { return container_of(vdpa, struct vdpasim, vdpa); @@ -66,8 +146,10 @@ static void vdpasim_vq_notify(struct vringh *vring) static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; + bool va_enabled = use_va && vdpasim->mm_bound; - vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, false, false, + vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, false, + va_enabled, (struct vring_desc *)(uintptr_t)vq->desc_addr, (struct vring_avail *) (uintptr_t)vq->driver_addr, @@ -96,6 +178,9 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim) { int i; + //TODO: should we cancel the works? + vdpasim_worker_unbind_mm(vdpasim); + spin_lock(&vdpasim->iommu_lock); for (i = 0; i < vdpasim->dev_attr.nvqs; i++) { @@ -275,7 +360,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, dev_attr->ngroups, dev_attr->nas, - dev_attr->name, false); + dev_attr->name, use_va); if (IS_ERR(vdpasim)) { ret = PTR_ERR(vdpasim); goto err_alloc; @@ -657,6 +742,19 @@ static int vdpasim_set_map(struct vdpa_device *vdpa, unsigned int asid, return ret; } +static int vdpasim_bind_mm(struct vdpa_device *vdpa, struct mm_struct *mm, + struct task_struct *owner) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + int ret; + + spin_lock(&vdpasim->lock); + ret = vdpasim_worker_bind_mm(vdpasim, mm, owner); + spin_unlock(&vdpasim->lock); + + return ret; +} + static int vdpasim_dma_map(struct vdpa_device *vdpa, unsigned int asid, u64 iova, u64 size, u64 pa, u32 perm, void *opaque) @@ -744,6 +842,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .set_group_asid = vdpasim_set_group_asid, .dma_map = vdpasim_dma_map, .dma_unmap = vdpasim_dma_unmap, + .bind_mm = vdpasim_bind_mm, .free = vdpasim_free, }; @@ -776,6 +875,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_iova_range = vdpasim_get_iova_range, .set_group_asid = vdpasim_set_group_asid, .set_map = vdpasim_set_map, + .bind_mm = vdpasim_bind_mm, .free = vdpasim_free, }; -- 2.38.1