Michael S. Tsirkin
2022-Jun-29 09:54 UTC
[PATCH 5/6] vduse: Support registering userspace memory for IOTLB
On Wed, Jun 29, 2022 at 05:26:04PM +0800, Yongji Xie wrote:> On Wed, Jun 29, 2022 at 4:43 PM Michael S. Tsirkin <mst at redhat.com> wrote: > > > > On Wed, Jun 29, 2022 at 04:25:40PM +0800, Xie Yongji wrote: > > > Introduce two ioctls: VDUSE_IOTLB_REG_UMEM and > > > VDUSE_IOTLB_DEREG_UMEM to support registering > > > and de-registering userspace memory for IOTLB > > > in virtio-vdpa case. > > > > > > Now it only supports registering userspace memory > > > for IOTLB as bounce buffer. > > > > > > Signed-off-by: Xie Yongji <xieyongji at bytedance.com> > > > --- > > > drivers/vdpa/vdpa_user/vduse_dev.c | 138 +++++++++++++++++++++++++++++ > > > include/uapi/linux/vduse.h | 28 ++++++ > > > 2 files changed, 166 insertions(+) > > > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c > > > index c47a5d9765cf..7b2ea7612da9 100644 > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > > > @@ -21,6 +21,7 @@ > > > #include <linux/uio.h> > > > #include <linux/vdpa.h> > > > #include <linux/nospec.h> > > > +#include <linux/sched/mm.h> > > > #include <uapi/linux/vduse.h> > > > #include <uapi/linux/vdpa.h> > > > #include <uapi/linux/virtio_config.h> > > > @@ -64,6 +65,13 @@ struct vduse_vdpa { > > > struct vduse_dev *dev; > > > }; > > > > > > +struct vduse_iotlb_mem { > > > + unsigned long iova; > > > + unsigned long npages; > > > + struct page **pages; > > > + struct mm_struct *mm; > > > +}; > > > + > > > struct vduse_dev { > > > struct vduse_vdpa *vdev; > > > struct device *dev; > > > @@ -95,6 +103,8 @@ struct vduse_dev { > > > u8 status; > > > u32 vq_num; > > > u32 vq_align; > > > + struct vduse_iotlb_mem *iotlb_mem; > > > + struct mutex mem_lock; > > > }; > > > > > > struct vduse_dev_msg { > > > @@ -917,6 +927,100 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev, > > > return ret; > > > } > > > > > > +static int vduse_dev_dereg_iotlb_mem(struct vduse_dev *dev, > > > + u64 iova, u64 size) > > > +{ > > > + int ret; > > > + > > > + mutex_lock(&dev->mem_lock); > > > + ret = -ENOENT; > > > + if (!dev->iotlb_mem) > > > + goto unlock; > > > + > > > + ret = -EINVAL; > > > + if (dev->iotlb_mem->iova != iova || size != dev->domain->bounce_size) > > > + goto unlock; > > > + > > > + vduse_domain_remove_user_bounce_pages(dev->domain); > > > + unpin_user_pages(dev->iotlb_mem->pages, dev->iotlb_mem->npages); > > > > I notice you don't mark the pages dirty. This is going to be a problem. > > > > Thanks for pointing out this, I will use unpin_user_pages_dirty_lock() instead. > > > > + atomic64_sub(dev->iotlb_mem->npages, &dev->iotlb_mem->mm->pinned_vm); > > > + mmdrop(dev->iotlb_mem->mm); > > > + vfree(dev->iotlb_mem->pages); > > > + kfree(dev->iotlb_mem); > > > + dev->iotlb_mem = NULL; > > > + ret = 0; > > > +unlock: > > > + mutex_unlock(&dev->mem_lock); > > > + return ret; > > > +} > > > + > > > +static int vduse_dev_reg_iotlb_mem(struct vduse_dev *dev, > > > + u64 iova, u64 uaddr, u64 size) > > > +{ > > > + struct page **page_list = NULL; > > > + struct vduse_iotlb_mem *mem = NULL; > > > + long pinned = 0; > > > + unsigned long npages, lock_limit; > > > + int ret; > > > + > > > + if (size != dev->domain->bounce_size || > > > + iova != 0 || uaddr & ~PAGE_MASK) > > > + return -EINVAL; > > > + > > > + mutex_lock(&dev->mem_lock); > > > + ret = -EEXIST; > > > + if (dev->iotlb_mem) > > > + goto unlock; > > > + > > > + ret = -ENOMEM; > > > + npages = size >> PAGE_SHIFT; > > > + page_list = vmalloc(array_size(npages, > > > + sizeof(struct page *))); > > > > Is this basically trying to do a vmalloc with userspace-controlled size? > > That's an easy DOS vector. > > > > We already checked the size before. The size must equal to (64MB >> > PAGE_SHIFT) now.That's not a small amount. Can this be accounted e.g. through cgroups at least?> > > + mem = kzalloc(sizeof(*mem), GFP_KERNEL); > > > + if (!page_list || !mem) > > > + goto unlock; > > > + > > > + mmap_read_lock(current->mm); > > > + > > > + lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK)); > > > + if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit) > > > + goto out; > > > + > > > + pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, > > > + page_list, NULL); > > > + if (pinned != npages) { > > > + ret = pinned < 0 ? pinned : -ENOMEM; > > > + goto out; > > > + } > > > > > > This is a popular approach but it's problematic if multiple > > devices try to pin the same page. > > Do you mean the data would be corrupted if multiple devices use the > same page as bounce buffer? This is indeed a problem.No i mean you decrement the lock twice. Question is can two bounce buffers share a page?> > Can this happen here? > > > > I think we can't prevent this case now. I will do it in v2. > > > > + > > > + ret = vduse_domain_add_user_bounce_pages(dev->domain, > > > + page_list, pinned); > > > + if (ret) > > > + goto out; > > > + > > > + atomic64_add(npages, ¤t->mm->pinned_vm); > > > + > > > + mem->pages = page_list; > > > + mem->npages = pinned; > > > + mem->iova = iova; > > > + mem->mm = current->mm; > > > + mmgrab(current->mm); > > > + > > > + dev->iotlb_mem = mem; > > > +out: > > > + if (ret && pinned > 0) > > > + unpin_user_pages(page_list, pinned); > > > + > > > + mmap_read_unlock(current->mm); > > > +unlock: > > > + if (ret) { > > > + vfree(page_list); > > > + kfree(mem); > > > + } > > > + mutex_unlock(&dev->mem_lock); > > > + return ret; > > > +} > > > + > > > static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > > > unsigned long arg) > > > { > > > @@ -943,6 +1047,16 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > > > if (entry.start > entry.last) > > > break; > > > > > > + if (domain->bounce_map && dev->iotlb_mem) { > > > + ret = -EEXIST; > > > + if (entry.start >= 0 && > > > + entry.last < domain->bounce_size) > > > + break; > > > + > > > + if (entry.start < domain->bounce_size) > > > + entry.start = domain->bounce_size; > > > + } > > > + > > > spin_lock(&domain->iotlb_lock); > > > map = vhost_iotlb_itree_first(domain->iotlb, > > > entry.start, entry.last); > > > @@ -1102,6 +1216,28 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, > > > ret = 0; > > > break; > > > } > > > + case VDUSE_IOTLB_REG_UMEM: { > > > + struct vduse_iotlb_umem umem; > > > + > > > + ret = -EFAULT; > > > + if (copy_from_user(&umem, argp, sizeof(umem))) > > > + break; > > > + > > > + ret = vduse_dev_reg_iotlb_mem(dev, umem.iova, > > > + umem.uaddr, umem.size); > > > + break; > > > + } > > > + case VDUSE_IOTLB_DEREG_UMEM: { > > > + struct vduse_iotlb_umem umem; > > > + > > > + ret = -EFAULT; > > > + if (copy_from_user(&umem, argp, sizeof(umem))) > > > + break; > > > + > > > + ret = vduse_dev_dereg_iotlb_mem(dev, umem.iova, > > > + umem.size); > > > + break; > > > + } > > > default: > > > ret = -ENOIOCTLCMD; > > > break; > > > @@ -1114,6 +1250,7 @@ static int vduse_dev_release(struct inode *inode, struct file *file) > > > { > > > struct vduse_dev *dev = file->private_data; > > > > > > + vduse_dev_dereg_iotlb_mem(dev, 0, dev->domain->bounce_size); > > > spin_lock(&dev->msg_lock); > > > /* Make sure the inflight messages can processed after reconncection */ > > > list_splice_init(&dev->recv_list, &dev->send_list); > > > @@ -1176,6 +1313,7 @@ static struct vduse_dev *vduse_dev_create(void) > > > return NULL; > > > > > > mutex_init(&dev->lock); > > > + mutex_init(&dev->mem_lock); > > > spin_lock_init(&dev->msg_lock); > > > INIT_LIST_HEAD(&dev->send_list); > > > INIT_LIST_HEAD(&dev->recv_list); > > > diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h > > > index c201b7a77c2c..1b17391e228f 100644 > > > --- a/include/uapi/linux/vduse.h > > > +++ b/include/uapi/linux/vduse.h > > > @@ -227,6 +227,34 @@ struct vduse_iotlb_info { > > > /* Get IOTLB information, e.g. bounce buffer size */ > > > #define VDUSE_IOTLB_GET_INFO _IOR(VDUSE_BASE, 0x18, struct vduse_iotlb_info) > > > > > > +/** > > > + * struct vduse_iotlb_umem - userspace memory configuration > > > + * @uaddr: start address of userspace memory, it must be aligned to page size > > > + * @iova: IOVA of userspace memory, it must be equal to bounce iova returned > > > + * by VDUSE_IOTLB_GET_INFO now > > > + * @size: size of userspace memory, it must be equal to bounce size returned > > > + * by VDUSE_IOTLB_GET_INFO now > > > + * @reserved: for future use, needs to be initialized to zero > > > > You should check that it's 0 in that case, otherwise userspace > > will conveniently forget. > > > > OK, I will fix it. > > Thanks, > Yongji