Jason Wang
2021-Jun-22 07:49 UTC
[PATCH v8 09/10] vduse: Introduce VDUSE - vDPA Device in Userspace
? 2021/6/22 ??3:22, Yongji Xie ??:>> We need fix a way to propagate the error to the userspace. >> >> E.g if we want to stop the deivce, we will delay the status reset until >> we get respose from the userspace? >> > I didn't get how to delay the status reset. And should it be a DoS > that we want to fix if the userspace doesn't give a response forever?You're right. So let's make set_status() can fail first, then propagate its failure via VHOST_VDPA_SET_STATUS.> >>>>> + } >>>>> +} >>>>> + >>>>> +static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa) >>>>> +{ >>>>> + struct vduse_dev *dev = vdpa_to_vduse(vdpa); >>>>> + >>>>> + return dev->config_size; >>>>> +} >>>>> + >>>>> +static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset, >>>>> + void *buf, unsigned int len) >>>>> +{ >>>>> + struct vduse_dev *dev = vdpa_to_vduse(vdpa); >>>>> + >>>>> + memcpy(buf, dev->config + offset, len); >>>>> +} >>>>> + >>>>> +static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, >>>>> + const void *buf, unsigned int len) >>>>> +{ >>>>> + /* Now we only support read-only configuration space */ >>>>> +} >>>>> + >>>>> +static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa) >>>>> +{ >>>>> + struct vduse_dev *dev = vdpa_to_vduse(vdpa); >>>>> + >>>>> + return dev->generation; >>>>> +} >>>>> + >>>>> +static int vduse_vdpa_set_map(struct vdpa_device *vdpa, >>>>> + struct vhost_iotlb *iotlb) >>>>> +{ >>>>> + struct vduse_dev *dev = vdpa_to_vduse(vdpa); >>>>> + int ret; >>>>> + >>>>> + ret = vduse_domain_set_map(dev->domain, iotlb); >>>>> + if (ret) >>>>> + return ret; >>>>> + >>>>> + ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX); >>>>> + if (ret) { >>>>> + vduse_domain_clear_map(dev->domain, iotlb); >>>>> + return ret; >>>>> + } >>>>> + >>>>> + return 0; >>>>> +} >>>>> + >>>>> +static void vduse_vdpa_free(struct vdpa_device *vdpa) >>>>> +{ >>>>> + struct vduse_dev *dev = vdpa_to_vduse(vdpa); >>>>> + >>>>> + dev->vdev = NULL; >>>>> +} >>>>> + >>>>> +static const struct vdpa_config_ops vduse_vdpa_config_ops = { >>>>> + .set_vq_address = vduse_vdpa_set_vq_address, >>>>> + .kick_vq = vduse_vdpa_kick_vq, >>>>> + .set_vq_cb = vduse_vdpa_set_vq_cb, >>>>> + .set_vq_num = vduse_vdpa_set_vq_num, >>>>> + .set_vq_ready = vduse_vdpa_set_vq_ready, >>>>> + .get_vq_ready = vduse_vdpa_get_vq_ready, >>>>> + .set_vq_state = vduse_vdpa_set_vq_state, >>>>> + .get_vq_state = vduse_vdpa_get_vq_state, >>>>> + .get_vq_align = vduse_vdpa_get_vq_align, >>>>> + .get_features = vduse_vdpa_get_features, >>>>> + .set_features = vduse_vdpa_set_features, >>>>> + .set_config_cb = vduse_vdpa_set_config_cb, >>>>> + .get_vq_num_max = vduse_vdpa_get_vq_num_max, >>>>> + .get_device_id = vduse_vdpa_get_device_id, >>>>> + .get_vendor_id = vduse_vdpa_get_vendor_id, >>>>> + .get_status = vduse_vdpa_get_status, >>>>> + .set_status = vduse_vdpa_set_status, >>>>> + .get_config_size = vduse_vdpa_get_config_size, >>>>> + .get_config = vduse_vdpa_get_config, >>>>> + .set_config = vduse_vdpa_set_config, >>>>> + .get_generation = vduse_vdpa_get_generation, >>>>> + .set_map = vduse_vdpa_set_map, >>>>> + .free = vduse_vdpa_free, >>>>> +}; >>>>> + >>>>> +static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page, >>>>> + unsigned long offset, size_t size, >>>>> + enum dma_data_direction dir, >>>>> + unsigned long attrs) >>>>> +{ >>>>> + struct vduse_dev *vdev = dev_to_vduse(dev); >>>>> + struct vduse_iova_domain *domain = vdev->domain; >>>>> + >>>>> + return vduse_domain_map_page(domain, page, offset, size, dir, attrs); >>>>> +} >>>>> + >>>>> +static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr, >>>>> + size_t size, enum dma_data_direction dir, >>>>> + unsigned long attrs) >>>>> +{ >>>>> + struct vduse_dev *vdev = dev_to_vduse(dev); >>>>> + struct vduse_iova_domain *domain = vdev->domain; >>>>> + >>>>> + return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); >>>>> +} >>>>> + >>>>> +static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, >>>>> + dma_addr_t *dma_addr, gfp_t flag, >>>>> + unsigned long attrs) >>>>> +{ >>>>> + struct vduse_dev *vdev = dev_to_vduse(dev); >>>>> + struct vduse_iova_domain *domain = vdev->domain; >>>>> + unsigned long iova; >>>>> + void *addr; >>>>> + >>>>> + *dma_addr = DMA_MAPPING_ERROR; >>>>> + addr = vduse_domain_alloc_coherent(domain, size, >>>>> + (dma_addr_t *)&iova, flag, attrs); >>>>> + if (!addr) >>>>> + return NULL; >>>>> + >>>>> + *dma_addr = (dma_addr_t)iova; >>>>> + >>>>> + return addr; >>>>> +} >>>>> + >>>>> +static void vduse_dev_free_coherent(struct device *dev, size_t size, >>>>> + void *vaddr, dma_addr_t dma_addr, >>>>> + unsigned long attrs) >>>>> +{ >>>>> + struct vduse_dev *vdev = dev_to_vduse(dev); >>>>> + struct vduse_iova_domain *domain = vdev->domain; >>>>> + >>>>> + vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); >>>>> +} >>>>> + >>>>> +static size_t vduse_dev_max_mapping_size(struct device *dev) >>>>> +{ >>>>> + struct vduse_dev *vdev = dev_to_vduse(dev); >>>>> + struct vduse_iova_domain *domain = vdev->domain; >>>>> + >>>>> + return domain->bounce_size; >>>>> +} >>>>> + >>>>> +static const struct dma_map_ops vduse_dev_dma_ops = { >>>>> + .map_page = vduse_dev_map_page, >>>>> + .unmap_page = vduse_dev_unmap_page, >>>>> + .alloc = vduse_dev_alloc_coherent, >>>>> + .free = vduse_dev_free_coherent, >>>>> + .max_mapping_size = vduse_dev_max_mapping_size, >>>>> +}; >>>>> + >>>>> +static unsigned int perm_to_file_flags(u8 perm) >>>>> +{ >>>>> + unsigned int flags = 0; >>>>> + >>>>> + switch (perm) { >>>>> + case VDUSE_ACCESS_WO: >>>>> + flags |= O_WRONLY; >>>>> + break; >>>>> + case VDUSE_ACCESS_RO: >>>>> + flags |= O_RDONLY; >>>>> + break; >>>>> + case VDUSE_ACCESS_RW: >>>>> + flags |= O_RDWR; >>>>> + break; >>>>> + default: >>>>> + WARN(1, "invalidate vhost IOTLB permission\n"); >>>>> + break; >>>>> + } >>>>> + >>>>> + return flags; >>>>> +} >>>>> + >>>>> +static int vduse_kickfd_setup(struct vduse_dev *dev, >>>>> + struct vduse_vq_eventfd *eventfd) >>>>> +{ >>>>> + struct eventfd_ctx *ctx = NULL; >>>>> + struct vduse_virtqueue *vq; >>>>> + u32 index; >>>>> + >>>>> + if (eventfd->index >= dev->vq_num) >>>>> + return -EINVAL; >>>>> + >>>>> + index = array_index_nospec(eventfd->index, dev->vq_num); >>>>> + vq = &dev->vqs[index]; >>>>> + if (eventfd->fd >= 0) { >>>>> + ctx = eventfd_ctx_fdget(eventfd->fd); >>>>> + if (IS_ERR(ctx)) >>>>> + return PTR_ERR(ctx); >>>>> + } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN) >>>>> + return 0; >>>>> + >>>>> + spin_lock(&vq->kick_lock); >>>>> + if (vq->kickfd) >>>>> + eventfd_ctx_put(vq->kickfd); >>>>> + vq->kickfd = ctx; >>>>> + if (vq->ready && vq->kicked && vq->kickfd) { >>>>> + eventfd_signal(vq->kickfd, 1); >>>>> + vq->kicked = false; >>>>> + } >>>>> + spin_unlock(&vq->kick_lock); >>>>> + >>>>> + return 0; >>>>> +} >>>>> + >>>>> +static void vduse_dev_irq_inject(struct work_struct *work) >>>>> +{ >>>>> + struct vduse_dev *dev = container_of(work, struct vduse_dev, inject); >>>>> + >>>>> + spin_lock_irq(&dev->irq_lock); >>>>> + if (dev->config_cb.callback) >>>>> + dev->config_cb.callback(dev->config_cb.private); >>>>> + spin_unlock_irq(&dev->irq_lock); >>>>> +} >>>>> + >>>>> +static void vduse_vq_irq_inject(struct work_struct *work) >>>>> +{ >>>>> + struct vduse_virtqueue *vq = container_of(work, >>>>> + struct vduse_virtqueue, inject); >>>>> + >>>>> + spin_lock_irq(&vq->irq_lock); >>>>> + if (vq->ready && vq->cb.callback) >>>>> + vq->cb.callback(vq->cb.private); >>>>> + spin_unlock_irq(&vq->irq_lock); >>>>> +} >>>>> + >>>>> +static long vduse_dev_ioctl(struct file *file, unsigned int cmd, >>>>> + unsigned long arg) >>>>> +{ >>>>> + struct vduse_dev *dev = file->private_data; >>>>> + void __user *argp = (void __user *)arg; >>>>> + int ret; >>>>> + >>>>> + switch (cmd) { >>>>> + case VDUSE_IOTLB_GET_FD: { >>>>> + struct vduse_iotlb_entry entry; >>>>> + struct vhost_iotlb_map *map; >>>>> + struct vdpa_map_file *map_file; >>>>> + struct vduse_iova_domain *domain = dev->domain; >>>>> + struct file *f = NULL; >>>>> + >>>>> + ret = -EFAULT; >>>>> + if (copy_from_user(&entry, argp, sizeof(entry))) >>>>> + break; >>>>> + >>>>> + ret = -EINVAL; >>>>> + if (entry.start > entry.last) >>>>> + break; >>>>> + >>>>> + spin_lock(&domain->iotlb_lock); >>>>> + map = vhost_iotlb_itree_first(domain->iotlb, >>>>> + entry.start, entry.last); >>>>> + if (map) { >>>>> + map_file = (struct vdpa_map_file *)map->opaque; >>>>> + f = get_file(map_file->file); >>>>> + entry.offset = map_file->offset; >>>>> + entry.start = map->start; >>>>> + entry.last = map->last; >>>>> + entry.perm = map->perm; >>>>> + } >>>>> + spin_unlock(&domain->iotlb_lock); >>>>> + ret = -EINVAL; >>>>> + if (!f) >>>>> + break; >>>>> + >>>>> + ret = -EFAULT; >>>>> + if (copy_to_user(argp, &entry, sizeof(entry))) { >>>>> + fput(f); >>>>> + break; >>>>> + } >>>>> + ret = receive_fd(f, perm_to_file_flags(entry.perm)); >>>>> + fput(f); >>>>> + break; >>>>> + } >>>>> + case VDUSE_DEV_GET_FEATURES: >>>>> + ret = put_user(dev->features, (u64 __user *)argp); >>>>> + break; >>>>> + case VDUSE_DEV_UPDATE_CONFIG: { >>>>> + struct vduse_config_update config; >>>>> + unsigned long size = offsetof(struct vduse_config_update, >>>>> + buffer); >>>>> + >>>>> + ret = -EFAULT; >>>>> + if (copy_from_user(&config, argp, size)) >>>>> + break; >>>>> + >>>>> + ret = -EINVAL; >>>>> + if (config.length == 0 || >>>>> + config.length > dev->config_size - config.offset) >>>>> + break; >>>>> + >>>>> + ret = -EFAULT; >>>>> + if (copy_from_user(dev->config + config.offset, argp + size, >>>>> + config.length)) >>>>> + break; >>>>> + >>>>> + ret = 0; >>>>> + queue_work(vduse_irq_wq, &dev->inject); >>>> I wonder if it's better to separate config interrupt out of config >>>> update or we need document this. >>>> >>> I have documented it in the docs. Looks like a config update should be >>> always followed by a config interrupt. I didn't find a case that uses >>> them separately. >> The uAPI doesn't prevent us from the following scenario: >> >> update_config(mac[0], ..); >> update_config(max[1], ..); >> >> So it looks to me it's better to separate the config interrupt from the >> config updating. >> > Fine. > >>>>> + break; >>>>> + } >>>>> + case VDUSE_VQ_GET_INFO: { >>>> Do we need to limit this only when DRIVER_OK is set? >>>> >>> Any reason to add this limitation? >> Otherwise the vq is not fully initialized, e.g the desc_addr might not >> be correct. >> > The vq_info->ready can be used to tell userspace whether the vq is > initialized or not.Yes, this will work as well. Thanks> > Thanks, > Yongji >