Mike Christie
2023-Apr-28 16:31 UTC
[PATCH 13/14] vhost: allow userspace to create workers
For vhost-scsi with 3 vqs or more and a workload that tries to use them in parallel like: fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \ --ioengine=libaio --iodepth=128 --numjobs=3 the single vhost worker thread will become a bottlneck and we are stuck at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are used. To better utilize virtqueues and available CPUs, this patch allows userspace to create workers and bind them to vqs. You can have N workers per dev and also share N workers with M vqs on that dev. This patch adds the interface related code and the next patch will hook vhost-scsi into it. The patches do not try to hook net and vsock into the interface because: 1. multiple workers don't seem to help vsock. The problem is that with only 2 virtqueues we never fully use the existing worker when doing bidirectional tests. This seems to match vhost-scsi where we don't see the worker as a bottleneck until 3 virtqueues are used. 2. net already has a way to use multiple workers. Signed-off-by: Mike Christie <michael.christie at oracle.com> --- drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++++++- drivers/vhost/vhost.h | 3 + include/uapi/linux/vhost.h | 33 +++++++ include/uapi/linux/vhost_types.h | 16 ++++ 4 files changed, 196 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 4b0b82292379..e8f829f35814 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) return NULL; } +/* Caller must have device mutex */ +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, + struct vhost_worker *worker) +{ + if (vq->worker) + vq->worker->attachment_cnt--; + worker->attachment_cnt++; + vq->worker = worker; +} + +/** + * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command + * @vq: the virtqueue we will set the worker for + * @info: the worker userspace has requested us to use + * + * We only allow userspace to set a virtqueue's worker if it's not active and + * polling is not enabled. We also assume drivers supporting this will not be + * internally queueing works directly or via calls like vhost_dev_flush at + * this time. + * + * Caller must have device and virtqueue mutex. + */ +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, + struct vhost_vring_worker *info) +{ + unsigned long index = info->worker_id; + struct vhost_dev *dev = vq->dev; + struct vhost_worker *worker; + + if (!dev->use_worker) + return -EINVAL; + + if (vhost_vq_get_backend(vq) || vq->kick) + return -EBUSY; + + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); + if (!worker || worker->id != info->worker_id) + return -ENODEV; + + __vhost_vq_attach_worker(vq, worker); + return 0; +} + +/* Caller must have device mutex */ +static int vhost_new_worker(struct vhost_dev *dev, + struct vhost_worker_state *info) +{ + struct vhost_worker *worker; + + worker = vhost_worker_create(dev); + if (!worker) + return -ENOMEM; + + info->worker_id = worker->id; + return 0; +} + +static int vhost_free_worker(struct vhost_dev *dev, + struct vhost_worker_state *info) +{ + unsigned long index = info->worker_id; + struct vhost_worker *worker; + + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); + if (!worker || worker->id != info->worker_id) + return -ENODEV; + + if (worker->attachment_cnt) + return -EBUSY; + + vhost_worker_destroy(dev, worker); + return 0; +} + static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, struct vhost_virtqueue **vq, u32 *id) { @@ -651,6 +725,75 @@ static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, return 0; } +/* Caller must have device mutex */ +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, + void __user *argp) +{ + struct vhost_vring_worker ring_worker; + struct vhost_worker_state state; + struct vhost_virtqueue *vq; + long ret; + u32 idx; + + if (!dev->use_worker) + return -EINVAL; + + if (!vhost_dev_has_owner(dev)) + return -EINVAL; + + switch (ioctl) { + /* dev worker ioctls */ + case VHOST_NEW_WORKER: + ret = vhost_new_worker(dev, &state); + if (!ret && copy_to_user(argp, &state, sizeof(state))) + ret = -EFAULT; + return ret; + case VHOST_FREE_WORKER: + if (copy_from_user(&state, argp, sizeof(state))) + return -EFAULT; + return vhost_free_worker(dev, &state); + /* vring worker ioctls */ + case VHOST_ATTACH_VRING_WORKER: + case VHOST_GET_VRING_WORKER: + break; + default: + return -ENOIOCTLCMD; + } + + ret = vhost_get_vq_from_user(dev, argp, &vq, &idx); + if (ret) + return ret; + + mutex_lock(&vq->mutex); + switch (ioctl) { + case VHOST_ATTACH_VRING_WORKER: + if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) { + ret = -EFAULT; + break; + } + + ret = vhost_vq_attach_worker(vq, &ring_worker); + if (!ret && copy_to_user(argp, &ring_worker, + sizeof(ring_worker))) + ret = -EFAULT; + break; + case VHOST_GET_VRING_WORKER: + ring_worker.index = idx; + ring_worker.worker_id = vq->worker->id; + + if (copy_to_user(argp, &ring_worker, sizeof(ring_worker))) + ret = -EFAULT; + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + mutex_unlock(&vq->mutex); + return ret; +} +EXPORT_SYMBOL_GPL(vhost_worker_ioctl); + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { @@ -671,7 +814,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) goto err_worker; for (i = 0; i < dev->nvqs; i++) - dev->vqs[i]->worker = worker; + __vhost_vq_attach_worker(dev->vqs[i], worker); } err = vhost_dev_alloc_iovecs(dev); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 2eea20d54134..bcb33a2f64f0 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -31,6 +31,7 @@ struct vhost_worker { struct llist_head work_list; u64 kcov_handle; u32 id; + int attachment_cnt; }; /* Poll a file (eventfd or socket) */ @@ -187,6 +188,8 @@ void vhost_dev_cleanup(struct vhost_dev *); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, + void __user *argp); bool vhost_vq_access_ok(struct vhost_virtqueue *vq); bool vhost_log_access_ok(struct vhost_dev *); void vhost_clear_msg(struct vhost_dev *dev); diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 92e1b700b51c..155710585979 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -45,6 +45,25 @@ #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) /* Specify an eventfd file descriptor to signal on log write. */ #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) +/* By default, a device gets one vhost_worker that its virtqueues share. This + * command allows the owner of the device to create an additional vhost_worker + * for the device. It can later be bound to 1 or more of its virtqueues using + * the VHOST_ATTACH_VRING_WORKER command. + * + * This must be called after VHOST_SET_OWNER and the caller must be the owner + * of the device. The new thread will inherit caller's cgroups and namespaces, + * and will share the caller's memory space. The new thread will also be + * counted against the caller's RLIMIT_NPROC value. + * + * The worker's ID used in other commands will be returned in + * vhost_worker_state. + */ +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any + * virtqueue. If userspace is not able to call this for workers its created, + * the kernel will free all the device's workers when the device is closed. + */ +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) /* Ring setup. */ /* Set number of descriptors in ring. This parameter can not @@ -70,6 +89,20 @@ #define VHOST_VRING_BIG_ENDIAN 1 #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's + * virtqueues. This must be done before VHOST_SET_VRING_KICK and the driver + * specific ioctl to activate the virtqueue (VHOST_SCSI_SET_ENDPOINT, + * VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING) has been run. + * + * This will replace the virtqueue's existing worker. If the replaced worker + * is no longer attached to any virtqueues, it can be freed with + * VHOST_FREE_WORKER. + */ +#define VHOST_ATTACH_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x15, \ + struct vhost_vring_worker) +/* Return the vring worker's ID */ +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ + struct vhost_vring_worker) /* The following ioctls use eventfd file descriptors to signal and poll * for events. */ diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index c5690a8992d8..d3aad12ad1fa 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -47,6 +47,22 @@ struct vhost_vring_addr { __u64 log_guest_addr; }; +struct vhost_worker_state { + /* + * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id. + * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker + * to free. + */ + unsigned int worker_id; +}; + +struct vhost_vring_worker { + /* vring index */ + unsigned int index; + /* The id of the vhost_worker returned from VHOST_NEW_WORKER */ + unsigned int worker_id; +}; + /* no alignment requirement */ struct vhost_iotlb_msg { __u64 iova; -- 2.25.1
On Sat, Apr 29, 2023 at 12:32?AM Mike Christie <michael.christie at oracle.com> wrote:> > For vhost-scsi with 3 vqs or more and a workload that tries to use > them in parallel like: > > fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \ > --ioengine=libaio --iodepth=128 --numjobs=3 > > the single vhost worker thread will become a bottlneck and we are stuck > at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are > used. > > To better utilize virtqueues and available CPUs, this patch allows > userspace to create workers and bind them to vqs. You can have N workers > per dev and also share N workers with M vqs on that dev. > > This patch adds the interface related code and the next patch will hook > vhost-scsi into it. The patches do not try to hook net and vsock into > the interface because: > > 1. multiple workers don't seem to help vsock. The problem is that with > only 2 virtqueues we never fully use the existing worker when doing > bidirectional tests. This seems to match vhost-scsi where we don't see > the worker as a bottleneck until 3 virtqueues are used. > > 2. net already has a way to use multiple workers. > > Signed-off-by: Mike Christie <michael.christie at oracle.com> > --- > drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++++++- > drivers/vhost/vhost.h | 3 + > include/uapi/linux/vhost.h | 33 +++++++ > include/uapi/linux/vhost_types.h | 16 ++++ > 4 files changed, 196 insertions(+), 1 deletion(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 4b0b82292379..e8f829f35814 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) > return NULL; > } > > +/* Caller must have device mutex */ > +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, > + struct vhost_worker *worker) > +{ > + if (vq->worker) > + vq->worker->attachment_cnt--; > + worker->attachment_cnt++; > + vq->worker = worker; > +} > + > +/** > + * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command > + * @vq: the virtqueue we will set the worker for > + * @info: the worker userspace has requested us to use > + * > + * We only allow userspace to set a virtqueue's worker if it's not active and > + * polling is not enabled.I wonder if we can mandate this in the code like check the vq backend in vhost_vq_work_queue(). We also assume drivers supporting this will not be> + * internally queueing works directly or via calls like vhost_dev_flush at > + * this time. > + * > + * Caller must have device and virtqueue mutex. > + */ > +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, > + struct vhost_vring_worker *info) > +{ > + unsigned long index = info->worker_id; > + struct vhost_dev *dev = vq->dev; > + struct vhost_worker *worker; > + > + if (!dev->use_worker) > + return -EINVAL; > + > + if (vhost_vq_get_backend(vq) || vq->kick)It might be worthwhile to have a comment to explain why we need to check vq->kick here. This also means the device should not queue work when the backend is NULL. But I found it is probably not the case for vsock, it calls vhost_poll_queue() in vhost_transport_cancel_pkt() but vhost_vsock_stop() doesn't wait before doing vhost_vq_set_backend(vq, NULL); Net seems to be fine since it waits for ubufs to be completed in vhost_net_set_backend(). Can we make things easier by migrating the work_list? I also worry if there are other corner cases which makes me think how hard it is if we can just support those ioctls after the backend is set?> + return -EBUSY; > + > + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); > + if (!worker || worker->id != info->worker_id) > + return -ENODEV; > + > + __vhost_vq_attach_worker(vq, worker); > + return 0; > +} > + > +/* Caller must have device mutex */ > +static int vhost_new_worker(struct vhost_dev *dev, > + struct vhost_worker_state *info) > +{ > + struct vhost_worker *worker; > + > + worker = vhost_worker_create(dev); > + if (!worker) > + return -ENOMEM; > + > + info->worker_id = worker->id; > + return 0; > +} > + > +static int vhost_free_worker(struct vhost_dev *dev, > + struct vhost_worker_state *info) > +{ > + unsigned long index = info->worker_id; > + struct vhost_worker *worker; > + > + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); > + if (!worker || worker->id != info->worker_id) > + return -ENODEV; > + > + if (worker->attachment_cnt) > + return -EBUSY; > + > + vhost_worker_destroy(dev, worker); > + return 0; > +} > + > static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, > struct vhost_virtqueue **vq, u32 *id) > { > @@ -651,6 +725,75 @@ static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, > return 0; > } > > +/* Caller must have device mutex */ > +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, > + void __user *argp) > +{ > + struct vhost_vring_worker ring_worker; > + struct vhost_worker_state state; > + struct vhost_virtqueue *vq; > + long ret; > + u32 idx; > + > + if (!dev->use_worker) > + return -EINVAL; > + > + if (!vhost_dev_has_owner(dev)) > + return -EINVAL; > + > + switch (ioctl) { > + /* dev worker ioctls */ > + case VHOST_NEW_WORKER: > + ret = vhost_new_worker(dev, &state); > + if (!ret && copy_to_user(argp, &state, sizeof(state))) > + ret = -EFAULT; > + return ret; > + case VHOST_FREE_WORKER: > + if (copy_from_user(&state, argp, sizeof(state))) > + return -EFAULT; > + return vhost_free_worker(dev, &state); > + /* vring worker ioctls */ > + case VHOST_ATTACH_VRING_WORKER: > + case VHOST_GET_VRING_WORKER: > + break; > + default: > + return -ENOIOCTLCMD; > + } > + > + ret = vhost_get_vq_from_user(dev, argp, &vq, &idx); > + if (ret) > + return ret; > + > + mutex_lock(&vq->mutex); > + switch (ioctl) { > + case VHOST_ATTACH_VRING_WORKER: > + if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) { > + ret = -EFAULT; > + break; > + } > + > + ret = vhost_vq_attach_worker(vq, &ring_worker); > + if (!ret && copy_to_user(argp, &ring_worker, > + sizeof(ring_worker))) > + ret = -EFAULT; > + break; > + case VHOST_GET_VRING_WORKER: > + ring_worker.index = idx; > + ring_worker.worker_id = vq->worker->id; > + > + if (copy_to_user(argp, &ring_worker, sizeof(ring_worker))) > + ret = -EFAULT; > + break; > + default: > + ret = -ENOIOCTLCMD; > + break; > + } > + > + mutex_unlock(&vq->mutex); > + return ret; > +} > +EXPORT_SYMBOL_GPL(vhost_worker_ioctl); > + > /* Caller should have device mutex */ > long vhost_dev_set_owner(struct vhost_dev *dev) > { > @@ -671,7 +814,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) > goto err_worker; > > for (i = 0; i < dev->nvqs; i++) > - dev->vqs[i]->worker = worker; > + __vhost_vq_attach_worker(dev->vqs[i], worker); > } > > err = vhost_dev_alloc_iovecs(dev); > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 2eea20d54134..bcb33a2f64f0 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -31,6 +31,7 @@ struct vhost_worker { > struct llist_head work_list; > u64 kcov_handle; > u32 id; > + int attachment_cnt; > }; > > /* Poll a file (eventfd or socket) */ > @@ -187,6 +188,8 @@ void vhost_dev_cleanup(struct vhost_dev *); > void vhost_dev_stop(struct vhost_dev *); > long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); > long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); > +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, > + void __user *argp); > bool vhost_vq_access_ok(struct vhost_virtqueue *vq); > bool vhost_log_access_ok(struct vhost_dev *); > void vhost_clear_msg(struct vhost_dev *dev); > diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h > index 92e1b700b51c..155710585979 100644 > --- a/include/uapi/linux/vhost.h > +++ b/include/uapi/linux/vhost.h > @@ -45,6 +45,25 @@ > #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) > /* Specify an eventfd file descriptor to signal on log write. */ > #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) > +/* By default, a device gets one vhost_worker that its virtqueues share. This > + * command allows the owner of the device to create an additional vhost_worker > + * for the device. It can later be bound to 1 or more of its virtqueues using > + * the VHOST_ATTACH_VRING_WORKER command. > + * > + * This must be called after VHOST_SET_OWNER and the caller must be the owner > + * of the device. The new thread will inherit caller's cgroups and namespaces, > + * and will share the caller's memory space. The new thread will also be > + * counted against the caller's RLIMIT_NPROC value.This makes me think if we should destroy and re-create those after VHOST_RESET_OWNER? Thanks> + * > + * The worker's ID used in other commands will be returned in > + * vhost_worker_state. > + */ > +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) > +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any > + * virtqueue. If userspace is not able to call this for workers its created, > + * the kernel will free all the device's workers when the device is closed. > + */ > +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) > > /* Ring setup. */ > /* Set number of descriptors in ring. This parameter can not > @@ -70,6 +89,20 @@ > #define VHOST_VRING_BIG_ENDIAN 1 > #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) > #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's > + * virtqueues. This must be done before VHOST_SET_VRING_KICK and the driver > + * specific ioctl to activate the virtqueue (VHOST_SCSI_SET_ENDPOINT, > + * VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING) has been run. > + * > + * This will replace the virtqueue's existing worker. If the replaced worker > + * is no longer attached to any virtqueues, it can be freed with > + * VHOST_FREE_WORKER. > + */ > +#define VHOST_ATTACH_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x15, \ > + struct vhost_vring_worker) > +/* Return the vring worker's ID */ > +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ > + struct vhost_vring_worker) > > /* The following ioctls use eventfd file descriptors to signal and poll > * for events. */ > diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h > index c5690a8992d8..d3aad12ad1fa 100644 > --- a/include/uapi/linux/vhost_types.h > +++ b/include/uapi/linux/vhost_types.h > @@ -47,6 +47,22 @@ struct vhost_vring_addr { > __u64 log_guest_addr; > }; > > +struct vhost_worker_state { > + /* > + * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id. > + * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker > + * to free. > + */ > + unsigned int worker_id; > +}; > + > +struct vhost_vring_worker { > + /* vring index */ > + unsigned int index; > + /* The id of the vhost_worker returned from VHOST_NEW_WORKER */ > + unsigned int worker_id; > +}; > + > /* no alignment requirement */ > struct vhost_iotlb_msg { > __u64 iova; > -- > 2.25.1 >