Michael S. Tsirkin
2023-Jul-20 13:06 UTC
[PATCH v11 8/8] vhost: use vhost_tasks for worker threads
On Thu, Feb 02, 2023 at 05:25:17PM -0600, Mike Christie wrote:> For vhost workers we use the kthread API which inherit's its values from > and checks against the kthreadd thread. This results in the wrong RLIMITs > being checked, so while tools like libvirt try to control the number of > threads based on the nproc rlimit setting we can end up creating more > threads than the user wanted. > > This patch has us use the vhost_task helpers which will inherit its > values/checks from the thread that owns the device similar to if we did > a clone in userspace. The vhost threads will now be counted in the nproc > rlimits. And we get features like cgroups and mm sharing automatically, > so we can remove those calls. > > Signed-off-by: Mike Christie <michael.christie at oracle.com> > Acked-by: Michael S. Tsirkin <mst at redhat.com>Hi Mike, So this seems to have caused a measureable regression in networking performance (about 30%). Take a look here, and there's a zip file with detailed measuraments attached: https://bugzilla.redhat.com/show_bug.cgi?id=2222603 Could you take a look please? You can also ask reporter questions there assuming you have or can create a (free) account.> --- > drivers/vhost/vhost.c | 58 ++++++++----------------------------------- > drivers/vhost/vhost.h | 4 +-- > 2 files changed, 13 insertions(+), 49 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 74378d241f8d..d3c7c37b69a7 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -22,11 +22,11 @@ > #include <linux/slab.h> > #include <linux/vmalloc.h> > #include <linux/kthread.h> > -#include <linux/cgroup.h> > #include <linux/module.h> > #include <linux/sort.h> > #include <linux/sched/mm.h> > #include <linux/sched/signal.h> > +#include <linux/sched/vhost_task.h> > #include <linux/interval_tree_generic.h> > #include <linux/nospec.h> > #include <linux/kcov.h> > @@ -256,7 +256,7 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) > * test_and_set_bit() implies a memory barrier. > */ > llist_add(&work->node, &dev->worker->work_list); > - wake_up_process(dev->worker->task); > + wake_up_process(dev->worker->vtsk->task); > } > } > EXPORT_SYMBOL_GPL(vhost_work_queue); > @@ -336,17 +336,14 @@ static void vhost_vq_reset(struct vhost_dev *dev, > static int vhost_worker(void *data) > { > struct vhost_worker *worker = data; > - struct vhost_dev *dev = worker->dev; > struct vhost_work *work, *work_next; > struct llist_node *node; > > - kthread_use_mm(dev->mm); > - > for (;;) { > /* mb paired w/ kthread_stop */ > set_current_state(TASK_INTERRUPTIBLE); > > - if (kthread_should_stop()) { > + if (vhost_task_should_stop(worker->vtsk)) { > __set_current_state(TASK_RUNNING); > break; > } > @@ -368,7 +365,7 @@ static int vhost_worker(void *data) > schedule(); > } > } > - kthread_unuse_mm(dev->mm); > + > return 0; > } > > @@ -509,31 +506,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev) > } > EXPORT_SYMBOL_GPL(vhost_dev_check_owner); > > -struct vhost_attach_cgroups_struct { > - struct vhost_work work; > - struct task_struct *owner; > - int ret; > -}; > - > -static void vhost_attach_cgroups_work(struct vhost_work *work) > -{ > - struct vhost_attach_cgroups_struct *s; > - > - s = container_of(work, struct vhost_attach_cgroups_struct, work); > - s->ret = cgroup_attach_task_all(s->owner, current); > -} > - > -static int vhost_attach_cgroups(struct vhost_dev *dev) > -{ > - struct vhost_attach_cgroups_struct attach; > - > - attach.owner = current; > - vhost_work_init(&attach.work, vhost_attach_cgroups_work); > - vhost_work_queue(dev, &attach.work); > - vhost_dev_flush(dev); > - return attach.ret; > -} > - > /* Caller should have device mutex */ > bool vhost_dev_has_owner(struct vhost_dev *dev) > { > @@ -580,14 +552,14 @@ static void vhost_worker_free(struct vhost_dev *dev) > > dev->worker = NULL; > WARN_ON(!llist_empty(&worker->work_list)); > - kthread_stop(worker->task); > + vhost_task_stop(worker->vtsk); > kfree(worker); > } > > static int vhost_worker_create(struct vhost_dev *dev) > { > struct vhost_worker *worker; > - struct task_struct *task; > + struct vhost_task *vtsk; > int ret; > > worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); > @@ -595,27 +567,19 @@ static int vhost_worker_create(struct vhost_dev *dev) > return -ENOMEM; > > dev->worker = worker; > - worker->dev = dev; > worker->kcov_handle = kcov_common_handle(); > init_llist_head(&worker->work_list); > > - task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid); > - if (IS_ERR(task)) { > - ret = PTR_ERR(task); > + vtsk = vhost_task_create(vhost_worker, worker, NUMA_NO_NODE); > + if (!vtsk) { > + ret = -ENOMEM; > goto free_worker; > } > > - worker->task = task; > - wake_up_process(task); /* avoid contributing to loadavg */ > - > - ret = vhost_attach_cgroups(dev); > - if (ret) > - goto stop_worker; > - > + worker->vtsk = vtsk; > + vhost_task_start(vtsk, "vhost-%d", current->pid); > return 0; > > -stop_worker: > - kthread_stop(worker->task); > free_worker: > kfree(worker); > dev->worker = NULL; > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 2f6beab93784..3af59c65025e 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -16,6 +16,7 @@ > #include <linux/irqbypass.h> > > struct vhost_work; > +struct vhost_task; > typedef void (*vhost_work_fn_t)(struct vhost_work *work); > > #define VHOST_WORK_QUEUED 1 > @@ -26,9 +27,8 @@ struct vhost_work { > }; > > struct vhost_worker { > - struct task_struct *task; > + struct vhost_task *vtsk; > struct llist_head work_list; > - struct vhost_dev *dev; > u64 kcov_handle; > }; > > -- > 2.25.1
michael.christie at oracle.com
2023-Jul-23 04:03 UTC
[PATCH v11 8/8] vhost: use vhost_tasks for worker threads
On 7/20/23 8:06 AM, Michael S. Tsirkin wrote:> On Thu, Feb 02, 2023 at 05:25:17PM -0600, Mike Christie wrote: >> For vhost workers we use the kthread API which inherit's its values from >> and checks against the kthreadd thread. This results in the wrong RLIMITs >> being checked, so while tools like libvirt try to control the number of >> threads based on the nproc rlimit setting we can end up creating more >> threads than the user wanted. >> >> This patch has us use the vhost_task helpers which will inherit its >> values/checks from the thread that owns the device similar to if we did >> a clone in userspace. The vhost threads will now be counted in the nproc >> rlimits. And we get features like cgroups and mm sharing automatically, >> so we can remove those calls. >> >> Signed-off-by: Mike Christie <michael.christie at oracle.com> >> Acked-by: Michael S. Tsirkin <mst at redhat.com> > > > Hi Mike, > So this seems to have caused a measureable regression in networking > performance (about 30%). Take a look here, and there's a zip file > with detailed measuraments attached: > > https://bugzilla.redhat.com/show_bug.cgi?id=2222603 > > > Could you take a look please? > You can also ask reporter questions there assuming you > have or can create a (free) account. >Sorry for the late reply. I just got home from vacation. The account creation link seems to be down. I keep getting a "unable to establish SMTP connection to bz-exim-prod port 25 " error. Can you give me Quan's email? I think I can replicate the problem. I just need some extra info from Quan: 1. Just double check that they are using RHEL 9 on the host running the VMs. 2. The kernel config 3. Any tuning that was done. Is tuned running in guest and/or host running the VMs and what profile is being used in each. 4. Number of vCPUs and virtqueues being used. 5. Can they dump the contents of: /sys/kernel/debug/sched and sysctl -a on the host running the VMs. 6. With the 6.4 kernel, can they also run a quick test and tell me if they set the scheduler to batch: ps -T -o comm,pid,tid $QEMU_THREAD then for each vhost thread do: chrt -b -p 0 $VHOST_THREAD Does that end up increasing perf? When I do this I see throughput go up by around 50% vs 6.3 when sessions was 16 or more (16 was the number of vCPUs and virtqueues per net device in the VM). Note that I'm not saying that is a fix. It's just a difference I noticed when running some other tests.