Hi all: This series tries to add basic busy polling for vhost net. The idea is simple: at the end of tx/rx processing, busy polling for new tx added descriptor and rx receive socket for a while. The maximum number of time (in us) could be spent on busy polling was specified ioctl. Test A were done through: - 50 us as busy loop timeout - Netperf 2.6 - Two machines with back to back connected ixgbe - Guest with 1 vcpu and 1 queue Results: - For stream workload, ioexits were reduced dramatically in medium size (1024-2048) of tx (at most -43%) and almost all rx (at most -84%) as a result of polling. This compensate for the possible wasted cpu cycles more or less. That porbably why we can still see some increasing in the normalized throughput in some cases. - Throughput of tx were increased (at most 50%) expect for the huge write (16384). And we can send more packets in the case (+tpkts were increased). - Very minor rx regression in some cases. - Improvemnt on TCP_RR (at most 17%). Guest TX: size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/ 64/ 1/ +18%/ -10%/ +7%/ +11%/ 0% 64/ 2/ +14%/ -13%/ +7%/ +10%/ 0% 64/ 4/ +8%/ -17%/ +7%/ +9%/ 0% 64/ 8/ +11%/ -15%/ +7%/ +10%/ 0% 256/ 1/ +35%/ +9%/ +21%/ +12%/ -11% 256/ 2/ +26%/ +2%/ +20%/ +9%/ -10% 256/ 4/ +23%/ 0%/ +21%/ +10%/ -9% 256/ 8/ +23%/ 0%/ +21%/ +9%/ -9% 512/ 1/ +31%/ +9%/ +23%/ +18%/ -12% 512/ 2/ +30%/ +8%/ +24%/ +15%/ -10% 512/ 4/ +26%/ +5%/ +24%/ +14%/ -11% 512/ 8/ +32%/ +9%/ +23%/ +15%/ -11% 1024/ 1/ +39%/ +16%/ +29%/ +22%/ -26% 1024/ 2/ +35%/ +14%/ +30%/ +21%/ -22% 1024/ 4/ +34%/ +13%/ +32%/ +21%/ -25% 1024/ 8/ +36%/ +14%/ +32%/ +19%/ -26% 2048/ 1/ +50%/ +27%/ +34%/ +26%/ -42% 2048/ 2/ +43%/ +21%/ +36%/ +25%/ -43% 2048/ 4/ +41%/ +20%/ +37%/ +27%/ -43% 2048/ 8/ +40%/ +18%/ +35%/ +25%/ -42% 16384/ 1/ 0%/ -12%/ -1%/ +8%/ +15% 16384/ 2/ 0%/ -10%/ +1%/ +4%/ +5% 16384/ 4/ 0%/ -11%/ -3%/ 0%/ +3% 16384/ 8/ 0%/ -10%/ -4%/ 0%/ +1% Guest RX: size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/ 64/ 1/ -2%/ -21%/ +1%/ +2%/ -75% 64/ 2/ +1%/ -9%/ +12%/ 0%/ -55% 64/ 4/ 0%/ -6%/ +5%/ -1%/ -44% 64/ 8/ -5%/ -5%/ +7%/ -23%/ -50% 256/ 1/ -8%/ -18%/ +16%/ +15%/ -63% 256/ 2/ 0%/ -8%/ +9%/ -2%/ -26% 256/ 4/ 0%/ -7%/ -8%/ +20%/ -41% 256/ 8/ -8%/ -11%/ -9%/ -24%/ -78% 512/ 1/ -6%/ -19%/ +20%/ +18%/ -29% 512/ 2/ 0%/ -10%/ -14%/ -8%/ -31% 512/ 4/ -1%/ -5%/ -11%/ -9%/ -38% 512/ 8/ -7%/ -9%/ -17%/ -22%/ -81% 1024/ 1/ 0%/ -16%/ +12%/ +9%/ -11% 1024/ 2/ 0%/ -11%/ 0%/ +3%/ -30% 1024/ 4/ 0%/ -4%/ +2%/ +6%/ -15% 1024/ 8/ -3%/ -4%/ -8%/ -8%/ -70% 2048/ 1/ -8%/ -23%/ +36%/ +22%/ -11% 2048/ 2/ 0%/ -12%/ +1%/ +3%/ -29% 2048/ 4/ 0%/ -3%/ -17%/ -15%/ -84% 2048/ 8/ 0%/ -3%/ +1%/ -3%/ +10% 16384/ 1/ 0%/ -11%/ +4%/ +7%/ -22% 16384/ 2/ 0%/ -7%/ +4%/ +4%/ -33% 16384/ 4/ 0%/ -2%/ -2%/ -4%/ -23% 16384/ 8/ -1%/ -2%/ +1%/ -22%/ -40% TCP_RR: size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/ 1/ 1/ +11%/ -26%/ +11%/ +11%/ +10% 1/ 25/ +11%/ -15%/ +11%/ +11%/ 0% 1/ 50/ +9%/ -16%/ +10%/ +10%/ 0% 1/ 100/ +9%/ -15%/ +9%/ +9%/ 0% 64/ 1/ +11%/ -31%/ +11%/ +11%/ +11% 64/ 25/ +12%/ -14%/ +12%/ +12%/ 0% 64/ 50/ +11%/ -14%/ +12%/ +12%/ 0% 64/ 100/ +11%/ -15%/ +11%/ +11%/ 0% 256/ 1/ +11%/ -27%/ +11%/ +11%/ +10% 256/ 25/ +17%/ -11%/ +16%/ +16%/ -1% 256/ 50/ +16%/ -11%/ +17%/ +17%/ +1% 256/ 100/ +17%/ -11%/ +18%/ +18%/ +1% Test B were done through: - 50us as busy loop timeout - Netperf 2.6 - Two machines with back to back connected ixgbe - Two guests each wich 1 vcpu and 1 queue - pin two vhost threads to the same cpu on host to simulate the cpu contending Results: - In this radical case, we can still get at most 14% improvement on TCP_RR. - For guest tx stream, minor improvemnt with at most 5% regression in one byte case. For guest rx stream, at most 5% regression were seen. Guest TX: size /-+% / 1 /-5.55%/ 64 /+1.11%/ 256 /+2.33%/ 512 /-0.03%/ 1024 /+1.14%/ 4096 /+0.00%/ 16384/+0.00%/ Guest RX: size /-+% / 1 /-5.11%/ 64 /-0.55%/ 256 /-2.35%/ 512 /-3.39%/ 1024 /+6.8% / 4096 /-0.01%/ 16384/+0.00%/ TCP_RR: size /-+% / 1 /+9.79% / 64 /+4.51% / 256 /+6.47% / 512 /-3.37% / 1024 /+6.15% / 4096 /+14.88%/ 16384/-2.23% / Changes from V1: - Remove the buggy vq_error() in vhost_vq_more_avail(). - Leave vhost_enable_notify() untouched. Changes from RFC V3: - small tweak on the code to avoid multiple duplicate conditions in critical path when busy loop is not enabled. - Add the test result of multiple VMs Changes from RFC V2: - poll also at the end of rx handling - factor out the polling logic and optimize the code a little bit - add two ioctls to get and set the busy poll timeout - test on ixgbe (which can give more stable and reproducable numbers) instead of mlx4. Changes from RFC V1: - Add a comment for vhost_has_work() to explain why it could be lockless - Add param description for busyloop_timeout - Split out the busy polling logic into a new helper - Check and exit the loop when there's a pending signal - Disable preemption during busy looping to make sure lock_clock() was correctly used. Jason Wang (3): vhost: introduce vhost_has_work() vhost: introduce vhost_vq_more_avail() vhost_net: basic polling support drivers/vhost/net.c | 72 ++++++++++++++++++++++++++++++++++++++++++---- drivers/vhost/vhost.c | 35 ++++++++++++++++++++++ drivers/vhost/vhost.h | 3 ++ include/uapi/linux/vhost.h | 11 +++++++ 4 files changed, 116 insertions(+), 5 deletions(-) -- 2.5.0
This path introduces a helper which can give a hint for whether or not there's a work queued in the work list. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 7 +++++++ drivers/vhost/vhost.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index eec2f11..163b365 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -245,6 +245,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) } EXPORT_SYMBOL_GPL(vhost_work_queue); +/* A lockless hint for busy polling code to exit the loop */ +bool vhost_has_work(struct vhost_dev *dev) +{ + return !list_empty(&dev->work_list); +} +EXPORT_SYMBOL_GPL(vhost_has_work); + void vhost_poll_queue(struct vhost_poll *poll) { vhost_work_queue(poll->dev, &poll->work); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index d3f7674..43284ad 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -37,6 +37,7 @@ struct vhost_poll { void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn); void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work); +bool vhost_has_work(struct vhost_dev *dev); void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, unsigned long mask, struct vhost_dev *dev); -- 2.5.0
Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 13 +++++++++++++ drivers/vhost/vhost.h | 1 + 2 files changed, 14 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 163b365..4f45a03 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1633,6 +1633,19 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev, } EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); +bool vhost_vq_more_avail(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + __virtio16 avail_idx; + int r; + + r = __get_user(avail_idx, &vq->avail->idx); + if (r) + return false; + + return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx; +} +EXPORT_SYMBOL_GPL(vhost_vq_more_avail); + /* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 43284ad..2f3c57c 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -159,6 +159,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, struct vring_used_elem *heads, unsigned count); void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *); +bool vhost_vq_more_avail(struct vhost_dev *, struct vhost_virtqueue *); bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *); int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, -- 2.5.0
This patch tries to poll for new added tx buffer or socket receive queue for a while at the end of tx/rx processing. The maximum time spent on polling were specified through a new kind of vring ioctl. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/net.c | 72 ++++++++++++++++++++++++++++++++++++++++++---- drivers/vhost/vhost.c | 15 ++++++++++ drivers/vhost/vhost.h | 1 + include/uapi/linux/vhost.h | 11 +++++++ 4 files changed, 94 insertions(+), 5 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 9eda69e..ce6da77 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -287,6 +287,41 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) rcu_read_unlock_bh(); } +static inline unsigned long busy_clock(void) +{ + return local_clock() >> 10; +} + +static bool vhost_can_busy_poll(struct vhost_dev *dev, + unsigned long endtime) +{ + return likely(!need_resched()) && + likely(!time_after(busy_clock(), endtime)) && + likely(!signal_pending(current)) && + !vhost_has_work(dev) && + single_task_running(); +} + +static int vhost_net_tx_get_vq_desc(struct vhost_net *net, + struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num) +{ + unsigned long uninitialized_var(endtime); + + if (vq->busyloop_timeout) { + preempt_disable(); + endtime = busy_clock() + vq->busyloop_timeout; + while (vhost_can_busy_poll(vq->dev, endtime) && + !vhost_vq_more_avail(vq->dev, vq)) + cpu_relax(); + preempt_enable(); + } + + return vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + out_num, in_num, NULL, NULL); +} + /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) @@ -331,10 +366,9 @@ static void handle_tx(struct vhost_net *net) % UIO_MAXIOV == nvq->done_idx)) break; - head = vhost_get_vq_desc(vq, vq->iov, - ARRAY_SIZE(vq->iov), - &out, &in, - NULL, NULL); + head = vhost_net_tx_get_vq_desc(net, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; @@ -435,6 +469,34 @@ static int peek_head_len(struct sock *sk) return len; } +static int vhost_net_peek_head_len(struct vhost_net *net, struct sock *sk) +{ + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; + struct vhost_virtqueue *vq = &nvq->vq; + unsigned long uninitialized_var(endtime); + + if (vq->busyloop_timeout) { + mutex_lock(&vq->mutex); + vhost_disable_notify(&net->dev, vq); + + preempt_disable(); + endtime = busy_clock() + vq->busyloop_timeout; + + while (vhost_can_busy_poll(&net->dev, endtime) && + skb_queue_empty(&sk->sk_receive_queue) && + !vhost_vq_more_avail(&net->dev, vq)) + cpu_relax(); + + preempt_enable(); + + if (vhost_enable_notify(&net->dev, vq)) + vhost_poll_queue(&vq->poll); + mutex_unlock(&vq->mutex); + } + + return peek_head_len(sk); +} + /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @vq - the relevant virtqueue @@ -553,7 +615,7 @@ static void handle_rx(struct vhost_net *net) vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); - while ((sock_len = peek_head_len(sock->sk))) { + while ((sock_len = vhost_net_peek_head_len(net, sock->sk))) { sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads, vhost_len, diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 4f45a03..b8ca873 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -285,6 +285,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->memory = NULL; vq->is_le = virtio_legacy_is_little_endian(); vhost_vq_reset_user_be(vq); + vq->busyloop_timeout = 0; } static int vhost_worker(void *data) @@ -747,6 +748,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) struct vhost_vring_state s; struct vhost_vring_file f; struct vhost_vring_addr a; + struct vhost_vring_busyloop_timeout t; u32 idx; long r; @@ -919,6 +921,19 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) case VHOST_GET_VRING_ENDIAN: r = vhost_get_vring_endian(vq, idx, argp); break; + case VHOST_SET_VRING_BUSYLOOP_TIMEOUT: + if (copy_from_user(&t, argp, sizeof(t))) { + r = -EFAULT; + break; + } + vq->busyloop_timeout = t.timeout; + break; + case VHOST_GET_VRING_BUSYLOOP_TIMEOUT: + t.index = idx; + t.timeout = vq->busyloop_timeout; + if (copy_to_user(argp, &t, sizeof(t))) + r = -EFAULT; + break; default: r = -ENOIOCTLCMD; } diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 2f3c57c..4b7d4fa 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -115,6 +115,7 @@ struct vhost_virtqueue { /* Ring endianness requested by userspace for cross-endian support. */ bool user_be; #endif + u32 busyloop_timeout; }; struct vhost_dev { diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index ab373191..eaf6c33 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -27,6 +27,11 @@ struct vhost_vring_file { }; +struct vhost_vring_busyloop_timeout { + unsigned int index; + unsigned int timeout; +}; + struct vhost_vring_addr { unsigned int index; /* Option flags. */ @@ -126,6 +131,12 @@ struct vhost_memory { #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) /* Set eventfd to signal an error */ #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +/* Set busy loop timeout */ +#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, \ + struct vhost_vring_busyloop_timeout) +/* Get busy loop timeout */ +#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \ + struct vhost_vring_busyloop_timeout) /* VHOST_NET specific defines */ -- 2.5.0
Michael S. Tsirkin
2016-Jan-20 14:09 UTC
[PATCH V2 2/3] vhost: introduce vhost_vq_more_avail()
On Tue, Dec 01, 2015 at 02:39:44PM +0800, Jason Wang wrote:> Signed-off-by: Jason Wang <jasowang at redhat.com>Wow new API with no comments anywhere, and no commit log to say what it's good for. Want to know what it does and whether it's correct? You have to read the next patch. So what is the point of splitting it out? It's confusing, and in fact it made you miss a bug.> --- > drivers/vhost/vhost.c | 13 +++++++++++++ > drivers/vhost/vhost.h | 1 + > 2 files changed, 14 insertions(+) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 163b365..4f45a03 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -1633,6 +1633,19 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev, > } > EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); > > +bool vhost_vq_more_avail(struct vhost_dev *dev, struct vhost_virtqueue *vq) > +{ > + __virtio16 avail_idx; > + int r; > + > + r = __get_user(avail_idx, &vq->avail->idx); > + if (r) > + return false;So the result is that if the page is not present, you return false (empty ring) and the caller will busy wait with preempt disabled. Nasty. So it should return something that breaks the loop, and this means it should have a different name for the return code to make sense. Maybe reverse the polarity: vhost_vq_avail_empty? And add a comment saying we can't be sure ring is empty so return false.> + > + return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx; > +} > +EXPORT_SYMBOL_GPL(vhost_vq_more_avail); > + > /* OK, now we need to know about added descriptors. */ > bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) > { > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 43284ad..2f3c57c 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -159,6 +159,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, > struct vring_used_elem *heads, unsigned count); > void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); > void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *); > +bool vhost_vq_more_avail(struct vhost_dev *, struct vhost_virtqueue *); > bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *); > > int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, > -- > 2.5.0
On Tue, Dec 01, 2015 at 02:39:45PM +0800, Jason Wang wrote:> This patch tries to poll for new added tx buffer or socket receive > queue for a while at the end of tx/rx processing. The maximum time > spent on polling were specified through a new kind of vring ioctl. > > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/net.c | 72 ++++++++++++++++++++++++++++++++++++++++++---- > drivers/vhost/vhost.c | 15 ++++++++++ > drivers/vhost/vhost.h | 1 + > include/uapi/linux/vhost.h | 11 +++++++ > 4 files changed, 94 insertions(+), 5 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 9eda69e..ce6da77 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -287,6 +287,41 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) > rcu_read_unlock_bh(); > } > > +static inline unsigned long busy_clock(void) > +{ > + return local_clock() >> 10; > +} > + > +static bool vhost_can_busy_poll(struct vhost_dev *dev, > + unsigned long endtime) > +{ > + return likely(!need_resched()) && > + likely(!time_after(busy_clock(), endtime)) && > + likely(!signal_pending(current)) && > + !vhost_has_work(dev) && > + single_task_running(); > +} > + > +static int vhost_net_tx_get_vq_desc(struct vhost_net *net, > + struct vhost_virtqueue *vq, > + struct iovec iov[], unsigned int iov_size, > + unsigned int *out_num, unsigned int *in_num) > +{ > + unsigned long uninitialized_var(endtime); > + > + if (vq->busyloop_timeout) { > + preempt_disable(); > + endtime = busy_clock() + vq->busyloop_timeout; > + while (vhost_can_busy_poll(vq->dev, endtime) && > + !vhost_vq_more_avail(vq->dev, vq)) > + cpu_relax(); > + preempt_enable(); > + }Isn't there a way to call all this after vhost_get_vq_desc? First, this will reduce the good path overhead as you won't have to play with timers and preemption. Second, this will reduce the chance of a pagefault on avail ring read.> + > + return vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), > + out_num, in_num, NULL, NULL); > +} > + > /* Expects to be always run from workqueue - which acts as > * read-size critical section for our kind of RCU. */ > static void handle_tx(struct vhost_net *net) > @@ -331,10 +366,9 @@ static void handle_tx(struct vhost_net *net) > % UIO_MAXIOV == nvq->done_idx)) > break; > > - head = vhost_get_vq_desc(vq, vq->iov, > - ARRAY_SIZE(vq->iov), > - &out, &in, > - NULL, NULL); > + head = vhost_net_tx_get_vq_desc(net, vq, vq->iov, > + ARRAY_SIZE(vq->iov), > + &out, &in); > /* On error, stop handling until the next kick. */ > if (unlikely(head < 0)) > break; > @@ -435,6 +469,34 @@ static int peek_head_len(struct sock *sk) > return len; > } > > +static int vhost_net_peek_head_len(struct vhost_net *net, struct sock *sk)Need a hint that it's rx related in the name.> +{ > + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; > + struct vhost_virtqueue *vq = &nvq->vq; > + unsigned long uninitialized_var(endtime); > + > + if (vq->busyloop_timeout) { > + mutex_lock(&vq->mutex);This appears to be called under vq mutex in handle_rx. So how does this work then?> + vhost_disable_notify(&net->dev, vq);This appears to be called after disable notify in handle_rx - so why disable here again?> + > + preempt_disable(); > + endtime = busy_clock() + vq->busyloop_timeout; > + > + while (vhost_can_busy_poll(&net->dev, endtime) && > + skb_queue_empty(&sk->sk_receive_queue) && > + !vhost_vq_more_avail(&net->dev, vq)) > + cpu_relax();This seems to mix in several items. RX queue is normally not empty. I don't think we need to poll for that. So IMHO we only need to poll for sk_receive_queue really.> + > + preempt_enable(); > + > + if (vhost_enable_notify(&net->dev, vq)) > + vhost_poll_queue(&vq->poll);But vhost_enable_notify returns true on queue not empty. So in fact this will requeue on good path - does not make sense to me.> + mutex_unlock(&vq->mutex); > + } > +Same comment as for get vq desc here: don't slow down the good path.> + return peek_head_len(sk); > +} > + > /* This is a multi-buffer version of vhost_get_desc, that works if > * vq has read descriptors only. > * @vq - the relevant virtqueue > @@ -553,7 +615,7 @@ static void handle_rx(struct vhost_net *net) > vq->log : NULL; > mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); > > - while ((sock_len = peek_head_len(sock->sk))) { > + while ((sock_len = vhost_net_peek_head_len(net, sock->sk))) { > sock_len += sock_hlen; > vhost_len = sock_len + vhost_hlen; > headcount = get_rx_bufs(vq, vq->heads, vhost_len, > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 4f45a03..b8ca873 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -285,6 +285,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, > vq->memory = NULL; > vq->is_le = virtio_legacy_is_little_endian(); > vhost_vq_reset_user_be(vq); > + vq->busyloop_timeout = 0; > } > > static int vhost_worker(void *data) > @@ -747,6 +748,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) > struct vhost_vring_state s; > struct vhost_vring_file f; > struct vhost_vring_addr a; > + struct vhost_vring_busyloop_timeout t; > u32 idx; > long r; > > @@ -919,6 +921,19 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) > case VHOST_GET_VRING_ENDIAN: > r = vhost_get_vring_endian(vq, idx, argp); > break; > + case VHOST_SET_VRING_BUSYLOOP_TIMEOUT: > + if (copy_from_user(&t, argp, sizeof(t))) { > + r = -EFAULT; > + break; > + } > + vq->busyloop_timeout = t.timeout; > + break; > + case VHOST_GET_VRING_BUSYLOOP_TIMEOUT: > + t.index = idx; > + t.timeout = vq->busyloop_timeout; > + if (copy_to_user(argp, &t, sizeof(t))) > + r = -EFAULT; > + break; > default: > r = -ENOIOCTLCMD; > } > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 2f3c57c..4b7d4fa 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -115,6 +115,7 @@ struct vhost_virtqueue { > /* Ring endianness requested by userspace for cross-endian support. */ > bool user_be; > #endif > + u32 busyloop_timeout; > }; > > struct vhost_dev { > diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h > index ab373191..eaf6c33 100644 > --- a/include/uapi/linux/vhost.h > +++ b/include/uapi/linux/vhost.h > @@ -27,6 +27,11 @@ struct vhost_vring_file { > > }; > > +struct vhost_vring_busyloop_timeout { > + unsigned int index; > + unsigned int timeout; > +}; > +So why not reuse vhost_vring_state then?> struct vhost_vring_addr { > unsigned int index; > /* Option flags. */ > @@ -126,6 +131,12 @@ struct vhost_memory { > #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) > /* Set eventfd to signal an error */ > #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) > +/* Set busy loop timeout */Units?> +#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, \ > + struct vhost_vring_busyloop_timeout) > +/* Get busy loop timeout */ > +#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \ > + struct vhost_vring_busyloop_timeout) > > /* VHOST_NET specific defines */ > > -- > 2.5.0