Hi all: This series tries to unify and simplify vhost codes especially for zerocopy. Plase review. Thanks Jason Wang (6): vhost_net: make vhost_zerocopy_signal_used() returns void vhost_net: use vhost_add_used_and_signal_n() in vhost_zerocopy_signal_used() vhost: switch to use vhost_add_used_n() vhost_net: determine whether or not to use zerocopy at one time vhost_net: poll vhost queue after marking DMA is done vhost_net: remove the max pending check drivers/vhost/net.c | 86 +++++++++++++++++++----------------------------- drivers/vhost/vhost.c | 43 +----------------------- 2 files changed, 36 insertions(+), 93 deletions(-)
Jason Wang
2013-Aug-16 05:16 UTC
[PATCH 1/6] vhost_net: make vhost_zerocopy_signal_used() returns void
None of its caller use its return value, so let it return void. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/net.c | 5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 969a859..280ee66 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -276,8 +276,8 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, * of used idx. Once lower device DMA done contiguously, we will signal KVM * guest used idx. */ -static int vhost_zerocopy_signal_used(struct vhost_net *net, - struct vhost_virtqueue *vq) +static void vhost_zerocopy_signal_used(struct vhost_net *net, + struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq container_of(vq, struct vhost_net_virtqueue, vq); @@ -297,7 +297,6 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net, } if (j) nvq->done_idx = i; - return j; } static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) -- 1.7.1
Jason Wang
2013-Aug-16 05:16 UTC
[PATCH 2/6] vhost_net: use vhost_add_used_and_signal_n() in vhost_zerocopy_signal_used()
Switch to use vhost_add_used_and_signal_n() to avoid multiple calls to vhost_add_used_and_signal(). With the patch we will call at most 2 times (consider done_idx warp around) compared to N times w/o this patch. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/net.c | 13 ++++++++----- 1 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 280ee66..8a6dd0d 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -281,7 +281,7 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, { struct vhost_net_virtqueue *nvq container_of(vq, struct vhost_net_virtqueue, vq); - int i; + int i, add; int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { @@ -289,14 +289,17 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { vq->heads[i].len = VHOST_DMA_CLEAR_LEN; - vhost_add_used_and_signal(vq->dev, vq, - vq->heads[i].id, 0); ++j; } else break; } - if (j) - nvq->done_idx = i; + while (j) { + add = min(UIO_MAXIOV - nvq->done_idx, j); + vhost_add_used_and_signal_n(vq->dev, vq, + &vq->heads[nvq->done_idx], add); + nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; + j -= add; + } } static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) -- 1.7.1
Let vhost_add_used() to use vhost_add_used_n() to reduce the code duplication. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/vhost.c | 43 ++----------------------------------------- 1 files changed, 2 insertions(+), 41 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e58cf00..c479452 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1332,48 +1332,9 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc); * want to notify the guest, using eventfd. */ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) { - struct vring_used_elem __user *used; + struct vring_used_elem heads = { head, len }; - /* The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. */ - used = &vq->used->ring[vq->last_used_idx % vq->num]; - if (__put_user(head, &used->id)) { - vq_err(vq, "Failed to write used id"); - return -EFAULT; - } - if (__put_user(len, &used->len)) { - vq_err(vq, "Failed to write used len"); - return -EFAULT; - } - /* Make sure buffer is written before we update index. */ - smp_wmb(); - if (__put_user(vq->last_used_idx + 1, &vq->used->idx)) { - vq_err(vq, "Failed to increment used idx"); - return -EFAULT; - } - if (unlikely(vq->log_used)) { - /* Make sure data is seen before log. */ - smp_wmb(); - /* Log used ring entry write. */ - log_write(vq->log_base, - vq->log_addr + - ((void __user *)used - (void __user *)vq->used), - sizeof *used); - /* Log used index update. */ - log_write(vq->log_base, - vq->log_addr + offsetof(struct vring_used, idx), - sizeof vq->used->idx); - if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); - } - vq->last_used_idx++; - /* If the driver never bothers to signal in a very long while, - * used index might wrap around. If that happens, invalidate - * signalled_used index we stored. TODO: make sure driver - * signals at least once in 2^16 and remove this. */ - if (unlikely(vq->last_used_idx == vq->signalled_used)) - vq->signalled_used_valid = false; - return 0; + return vhost_add_used_n(vq, &heads, 1); } EXPORT_SYMBOL_GPL(vhost_add_used); -- 1.7.1
Jason Wang
2013-Aug-16 05:16 UTC
[PATCH 4/6] vhost_net: determine whether or not to use zerocopy at one time
Currently, even if the packet length is smaller than VHOST_GOODCOPY_LEN, if upend_idx != done_idx we still set zcopy_used to true and rollback this choice later. This could be avoided by determine zerocopy once by checking all conditions at one time before. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/net.c | 46 +++++++++++++++++++--------------------------- 1 files changed, 19 insertions(+), 27 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 8a6dd0d..70cab75 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -404,43 +404,35 @@ static void handle_tx(struct vhost_net *net) iov_length(nvq->hdr, s), hdr_size); break; } - zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN || - nvq->upend_idx != nvq->done_idx); + + zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN + && nvq->upend_idx != nvq->done_idx + && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { + struct ubuf_info *ubuf; + ubuf = nvq->ubuf_info + nvq->upend_idx; + vq->heads[nvq->upend_idx].id = head; - if (!vhost_net_tx_select_zcopy(net) || - len < VHOST_GOODCOPY_LEN) { - /* copy don't need to wait for DMA done */ - vq->heads[nvq->upend_idx].len - VHOST_DMA_DONE_LEN; - msg.msg_control = NULL; - msg.msg_controllen = 0; - ubufs = NULL; - } else { - struct ubuf_info *ubuf; - ubuf = nvq->ubuf_info + nvq->upend_idx; - - vq->heads[nvq->upend_idx].len - VHOST_DMA_IN_PROGRESS; - ubuf->callback = vhost_zerocopy_callback; - ubuf->ctx = nvq->ubufs; - ubuf->desc = nvq->upend_idx; - msg.msg_control = ubuf; - msg.msg_controllen = sizeof(ubuf); - ubufs = nvq->ubufs; - kref_get(&ubufs->kref); - } + vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; + ubuf->callback = vhost_zerocopy_callback; + ubuf->ctx = nvq->ubufs; + ubuf->desc = nvq->upend_idx; + msg.msg_control = ubuf; + msg.msg_controllen = sizeof(ubuf); + ubufs = nvq->ubufs; + kref_get(&ubufs->kref); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; - } else + } else { msg.msg_control = NULL; + ubufs = NULL; + } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(NULL, sock, &msg, len); if (unlikely(err < 0)) { if (zcopy_used) { - if (ubufs) - vhost_net_ubuf_put(ubufs); + vhost_net_ubuf_put(ubufs); nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; } -- 1.7.1
Jason Wang
2013-Aug-16 05:16 UTC
[PATCH 5/6] vhost_net: poll vhost queue after marking DMA is done
We used to poll vhost queue before making DMA is done, this is racy if vhost thread were waked up before marking DMA is done which can result the signal to be missed. Fix this by always poll the vhost thread before DMA is done. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/net.c | 9 +++++---- 1 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 70cab75..a035a89 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -308,6 +308,11 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) struct vhost_virtqueue *vq = ubufs->vq; int cnt = atomic_read(&ubufs->kref.refcount); + /* set len to mark this desc buffers done DMA */ + vq->heads[ubuf->desc].len = success ? + VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; + vhost_net_ubuf_put(ubufs); + /* * Trigger polling thread if guest stopped submitting new buffers: * in this case, the refcount after decrement will eventually reach 1 @@ -318,10 +323,6 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) */ if (cnt <= 2 || !(cnt % 16)) vhost_poll_queue(&vq->poll); - /* set len to mark this desc buffers done DMA */ - vq->heads[ubuf->desc].len = success ? - VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; - vhost_net_ubuf_put(ubufs); } /* Expects to be always run from workqueue - which acts as -- 1.7.1
We used to limit the max pending DMAs to prevent guest from pinning too many pages. But this could be removed since: - We have the sk_wmem_alloc check in both tun/macvtap to do the same work - This max pending check were almost useless since it was one done when there's no new buffers coming from guest. Guest can easily exceeds the limitation. - We've already check upend_idx != done_idx and switch to non zerocopy then. So even if all vq->heads were used, we can still does the packet transmission. So remove this check completely. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/vhost/net.c | 13 ------------- 1 files changed, 0 insertions(+), 13 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index a035a89..ed3f165 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -38,8 +38,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 -/* MAX number of TX used buffers for outstanding zerocopy */ -#define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 /* @@ -372,17 +370,6 @@ static void handle_tx(struct vhost_net *net) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { - int num_pends; - - /* If more outstanding DMAs, queue the work. - * Handle upend_idx wrap around - */ - num_pends = likely(nvq->upend_idx >= nvq->done_idx) ? - (nvq->upend_idx - nvq->done_idx) : - (nvq->upend_idx + UIO_MAXIOV - - nvq->done_idx); - if (unlikely(num_pends > VHOST_MAX_PEND)) - break; if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; -- 1.7.1
Michael S. Tsirkin
2013-Aug-16 09:54 UTC
[PATCH 2/6] vhost_net: use vhost_add_used_and_signal_n() in vhost_zerocopy_signal_used()
On Fri, Aug 16, 2013 at 01:16:26PM +0800, Jason Wang wrote:> Switch to use vhost_add_used_and_signal_n() to avoid multiple calls to > vhost_add_used_and_signal(). With the patch we will call at most 2 times > (consider done_idx warp around) compared to N times w/o this patch. > > Signed-off-by: Jason Wang <jasowang at redhat.com>So? Does this help performance then?> --- > drivers/vhost/net.c | 13 ++++++++----- > 1 files changed, 8 insertions(+), 5 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 280ee66..8a6dd0d 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -281,7 +281,7 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, > { > struct vhost_net_virtqueue *nvq > container_of(vq, struct vhost_net_virtqueue, vq); > - int i; > + int i, add; > int j = 0; > > for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { > @@ -289,14 +289,17 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, > vhost_net_tx_err(net); > if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { > vq->heads[i].len = VHOST_DMA_CLEAR_LEN; > - vhost_add_used_and_signal(vq->dev, vq, > - vq->heads[i].id, 0); > ++j; > } else > break; > } > - if (j) > - nvq->done_idx = i; > + while (j) { > + add = min(UIO_MAXIOV - nvq->done_idx, j); > + vhost_add_used_and_signal_n(vq->dev, vq, > + &vq->heads[nvq->done_idx], add); > + nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; > + j -= add; > + } > } > > static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) > -- > 1.7.1
Michael S. Tsirkin
2013-Aug-16 09:56 UTC
[PATCH 3/6] vhost: switch to use vhost_add_used_n()
On Fri, Aug 16, 2013 at 01:16:27PM +0800, Jason Wang wrote:> Let vhost_add_used() to use vhost_add_used_n() to reduce the code duplication. > > Signed-off-by: Jason Wang <jasowang at redhat.com>Does compiler inline it then? Reason I ask, last time I checked put_user inside vhost_add_used was much cheaper than copy_to_user inside vhost_add_used_n, so I wouldn't be surprised if this hurt performance. Did you check?> --- > drivers/vhost/vhost.c | 43 ++----------------------------------------- > 1 files changed, 2 insertions(+), 41 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index e58cf00..c479452 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -1332,48 +1332,9 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc); > * want to notify the guest, using eventfd. */ > int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) > { > - struct vring_used_elem __user *used; > + struct vring_used_elem heads = { head, len }; > > - /* The virtqueue contains a ring of used buffers. Get a pointer to the > - * next entry in that used ring. */ > - used = &vq->used->ring[vq->last_used_idx % vq->num]; > - if (__put_user(head, &used->id)) { > - vq_err(vq, "Failed to write used id"); > - return -EFAULT; > - } > - if (__put_user(len, &used->len)) { > - vq_err(vq, "Failed to write used len"); > - return -EFAULT; > - } > - /* Make sure buffer is written before we update index. */ > - smp_wmb(); > - if (__put_user(vq->last_used_idx + 1, &vq->used->idx)) { > - vq_err(vq, "Failed to increment used idx"); > - return -EFAULT; > - } > - if (unlikely(vq->log_used)) { > - /* Make sure data is seen before log. */ > - smp_wmb(); > - /* Log used ring entry write. */ > - log_write(vq->log_base, > - vq->log_addr + > - ((void __user *)used - (void __user *)vq->used), > - sizeof *used); > - /* Log used index update. */ > - log_write(vq->log_base, > - vq->log_addr + offsetof(struct vring_used, idx), > - sizeof vq->used->idx); > - if (vq->log_ctx) > - eventfd_signal(vq->log_ctx, 1); > - } > - vq->last_used_idx++; > - /* If the driver never bothers to signal in a very long while, > - * used index might wrap around. If that happens, invalidate > - * signalled_used index we stored. TODO: make sure driver > - * signals at least once in 2^16 and remove this. */ > - if (unlikely(vq->last_used_idx == vq->signalled_used)) > - vq->signalled_used_valid = false; > - return 0; > + return vhost_add_used_n(vq, &heads, 1); > } > EXPORT_SYMBOL_GPL(vhost_add_used); > > -- > 1.7.1
Michael S. Tsirkin
2013-Aug-16 10:00 UTC
[PATCH 5/6] vhost_net: poll vhost queue after marking DMA is done
On Fri, Aug 16, 2013 at 01:16:29PM +0800, Jason Wang wrote:> We used to poll vhost queue before making DMA is done, this is racy if vhost > thread were waked up before marking DMA is done which can result the signal to > be missed. Fix this by always poll the vhost thread before DMA is done. > > Signed-off-by: Jason Wang <jasowang at redhat.com>Indeed, but vhost_net_ubuf_put should be the last thing we do: it can cause the device to go away and we'll get a user after free.> --- > drivers/vhost/net.c | 9 +++++---- > 1 files changed, 5 insertions(+), 4 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 70cab75..a035a89 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -308,6 +308,11 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) > struct vhost_virtqueue *vq = ubufs->vq; > int cnt = atomic_read(&ubufs->kref.refcount); > > + /* set len to mark this desc buffers done DMA */ > + vq->heads[ubuf->desc].len = success ? > + VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; > + vhost_net_ubuf_put(ubufs); > + > /* > * Trigger polling thread if guest stopped submitting new buffers: > * in this case, the refcount after decrement will eventually reach 1 > @@ -318,10 +323,6 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) > */ > if (cnt <= 2 || !(cnt % 16)) > vhost_poll_queue(&vq->poll); > - /* set len to mark this desc buffers done DMA */ > - vq->heads[ubuf->desc].len = success ? > - VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; > - vhost_net_ubuf_put(ubufs); > } > > /* Expects to be always run from workqueue - which acts as > -- > 1.7.1
Michael S. Tsirkin
2013-Aug-16 10:02 UTC
[PATCH 6/6] vhost_net: remove the max pending check
On Fri, Aug 16, 2013 at 01:16:30PM +0800, Jason Wang wrote:> We used to limit the max pending DMAs to prevent guest from pinning too many > pages. But this could be removed since: > > - We have the sk_wmem_alloc check in both tun/macvtap to do the same work > - This max pending check were almost useless since it was one done when there's > no new buffers coming from guest. Guest can easily exceeds the limitation. > - We've already check upend_idx != done_idx and switch to non zerocopy then. So > even if all vq->heads were used, we can still does the packet transmission.We can but performance will suffer.> > So remove this check completely. > > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/net.c | 13 ------------- > 1 files changed, 0 insertions(+), 13 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index a035a89..ed3f165 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -38,8 +38,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" > * Using this limit prevents one virtqueue from starving others. */ > #define VHOST_NET_WEIGHT 0x80000 > > -/* MAX number of TX used buffers for outstanding zerocopy */ > -#define VHOST_MAX_PEND 128 > #define VHOST_GOODCOPY_LEN 256 > > /* > @@ -372,17 +370,6 @@ static void handle_tx(struct vhost_net *net) > break; > /* Nothing new? Wait for eventfd to tell us they refilled. */ > if (head == vq->num) { > - int num_pends; > - > - /* If more outstanding DMAs, queue the work. > - * Handle upend_idx wrap around > - */ > - num_pends = likely(nvq->upend_idx >= nvq->done_idx) ? > - (nvq->upend_idx - nvq->done_idx) : > - (nvq->upend_idx + UIO_MAXIOV - > - nvq->done_idx); > - if (unlikely(num_pends > VHOST_MAX_PEND)) > - break; > if (unlikely(vhost_enable_notify(&net->dev, vq))) { > vhost_disable_notify(&net->dev, vq); > continue; > -- > 1.7.1
Possibly Parallel Threads
- [PATCH 0/6] vhost code cleanup and minor enhancement
- [PATCH V2 0/6] vhost code cleanup and minor enhancement
- [PATCH V2 0/6] vhost code cleanup and minor enhancement
- [PATCH V3 0/6] vhost code cleanup and minor enhancement
- [PATCH V3 0/6] vhost code cleanup and minor enhancement