On Fri, Jan 06, 2017 at 10:13:17AM +0800, Jason Wang
wrote:> We can only process 1 packet at one time during sendmsg(). This often
> lead bad cache utilization under heavy load. So this patch tries to do
> some batching during rx before submitting them to host network
> stack. This is done through accepting MSG_MORE as a hint from
> sendmsg() caller, if it was set, batch the packet temporarily in a
> linked list and submit them all once MSG_MORE were cleared.
>
> Tests were done by pktgen (burst=128) in guest over mlx4(noqueue) on host:
>
> Mpps -+%
> rx-frames = 0 0.91 +0%
> rx-frames = 4 1.00 +9.8%
> rx-frames = 8 1.00 +9.8%
> rx-frames = 16 1.01 +10.9%
> rx-frames = 32 1.07 +17.5%
> rx-frames = 48 1.07 +17.5%
> rx-frames = 64 1.08 +18.6%
> rx-frames = 64 (no MSG_MORE) 0.91 +0%
>
> User were allowed to change per device batched packets through
> ethtool -C rx-frames. NAPI_POLL_WEIGHT were used as upper limitation
> to prevent bh from being disabled too long.
>
> Signed-off-by: Jason Wang <jasowang at redhat.com>
> ---
> drivers/net/tun.c | 76
++++++++++++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 70 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index cd8e02c..6c93926 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -218,6 +218,7 @@ struct tun_struct {
> struct list_head disabled;
> void *security;
> u32 flow_count;
> + u32 rx_batched;
> struct tun_pcpu_stats __percpu *pcpu_stats;
> };
>
> @@ -522,6 +523,7 @@ static void tun_queue_purge(struct tun_file *tfile)
> while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
> kfree_skb(skb);
>
> + skb_queue_purge(&tfile->sk.sk_write_queue);
> skb_queue_purge(&tfile->sk.sk_error_queue);
> }
>
> @@ -1140,10 +1142,45 @@ static struct sk_buff *tun_alloc_skb(struct
tun_file *tfile,
> return skb;
> }
>
> +static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
> + struct sk_buff *skb, int more)
> +{
> + struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
> + struct sk_buff_head process_queue;
> + u32 rx_batched = tun->rx_batched;
> + bool rcv = false;
> +
> + if (!rx_batched || (!more && skb_queue_empty(queue))) {
> + local_bh_disable();
> + netif_receive_skb(skb);
> + local_bh_enable();
> + return;
> + }
> +
> + spin_lock(&queue->lock);
> + if (!more || skb_queue_len(queue) == rx_batched) {
> + __skb_queue_head_init(&process_queue);
> + skb_queue_splice_tail_init(queue, &process_queue);
> + rcv = true;
> + } else {
> + __skb_queue_tail(queue, skb);
> + }
> + spin_unlock(&queue->lock);
> +
> + if (rcv) {
> + struct sk_buff *nskb;
> + local_bh_disable();
> + while ((nskb = __skb_dequeue(&process_queue)))
> + netif_receive_skb(nskb);
> + netif_receive_skb(skb);
> + local_bh_enable();
> + }
> +}
> +
> /* Get packet from user space buffer */
> static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file
*tfile,
> void *msg_control, struct iov_iter *from,
> - int noblock)
> + int noblock, bool more)
> {
> struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
> struct sk_buff *skb;
> @@ -1283,10 +1320,9 @@ static ssize_t tun_get_user(struct tun_struct *tun,
struct tun_file *tfile,
> skb_probe_transport_header(skb, 0);
>
> rxhash = skb_get_hash(skb);
> +
> #ifndef CONFIG_4KSTACKS
> - local_bh_disable();
> - netif_receive_skb(skb);
> - local_bh_enable();
> + tun_rx_batched(tun, tfile, skb, more);
> #else
> netif_rx_ni(skb);
> #endif
> @@ -1312,7 +1348,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb,
struct iov_iter *from)
> if (!tun)
> return -EBADFD;
>
> - result = tun_get_user(tun, tfile, NULL, from, file->f_flags &
O_NONBLOCK);
> + result = tun_get_user(tun, tfile, NULL, from,
> + file->f_flags & O_NONBLOCK, false);
>
> tun_put(tun);
> return result;
> @@ -1570,7 +1607,8 @@ static int tun_sendmsg(struct socket *sock, struct
msghdr *m, size_t total_len)
> return -EBADFD;
>
> ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
> - m->msg_flags & MSG_DONTWAIT);
> + m->msg_flags & MSG_DONTWAIT,
> + m->msg_flags & MSG_MORE);
> tun_put(tun);
> return ret;
> }
> @@ -1771,6 +1809,7 @@ static int tun_set_iff(struct net *net, struct file
*file, struct ifreq *ifr)
> tun->align = NET_SKB_PAD;
> tun->filter_attached = false;
> tun->sndbuf = tfile->socket.sk->sk_sndbuf;
> + tun->rx_batched = 0;
>
> tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
> if (!tun->pcpu_stats) {
> @@ -2439,6 +2478,29 @@ static void tun_set_msglevel(struct net_device *dev,
u32 value)
> #endif
> }
>
> +static int tun_get_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec)
> +{
> + struct tun_struct *tun = netdev_priv(dev);
> +
> + ec->rx_max_coalesced_frames = tun->rx_batched;
> +
> + return 0;
> +}
> +
> +static int tun_set_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec)
> +{
> + struct tun_struct *tun = netdev_priv(dev);
> +
> + if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
> + return -EINVAL;
So what should userspace do? Keep trying until it succeeds?
I think it's better to just use NAPI_POLL_WEIGHT instead and DTRT here.
> +
> + tun->rx_batched = ec->rx_max_coalesced_frames;
> +
> + return 0;
> +}
> +
> static const struct ethtool_ops tun_ethtool_ops = {
> .get_settings = tun_get_settings,
> .get_drvinfo = tun_get_drvinfo,
> @@ -2446,6 +2508,8 @@ static const struct ethtool_ops tun_ethtool_ops = {
> .set_msglevel = tun_set_msglevel,
> .get_link = ethtool_op_get_link,
> .get_ts_info = ethtool_op_get_ts_info,
> + .get_coalesce = tun_get_coalesce,
> + .set_coalesce = tun_set_coalesce,
> };
>
> static int tun_queue_resize(struct tun_struct *tun)
> --
> 2.7.4