Hi all:
This series tries to batch submitting packets to underlayer socket
through msg_control during sendmsg(). This is done by:
1) Doing userspace copy inside vhost_net
2) Build XDP buff
3) Batch at most 64 (VHOST_NET_BATCH) XDP buffs and submit them once
   through msg_control during sendmsg().
4) Underlayer sockets can use XDP buffs directly when XDP is enalbed,
   or build skb based on XDP buff.
For the packet that can not be built easily with XDP or for the case
that batch submission is hard (e.g sndbuf is limited). We will go for
the previous slow path, passing iov iterator to underlayer socket
through sendmsg() once per packet.
This can help to improve cache utilization and avoid lots of indirect
calls with sendmsg(). It can also co-operate with the batching support
of the underlayer sockets (e.g the case of XDP redirection through
maps).
Testpmd(txonly) in guest shows obvious improvements:
Test                /+pps%
XDP_DROP on TAP     /+44.8%
XDP_REDIRECT on TAP /+29%
macvtap (skb)       /+26%
Netperf TCP_STREAM TX from guest shows obvious improvements on small
packet:
    size/session/+thu%/+normalize%
       64/     1/   +2%/    0%
       64/     2/   +3%/   +1%
       64/     4/   +7%/   +5%
       64/     8/   +8%/   +6%
      256/     1/   +3%/    0%
      256/     2/  +10%/   +7%
      256/     4/  +26%/  +22%
      256/     8/  +27%/  +23%
      512/     1/   +3%/   +2%
      512/     2/  +19%/  +14%
      512/     4/  +43%/  +40%
      512/     8/  +45%/  +41%
     1024/     1/   +4%/    0%
     1024/     2/  +27%/  +21%
     1024/     4/  +38%/  +73%
     1024/     8/  +15%/  +24%
     2048/     1/  +10%/   +7%
     2048/     2/  +16%/  +12%
     2048/     4/    0%/   +2%
     2048/     8/    0%/   +2%
     4096/     1/  +36%/  +60%
     4096/     2/  -11%/  -26%
     4096/     4/    0%/  +14%
     4096/     8/    0%/   +4%
    16384/     1/   -1%/   +5%
    16384/     2/    0%/   +2%
    16384/     4/    0%/   -3%
    16384/     8/    0%/   +4%
    65535/     1/    0%/  +10%
    65535/     2/    0%/   +8%
    65535/     4/    0%/   +1%
    65535/     8/    0%/   +3%
Please review.
Thanks
Jason Wang (11):
  net: sock: introduce SOCK_XDP
  tuntap: switch to use XDP_PACKET_HEADROOM
  tuntap: enable bh early during processing XDP
  tuntap: simplify error handling in tun_build_skb()
  tuntap: tweak on the path of non-xdp case in tun_build_skb()
  tuntap: split out XDP logic
  tuntap: move XDP flushing out of tun_do_xdp()
  tun: switch to new type of msg_control
  tuntap: accept an array of XDP buffs through sendmsg()
  tap: accept an array of XDP buffs through sendmsg()
  vhost_net: batch submitting XDP buffers to underlayer sockets
 drivers/net/tap.c      |  87 +++++++++++++-
 drivers/net/tun.c      | 251 +++++++++++++++++++++++++++++++----------
 drivers/vhost/net.c    | 171 +++++++++++++++++++++++++---
 include/linux/if_tun.h |   7 ++
 include/net/sock.h     |   1 +
 5 files changed, 437 insertions(+), 80 deletions(-)
-- 
2.17.1
This patch introduces a new sock flag - SOCK_XDP. This will be used
for notifying the upper layer that XDP program is attached on the
lower socket, and requires for extra headroom.
TUN will be the first user.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tun.c  | 19 +++++++++++++++++++
 include/net/sock.h |  1 +
 2 files changed, 20 insertions(+)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ebd07ad82431..2c548bd20393 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -869,6 +869,9 @@ static int tun_attach(struct tun_struct *tun, struct file
*file,
 		tun_napi_init(tun, tfile, napi);
 	}
 
+	if (rtnl_dereference(tun->xdp_prog))
+		sock_set_flag(&tfile->sk, SOCK_XDP);
+
 	tun_set_real_num_queues(tun);
 
 	/* device is allowed to go away first, so no need to hold extra
@@ -1241,13 +1244,29 @@ static int tun_xdp_set(struct net_device *dev, struct
bpf_prog *prog,
 		       struct netlink_ext_ack *extack)
 {
 	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
 	struct bpf_prog *old_prog;
+	int i;
 
 	old_prog = rtnl_dereference(tun->xdp_prog);
 	rcu_assign_pointer(tun->xdp_prog, prog);
 	if (old_prog)
 		bpf_prog_put(old_prog);
 
+	for (i = 0; i < tun->numqueues; i++) {
+		tfile = rtnl_dereference(tun->tfiles[i]);
+		if (prog)
+			sock_set_flag(&tfile->sk, SOCK_XDP);
+		else
+			sock_reset_flag(&tfile->sk, SOCK_XDP);
+	}
+	list_for_each_entry(tfile, &tun->disabled, next) {
+		if (prog)
+			sock_set_flag(&tfile->sk, SOCK_XDP);
+		else
+			sock_reset_flag(&tfile->sk, SOCK_XDP);
+	}
+
 	return 0;
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 433f45fc2d68..38cae35f6e16 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -800,6 +800,7 @@ enum sock_flags {
 	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
 	SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
 	SOCK_TXTIME,
+	SOCK_XDP, /* XDP is attached */
 };
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL <<
SOCK_TIMESTAMPING_RX_SOFTWARE))
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 02/11] tuntap: switch to use XDP_PACKET_HEADROOM
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tun.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2c548bd20393..d3677a544b56 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -113,7 +113,6 @@ do {								\
 } while (0)
 #endif
 
-#define TUN_HEADROOM 256
 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
 /* TUN device flags */
@@ -1654,7 +1653,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(tun->xdp_prog);
 	if (xdp_prog)
-		pad += TUN_HEADROOM;
+		pad += XDP_PACKET_HEADROOM;
 	buflen += SKB_DATA_ALIGN(len + pad);
 	rcu_read_unlock();
 
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 03/11] tuntap: enable bh early during processing XDP
This patch move the bh enabling a little bit earlier, this will be
used for factoring out the core XDP logic of tuntap.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tun.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d3677a544b56..372caf7d67d9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1726,22 +1726,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 			goto err_xdp;
 		}
 	}
+	rcu_read_unlock();
+	local_bh_enable();
 
 	skb = build_skb(buf, buflen);
-	if (!skb) {
-		rcu_read_unlock();
-		local_bh_enable();
+	if (!skb)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	skb_reserve(skb, pad - delta);
 	skb_put(skb, len);
 	get_page(alloc_frag->page);
 	alloc_frag->offset += buflen;
 
-	rcu_read_unlock();
-	local_bh_enable();
-
 	return skb;
 
 err_redirect:
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 04/11] tuntap: simplify error handling in tun_build_skb()
There's no need to duplicate page get logic in each action. So this
patch tries to get page and calculate the offset before processing XDP
actions, and undo them when meet errors (we don't care the performance
on errors). This will be used for factoring out XDP logic.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tun.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 372caf7d67d9..f8cdcfa392c3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1642,7 +1642,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 				     int len, int *skb_xdp)
 {
 	struct page_frag *alloc_frag = ¤t->task_frag;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	unsigned int delta = 0;
@@ -1668,6 +1668,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 	if (copied != len)
 		return ERR_PTR(-EFAULT);
 
+	get_page(alloc_frag->page);
+	alloc_frag->offset += buflen;
+
 	/* There's a small window that XDP may be set after the check
 	 * of xdp_prog above, this should be rare and for simplicity
 	 * we do XDP on skb in case the headroom is not enough.
@@ -1695,23 +1698,15 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 
 		switch (act) {
 		case XDP_REDIRECT:
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;
 			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
 			xdp_do_flush_map();
 			if (err)
-				goto err_redirect;
-			rcu_read_unlock();
-			local_bh_enable();
-			return NULL;
+				goto err_xdp;
+			goto out;
 		case XDP_TX:
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;
 			if (tun_xdp_tx(tun->dev, &xdp) < 0)
-				goto err_redirect;
-			rcu_read_unlock();
-			local_bh_enable();
-			return NULL;
+				goto err_xdp;
+			goto out;
 		case XDP_PASS:
 			delta = orig_data - xdp.data;
 			len = xdp.data_end - xdp.data;
@@ -1730,23 +1725,23 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 	local_bh_enable();
 
 	skb = build_skb(buf, buflen);
-	if (!skb)
-		return ERR_PTR(-ENOMEM);
+	if (!skb) {
+		skb = ERR_PTR(-ENOMEM);
+		goto out;
+	}
 
 	skb_reserve(skb, pad - delta);
 	skb_put(skb, len);
-	get_page(alloc_frag->page);
-	alloc_frag->offset += buflen;
 
 	return skb;
 
-err_redirect:
-	put_page(alloc_frag->page);
 err_xdp:
+	alloc_frag->offset -= buflen;
+	put_page(alloc_frag->page);
+out:
 	rcu_read_unlock();
 	local_bh_enable();
-	this_cpu_inc(tun->pcpu_stats->rx_dropped);
-	return NULL;
+	return skb;
 }
 
 /* Get packet from user space buffer */
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 05/11] tuntap: tweak on the path of non-xdp case in tun_build_skb()
If we're sure not to go native XDP, there's no need for several things
like bh and rcu stuffs.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tun.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f8cdcfa392c3..389aa0727cc6 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1675,10 +1675,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 	 * of xdp_prog above, this should be rare and for simplicity
 	 * we do XDP on skb in case the headroom is not enough.
 	 */
-	if (hdr->gso_type || !xdp_prog)
+	if (hdr->gso_type || !xdp_prog) {
 		*skb_xdp = 1;
-	else
-		*skb_xdp = 0;
+		goto build;
+	}
+
+	*skb_xdp = 0;
 
 	local_bh_disable();
 	rcu_read_lock();
@@ -1724,6 +1726,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 	rcu_read_unlock();
 	local_bh_enable();
 
+build:
 	skb = build_skb(buf, buflen);
 	if (!skb) {
 		skb = ERR_PTR(-ENOMEM);
-- 
2.17.1
This patch split out XDP logic into a single function. This make it to
be reused by XDP batching path in the following patch.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tun.c | 84 ++++++++++++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 33 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 389aa0727cc6..21b125020b3b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1635,6 +1635,44 @@ static bool tun_can_build_skb(struct tun_struct *tun,
struct tun_file *tfile,
 	return true;
 }
 
+static u32 tun_do_xdp(struct tun_struct *tun,
+		      struct tun_file *tfile,
+		      struct bpf_prog *xdp_prog,
+		      struct xdp_buff *xdp,
+		      int *err)
+{
+	u32 act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	switch (act) {
+	case XDP_REDIRECT:
+		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+		xdp_do_flush_map();
+		if (*err)
+			break;
+		goto out;
+	case XDP_TX:
+		*err = tun_xdp_tx(tun->dev, xdp);
+		if (*err < 0)
+			break;
+		*err = 0;
+		goto out;
+	case XDP_PASS:
+		goto out;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fall through */
+	case XDP_ABORTED:
+		trace_xdp_exception(tun->dev, xdp_prog, act);
+		/* fall through */
+	case XDP_DROP:
+		break;
+	}
+
+	put_page(virt_to_head_page(xdp->data_hard_start));
+out:
+	return act;
+}
+
 static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 				     struct tun_file *tfile,
 				     struct iov_iter *from,
@@ -1645,10 +1683,10 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 	struct sk_buff *skb = NULL;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	unsigned int delta = 0;
 	char *buf;
 	size_t copied;
-	int err, pad = TUN_RX_PAD;
+	int pad = TUN_RX_PAD;
+	int err = 0;
 
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(tun->xdp_prog);
@@ -1685,9 +1723,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 	local_bh_disable();
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(tun->xdp_prog);
-	if (xdp_prog && !*skb_xdp) {
+	if (xdp_prog) {
 		struct xdp_buff xdp;
-		void *orig_data;
 		u32 act;
 
 		xdp.data_hard_start = buf;
@@ -1695,33 +1732,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		xdp.rxq = &tfile->xdp_rxq;
-		orig_data = xdp.data;
-		act = bpf_prog_run_xdp(xdp_prog, &xdp);
-
-		switch (act) {
-		case XDP_REDIRECT:
-			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
-			xdp_do_flush_map();
-			if (err)
-				goto err_xdp;
-			goto out;
-		case XDP_TX:
-			if (tun_xdp_tx(tun->dev, &xdp) < 0)
-				goto err_xdp;
-			goto out;
-		case XDP_PASS:
-			delta = orig_data - xdp.data;
-			len = xdp.data_end - xdp.data;
-			break;
-		default:
-			bpf_warn_invalid_xdp_action(act);
-			/* fall through */
-		case XDP_ABORTED:
-			trace_xdp_exception(tun->dev, xdp_prog, act);
-			/* fall through */
-		case XDP_DROP:
+		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
+		if (err)
 			goto err_xdp;
-		}
+		if (act != XDP_PASS)
+			goto out;
+
+		pad = xdp.data - xdp.data_hard_start;
+		len = xdp.data_end - xdp.data;
 	}
 	rcu_read_unlock();
 	local_bh_enable();
@@ -1729,18 +1747,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 build:
 	skb = build_skb(buf, buflen);
 	if (!skb) {
+		put_page(alloc_frag->page);
 		skb = ERR_PTR(-ENOMEM);
 		goto out;
 	}
 
-	skb_reserve(skb, pad - delta);
+	skb_reserve(skb, pad);
 	skb_put(skb, len);
 
 	return skb;
 
 err_xdp:
-	alloc_frag->offset -= buflen;
-	put_page(alloc_frag->page);
+	this_cpu_inc(tun->pcpu_stats->rx_dropped);
 out:
 	rcu_read_unlock();
 	local_bh_enable();
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 07/11] tuntap: move XDP flushing out of tun_do_xdp()
This will allow adding batch flushing on top.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tun.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 21b125020b3b..ff1cbf3ebd50 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1646,7 +1646,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
 	switch (act) {
 	case XDP_REDIRECT:
 		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
-		xdp_do_flush_map();
 		if (*err)
 			break;
 		goto out;
@@ -1735,6 +1734,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct
*tun,
 		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
 		if (err)
 			goto err_xdp;
+
+		if (act == XDP_REDIRECT)
+			xdp_do_flush_map();
 		if (act != XDP_PASS)
 			goto out;
 
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 08/11] tun: switch to new type of msg_control
This patch introduces to a new tun/tap specific msg_control:
#define TUN_MSG_UBUF 1
#define TUN_MSG_PTR  2
struct tun_msg_ctl {
       int type;
       void *ptr;
};
This allows us to pass different kinds of msg_control through
sendmsg(). The first supported type is ubuf (TUN_MSG_UBUF) which will
be used by the existed vhost_net zerocopy code. The second is XDP
buff, which allows vhost_net to pass XDP buff to TUN. This could be
used to implement accepting an array of XDP buffs from vhost_net in
the following patches.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tap.c      | 18 ++++++++++++------
 drivers/net/tun.c      |  6 +++++-
 drivers/vhost/net.c    |  7 +++++--
 include/linux/if_tun.h |  7 +++++++
 4 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f0f7cd977667..7996ed7cbf18 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk,
size_t prepad,
 #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
 
 /* Get packet from user space buffer */
-static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
+static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
 			    struct iov_iter *from, int noblock)
 {
 	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
@@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct
msghdr *m,
 	if (unlikely(len < ETH_HLEN))
 		goto err;
 
-	if (m && m->msg_control && sock_flag(&q->sk,
SOCK_ZEROCOPY)) {
+	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
 		struct iov_iter i;
 
 		copylen = vnet_hdr.hdr_len ?
@@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct
msghdr *m,
 	tap = rcu_dereference(q->tap);
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
-		skb_shinfo(skb)->destructor_arg = m->msg_control;
+		skb_shinfo(skb)->destructor_arg = msg_control;
 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-	} else if (m && m->msg_control) {
-		struct ubuf_info *uarg = m->msg_control;
+	} else if (msg_control) {
+		struct ubuf_info *uarg = msg_control;
 		uarg->callback(uarg, false);
 	}
 
@@ -1150,7 +1150,13 @@ static int tap_sendmsg(struct socket *sock, struct msghdr
*m,
 		       size_t total_len)
 {
 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
-	return tap_get_user(q, m, &m->msg_iter, m->msg_flags &
MSG_DONTWAIT);
+	struct tun_msg_ctl *ctl = m->msg_control;
+
+	if (ctl && ctl->type != TUN_MSG_UBUF)
+		return -EINVAL;
+
+	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
+			    m->msg_flags & MSG_DONTWAIT);
 }
 
 static int tap_recvmsg(struct socket *sock, struct msghdr *m,
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ff1cbf3ebd50..c839a4bdcbd9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2429,11 +2429,15 @@ static int tun_sendmsg(struct socket *sock, struct
msghdr *m, size_t total_len)
 	int ret;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = tun_get(tfile);
+	struct tun_msg_ctl *ctl = m->msg_control;
 
 	if (!tun)
 		return -EBADFD;
 
-	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+	if (ctl && ctl->type != TUN_MSG_UBUF)
+		return -EINVAL;
+
+	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT,
 			   m->msg_flags & MSG_MORE);
 	tun_put(tun);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4e656f89cb22..fb01ce6d981c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -620,6 +620,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct
socket *sock)
 		.msg_controllen = 0,
 		.msg_flags = MSG_DONTWAIT,
 	};
+	struct tun_msg_ctl ctl;
 	size_t len, total_len = 0;
 	int err;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
@@ -664,8 +665,10 @@ static void handle_tx_zerocopy(struct vhost_net *net,
struct socket *sock)
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
 			refcount_set(&ubuf->refcnt, 1);
-			msg.msg_control = ubuf;
-			msg.msg_controllen = sizeof(ubuf);
+			msg.msg_control = &ctl;
+			ctl.type = TUN_MSG_UBUF;
+			ctl.ptr = ubuf;
+			msg.msg_controllen = sizeof(ctl);
 			ubufs = nvq->ubufs;
 			atomic_inc(&ubufs->refcount);
 			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 3d2996dc7d85..ba46dced1f38 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,13 @@
 
 #define TUN_XDP_FLAG 0x1UL
 
+#define TUN_MSG_UBUF 1
+#define TUN_MSG_PTR  2
+struct tun_msg_ctl {
+	int type;
+	void *ptr;
+};
+
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 09/11] tuntap: accept an array of XDP buffs through sendmsg()
This patch implement TUN_MSG_PTR msg_control type. This type allows
the caller to pass an array of XDP buffs to tuntap through ptr field
of the tun_msg_control. If an XDP program is attached, tuntap can run
XDP program directly. If not, tuntap will build skb and do a fast
receiving since part of the work has been done by vhost_net.
This will avoid lots of indirect calls thus improves the icache
utilization and allows to do XDP batched flushing when doing XDP
redirection.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tun.c | 103 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c839a4bdcbd9..069db2e5dd08 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2424,22 +2424,119 @@ static void tun_sock_write_space(struct sock *sk)
 	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
 }
 
+static int tun_xdp_one(struct tun_struct *tun,
+		       struct tun_file *tfile,
+		       struct xdp_buff *xdp, int *flush)
+{
+	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
+	struct tun_pcpu_stats *stats;
+	struct bpf_prog *xdp_prog;
+	struct sk_buff *skb = NULL;
+	u32 rxhash = 0, act;
+	int buflen = *(int *)xdp->data_hard_start;
+	int err = 0;
+	bool skb_xdp = false;
+
+	xdp_prog = rcu_dereference(tun->xdp_prog);
+	if (xdp_prog) {
+		if (gso->gso_type) {
+			skb_xdp = true;
+			goto build;
+		}
+		xdp_set_data_meta_invalid(xdp);
+		xdp->rxq = &tfile->xdp_rxq;
+		act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
+		if (err)
+			goto out;
+		if (act == XDP_REDIRECT)
+			*flush = true;
+		if (act != XDP_PASS)
+			goto out;
+	}
+
+build:
+	skb = build_skb(xdp->data_hard_start, buflen);
+	if (!skb) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	skb_put(skb, xdp->data_end - xdp->data);
+
+	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+		kfree_skb(skb);
+		err = -EINVAL;
+		goto out;
+	}
+
+	skb->protocol = eth_type_trans(skb, tun->dev);
+	skb_reset_network_header(skb);
+	skb_probe_transport_header(skb, 0);
+
+	if (skb_xdp) {
+		err = do_xdp_generic(xdp_prog, skb);
+		if (err != XDP_PASS)
+			goto out;
+	}
+
+	if (!rcu_dereference(tun->steering_prog))
+		rxhash = __skb_get_hash_symmetric(skb);
+
+	netif_receive_skb(skb);
+
+	stats = get_cpu_ptr(tun->pcpu_stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+	put_cpu_ptr(stats);
+
+	if (rxhash)
+		tun_flow_update(tun, rxhash, tfile);
+
+out:
+	return err;
+}
+
 static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
-	int ret;
+	int ret, i;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = tun_get(tfile);
 	struct tun_msg_ctl *ctl = m->msg_control;
+	struct xdp_buff *xdp;
 
 	if (!tun)
 		return -EBADFD;
 
-	if (ctl && ctl->type != TUN_MSG_UBUF)
-		return -EINVAL;
+	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
+		int n = ctl->type >> 16;
+		int flush = 0;
+
+		local_bh_disable();
+		rcu_read_lock();
+
+		for (i = 0; i < n; i++) {
+			xdp = &((struct xdp_buff *)ctl->ptr)[i];
+			tun_xdp_one(tun, tfile, xdp, &flush);
+		}
+
+		if (flush)
+			xdp_do_flush_map();
+
+		rcu_read_unlock();
+		local_bh_enable();
+
+		ret = total_len;
+		goto out;
+	}
 
 	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT,
 			   m->msg_flags & MSG_MORE);
+out:
 	tun_put(tun);
 	return ret;
 }
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 10/11] tap: accept an array of XDP buffs through sendmsg()
This patch implement TUN_MSG_PTR msg_control type. This type allows
the caller to pass an array of XDP buffs to tuntap through ptr field
of the tun_msg_control. Tap will build skb through those XDP buffers.
This will avoid lots of indirect calls thus improves the icache
utilization and allows to do XDP batched flushing when doing XDP
redirection.
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/net/tap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 7996ed7cbf18..50eb7bf22225 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1146,14 +1146,83 @@ static const struct file_operations tap_fops = {
 #endif
 };
 
+static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
+{
+	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
+	int buflen = *(int *)xdp->data_hard_start;
+	int vnet_hdr_len = 0;
+	struct tap_dev *tap;
+	struct sk_buff *skb;
+	int err, depth;
+
+	if (q->flags & IFF_VNET_HDR)
+		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
+
+	skb = build_skb(xdp->data_hard_start, buflen);
+	if (!skb) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	skb_put(skb, xdp->data_end - xdp->data);
+
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	if (vnet_hdr_len) {
+		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
+		if (err)
+			goto err_kfree;
+	}
+
+	skb_probe_transport_header(skb, ETH_HLEN);
+
+	/* Move network header to the right position for VLAN tagged packets */
+	if ((skb->protocol == htons(ETH_P_8021Q) ||
+	     skb->protocol == htons(ETH_P_8021AD)) &&
+	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
+		skb_set_network_header(skb, depth);
+
+	rcu_read_lock();
+	tap = rcu_dereference(q->tap);
+	if (tap) {
+		skb->dev = tap->dev;
+		dev_queue_xmit(skb);
+	} else {
+		kfree_skb(skb);
+	}
+	rcu_read_unlock();
+
+	return 0;
+
+err_kfree:
+	kfree_skb(skb);
+err:
+	rcu_read_lock();
+		tap = rcu_dereference(q->tap);
+	if (tap && tap->count_tx_dropped)
+		tap->count_tx_dropped(tap);
+	rcu_read_unlock();
+	return err;
+}
+
 static int tap_sendmsg(struct socket *sock, struct msghdr *m,
 		       size_t total_len)
 {
 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
 	struct tun_msg_ctl *ctl = m->msg_control;
+	struct xdp_buff *xdp;
+	int i;
 
-	if (ctl && ctl->type != TUN_MSG_UBUF)
-		return -EINVAL;
+	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
+		for (i = 0; i < ctl->type >> 16; i++) {
+			xdp = &((struct xdp_buff *)ctl->ptr)[i];
+			tap_get_user_xdp(q, xdp);
+		}
+		return 0;
+	}
 
 	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			    m->msg_flags & MSG_DONTWAIT);
-- 
2.17.1
Jason Wang
2018-Sep-06  04:05 UTC
[PATCH net-next 11/11] vhost_net: batch submitting XDP buffers to underlayer sockets
This patch implements XDP batching for vhost_net. The idea is first to
try to do userspace copy and build XDP buff directly in vhost. Instead
of submitting the packet immediately, vhost_net will batch them in an
array and submit every 64 (VHOST_NET_BATCH) packets to the under layer
sockets through msg_control of sendmsg().
When XDP is enabled on the TUN/TAP, TUN/TAP can process XDP inside a
loop without caring GUP thus it can do batch map flushing. When XDP is
not enabled or not supported, the underlayer socket need to build skb
and pass it to network core. The batched packet submission allows us
to do batching like netif_receive_skb_list() in the future.
This saves lots of indirect calls for better cache utilization. For
the case that we can't so batching e.g when sndbuf is limited or
packet size is too large, we will go for usual one packet per
sendmsg() way.
Doing testpmd on various setups gives us:
Test                /+pps%
XDP_DROP on TAP     /+44.8%
XDP_REDIRECT on TAP /+29%
macvtap (skb)       /+26%
Netperf tests shows obvious improvements for small packet transmission:
size/session/+thu%/+normalize%
   64/     1/   +2%/    0%
   64/     2/   +3%/   +1%
   64/     4/   +7%/   +5%
   64/     8/   +8%/   +6%
  256/     1/   +3%/    0%
  256/     2/  +10%/   +7%
  256/     4/  +26%/  +22%
  256/     8/  +27%/  +23%
  512/     1/   +3%/   +2%
  512/     2/  +19%/  +14%
  512/     4/  +43%/  +40%
  512/     8/  +45%/  +41%
 1024/     1/   +4%/    0%
 1024/     2/  +27%/  +21%
 1024/     4/  +38%/  +73%
 1024/     8/  +15%/  +24%
 2048/     1/  +10%/   +7%
 2048/     2/  +16%/  +12%
 2048/     4/    0%/   +2%
 2048/     8/    0%/   +2%
 4096/     1/  +36%/  +60%
 4096/     2/  -11%/  -26%
 4096/     4/    0%/  +14%
 4096/     8/    0%/   +4%
16384/     1/   -1%/   +5%
16384/     2/    0%/   +2%
16384/     4/    0%/   -3%
16384/     8/    0%/   +4%
65535/     1/    0%/  +10%
65535/     2/    0%/   +8%
65535/     4/    0%/   +1%
65535/     8/    0%/   +3%
Signed-off-by: Jason Wang <jasowang at redhat.com>
---
 drivers/vhost/net.c | 164 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 151 insertions(+), 13 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index fb01ce6d981c..1dd4239cbff8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -116,6 +116,7 @@ struct vhost_net_virtqueue {
 	 * For RX, number of batched heads
 	 */
 	int done_idx;
+	int batched_xdp;
 	/* an array of userspace buffers info */
 	struct ubuf_info *ubuf_info;
 	/* Reference counting for outstanding ubufs.
@@ -123,6 +124,7 @@ struct vhost_net_virtqueue {
 	struct vhost_net_ubuf_ref *ubufs;
 	struct ptr_ring *rx_ring;
 	struct vhost_net_buf rxq;
+	struct xdp_buff xdp[VHOST_NET_BATCH];
 };
 
 struct vhost_net {
@@ -338,6 +340,11 @@ static bool vhost_sock_zcopy(struct socket *sock)
 		sock_flag(sock->sk, SOCK_ZEROCOPY);
 }
 
+static bool vhost_sock_xdp(struct socket *sock)
+{
+	return sock_flag(sock->sk, SOCK_XDP);
+}
+
 /* In case of DMA done not in order in lower device driver for some reason.
  * upend_idx is used to track end of used idx, done_idx is used to track head
  * of used idx. Once lower device DMA done contiguously, we will signal KVM
@@ -444,10 +451,36 @@ static void vhost_net_signal_used(struct
vhost_net_virtqueue *nvq)
 	nvq->done_idx = 0;
 }
 
+static void vhost_tx_batch(struct vhost_net *net,
+			   struct vhost_net_virtqueue *nvq,
+			   struct socket *sock,
+			   struct msghdr *msghdr)
+{
+	struct tun_msg_ctl ctl = {
+		.type = nvq->batched_xdp << 16 | TUN_MSG_PTR,
+		.ptr = nvq->xdp,
+	};
+	int err;
+
+	if (nvq->batched_xdp == 0)
+		goto signal_used;
+
+	msghdr->msg_control = &ctl;
+	err = sock->ops->sendmsg(sock, msghdr, 0);
+	if (unlikely(err < 0)) {
+		vq_err(&nvq->vq, "Fail to batch sending packets\n");
+		return;
+	}
+
+signal_used:
+	vhost_net_signal_used(nvq);
+	nvq->batched_xdp = 0;
+}
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_net_virtqueue *nvq,
 				    unsigned int *out_num, unsigned int *in_num,
-				    bool *busyloop_intr)
+				    struct msghdr *msghdr, bool *busyloop_intr)
 {
 	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned long uninitialized_var(endtime);
@@ -455,8 +488,9 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				  out_num, in_num, NULL, NULL);
 
 	if (r == vq->num && vq->busyloop_timeout) {
+		/* Flush batched packets first */
 		if (!vhost_sock_zcopy(vq->private_data))
-			vhost_net_signal_used(nvq);
+			vhost_tx_batch(net, nvq, vq->private_data, msghdr);
 		preempt_disable();
 		endtime = busy_clock() + vq->busyloop_timeout;
 		while (vhost_can_busy_poll(endtime)) {
@@ -512,7 +546,7 @@ static int get_tx_bufs(struct vhost_net *net,
 	struct vhost_virtqueue *vq = &nvq->vq;
 	int ret;
 
-	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
+	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
 
 	if (ret < 0 || ret == vq->num)
 		return ret;
@@ -540,6 +574,83 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t
total_len)
 	       !vhost_vq_avail_empty(vq->dev, vq);
 }
 
+#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
+static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
+			       struct iov_iter *from)
+{
+	struct vhost_virtqueue *vq = &nvq->vq;
+	struct socket *sock = vq->private_data;
+	struct page_frag *alloc_frag = ¤t->task_frag;
+	struct virtio_net_hdr *gso;
+	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
+	size_t len = iov_iter_count(from);
+	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
+	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
+	int sock_hlen = nvq->sock_hlen;
+	void *buf;
+	int copied;
+
+	if (unlikely(len < nvq->sock_hlen))
+		return -EFAULT;
+
+	if (SKB_DATA_ALIGN(len + pad) +
+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+		return -ENOSPC;
+
+	buflen += SKB_DATA_ALIGN(len + pad);
+	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+		return -ENOMEM;
+
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+
+	/* We store two kinds of metadata in the header which will be
+	 * used for XDP_PASS to do build_skb():
+	 * offset 0: buflen
+	 * offset sizeof(int): vnet header
+	 */
+	copied = copy_page_from_iter(alloc_frag->page,
+				     alloc_frag->offset + sizeof(int),
+				     sock_hlen, from);
+	if (copied != sock_hlen)
+		return -EFAULT;
+
+	gso = (struct virtio_net_hdr *)(buf + sizeof(int));
+
+	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+	    vhost16_to_cpu(vq, gso->csum_start) +
+	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
+	    vhost16_to_cpu(vq, gso->hdr_len)) {
+		gso->hdr_len = cpu_to_vhost16(vq,
+			       vhost16_to_cpu(vq, gso->csum_start) +
+			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
+
+		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
+			return -EINVAL;
+	}
+
+	len -= sock_hlen;
+	copied = copy_page_from_iter(alloc_frag->page,
+				     alloc_frag->offset + pad,
+				     len, from);
+	if (copied != len)
+		return -EFAULT;
+
+	xdp->data_hard_start = buf;
+	xdp->data = buf + pad;
+	xdp->data_end = xdp->data + len;
+	*(int *)(xdp->data_hard_start) = buflen;
+
+	get_page(alloc_frag->page);
+	alloc_frag->offset += buflen;
+
+	++nvq->batched_xdp;
+
+	return 0;
+}
+
 static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -556,10 +667,14 @@ static void handle_tx_copy(struct vhost_net *net, struct
socket *sock)
 	size_t len, total_len = 0;
 	int err;
 	int sent_pkts = 0;
+	bool bulking = (sock->sk->sk_sndbuf == INT_MAX);
 
 	for (;;) {
 		bool busyloop_intr = false;
 
+		if (nvq->done_idx == VHOST_NET_BATCH)
+			vhost_tx_batch(net, nvq, sock, &msg);
+
 		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
 				   &busyloop_intr);
 		/* On error, stop handling until the next kick. */
@@ -577,14 +692,34 @@ static void handle_tx_copy(struct vhost_net *net, struct
socket *sock)
 			break;
 		}
 
-		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
-		vq->heads[nvq->done_idx].len = 0;
-
 		total_len += len;
-		if (tx_can_batch(vq, total_len))
-			msg.msg_flags |= MSG_MORE;
-		else
-			msg.msg_flags &= ~MSG_MORE;
+
+		/* For simplicity, TX batching is only enabled if
+		 * sndbuf is unlimited.
+		 */
+		if (bulking) {
+			err = vhost_net_build_xdp(nvq, &msg.msg_iter);
+			if (!err) {
+				goto done;
+			} else if (unlikely(err != -ENOSPC)) {
+				vhost_tx_batch(net, nvq, sock, &msg);
+				vhost_discard_vq_desc(vq, 1);
+				vhost_net_enable_vq(net, vq);
+				break;
+			}
+
+			/* We can't build XDP buff, go for single
+			 * packet path but let's flush batched
+			 * packets.
+			 */
+			vhost_tx_batch(net, nvq, sock, &msg);
+			msg.msg_control = NULL;
+		} else {
+			if (tx_can_batch(vq, total_len))
+				msg.msg_flags |= MSG_MORE;
+			else
+				msg.msg_flags &= ~MSG_MORE;
+		}
 
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(sock, &msg, len);
@@ -596,15 +731,17 @@ static void handle_tx_copy(struct vhost_net *net, struct
socket *sock)
 		if (err != len)
 			pr_debug("Truncated TX packet: len %d != %zd\n",
 				 err, len);
-		if (++nvq->done_idx >= VHOST_NET_BATCH)
-			vhost_net_signal_used(nvq);
+done:
+		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
+		vq->heads[nvq->done_idx].len = 0;
+		++nvq->done_idx;
 		if (vhost_exceeds_weight(++sent_pkts, total_len)) {
 			vhost_poll_queue(&vq->poll);
 			break;
 		}
 	}
 
-	vhost_net_signal_used(nvq);
+	vhost_tx_batch(net, nvq, sock, &msg);
 }
 
 static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
@@ -1111,6 +1248,7 @@ static int vhost_net_open(struct inode *inode, struct file
*f)
 		n->vqs[i].ubuf_info = NULL;
 		n->vqs[i].upend_idx = 0;
 		n->vqs[i].done_idx = 0;
+		n->vqs[i].batched_xdp = 0;
 		n->vqs[i].vhost_hlen = 0;
 		n->vqs[i].sock_hlen = 0;
 		n->vqs[i].rx_ring = NULL;
-- 
2.17.1
Michael S. Tsirkin
2018-Sep-06  16:46 UTC
[PATCH net-next 11/11] vhost_net: batch submitting XDP buffers to underlayer sockets
On Thu, Sep 06, 2018 at 12:05:26PM +0800, Jason Wang wrote:> This patch implements XDP batching for vhost_net. The idea is first to > try to do userspace copy and build XDP buff directly in vhost. Instead > of submitting the packet immediately, vhost_net will batch them in an > array and submit every 64 (VHOST_NET_BATCH) packets to the under layer > sockets through msg_control of sendmsg(). > > When XDP is enabled on the TUN/TAP, TUN/TAP can process XDP inside a > loop without caring GUP thus it can do batch map flushing. When XDP is > not enabled or not supported, the underlayer socket need to build skb > and pass it to network core. The batched packet submission allows us > to do batching like netif_receive_skb_list() in the future. > > This saves lots of indirect calls for better cache utilization. For > the case that we can't so batching e.g when sndbuf is limited or > packet size is too large, we will go for usual one packet per > sendmsg() way. > > Doing testpmd on various setups gives us: > > Test /+pps% > XDP_DROP on TAP /+44.8% > XDP_REDIRECT on TAP /+29% > macvtap (skb) /+26% > > Netperf tests shows obvious improvements for small packet transmission: > > size/session/+thu%/+normalize% > 64/ 1/ +2%/ 0% > 64/ 2/ +3%/ +1% > 64/ 4/ +7%/ +5% > 64/ 8/ +8%/ +6% > 256/ 1/ +3%/ 0% > 256/ 2/ +10%/ +7% > 256/ 4/ +26%/ +22% > 256/ 8/ +27%/ +23% > 512/ 1/ +3%/ +2% > 512/ 2/ +19%/ +14% > 512/ 4/ +43%/ +40% > 512/ 8/ +45%/ +41% > 1024/ 1/ +4%/ 0% > 1024/ 2/ +27%/ +21% > 1024/ 4/ +38%/ +73% > 1024/ 8/ +15%/ +24% > 2048/ 1/ +10%/ +7% > 2048/ 2/ +16%/ +12% > 2048/ 4/ 0%/ +2% > 2048/ 8/ 0%/ +2% > 4096/ 1/ +36%/ +60% > 4096/ 2/ -11%/ -26% > 4096/ 4/ 0%/ +14% > 4096/ 8/ 0%/ +4% > 16384/ 1/ -1%/ +5% > 16384/ 2/ 0%/ +2% > 16384/ 4/ 0%/ -3% > 16384/ 8/ 0%/ +4% > 65535/ 1/ 0%/ +10% > 65535/ 2/ 0%/ +8% > 65535/ 4/ 0%/ +1% > 65535/ 8/ 0%/ +3% > > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/vhost/net.c | 164 ++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 151 insertions(+), 13 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index fb01ce6d981c..1dd4239cbff8 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -116,6 +116,7 @@ struct vhost_net_virtqueue { > * For RX, number of batched heads > */ > int done_idx; > + int batched_xdp;Pls add a comment documenting what does this new field do.> /* an array of userspace buffers info */ > struct ubuf_info *ubuf_info; > /* Reference counting for outstanding ubufs. > @@ -123,6 +124,7 @@ struct vhost_net_virtqueue { > struct vhost_net_ubuf_ref *ubufs; > struct ptr_ring *rx_ring; > struct vhost_net_buf rxq; > + struct xdp_buff xdp[VHOST_NET_BATCH];This is a bit too much to have inline. 64 entries 48 bytes each, and we have 2 of these structures, that's already >4K.> }; > > struct vhost_net { > @@ -338,6 +340,11 @@ static bool vhost_sock_zcopy(struct socket *sock) > sock_flag(sock->sk, SOCK_ZEROCOPY); > } > > +static bool vhost_sock_xdp(struct socket *sock) > +{ > + return sock_flag(sock->sk, SOCK_XDP);what if an xdp program is attached while this processing is going on? Flag value will be wrong - is this safe and why? Pls add a comment.> +} > + > /* In case of DMA done not in order in lower device driver for some reason. > * upend_idx is used to track end of used idx, done_idx is used to track head > * of used idx. Once lower device DMA done contiguously, we will signal KVM > @@ -444,10 +451,36 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) > nvq->done_idx = 0; > } > > +static void vhost_tx_batch(struct vhost_net *net, > + struct vhost_net_virtqueue *nvq, > + struct socket *sock, > + struct msghdr *msghdr) > +{ > + struct tun_msg_ctl ctl = { > + .type = nvq->batched_xdp << 16 | TUN_MSG_PTR, > + .ptr = nvq->xdp, > + }; > + int err; > + > + if (nvq->batched_xdp == 0) > + goto signal_used; > + > + msghdr->msg_control = &ctl; > + err = sock->ops->sendmsg(sock, msghdr, 0); > + if (unlikely(err < 0)) { > + vq_err(&nvq->vq, "Fail to batch sending packets\n"); > + return; > + } > + > +signal_used: > + vhost_net_signal_used(nvq); > + nvq->batched_xdp = 0; > +} > + > static int vhost_net_tx_get_vq_desc(struct vhost_net *net, > struct vhost_net_virtqueue *nvq, > unsigned int *out_num, unsigned int *in_num, > - bool *busyloop_intr) > + struct msghdr *msghdr, bool *busyloop_intr) > { > struct vhost_virtqueue *vq = &nvq->vq; > unsigned long uninitialized_var(endtime); > @@ -455,8 +488,9 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net, > out_num, in_num, NULL, NULL); > > if (r == vq->num && vq->busyloop_timeout) { > + /* Flush batched packets first */ > if (!vhost_sock_zcopy(vq->private_data)) > - vhost_net_signal_used(nvq); > + vhost_tx_batch(net, nvq, vq->private_data, msghdr); > preempt_disable(); > endtime = busy_clock() + vq->busyloop_timeout; > while (vhost_can_busy_poll(endtime)) { > @@ -512,7 +546,7 @@ static int get_tx_bufs(struct vhost_net *net, > struct vhost_virtqueue *vq = &nvq->vq; > int ret; > > - ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr); > + ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); > > if (ret < 0 || ret == vq->num) > return ret; > @@ -540,6 +574,83 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) > !vhost_vq_avail_empty(vq->dev, vq); > } > > +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)I wonder whether NET_IP_ALIGN make sense for XDP.> + > +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, > + struct iov_iter *from) > +{ > + struct vhost_virtqueue *vq = &nvq->vq; > + struct socket *sock = vq->private_data; > + struct page_frag *alloc_frag = ¤t->task_frag; > + struct virtio_net_hdr *gso; > + struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; > + size_t len = iov_iter_count(from); > + int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; > + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); > + int sock_hlen = nvq->sock_hlen; > + void *buf; > + int copied; > + > + if (unlikely(len < nvq->sock_hlen)) > + return -EFAULT; > + > + if (SKB_DATA_ALIGN(len + pad) + > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) > + return -ENOSPC; > + > + buflen += SKB_DATA_ALIGN(len + pad); > + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); > + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) > + return -ENOMEM; > + > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > + > + /* We store two kinds of metadata in the header which will be > + * used for XDP_PASS to do build_skb(): > + * offset 0: buflen > + * offset sizeof(int): vnet headerDefine a struct for the metadata then?> + */ > + copied = copy_page_from_iter(alloc_frag->page, > + alloc_frag->offset + sizeof(int), > + sock_hlen, from); > + if (copied != sock_hlen) > + return -EFAULT; > + > + gso = (struct virtio_net_hdr *)(buf + sizeof(int)); > + > + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && > + vhost16_to_cpu(vq, gso->csum_start) + > + vhost16_to_cpu(vq, gso->csum_offset) + 2 > > + vhost16_to_cpu(vq, gso->hdr_len)) { > + gso->hdr_len = cpu_to_vhost16(vq, > + vhost16_to_cpu(vq, gso->csum_start) + > + vhost16_to_cpu(vq, gso->csum_offset) + 2); > + > + if (vhost16_to_cpu(vq, gso->hdr_len) > len) > + return -EINVAL; > + } > + > + len -= sock_hlen; > + copied = copy_page_from_iter(alloc_frag->page, > + alloc_frag->offset + pad, > + len, from); > + if (copied != len) > + return -EFAULT; > + > + xdp->data_hard_start = buf; > + xdp->data = buf + pad; > + xdp->data_end = xdp->data + len; > + *(int *)(xdp->data_hard_start) = buflen; > + > + get_page(alloc_frag->page); > + alloc_frag->offset += buflen; > + > + ++nvq->batched_xdp; > + > + return 0; > +} > + > static void handle_tx_copy(struct vhost_net *net, struct socket *sock) > { > struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; > @@ -556,10 +667,14 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) > size_t len, total_len = 0; > int err; > int sent_pkts = 0; > + bool bulking = (sock->sk->sk_sndbuf == INT_MAX);What does bulking mean?> > for (;;) { > bool busyloop_intr = false; > > + if (nvq->done_idx == VHOST_NET_BATCH) > + vhost_tx_batch(net, nvq, sock, &msg); > + > head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, > &busyloop_intr); > /* On error, stop handling until the next kick. */ > @@ -577,14 +692,34 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) > break; > } > > - vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); > - vq->heads[nvq->done_idx].len = 0; > - > total_len += len; > - if (tx_can_batch(vq, total_len)) > - msg.msg_flags |= MSG_MORE; > - else > - msg.msg_flags &= ~MSG_MORE; > + > + /* For simplicity, TX batching is only enabled if > + * sndbuf is unlimited.What if sndbuf changes while this processing is going on?> + */ > + if (bulking) { > + err = vhost_net_build_xdp(nvq, &msg.msg_iter); > + if (!err) { > + goto done; > + } else if (unlikely(err != -ENOSPC)) { > + vhost_tx_batch(net, nvq, sock, &msg); > + vhost_discard_vq_desc(vq, 1); > + vhost_net_enable_vq(net, vq); > + break; > + } > + > + /* We can't build XDP buff, go for single > + * packet path but let's flush batched > + * packets. > + */ > + vhost_tx_batch(net, nvq, sock, &msg); > + msg.msg_control = NULL; > + } else { > + if (tx_can_batch(vq, total_len)) > + msg.msg_flags |= MSG_MORE; > + else > + msg.msg_flags &= ~MSG_MORE; > + } > > /* TODO: Check specific error and bomb out unless ENOBUFS? */ > err = sock->ops->sendmsg(sock, &msg, len); > @@ -596,15 +731,17 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) > if (err != len) > pr_debug("Truncated TX packet: len %d != %zd\n", > err, len); > - if (++nvq->done_idx >= VHOST_NET_BATCH) > - vhost_net_signal_used(nvq); > +done: > + vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); > + vq->heads[nvq->done_idx].len = 0; > + ++nvq->done_idx; > if (vhost_exceeds_weight(++sent_pkts, total_len)) { > vhost_poll_queue(&vq->poll); > break; > } > } > > - vhost_net_signal_used(nvq); > + vhost_tx_batch(net, nvq, sock, &msg); > } > > static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) > @@ -1111,6 +1248,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) > n->vqs[i].ubuf_info = NULL; > n->vqs[i].upend_idx = 0; > n->vqs[i].done_idx = 0; > + n->vqs[i].batched_xdp = 0; > n->vqs[i].vhost_hlen = 0; > n->vqs[i].sock_hlen = 0; > n->vqs[i].rx_ring = NULL; > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  16:54 UTC
[PATCH net-next 08/11] tun: switch to new type of msg_control
On Thu, Sep 06, 2018 at 12:05:23PM +0800, Jason Wang wrote:> This patch introduces to a new tun/tap specific msg_control: > > #define TUN_MSG_UBUF 1 > #define TUN_MSG_PTR 2 > struct tun_msg_ctl { > int type; > void *ptr; > }; > > This allows us to pass different kinds of msg_control through > sendmsg(). The first supported type is ubuf (TUN_MSG_UBUF) which will > be used by the existed vhost_net zerocopy code. The second is XDP > buff, which allows vhost_net to pass XDP buff to TUN. This could be > used to implement accepting an array of XDP buffs from vhost_net in > the following patches. > > Signed-off-by: Jason Wang <jasowang at redhat.com>At this point, do we want to just add a new sock opt for tap's benefit? Seems cleaner than (ab)using sendmsg.> --- > drivers/net/tap.c | 18 ++++++++++++------ > drivers/net/tun.c | 6 +++++- > drivers/vhost/net.c | 7 +++++-- > include/linux/if_tun.h | 7 +++++++ > 4 files changed, 29 insertions(+), 9 deletions(-) > > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index f0f7cd977667..7996ed7cbf18 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad, > #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN) > > /* Get packet from user space buffer */ > -static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m, > +static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, > struct iov_iter *from, int noblock) > { > int good_linear = SKB_MAX_HEAD(TAP_RESERVE); > @@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m, > if (unlikely(len < ETH_HLEN)) > goto err; > > - if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { > + if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { > struct iov_iter i; > > copylen = vnet_hdr.hdr_len ? > @@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m, > tap = rcu_dereference(q->tap); > /* copy skb_ubuf_info for callback when skb has no error */ > if (zerocopy) { > - skb_shinfo(skb)->destructor_arg = m->msg_control; > + skb_shinfo(skb)->destructor_arg = msg_control; > skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; > skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; > - } else if (m && m->msg_control) { > - struct ubuf_info *uarg = m->msg_control; > + } else if (msg_control) { > + struct ubuf_info *uarg = msg_control; > uarg->callback(uarg, false); > } > > @@ -1150,7 +1150,13 @@ static int tap_sendmsg(struct socket *sock, struct msghdr *m, > size_t total_len) > { > struct tap_queue *q = container_of(sock, struct tap_queue, sock); > - return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); > + struct tun_msg_ctl *ctl = m->msg_control; > + > + if (ctl && ctl->type != TUN_MSG_UBUF) > + return -EINVAL; > + > + return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, > + m->msg_flags & MSG_DONTWAIT); > } > > static int tap_recvmsg(struct socket *sock, struct msghdr *m, > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index ff1cbf3ebd50..c839a4bdcbd9 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -2429,11 +2429,15 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) > int ret; > struct tun_file *tfile = container_of(sock, struct tun_file, socket); > struct tun_struct *tun = tun_get(tfile); > + struct tun_msg_ctl *ctl = m->msg_control; > > if (!tun) > return -EBADFD; > > - ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, > + if (ctl && ctl->type != TUN_MSG_UBUF) > + return -EINVAL; > + > + ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, > m->msg_flags & MSG_DONTWAIT, > m->msg_flags & MSG_MORE); > tun_put(tun); > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 4e656f89cb22..fb01ce6d981c 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -620,6 +620,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) > .msg_controllen = 0, > .msg_flags = MSG_DONTWAIT, > }; > + struct tun_msg_ctl ctl; > size_t len, total_len = 0; > int err; > struct vhost_net_ubuf_ref *uninitialized_var(ubufs); > @@ -664,8 +665,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) > ubuf->ctx = nvq->ubufs; > ubuf->desc = nvq->upend_idx; > refcount_set(&ubuf->refcnt, 1); > - msg.msg_control = ubuf; > - msg.msg_controllen = sizeof(ubuf); > + msg.msg_control = &ctl; > + ctl.type = TUN_MSG_UBUF; > + ctl.ptr = ubuf; > + msg.msg_controllen = sizeof(ctl); > ubufs = nvq->ubufs; > atomic_inc(&ubufs->refcount); > nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; > diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h > index 3d2996dc7d85..ba46dced1f38 100644 > --- a/include/linux/if_tun.h > +++ b/include/linux/if_tun.h > @@ -19,6 +19,13 @@ > > #define TUN_XDP_FLAG 0x1UL > > +#define TUN_MSG_UBUF 1 > +#define TUN_MSG_PTR 2Looks like TUN_MSG_PTR should be pushed out to a follow-up patch?> +struct tun_msg_ctl { > + int type; > + void *ptr; > +}; > +type actually includes a size. Why not two short fields then?> #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) > struct socket *tun_get_socket(struct file *); > struct ptr_ring *tun_get_tx_ring(struct file *file); > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  16:56 UTC
[PATCH net-next 01/11] net: sock: introduce SOCK_XDP
On Thu, Sep 06, 2018 at 12:05:16PM +0800, Jason Wang wrote:> This patch introduces a new sock flag - SOCK_XDP. This will be used > for notifying the upper layer that XDP program is attached on the > lower socket, and requires for extra headroom. > > TUN will be the first user. > > Signed-off-by: Jason Wang <jasowang at redhat.com>In fact vhost is the 1st user, right? So this can be pushed out to become patch 10/11.> --- > drivers/net/tun.c | 19 +++++++++++++++++++ > include/net/sock.h | 1 + > 2 files changed, 20 insertions(+) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index ebd07ad82431..2c548bd20393 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -869,6 +869,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file, > tun_napi_init(tun, tfile, napi); > } > > + if (rtnl_dereference(tun->xdp_prog)) > + sock_set_flag(&tfile->sk, SOCK_XDP); > + > tun_set_real_num_queues(tun); > > /* device is allowed to go away first, so no need to hold extra > @@ -1241,13 +1244,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, > struct netlink_ext_ack *extack) > { > struct tun_struct *tun = netdev_priv(dev); > + struct tun_file *tfile; > struct bpf_prog *old_prog; > + int i; > > old_prog = rtnl_dereference(tun->xdp_prog); > rcu_assign_pointer(tun->xdp_prog, prog); > if (old_prog) > bpf_prog_put(old_prog); > > + for (i = 0; i < tun->numqueues; i++) { > + tfile = rtnl_dereference(tun->tfiles[i]); > + if (prog) > + sock_set_flag(&tfile->sk, SOCK_XDP); > + else > + sock_reset_flag(&tfile->sk, SOCK_XDP); > + } > + list_for_each_entry(tfile, &tun->disabled, next) { > + if (prog) > + sock_set_flag(&tfile->sk, SOCK_XDP); > + else > + sock_reset_flag(&tfile->sk, SOCK_XDP); > + } > + > return 0; > } > > diff --git a/include/net/sock.h b/include/net/sock.h > index 433f45fc2d68..38cae35f6e16 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -800,6 +800,7 @@ enum sock_flags { > SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ > SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ > SOCK_TXTIME, > + SOCK_XDP, /* XDP is attached */ > }; > > #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  16:57 UTC
[PATCH net-next 02/11] tuntap: switch to use XDP_PACKET_HEADROOM
On Thu, Sep 06, 2018 at 12:05:17PM +0800, Jason Wang wrote:> Signed-off-by: Jason Wang <jasowang at redhat.com>Acked-by: Michael S. Tsirkin <mst at redhat.com> any idea why didn't we do this straight away?> --- > drivers/net/tun.c | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 2c548bd20393..d3677a544b56 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -113,7 +113,6 @@ do { \ > } while (0) > #endif > > -#define TUN_HEADROOM 256 > #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) > > /* TUN device flags */ > @@ -1654,7 +1653,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > rcu_read_lock(); > xdp_prog = rcu_dereference(tun->xdp_prog); > if (xdp_prog) > - pad += TUN_HEADROOM; > + pad += XDP_PACKET_HEADROOM; > buflen += SKB_DATA_ALIGN(len + pad); > rcu_read_unlock(); > > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  17:02 UTC
[PATCH net-next 03/11] tuntap: enable bh early during processing XDP
On Thu, Sep 06, 2018 at 12:05:18PM +0800, Jason Wang wrote:> This patch move the bh enabling a little bit earlier, this will be > used for factoring out the core XDP logic of tuntap. > > Signed-off-by: Jason Wang <jasowang at redhat.com>Acked-by: Michael S. Tsirkin <mst at redhat.com> no reason to disable bh for non-xdp things.> --- > drivers/net/tun.c | 10 +++------- > 1 file changed, 3 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index d3677a544b56..372caf7d67d9 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -1726,22 +1726,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > goto err_xdp; > } > } > + rcu_read_unlock(); > + local_bh_enable(); > > skb = build_skb(buf, buflen); > - if (!skb) { > - rcu_read_unlock(); > - local_bh_enable(); > + if (!skb) > return ERR_PTR(-ENOMEM); > - } > > skb_reserve(skb, pad - delta); > skb_put(skb, len); > get_page(alloc_frag->page); > alloc_frag->offset += buflen; > > - rcu_read_unlock(); > - local_bh_enable(); > - > return skb; > > err_redirect: > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  17:14 UTC
[PATCH net-next 04/11] tuntap: simplify error handling in tun_build_skb()
On Thu, Sep 06, 2018 at 12:05:19PM +0800, Jason Wang wrote:> There's no need to duplicate page get logic in each action. So this > patch tries to get page and calculate the offset before processing XDP > actions, and undo them when meet errors (we don't care the performance > on errors). This will be used for factoring out XDP logic. > > Signed-off-by: Jason Wang <jasowang at redhat.com>I see some issues with this one.> --- > drivers/net/tun.c | 37 ++++++++++++++++--------------------- > 1 file changed, 16 insertions(+), 21 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 372caf7d67d9..f8cdcfa392c3 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -1642,7 +1642,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > int len, int *skb_xdp) > { > struct page_frag *alloc_frag = ¤t->task_frag; > - struct sk_buff *skb; > + struct sk_buff *skb = NULL; > struct bpf_prog *xdp_prog; > int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > unsigned int delta = 0; > @@ -1668,6 +1668,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > if (copied != len) > return ERR_PTR(-EFAULT); > > + get_page(alloc_frag->page); > + alloc_frag->offset += buflen; > +This adds an atomic op on XDP_DROP which is a data path operation for some workloads.> /* There's a small window that XDP may be set after the check > * of xdp_prog above, this should be rare and for simplicity > * we do XDP on skb in case the headroom is not enough. > @@ -1695,23 +1698,15 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > > switch (act) { > case XDP_REDIRECT: > - get_page(alloc_frag->page); > - alloc_frag->offset += buflen; > err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); > xdp_do_flush_map(); > if (err) > - goto err_redirect; > - rcu_read_unlock(); > - local_bh_enable(); > - return NULL; > + goto err_xdp; > + goto out; > case XDP_TX: > - get_page(alloc_frag->page); > - alloc_frag->offset += buflen; > if (tun_xdp_tx(tun->dev, &xdp) < 0) > - goto err_redirect; > - rcu_read_unlock(); > - local_bh_enable(); > - return NULL; > + goto err_xdp; > + goto out; > case XDP_PASS: > delta = orig_data - xdp.data; > len = xdp.data_end - xdp.data; > @@ -1730,23 +1725,23 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > local_bh_enable(); > > skb = build_skb(buf, buflen); > - if (!skb) > - return ERR_PTR(-ENOMEM); > + if (!skb) { > + skb = ERR_PTR(-ENOMEM); > + goto out;So goto out will skip put_page, and we did do get_page above. Seems wrong. You should goto err_skb or something like this.> + } > > skb_reserve(skb, pad - delta); > skb_put(skb, len); > - get_page(alloc_frag->page); > - alloc_frag->offset += buflen; > > return skb; > > -err_redirect: > - put_page(alloc_frag->page); > err_xdp: > + alloc_frag->offset -= buflen; > + put_page(alloc_frag->page); > +out:Out here isn't an error at all, is it? You should not mix return and error handling IMHO.> rcu_read_unlock(); > local_bh_enable(); > - this_cpu_inc(tun->pcpu_stats->rx_dropped);Doesn't this break rx_dropped accounting?> - return NULL; > + return skb; > } > > /* Get packet from user space buffer */ > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  17:16 UTC
[PATCH net-next 05/11] tuntap: tweak on the path of non-xdp case in tun_build_skb()
On Thu, Sep 06, 2018 at 12:05:20PM +0800, Jason Wang wrote:> If we're sure not to go native XDP, there's no need for several things > like bh and rcu stuffs. > > Signed-off-by: Jason Wang <jasowang at redhat.com>True...> --- > drivers/net/tun.c | 9 ++++++--- > 1 file changed, 6 insertions(+), 3 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index f8cdcfa392c3..389aa0727cc6 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -1675,10 +1675,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > * of xdp_prog above, this should be rare and for simplicity > * we do XDP on skb in case the headroom is not enough. > */ > - if (hdr->gso_type || !xdp_prog) > + if (hdr->gso_type || !xdp_prog) { > *skb_xdp = 1; > - else > - *skb_xdp = 0; > + goto build; > + } > + > + *skb_xdp = 0; > > local_bh_disable(); > rcu_read_lock(); > @@ -1724,6 +1726,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > rcu_read_unlock(); > local_bh_enable(); > > +build:But this is spaghetti code. Please just put common code into functions and call them, don't goto.> skb = build_skb(buf, buflen); > if (!skb) { > skb = ERR_PTR(-ENOMEM); > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  17:21 UTC
[PATCH net-next 06/11] tuntap: split out XDP logic
On Thu, Sep 06, 2018 at 12:05:21PM +0800, Jason Wang wrote:> This patch split out XDP logic into a single function. This make it to > be reused by XDP batching path in the following patch. > > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/net/tun.c | 84 ++++++++++++++++++++++++++++------------------- > 1 file changed, 51 insertions(+), 33 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 389aa0727cc6..21b125020b3b 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -1635,6 +1635,44 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, > return true; > } > > +static u32 tun_do_xdp(struct tun_struct *tun, > + struct tun_file *tfile, > + struct bpf_prog *xdp_prog, > + struct xdp_buff *xdp, > + int *err) > +{ > + u32 act = bpf_prog_run_xdp(xdp_prog, xdp); > + > + switch (act) { > + case XDP_REDIRECT: > + *err = xdp_do_redirect(tun->dev, xdp, xdp_prog); > + xdp_do_flush_map(); > + if (*err) > + break; > + goto out; > + case XDP_TX: > + *err = tun_xdp_tx(tun->dev, xdp); > + if (*err < 0) > + break; > + *err = 0; > + goto out; > + case XDP_PASS: > + goto out;Do we need goto? why not just return?> + default: > + bpf_warn_invalid_xdp_action(act); > + /* fall through */ > + case XDP_ABORTED: > + trace_xdp_exception(tun->dev, xdp_prog, act); > + /* fall through */ > + case XDP_DROP: > + break; > + } > + > + put_page(virt_to_head_page(xdp->data_hard_start));put here because caller does get_page :( Not pretty. I'd move this out to the caller.> +out: > + return act;How about combining err and act? err is < 0 XDP_PASS is > 0. No need for pointers then.> +} > + > static struct sk_buff *tun_build_skb(struct tun_struct *tun, > struct tun_file *tfile, > struct iov_iter *from, > @@ -1645,10 +1683,10 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > struct sk_buff *skb = NULL; > struct bpf_prog *xdp_prog; > int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > - unsigned int delta = 0; > char *buf; > size_t copied; > - int err, pad = TUN_RX_PAD; > + int pad = TUN_RX_PAD; > + int err = 0; > > rcu_read_lock(); > xdp_prog = rcu_dereference(tun->xdp_prog); > @@ -1685,9 +1723,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > local_bh_disable(); > rcu_read_lock(); > xdp_prog = rcu_dereference(tun->xdp_prog); > - if (xdp_prog && !*skb_xdp) { > + if (xdp_prog) { > struct xdp_buff xdp; > - void *orig_data; > u32 act; > > xdp.data_hard_start = buf; > @@ -1695,33 +1732,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > xdp_set_data_meta_invalid(&xdp); > xdp.data_end = xdp.data + len; > xdp.rxq = &tfile->xdp_rxq; > - orig_data = xdp.data; > - act = bpf_prog_run_xdp(xdp_prog, &xdp); > - > - switch (act) { > - case XDP_REDIRECT: > - err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); > - xdp_do_flush_map(); > - if (err) > - goto err_xdp; > - goto out; > - case XDP_TX: > - if (tun_xdp_tx(tun->dev, &xdp) < 0) > - goto err_xdp; > - goto out; > - case XDP_PASS: > - delta = orig_data - xdp.data; > - len = xdp.data_end - xdp.data; > - break; > - default: > - bpf_warn_invalid_xdp_action(act); > - /* fall through */ > - case XDP_ABORTED: > - trace_xdp_exception(tun->dev, xdp_prog, act); > - /* fall through */ > - case XDP_DROP: > + act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err); > + if (err) > goto err_xdp; > - } > + if (act != XDP_PASS) > + goto out;likely?> + > + pad = xdp.data - xdp.data_hard_start; > + len = xdp.data_end - xdp.data; > } > rcu_read_unlock(); > local_bh_enable(); > @@ -1729,18 +1747,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > build: > skb = build_skb(buf, buflen); > if (!skb) { > + put_page(alloc_frag->page); > skb = ERR_PTR(-ENOMEM); > goto out; > } > > - skb_reserve(skb, pad - delta); > + skb_reserve(skb, pad); > skb_put(skb, len); > > return skb; > > err_xdp: > - alloc_frag->offset -= buflen; > - put_page(alloc_frag->page); > + this_cpu_inc(tun->pcpu_stats->rx_dropped);This fixes bug in previous patch which dropped it. OK :)> out: > rcu_read_unlock(); > local_bh_enable(); > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  17:48 UTC
[PATCH net-next 07/11] tuntap: move XDP flushing out of tun_do_xdp()
On Thu, Sep 06, 2018 at 12:05:22PM +0800, Jason Wang wrote:> This will allow adding batch flushing on top. > > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/net/tun.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 21b125020b3b..ff1cbf3ebd50 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -1646,7 +1646,6 @@ static u32 tun_do_xdp(struct tun_struct *tun, > switch (act) { > case XDP_REDIRECT: > *err = xdp_do_redirect(tun->dev, xdp, xdp_prog); > - xdp_do_flush_map(); > if (*err) > break; > goto out; > @@ -1735,6 +1734,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, > act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err); > if (err) > goto err_xdp; > + > + if (act == XDP_REDIRECT) > + xdp_do_flush_map(); > if (act != XDP_PASS) > goto out;At this point the switch statement which used to contain all XDP things seems to be gone completely. Just rewrite with a bunch of if statements and all xdp handling spread out to where it makes sense?> -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  17:51 UTC
[PATCH net-next 09/11] tuntap: accept an array of XDP buffs through sendmsg()
On Thu, Sep 06, 2018 at 12:05:24PM +0800, Jason Wang wrote:> This patch implement TUN_MSG_PTR msg_control type. This type allows > the caller to pass an array of XDP buffs to tuntap through ptr field > of the tun_msg_control. If an XDP program is attached, tuntap can run > XDP program directly. If not, tuntap will build skb and do a fast > receiving since part of the work has been done by vhost_net. > > This will avoid lots of indirect calls thus improves the icache > utilization and allows to do XDP batched flushing when doing XDP > redirection. > > Signed-off-by: Jason Wang <jasowang at redhat.com>Is most of the benefit in batched flushing or skipping indirect calls? Because if it's flushing we can gain most of it easily by adding an analog of xmit_more.> --- > drivers/net/tun.c | 103 ++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 100 insertions(+), 3 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index c839a4bdcbd9..069db2e5dd08 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -2424,22 +2424,119 @@ static void tun_sock_write_space(struct sock *sk) > kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); > } > > +static int tun_xdp_one(struct tun_struct *tun, > + struct tun_file *tfile, > + struct xdp_buff *xdp, int *flush) > +{ > + struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int); > + struct tun_pcpu_stats *stats; > + struct bpf_prog *xdp_prog; > + struct sk_buff *skb = NULL; > + u32 rxhash = 0, act; > + int buflen = *(int *)xdp->data_hard_start; > + int err = 0; > + bool skb_xdp = false; > + > + xdp_prog = rcu_dereference(tun->xdp_prog); > + if (xdp_prog) { > + if (gso->gso_type) { > + skb_xdp = true; > + goto build; > + } > + xdp_set_data_meta_invalid(xdp); > + xdp->rxq = &tfile->xdp_rxq; > + act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err); > + if (err) > + goto out; > + if (act == XDP_REDIRECT) > + *flush = true; > + if (act != XDP_PASS) > + goto out; > + } > + > +build: > + skb = build_skb(xdp->data_hard_start, buflen); > + if (!skb) { > + err = -ENOMEM; > + goto out; > + } > + > + skb_reserve(skb, xdp->data - xdp->data_hard_start); > + skb_put(skb, xdp->data_end - xdp->data); > + > + if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { > + this_cpu_inc(tun->pcpu_stats->rx_frame_errors); > + kfree_skb(skb); > + err = -EINVAL; > + goto out; > + } > + > + skb->protocol = eth_type_trans(skb, tun->dev); > + skb_reset_network_header(skb); > + skb_probe_transport_header(skb, 0); > + > + if (skb_xdp) { > + err = do_xdp_generic(xdp_prog, skb); > + if (err != XDP_PASS) > + goto out; > + } > + > + if (!rcu_dereference(tun->steering_prog)) > + rxhash = __skb_get_hash_symmetric(skb); > + > + netif_receive_skb(skb); > + > + stats = get_cpu_ptr(tun->pcpu_stats); > + u64_stats_update_begin(&stats->syncp); > + stats->rx_packets++; > + stats->rx_bytes += skb->len; > + u64_stats_update_end(&stats->syncp); > + put_cpu_ptr(stats); > + > + if (rxhash) > + tun_flow_update(tun, rxhash, tfile); > + > +out: > + return err; > +} > + > static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) > { > - int ret; > + int ret, i; > struct tun_file *tfile = container_of(sock, struct tun_file, socket); > struct tun_struct *tun = tun_get(tfile); > struct tun_msg_ctl *ctl = m->msg_control; > + struct xdp_buff *xdp; > > if (!tun) > return -EBADFD; > > - if (ctl && ctl->type != TUN_MSG_UBUF) > - return -EINVAL; > + if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) { > + int n = ctl->type >> 16; > + int flush = 0; > + > + local_bh_disable(); > + rcu_read_lock(); > + > + for (i = 0; i < n; i++) { > + xdp = &((struct xdp_buff *)ctl->ptr)[i]; > + tun_xdp_one(tun, tfile, xdp, &flush); > + } > + > + if (flush) > + xdp_do_flush_map(); > + > + rcu_read_unlock(); > + local_bh_enable(); > + > + ret = total_len; > + goto out; > + } > > ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, > m->msg_flags & MSG_DONTWAIT, > m->msg_flags & MSG_MORE); > +out: > tun_put(tun); > return ret; > } > -- > 2.17.1
Michael S. Tsirkin
2018-Sep-06  18:00 UTC
[PATCH net-next 10/11] tap: accept an array of XDP buffs through sendmsg()
On Thu, Sep 06, 2018 at 12:05:25PM +0800, Jason Wang wrote:> This patch implement TUN_MSG_PTR msg_control type. This type allows > the caller to pass an array of XDP buffs to tuntap through ptr field > of the tun_msg_control. Tap will build skb through those XDP buffers. > > This will avoid lots of indirect calls thus improves the icache > utilization and allows to do XDP batched flushing when doing XDP > redirection. > > Signed-off-by: Jason Wang <jasowang at redhat.com> > --- > drivers/net/tap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 71 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index 7996ed7cbf18..50eb7bf22225 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -1146,14 +1146,83 @@ static const struct file_operations tap_fops = { > #endif > }; > > +static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) > +{ > + struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int); > + int buflen = *(int *)xdp->data_hard_start; > + int vnet_hdr_len = 0; > + struct tap_dev *tap; > + struct sk_buff *skb; > + int err, depth; > + > + if (q->flags & IFF_VNET_HDR) > + vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); > + > + skb = build_skb(xdp->data_hard_start, buflen); > + if (!skb) { > + err = -ENOMEM; > + goto err; > + }So fundamentally why is it called XDP? We just build and skb, don't we?> + > + skb_reserve(skb, xdp->data - xdp->data_hard_start); > + skb_put(skb, xdp->data_end - xdp->data); > + > + skb_set_network_header(skb, ETH_HLEN); > + skb_reset_mac_header(skb); > + skb->protocol = eth_hdr(skb)->h_proto; > + > + if (vnet_hdr_len) { > + err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q)); > + if (err) > + goto err_kfree; > + } > + > + skb_probe_transport_header(skb, ETH_HLEN); > + > + /* Move network header to the right position for VLAN tagged packets */ > + if ((skb->protocol == htons(ETH_P_8021Q) || > + skb->protocol == htons(ETH_P_8021AD)) && > + __vlan_get_protocol(skb, skb->protocol, &depth) != 0) > + skb_set_network_header(skb, depth); > + > + rcu_read_lock(); > + tap = rcu_dereference(q->tap); > + if (tap) { > + skb->dev = tap->dev; > + dev_queue_xmit(skb); > + } else { > + kfree_skb(skb); > + } > + rcu_read_unlock(); > + > + return 0; > + > +err_kfree: > + kfree_skb(skb); > +err: > + rcu_read_lock(); > + tap = rcu_dereference(q->tap); > + if (tap && tap->count_tx_dropped) > + tap->count_tx_dropped(tap); > + rcu_read_unlock(); > + return err; > +} > + > static int tap_sendmsg(struct socket *sock, struct msghdr *m, > size_t total_len) > { > struct tap_queue *q = container_of(sock, struct tap_queue, sock); > struct tun_msg_ctl *ctl = m->msg_control; > + struct xdp_buff *xdp; > + int i; > > - if (ctl && ctl->type != TUN_MSG_UBUF) > - return -EINVAL; > + if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) { > + for (i = 0; i < ctl->type >> 16; i++) { > + xdp = &((struct xdp_buff *)ctl->ptr)[i]; > + tap_get_user_xdp(q, xdp); > + } > + return 0; > + } > > return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, > m->msg_flags & MSG_DONTWAIT); > -- > 2.17.1