1) Turn GSO on virtio net into an all-or-nothing (keep checksumming
separate). Having multiple bits is a pain: if you can't support
something
you should handle it in software, which is still a performance win.
2) Make VIRTIO_NET_HDR_GSO_ECN a flag in the header, so it can apply to
IPv6 or v4.
3) Rename VIRTIO_NET_F_NO_CSUM to VIRTIO_NET_F_CSUM (ie. means we do
checksumming).
4) Add csum and gso params to virtio_net to allow more testing.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/net/virtio_net.c | 32 ++++++++++++++++----------------
include/linux/virtio_net.h | 12 ++++--------
2 files changed, 20 insertions(+), 24 deletions(-)
diff -r 4fb788b18cf8 drivers/net/virtio_net.c
--- a/drivers/net/virtio_net.c Wed Jan 23 13:07:59 2008 +1100
+++ b/drivers/net/virtio_net.c Wed Jan 23 18:46:05 2008 +1100
@@ -26,6 +26,10 @@
static int napi_weight = 128;
module_param(napi_weight, int, 0444);
+
+static int csum = 1, gso = 1;
+module_param(csum, int, 0444);
+module_param(gso, int, 0444);
MODULE_LICENSE("GPL");
@@ -95,12 +99,9 @@ static void receive_skb(struct net_devic
if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
pr_debug("GSO!\n");
- switch (hdr->gso_type) {
+ switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
case VIRTIO_NET_HDR_GSO_TCPV4:
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
- break;
- case VIRTIO_NET_HDR_GSO_TCPV4_ECN:
- skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN;
break;
case VIRTIO_NET_HDR_GSO_UDP:
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
@@ -114,6 +115,9 @@ static void receive_skb(struct net_devic
dev->name, hdr->gso_type);
goto frame_err;
}
+
+ if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
skb_shinfo(skb)->gso_size = hdr->gso_size;
if (skb_shinfo(skb)->gso_size == 0) {
@@ -249,9 +253,7 @@ static int start_xmit(struct sk_buff *sk
if (skb_is_gso(skb)) {
hdr->hdr_len = skb_transport_header(skb) - skb->data;
hdr->gso_size = skb_shinfo(skb)->gso_size;
- if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
- hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN;
- else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
@@ -259,6 +261,8 @@ static int start_xmit(struct sk_buff *sk
hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
else
BUG();
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
+ hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
} else {
hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
hdr->gso_size = hdr->hdr_len = 0;
@@ -354,17 +358,13 @@ static int virtnet_probe(struct virtio_d
SET_NETDEV_DEV(dev, &vdev->dev);
/* Do we support "hardware" checksums? */
- if (vdev->config->feature(vdev, VIRTIO_NET_F_NO_CSUM)) {
+ if (csum && vdev->config->feature(vdev, VIRTIO_NET_F_CSUM)) {
/* This opens up the world of extra features. */
dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
- if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4))
- dev->features |= NETIF_F_TSO;
- if (vdev->config->feature(vdev, VIRTIO_NET_F_UFO))
- dev->features |= NETIF_F_UFO;
- if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4_ECN))
- dev->features |= NETIF_F_TSO_ECN;
- if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO6))
- dev->features |= NETIF_F_TSO6;
+ if (gso && vdev->config->feature(vdev, VIRTIO_NET_F_GSO)) {
+ dev->features |= NETIF_F_TSO | NETIF_F_UFO
+ | NETIF_F_TSO_ECN | NETIF_F_TSO6;
+ }
}
/* Configuration may specify what MAC to use. Otherwise random. */
diff -r 4fb788b18cf8 include/linux/virtio_net.h
--- a/include/linux/virtio_net.h Wed Jan 23 13:07:59 2008 +1100
+++ b/include/linux/virtio_net.h Wed Jan 23 18:46:05 2008 +1100
@@ -6,12 +6,9 @@
#define VIRTIO_ID_NET 1
/* The feature bitmap for virtio net */
-#define VIRTIO_NET_F_NO_CSUM 0
-#define VIRTIO_NET_F_TSO4 1
-#define VIRTIO_NET_F_UFO 2
-#define VIRTIO_NET_F_TSO4_ECN 3
-#define VIRTIO_NET_F_TSO6 4
-#define VIRTIO_NET_F_MAC 5
+#define VIRTIO_NET_F_CSUM 0 /* Can handle pkts w/ partial csum */
+#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */
+#define VIRTIO_NET_F_GSO 6 /* Can handle pkts w/ any GSO type */
struct virtio_net_config
{
@@ -27,10 +24,9 @@ struct virtio_net_hdr
__u8 flags;
#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame
#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO)
-/* FIXME: Do we need this? If they said they can handle ECN, do they care? */
-#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN
#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO)
#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP
+#define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set
__u8 gso_type;
__u16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */
__u16 gso_size; /* Bytes to append to gso_hdr_len per frame */
Rusty Russell
2008-Jan-23 06:23 UTC
[PATCH 2/3] partial checksum and GSO support for tun/tap.
(Changes since last time: we how have explicit IFF_RECV_CSUM and
IFF_RECV_GSO bits, and some renaming of virtio_net hdr)
We use the virtio_net_hdr: it is an ABI already and designed to
encapsulate such metadata as GSO and partial checksums.
IFF_VIRTIO_HDR means you will write and read a 'struct virtio_net_hdr'
at the start of each packet. You can always write packets with
partial checksum and gso to the tap device using this header.
IFF_RECV_CSUM means you can handle reading packets with partial
checksums. If IFF_RECV_GSO is also set, it means you can handle
reading (all types of) GSO packets.
Note that there is no easy way to detect if these flags are supported:
see next patch.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/net/tun.c | 259 +++++++++++++++++++++++++++++++++++++++++++------
include/linux/if_tun.h | 6 +
2 files changed, 238 insertions(+), 27 deletions(-)
diff -r cb85fb035378 drivers/net/tun.c
--- a/drivers/net/tun.c Wed Jan 23 20:06:56 2008 +1100
+++ b/drivers/net/tun.c Wed Jan 23 20:12:51 2008 +1100
@@ -62,6 +62,7 @@
#include <linux/if_ether.h>
#include <linux/if_tun.h>
#include <linux/crc32.h>
+#include <linux/virtio_net.h>
#include <net/net_namespace.h>
#include <asm/system.h>
@@ -238,35 +239,188 @@ static unsigned int tun_chr_poll(struct
return mask;
}
+static struct sk_buff *copy_user_skb(size_t align, struct iovec *iv, size_t
len)
+{
+ struct sk_buff *skb;
+
+ if (!(skb = alloc_skb(len + align, GFP_KERNEL)))
+ return ERR_PTR(-ENOMEM);
+
+ if (align)
+ skb_reserve(skb, align);
+
+ if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
+ kfree_skb(skb);
+ return ERR_PTR(-EFAULT);
+ }
+ return skb;
+}
+
+/* This will fail if they give us a crazy iovec, but that's their own
fault. */
+static int get_user_skb_frags(const struct iovec *iv, size_t count,
+ struct skb_frag_struct *f)
+{
+ unsigned int i, j, num_pg = 0;
+ int err;
+ struct page *pages[MAX_SKB_FRAGS];
+
+ down_read(¤t->mm->mmap_sem);
+ for (i = 0; i < count; i++) {
+ int n, npages;
+ unsigned long base, len;
+ base = (unsigned long)iv[i].iov_base;
+ len = (unsigned long)iv[i].iov_len;
+
+ if (len == 0)
+ continue;
+
+ /* How many pages will this take? */
+ npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
+ if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ n = get_user_pages(current, current->mm, base, npages,
+ 0, 0, pages, NULL);
+ if (unlikely(n < 0)) {
+ err = n;
+ goto fail;
+ }
+
+ /* Transfer pages to the frag array */
+ for (j = 0; j < n; j++) {
+ f[num_pg].page = pages[j];
+ if (j == 0) {
+ f[num_pg].page_offset = offset_in_page(base);
+ f[num_pg].size = min(len, PAGE_SIZE -
+ f[num_pg].page_offset);
+ } else {
+ f[num_pg].page_offset = 0;
+ f[num_pg].size = min(len, PAGE_SIZE);
+ }
+ len -= f[num_pg].size;
+ base += f[num_pg].size;
+ num_pg++;
+ }
+
+ if (unlikely(n != npages)) {
+ err = -EFAULT;
+ goto fail;
+ }
+ }
+ up_read(¤t->mm->mmap_sem);
+ return num_pg;
+
+fail:
+ for (i = 0; i < num_pg; i++)
+ put_page(f[i].page);
+ up_read(¤t->mm->mmap_sem);
+ return err;
+}
+
+
+static struct sk_buff *map_user_skb(const struct virtio_net_hdr *gso,
+ size_t align, struct iovec *iv,
+ size_t count, size_t len)
+{
+ struct sk_buff *skb;
+ struct skb_shared_info *sinfo;
+ int err;
+
+ if (!(skb = alloc_skb(gso->hdr_len + align, GFP_KERNEL)))
+ return ERR_PTR(-ENOMEM);
+
+ if (align)
+ skb_reserve(skb, align);
+
+ sinfo = skb_shinfo(skb);
+ sinfo->gso_size = gso->gso_size;
+ sinfo->gso_type = SKB_GSO_DODGY;
+ switch (gso->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ sinfo->gso_type |= SKB_GSO_TCPV4;
+ break;
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ sinfo->gso_type |= SKB_GSO_TCPV6;
+ break;
+ case VIRTIO_NET_HDR_GSO_UDP:
+ sinfo->gso_type |= SKB_GSO_UDP;
+ break;
+ default:
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (gso->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+ /* Copy in the header. */
+ if (memcpy_fromiovec(skb_put(skb, gso->hdr_len), iv, gso->hdr_len)) {
+ err = -EFAULT;
+ goto fail;
+ }
+
+ err = get_user_skb_frags(iv, count, sinfo->frags);
+ if (err < 0)
+ goto fail;
+
+ sinfo->nr_frags = err;
+ skb->len += len;
+ skb->data_len += len;
+
+ return skb;
+
+fail:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+
+static inline size_t iov_total(const struct iovec *iv, unsigned long count)
+{
+ unsigned long i;
+ size_t len;
+
+ for (i = 0, len = 0; i < count; i++)
+ len += iv[i].iov_len;
+
+ return len;
+}
+
/* Get packet from user space buffer */
-static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec
*iv, size_t count)
+static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec
*iv, size_t num)
{
struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
+ struct virtio_net_hdr gso = { 0, VIRTIO_NET_HDR_GSO_NONE };
struct sk_buff *skb;
- size_t len = count, align = 0;
+ size_t tot_len = iov_total(iv, num);
+ size_t len = tot_len, align = 0;
if (!(tun->flags & TUN_NO_PI)) {
- if ((len -= sizeof(pi)) > count)
+ if ((len -= sizeof(pi)) > tot_len)
return -EINVAL;
if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
+ return -EFAULT;
+ }
+ if (tun->flags & TUN_VIRTIO_HDR) {
+ if ((len -= sizeof(gso)) > tot_len)
+ return -EINVAL;
+
+ if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso)))
return -EFAULT;
}
if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV)
align = NET_IP_ALIGN;
- if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
+ if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE)
+ skb = map_user_skb(&gso, align, iv, num, len);
+ else
+ skb = copy_user_skb(align, iv, len);
+
+ if (IS_ERR(skb)) {
tun->dev->stats.rx_dropped++;
- return -ENOMEM;
- }
-
- if (align)
- skb_reserve(skb, align);
- if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
- tun->dev->stats.rx_dropped++;
- kfree_skb(skb);
- return -EFAULT;
+ return PTR_ERR(skb);
}
switch (tun->flags & TUN_TYPE_MASK) {
@@ -280,7 +434,13 @@ static __inline__ ssize_t tun_get_user(s
break;
};
- if (tun->flags & TUN_NOCHECKSUM)
+ if (gso.flags & (1 << VIRTIO_NET_F_CSUM)) {
+ if (!skb_partial_csum_set(skb,gso.csum_start,gso.csum_offset)) {
+ tun->dev->stats.rx_dropped++;
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ } else if (tun->flags & TUN_NOCHECKSUM)
skb->ip_summed = CHECKSUM_UNNECESSARY;
netif_rx_ni(skb);
@@ -289,18 +449,7 @@ static __inline__ ssize_t tun_get_user(s
tun->dev->stats.rx_packets++;
tun->dev->stats.rx_bytes += len;
- return count;
-}
-
-static inline size_t iov_total(const struct iovec *iv, unsigned long count)
-{
- unsigned long i;
- size_t len;
-
- for (i = 0, len = 0; i < count; i++)
- len += iv[i].iov_len;
-
- return len;
+ return tot_len;
}
static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
@@ -313,7 +462,7 @@ static ssize_t tun_chr_aio_write(struct
DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name,
count);
- return tun_get_user(tun, (struct iovec *) iv, iov_total(iv, count));
+ return tun_get_user(tun, (struct iovec *) iv, count);
}
/* Put packet to the user space buffer */
@@ -336,6 +485,42 @@ static __inline__ ssize_t tun_put_user(s
if (memcpy_toiovec(iv, (void *) &pi, sizeof(pi)))
return -EFAULT;
total += sizeof(pi);
+ }
+ if (tun->flags & TUN_VIRTIO_HDR) {
+ struct virtio_net_hdr gso;
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+ if (skb_is_gso(skb)) {
+ gso.hdr_len = skb_transport_header(skb) - skb->data;
+ gso.gso_size = sinfo->gso_size;
+ if (sinfo->gso_type & SKB_GSO_TCPV4)
+ gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ else if (sinfo->gso_type & SKB_GSO_TCPV6)
+ gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ else if (sinfo->gso_type & SKB_GSO_UDP)
+ gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+ else
+ BUG();
+ if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+ gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+ } else
+ gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ gso.csum_start = skb->csum_start - skb_headroom(skb);
+ gso.csum_offset = skb->csum_offset;
+ } else {
+ gso.flags = 0;
+ gso.csum_offset = gso.csum_start = 0;
+ }
+
+ if ((len -= sizeof(gso)) < 0)
+ return -EINVAL;
+
+ if (memcpy_toiovec(iv, (void *)&gso, sizeof(gso)))
+ return -EFAULT;
+ total += sizeof(gso);
}
len = min_t(int, skb->len, len);
@@ -523,6 +708,17 @@ static int tun_set_iff(struct file *file
tun_net_init(dev);
+ /* Virtio header means we can handle csum & gso. */
+ if ((ifr->ifr_flags & (IFF_VIRTIO_HDR|IFF_RECV_CSUM)) =+
(IFF_VIRTIO_HDR|IFF_RECV_CSUM)) {
+ dev->features = NETIF_F_SG | NETIF_F_HW_CSUM |
+ NETIF_F_HIGHDMA | NETIF_F_FRAGLIST;
+
+ if (ifr->ifr_flags & IFF_RECV_GSO)
+ dev->features |= NETIF_F_TSO | NETIF_F_UFO |
+ NETIF_F_TSO_ECN | NETIF_F_TSO6;
+ }
+
if (strchr(dev->name, '%')) {
err = dev_alloc_name(dev, dev->name);
if (err < 0)
@@ -543,6 +739,15 @@ static int tun_set_iff(struct file *file
if (ifr->ifr_flags & IFF_ONE_QUEUE)
tun->flags |= TUN_ONE_QUEUE;
+
+ if (ifr->ifr_flags & IFF_VIRTIO_HDR)
+ tun->flags |= TUN_VIRTIO_HDR;
+
+ if (ifr->ifr_flags & IFF_RECV_CSUM)
+ tun->flags |= TUN_RECV_CSUM;
+
+ if (ifr->ifr_flags & IFF_RECV_GSO)
+ tun->flags |= TUN_RECV_GSO;
file->private_data = tun;
tun->attached = 1;
diff -r cb85fb035378 include/linux/if_tun.h
--- a/include/linux/if_tun.h Wed Jan 23 20:06:56 2008 +1100
+++ b/include/linux/if_tun.h Wed Jan 23 20:12:51 2008 +1100
@@ -70,6 +70,9 @@ struct tun_struct {
#define TUN_NO_PI 0x0040
#define TUN_ONE_QUEUE 0x0080
#define TUN_PERSIST 0x0100
+#define TUN_VIRTIO_HDR 0x0200
+#define TUN_RECV_CSUM 0x0400
+#define TUN_RECV_GSO 0x0400
/* Ioctl defines */
#define TUNSETNOCSUM _IOW('T', 200, int)
@@ -85,6 +88,9 @@ struct tun_struct {
#define IFF_TAP 0x0002
#define IFF_NO_PI 0x1000
#define IFF_ONE_QUEUE 0x2000
+#define IFF_VIRTIO_HDR 0x4000
+#define IFF_RECV_CSUM 0x8000
+#define IFF_RECV_GSO 0x0800
struct tun_pi {
unsigned short flags;