1) Turn GSO on virtio net into an all-or-nothing (keep checksumming separate). Having multiple bits is a pain: if you can't support something you should handle it in software, which is still a performance win. 2) Make VIRTIO_NET_HDR_GSO_ECN a flag in the header, so it can apply to IPv6 or v4. 3) Rename VIRTIO_NET_F_NO_CSUM to VIRTIO_NET_F_CSUM (ie. means we do checksumming). 4) Add csum and gso params to virtio_net to allow more testing. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/net/virtio_net.c | 32 ++++++++++++++++---------------- include/linux/virtio_net.h | 12 ++++-------- 2 files changed, 20 insertions(+), 24 deletions(-) diff -r 4fb788b18cf8 drivers/net/virtio_net.c --- a/drivers/net/virtio_net.c Wed Jan 23 13:07:59 2008 +1100 +++ b/drivers/net/virtio_net.c Wed Jan 23 18:46:05 2008 +1100 @@ -26,6 +26,10 @@ static int napi_weight = 128; module_param(napi_weight, int, 0444); + +static int csum = 1, gso = 1; +module_param(csum, int, 0444); +module_param(gso, int, 0444); MODULE_LICENSE("GPL"); @@ -95,12 +99,9 @@ static void receive_skb(struct net_devic if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { pr_debug("GSO!\n"); - switch (hdr->gso_type) { + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; - break; - case VIRTIO_NET_HDR_GSO_TCPV4_ECN: - skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN; break; case VIRTIO_NET_HDR_GSO_UDP: skb_shinfo(skb)->gso_type = SKB_GSO_UDP; @@ -114,6 +115,9 @@ static void receive_skb(struct net_devic dev->name, hdr->gso_type); goto frame_err; } + + if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; skb_shinfo(skb)->gso_size = hdr->gso_size; if (skb_shinfo(skb)->gso_size == 0) { @@ -249,9 +253,7 @@ static int start_xmit(struct sk_buff *sk if (skb_is_gso(skb)) { hdr->hdr_len = skb_transport_header(skb) - skb->data; hdr->gso_size = skb_shinfo(skb)->gso_size; - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) - hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN; - else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; @@ -259,6 +261,8 @@ static int start_xmit(struct sk_buff *sk hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; else BUG(); + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) + hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else { hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; hdr->gso_size = hdr->hdr_len = 0; @@ -354,17 +358,13 @@ static int virtnet_probe(struct virtio_d SET_NETDEV_DEV(dev, &vdev->dev); /* Do we support "hardware" checksums? */ - if (vdev->config->feature(vdev, VIRTIO_NET_F_NO_CSUM)) { + if (csum && vdev->config->feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */ dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; - if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4)) - dev->features |= NETIF_F_TSO; - if (vdev->config->feature(vdev, VIRTIO_NET_F_UFO)) - dev->features |= NETIF_F_UFO; - if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4_ECN)) - dev->features |= NETIF_F_TSO_ECN; - if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO6)) - dev->features |= NETIF_F_TSO6; + if (gso && vdev->config->feature(vdev, VIRTIO_NET_F_GSO)) { + dev->features |= NETIF_F_TSO | NETIF_F_UFO + | NETIF_F_TSO_ECN | NETIF_F_TSO6; + } } /* Configuration may specify what MAC to use. Otherwise random. */ diff -r 4fb788b18cf8 include/linux/virtio_net.h --- a/include/linux/virtio_net.h Wed Jan 23 13:07:59 2008 +1100 +++ b/include/linux/virtio_net.h Wed Jan 23 18:46:05 2008 +1100 @@ -6,12 +6,9 @@ #define VIRTIO_ID_NET 1 /* The feature bitmap for virtio net */ -#define VIRTIO_NET_F_NO_CSUM 0 -#define VIRTIO_NET_F_TSO4 1 -#define VIRTIO_NET_F_UFO 2 -#define VIRTIO_NET_F_TSO4_ECN 3 -#define VIRTIO_NET_F_TSO6 4 -#define VIRTIO_NET_F_MAC 5 +#define VIRTIO_NET_F_CSUM 0 /* Can handle pkts w/ partial csum */ +#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */ +#define VIRTIO_NET_F_GSO 6 /* Can handle pkts w/ any GSO type */ struct virtio_net_config { @@ -27,10 +24,9 @@ struct virtio_net_hdr __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame #define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) -/* FIXME: Do we need this? If they said they can handle ECN, do they care? */ -#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN #define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) #define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP +#define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set __u8 gso_type; __u16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ __u16 gso_size; /* Bytes to append to gso_hdr_len per frame */
Rusty Russell
2008-Jan-23 06:23 UTC
[PATCH 2/3] partial checksum and GSO support for tun/tap.
(Changes since last time: we how have explicit IFF_RECV_CSUM and IFF_RECV_GSO bits, and some renaming of virtio_net hdr) We use the virtio_net_hdr: it is an ABI already and designed to encapsulate such metadata as GSO and partial checksums. IFF_VIRTIO_HDR means you will write and read a 'struct virtio_net_hdr' at the start of each packet. You can always write packets with partial checksum and gso to the tap device using this header. IFF_RECV_CSUM means you can handle reading packets with partial checksums. If IFF_RECV_GSO is also set, it means you can handle reading (all types of) GSO packets. Note that there is no easy way to detect if these flags are supported: see next patch. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/net/tun.c | 259 +++++++++++++++++++++++++++++++++++++++++++------ include/linux/if_tun.h | 6 + 2 files changed, 238 insertions(+), 27 deletions(-) diff -r cb85fb035378 drivers/net/tun.c --- a/drivers/net/tun.c Wed Jan 23 20:06:56 2008 +1100 +++ b/drivers/net/tun.c Wed Jan 23 20:12:51 2008 +1100 @@ -62,6 +62,7 @@ #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/crc32.h> +#include <linux/virtio_net.h> #include <net/net_namespace.h> #include <asm/system.h> @@ -238,35 +239,188 @@ static unsigned int tun_chr_poll(struct return mask; } +static struct sk_buff *copy_user_skb(size_t align, struct iovec *iv, size_t len) +{ + struct sk_buff *skb; + + if (!(skb = alloc_skb(len + align, GFP_KERNEL))) + return ERR_PTR(-ENOMEM); + + if (align) + skb_reserve(skb, align); + + if (memcpy_fromiovec(skb_put(skb, len), iv, len)) { + kfree_skb(skb); + return ERR_PTR(-EFAULT); + } + return skb; +} + +/* This will fail if they give us a crazy iovec, but that's their own fault. */ +static int get_user_skb_frags(const struct iovec *iv, size_t count, + struct skb_frag_struct *f) +{ + unsigned int i, j, num_pg = 0; + int err; + struct page *pages[MAX_SKB_FRAGS]; + + down_read(¤t->mm->mmap_sem); + for (i = 0; i < count; i++) { + int n, npages; + unsigned long base, len; + base = (unsigned long)iv[i].iov_base; + len = (unsigned long)iv[i].iov_len; + + if (len == 0) + continue; + + /* How many pages will this take? */ + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE; + if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) { + err = -ENOSPC; + goto fail; + } + n = get_user_pages(current, current->mm, base, npages, + 0, 0, pages, NULL); + if (unlikely(n < 0)) { + err = n; + goto fail; + } + + /* Transfer pages to the frag array */ + for (j = 0; j < n; j++) { + f[num_pg].page = pages[j]; + if (j == 0) { + f[num_pg].page_offset = offset_in_page(base); + f[num_pg].size = min(len, PAGE_SIZE - + f[num_pg].page_offset); + } else { + f[num_pg].page_offset = 0; + f[num_pg].size = min(len, PAGE_SIZE); + } + len -= f[num_pg].size; + base += f[num_pg].size; + num_pg++; + } + + if (unlikely(n != npages)) { + err = -EFAULT; + goto fail; + } + } + up_read(¤t->mm->mmap_sem); + return num_pg; + +fail: + for (i = 0; i < num_pg; i++) + put_page(f[i].page); + up_read(¤t->mm->mmap_sem); + return err; +} + + +static struct sk_buff *map_user_skb(const struct virtio_net_hdr *gso, + size_t align, struct iovec *iv, + size_t count, size_t len) +{ + struct sk_buff *skb; + struct skb_shared_info *sinfo; + int err; + + if (!(skb = alloc_skb(gso->hdr_len + align, GFP_KERNEL))) + return ERR_PTR(-ENOMEM); + + if (align) + skb_reserve(skb, align); + + sinfo = skb_shinfo(skb); + sinfo->gso_size = gso->gso_size; + sinfo->gso_type = SKB_GSO_DODGY; + switch (gso->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + sinfo->gso_type |= SKB_GSO_TCPV4; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + sinfo->gso_type |= SKB_GSO_TCPV6; + break; + case VIRTIO_NET_HDR_GSO_UDP: + sinfo->gso_type |= SKB_GSO_UDP; + break; + default: + err = -EINVAL; + goto fail; + } + + if (gso->gso_type & VIRTIO_NET_HDR_GSO_ECN) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + + /* Copy in the header. */ + if (memcpy_fromiovec(skb_put(skb, gso->hdr_len), iv, gso->hdr_len)) { + err = -EFAULT; + goto fail; + } + + err = get_user_skb_frags(iv, count, sinfo->frags); + if (err < 0) + goto fail; + + sinfo->nr_frags = err; + skb->len += len; + skb->data_len += len; + + return skb; + +fail: + kfree_skb(skb); + return ERR_PTR(err); +} + +static inline size_t iov_total(const struct iovec *iv, unsigned long count) +{ + unsigned long i; + size_t len; + + for (i = 0, len = 0; i < count; i++) + len += iv[i].iov_len; + + return len; +} + /* Get packet from user space buffer */ -static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count) +static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t num) { struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) }; + struct virtio_net_hdr gso = { 0, VIRTIO_NET_HDR_GSO_NONE }; struct sk_buff *skb; - size_t len = count, align = 0; + size_t tot_len = iov_total(iv, num); + size_t len = tot_len, align = 0; if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) > count) + if ((len -= sizeof(pi)) > tot_len) return -EINVAL; if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) + return -EFAULT; + } + if (tun->flags & TUN_VIRTIO_HDR) { + if ((len -= sizeof(gso)) > tot_len) + return -EINVAL; + + if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso))) return -EFAULT; } if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) align = NET_IP_ALIGN; - if (!(skb = alloc_skb(len + align, GFP_KERNEL))) { + if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) + skb = map_user_skb(&gso, align, iv, num, len); + else + skb = copy_user_skb(align, iv, len); + + if (IS_ERR(skb)) { tun->dev->stats.rx_dropped++; - return -ENOMEM; - } - - if (align) - skb_reserve(skb, align); - if (memcpy_fromiovec(skb_put(skb, len), iv, len)) { - tun->dev->stats.rx_dropped++; - kfree_skb(skb); - return -EFAULT; + return PTR_ERR(skb); } switch (tun->flags & TUN_TYPE_MASK) { @@ -280,7 +434,13 @@ static __inline__ ssize_t tun_get_user(s break; }; - if (tun->flags & TUN_NOCHECKSUM) + if (gso.flags & (1 << VIRTIO_NET_F_CSUM)) { + if (!skb_partial_csum_set(skb,gso.csum_start,gso.csum_offset)) { + tun->dev->stats.rx_dropped++; + kfree_skb(skb); + return -EINVAL; + } + } else if (tun->flags & TUN_NOCHECKSUM) skb->ip_summed = CHECKSUM_UNNECESSARY; netif_rx_ni(skb); @@ -289,18 +449,7 @@ static __inline__ ssize_t tun_get_user(s tun->dev->stats.rx_packets++; tun->dev->stats.rx_bytes += len; - return count; -} - -static inline size_t iov_total(const struct iovec *iv, unsigned long count) -{ - unsigned long i; - size_t len; - - for (i = 0, len = 0; i < count; i++) - len += iv[i].iov_len; - - return len; + return tot_len; } static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, @@ -313,7 +462,7 @@ static ssize_t tun_chr_aio_write(struct DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count); - return tun_get_user(tun, (struct iovec *) iv, iov_total(iv, count)); + return tun_get_user(tun, (struct iovec *) iv, count); } /* Put packet to the user space buffer */ @@ -336,6 +485,42 @@ static __inline__ ssize_t tun_put_user(s if (memcpy_toiovec(iv, (void *) &pi, sizeof(pi))) return -EFAULT; total += sizeof(pi); + } + if (tun->flags & TUN_VIRTIO_HDR) { + struct virtio_net_hdr gso; + struct skb_shared_info *sinfo = skb_shinfo(skb); + + if (skb_is_gso(skb)) { + gso.hdr_len = skb_transport_header(skb) - skb->data; + gso.gso_size = sinfo->gso_size; + if (sinfo->gso_type & SKB_GSO_TCPV4) + gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else if (sinfo->gso_type & SKB_GSO_TCPV6) + gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + gso.gso_type = VIRTIO_NET_HDR_GSO_UDP; + else + BUG(); + if (sinfo->gso_type & SKB_GSO_TCP_ECN) + gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN; + } else + gso.gso_type = VIRTIO_NET_HDR_GSO_NONE; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + gso.csum_start = skb->csum_start - skb_headroom(skb); + gso.csum_offset = skb->csum_offset; + } else { + gso.flags = 0; + gso.csum_offset = gso.csum_start = 0; + } + + if ((len -= sizeof(gso)) < 0) + return -EINVAL; + + if (memcpy_toiovec(iv, (void *)&gso, sizeof(gso))) + return -EFAULT; + total += sizeof(gso); } len = min_t(int, skb->len, len); @@ -523,6 +708,17 @@ static int tun_set_iff(struct file *file tun_net_init(dev); + /* Virtio header means we can handle csum & gso. */ + if ((ifr->ifr_flags & (IFF_VIRTIO_HDR|IFF_RECV_CSUM)) =+ (IFF_VIRTIO_HDR|IFF_RECV_CSUM)) { + dev->features = NETIF_F_SG | NETIF_F_HW_CSUM | + NETIF_F_HIGHDMA | NETIF_F_FRAGLIST; + + if (ifr->ifr_flags & IFF_RECV_GSO) + dev->features |= NETIF_F_TSO | NETIF_F_UFO | + NETIF_F_TSO_ECN | NETIF_F_TSO6; + } + if (strchr(dev->name, '%')) { err = dev_alloc_name(dev, dev->name); if (err < 0) @@ -543,6 +739,15 @@ static int tun_set_iff(struct file *file if (ifr->ifr_flags & IFF_ONE_QUEUE) tun->flags |= TUN_ONE_QUEUE; + + if (ifr->ifr_flags & IFF_VIRTIO_HDR) + tun->flags |= TUN_VIRTIO_HDR; + + if (ifr->ifr_flags & IFF_RECV_CSUM) + tun->flags |= TUN_RECV_CSUM; + + if (ifr->ifr_flags & IFF_RECV_GSO) + tun->flags |= TUN_RECV_GSO; file->private_data = tun; tun->attached = 1; diff -r cb85fb035378 include/linux/if_tun.h --- a/include/linux/if_tun.h Wed Jan 23 20:06:56 2008 +1100 +++ b/include/linux/if_tun.h Wed Jan 23 20:12:51 2008 +1100 @@ -70,6 +70,9 @@ struct tun_struct { #define TUN_NO_PI 0x0040 #define TUN_ONE_QUEUE 0x0080 #define TUN_PERSIST 0x0100 +#define TUN_VIRTIO_HDR 0x0200 +#define TUN_RECV_CSUM 0x0400 +#define TUN_RECV_GSO 0x0400 /* Ioctl defines */ #define TUNSETNOCSUM _IOW('T', 200, int) @@ -85,6 +88,9 @@ struct tun_struct { #define IFF_TAP 0x0002 #define IFF_NO_PI 0x1000 #define IFF_ONE_QUEUE 0x2000 +#define IFF_VIRTIO_HDR 0x4000 +#define IFF_RECV_CSUM 0x8000 +#define IFF_RECV_GSO 0x0800 struct tun_pi { unsigned short flags;