As multi-queue nics were commonly used for high-end servers, current single queue based tap can not satisfy the requirement of scaling guest network performance as the numbers of vcpus increase. So the following series implements multiple queue support in tun/tap. In order to take advantages of this, a multi-queue capable driver and qemu were also needed. I just rebase the latest version of Krishna's multi-queue virtio-net driver into this series to simplify the test. And for multiqueue supported qemu, you can refer the patches I post in http://www.spinics.net/lists/kvm/msg52808.html. Vhost is also a must to achieve high performance and its code could be used for multi-queue without modification. Alternatively, this series can be also used for Krishna's M:N implementation of multiqueue but I didn't test it. The idea is simple: each socket were abstracted as a queue for tun/tap, and userspace may open as many files as required and then attach them to the devices. In order to keep the ABI compatibility, device creation were still finished in TUNSETIFF, and two new ioctls TUNATTACHQUEUE and TUNDETACHQUEUE were added for user to manipulate the numbers of queues for the tun/tap. I've done some basic performance testing of multi queue tap. For tun, I just test it through vpnc. Notes: - Test shows improvement when receving packets from local/external host to guest, and send big packet from guest to local/external host. - Current multiqueue based virtio-net/tap introduce a regression of send small packet (512 byte) from guest to local/external host. I suspect it's the issue of queue selection in both guest driver and tap. Would continue to investigate. - I would post the perforamnce numbers as a reply of this mail. TODO: - solve the issue of packet transmission of small packets. - addressing the comments of virtio-net driver - performance tunning Please review and comment it, Thanks. --- Jason Wang (5): tuntap: move socket/sock related structures to tun_file tuntap: categorize ioctl tuntap: introduce multiqueue related flags tuntap: multiqueue support tuntap: add ioctls to attach or detach a file form tap device Krishna Kumar (2): Change virtqueue structure virtio-net changes drivers/net/tun.c | 738 ++++++++++++++++++++++++++----------------- drivers/net/virtio_net.c | 578 ++++++++++++++++++++++++---------- drivers/virtio/virtio_pci.c | 10 - include/linux/if_tun.h | 5 include/linux/virtio.h | 1 include/linux/virtio_net.h | 3 6 files changed, 867 insertions(+), 468 deletions(-) -- Jason Wang
Jason Wang
2011-Aug-12 01:54 UTC
[net-next RFC PATCH 1/7] tuntap: move socket/sock related structures to tun_file
In order to let tap can transmit packets to multiple sockets, the first step is to move all socket/sock related structures to tun_file. The reference between tap device and socket was setup during TUNSETIFF as usual. After this we can move towards the multi-queue support by allowing multiple files to be attached to a single tap device. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/net/tun.c | 349 +++++++++++++++++++++++++++-------------------------- 1 files changed, 180 insertions(+), 169 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 71f3d1a..2739887 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -109,9 +109,16 @@ struct tap_filter { }; struct tun_file { + struct sock sk; + struct socket socket; + struct socket_wq wq; + int vnet_hdr_sz; + struct tap_filter txflt; atomic_t count; struct tun_struct *tun; struct net *net; + struct fasync_struct *fasync; + unsigned int flags; }; struct tun_sock; @@ -126,29 +133,12 @@ struct tun_struct { u32 set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6|NETIF_F_UFO) - struct fasync_struct *fasync; - - struct tap_filter txflt; - struct socket socket; - struct socket_wq wq; - - int vnet_hdr_sz; #ifdef TUN_DEBUG int debug; #endif }; -struct tun_sock { - struct sock sk; - struct tun_struct *tun; -}; - -static inline struct tun_sock *tun_sk(struct sock *sk) -{ - return container_of(sk, struct tun_sock, sk); -} - static int tun_attach(struct tun_struct *tun, struct file *file) { struct tun_file *tfile = file->private_data; @@ -169,10 +159,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file) err = 0; tfile->tun = tun; tun->tfile = tfile; - tun->socket.file = file; netif_carrier_on(tun->dev); dev_hold(tun->dev); - sock_hold(tun->socket.sk); + sock_hold(&tfile->sk); atomic_inc(&tfile->count); out: @@ -182,15 +171,15 @@ out: static void __tun_detach(struct tun_struct *tun) { + struct tun_file *tfile = tun->tfile; /* Detach from net device */ netif_tx_lock_bh(tun->dev); netif_carrier_off(tun->dev); tun->tfile = NULL; - tun->socket.file = NULL; netif_tx_unlock_bh(tun->dev); /* Drop read queue */ - skb_queue_purge(&tun->socket.sk->sk_receive_queue); + skb_queue_purge(&tfile->socket.sk->sk_receive_queue); /* Drop the extra count on the net device */ dev_put(tun->dev); @@ -349,19 +338,12 @@ static void tun_net_uninit(struct net_device *dev) /* Inform the methods they need to stop using the dev. */ if (tfile) { - wake_up_all(&tun->wq.wait); + wake_up_all(&tfile->wq.wait); if (atomic_dec_and_test(&tfile->count)) __tun_detach(tun); } } -static void tun_free_netdev(struct net_device *dev) -{ - struct tun_struct *tun = netdev_priv(dev); - - sock_put(tun->socket.sk); -} - /* Net device open. */ static int tun_net_open(struct net_device *dev) { @@ -380,24 +362,25 @@ static int tun_net_close(struct net_device *dev) static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile = tun->tfile; tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); /* Drop packet if interface is not attached */ - if (!tun->tfile) + if (!tfile) goto drop; /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ - if (!check_filter(&tun->txflt, skb)) + if (!check_filter(&tfile->txflt, skb)) goto drop; - if (tun->socket.sk->sk_filter && - sk_filter(tun->socket.sk, skb)) + if (tfile->socket.sk->sk_filter && + sk_filter(tfile->socket.sk, skb)) goto drop; - if (skb_queue_len(&tun->socket.sk->sk_receive_queue) >= dev->tx_queue_len) { + if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) >= dev->tx_queue_len) { if (!(tun->flags & TUN_ONE_QUEUE)) { /* Normal queueing mode. */ /* Packet scheduler handles dropping of further packets. */ @@ -418,12 +401,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) skb_orphan(skb); /* Enqueue packet */ - skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb); + skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); /* Notify and wake up reader process */ - if (tun->flags & TUN_FASYNC) - kill_fasync(&tun->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tun->wq.wait, POLLIN | + if (tfile->flags & TUN_FASYNC) + kill_fasync(&tfile->fasync, SIGIO, POLL_IN); + wake_up_interruptible_poll(&tfile->wq.wait, POLLIN | POLLRDNORM | POLLRDBAND); return NETDEV_TX_OK; @@ -550,11 +533,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait) if (!tun) return POLLERR; - sk = tun->socket.sk; + sk = tfile->socket.sk; tun_debug(KERN_INFO, tun, "tun_chr_poll\n"); - poll_wait(file, &tun->wq.wait, wait); + poll_wait(file, &tfile->wq.wait, wait); if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; @@ -573,11 +556,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait) /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ -static struct sk_buff *tun_alloc_skb(struct tun_struct *tun, +static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, size_t prepad, size_t len, size_t linear, int noblock) { - struct sock *sk = tun->socket.sk; + struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err; @@ -601,7 +584,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun, } /* Get packet from user space buffer */ -static ssize_t tun_get_user(struct tun_struct *tun, +static ssize_t tun_get_user(struct tun_file *tfile, const struct iovec *iv, size_t count, int noblock) { @@ -610,8 +593,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, size_t len = count, align = NET_SKB_PAD; struct virtio_net_hdr gso = { 0 }; int offset = 0; + struct tun_struct *tun = NULL; + bool drop = false, error = false; - if (!(tun->flags & TUN_NO_PI)) { + if (!(tfile->flags & TUN_NO_PI)) { if ((len -= sizeof(pi)) > count) return -EINVAL; @@ -620,8 +605,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, offset += sizeof(pi); } - if (tun->flags & TUN_VNET_HDR) { - if ((len -= tun->vnet_hdr_sz) > count) + if (tfile->flags & TUN_VNET_HDR) { + if ((len -= tfile->vnet_hdr_sz) > count) return -EINVAL; if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) @@ -633,41 +618,43 @@ static ssize_t tun_get_user(struct tun_struct *tun, if (gso.hdr_len > len) return -EINVAL; - offset += tun->vnet_hdr_sz; + offset += tfile->vnet_hdr_sz; } - if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) { + if ((tfile->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (gso.hdr_len && gso.hdr_len < ETH_HLEN))) return -EINVAL; } - skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock); + skb = tun_alloc_skb(tfile, align, len, gso.hdr_len, noblock); + if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) - tun->dev->stats.rx_dropped++; - return PTR_ERR(skb); + drop = true; + count = PTR_ERR(skb); + goto err; } if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) { - tun->dev->stats.rx_dropped++; + drop = true; kfree_skb(skb); - return -EFAULT; + count = -EFAULT; + goto err; } if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { if (!skb_partial_csum_set(skb, gso.csum_start, gso.csum_offset)) { - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; + error = true; + goto err_free; } } - switch (tun->flags & TUN_TYPE_MASK) { + switch (tfile->flags & TUN_TYPE_MASK) { case TUN_TUN_DEV: - if (tun->flags & TUN_NO_PI) { + if (tfile->flags & TUN_NO_PI) { switch (skb->data[0] & 0xf0) { case 0x40: pi.proto = htons(ETH_P_IP); @@ -676,18 +663,15 @@ static ssize_t tun_get_user(struct tun_struct *tun, pi.proto = htons(ETH_P_IPV6); break; default: - tun->dev->stats.rx_dropped++; - kfree_skb(skb); - return -EINVAL; + drop = true; + goto err_free; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; - skb->dev = tun->dev; break; case TUN_TAP_DEV: - skb->protocol = eth_type_trans(skb, tun->dev); break; } @@ -704,9 +688,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, skb_shinfo(skb)->gso_type = SKB_GSO_UDP; break; default: - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; + error = true; + goto err_free; } if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN) @@ -714,9 +697,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, skb_shinfo(skb)->gso_size = gso.gso_size; if (skb_shinfo(skb)->gso_size == 0) { - tun->dev->stats.rx_frame_errors++; - kfree_skb(skb); - return -EINVAL; + error = true; + goto err_free; } /* Header must be checked, and gso_segs computed. */ @@ -724,42 +706,68 @@ static ssize_t tun_get_user(struct tun_struct *tun, skb_shinfo(skb)->gso_segs = 0; } - netif_rx_ni(skb); + tun = __tun_get(tfile); + if (!tun) { + return -EBADFD; + } + + switch (tfile->flags & TUN_TYPE_MASK) { + case TUN_TUN_DEV: + skb->dev = tun->dev; + break; + case TUN_TAP_DEV: + skb->protocol = eth_type_trans(skb, tun->dev); + break; + } tun->dev->stats.rx_packets++; tun->dev->stats.rx_bytes += len; + tun_put(tun); + + netif_rx_ni(skb); return count; + +err_free: + count = -EINVAL; + kfree_skb(skb); +err: + tun = __tun_get(tfile); + if (!tun) { + return -EBADFD; + } + + if (drop) + tun->dev->stats.rx_dropped++; + if (error) + tun->dev->stats.rx_frame_errors++; + tun_put(tun); + return count; } static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, unsigned long count, loff_t pos) { struct file *file = iocb->ki_filp; - struct tun_struct *tun = tun_get(file); + struct tun_file *tfile = file->private_data; ssize_t result; - if (!tun) - return -EBADFD; - - tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); - - result = tun_get_user(tun, iv, iov_length(iv, count), + result = tun_get_user(tfile, iv, iov_length(iv, count), file->f_flags & O_NONBLOCK); - tun_put(tun); return result; } /* Put packet to the user space buffer */ -static ssize_t tun_put_user(struct tun_struct *tun, +static ssize_t tun_put_user(struct tun_file *tfile, struct sk_buff *skb, const struct iovec *iv, int len) { + struct tun_struct *tun = NULL; struct tun_pi pi = { 0, skb->protocol }; ssize_t total = 0; - if (!(tun->flags & TUN_NO_PI)) { + if (!(tfile->flags & TUN_NO_PI)) { if ((len -= sizeof(pi)) < 0) return -EINVAL; @@ -773,9 +781,9 @@ static ssize_t tun_put_user(struct tun_struct *tun, total += sizeof(pi); } - if (tun->flags & TUN_VNET_HDR) { + if (tfile->flags & TUN_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; /* no info leak */ - if ((len -= tun->vnet_hdr_sz) < 0) + if ((len -= tfile->vnet_hdr_sz) < 0) return -EINVAL; if (skb_is_gso(skb)) { @@ -818,7 +826,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, sizeof(gso)))) return -EFAULT; - total += tun->vnet_hdr_sz; + total += tfile->vnet_hdr_sz; } len = min_t(int, skb->len, len); @@ -826,29 +834,32 @@ static ssize_t tun_put_user(struct tun_struct *tun, skb_copy_datagram_const_iovec(skb, 0, iv, total, len); total += skb->len; - tun->dev->stats.tx_packets++; - tun->dev->stats.tx_bytes += len; + tun = __tun_get(tfile); + if (tun) { + tun->dev->stats.tx_packets++; + tun->dev->stats.tx_bytes += len; + tun_put(tun); + } return total; } -static ssize_t tun_do_read(struct tun_struct *tun, +static ssize_t tun_do_read(struct tun_file *tfile, struct kiocb *iocb, const struct iovec *iv, ssize_t len, int noblock) { DECLARE_WAITQUEUE(wait, current); struct sk_buff *skb; ssize_t ret = 0; - - tun_debug(KERN_INFO, tun, "tun_chr_read\n"); + struct tun_struct *tun = NULL; if (unlikely(!noblock)) - add_wait_queue(&tun->wq.wait, &wait); + add_wait_queue(&tfile->wq.wait, &wait); while (len) { current->state = TASK_INTERRUPTIBLE; /* Read frames from the queue */ - if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) { + if (!(skb=skb_dequeue(&tfile->socket.sk->sk_receive_queue))) { if (noblock) { ret = -EAGAIN; break; @@ -857,25 +868,38 @@ static ssize_t tun_do_read(struct tun_struct *tun, ret = -ERESTARTSYS; break; } + + tun = __tun_get(tfile); + if (!tun) { + ret = -EIO; + break; + } if (tun->dev->reg_state != NETREG_REGISTERED) { ret = -EIO; + tun_put(tun); break; } + tun_put(tun); /* Nothing to read, let's sleep */ schedule(); continue; } - netif_wake_queue(tun->dev); - ret = tun_put_user(tun, skb, iv, len); + tun = __tun_get(tfile); + if (tun) { + netif_wake_queue(tun->dev); + tun_put(tun); + } + + ret = tun_put_user(tfile, skb, iv, len); kfree_skb(skb); break; } current->state = TASK_RUNNING; if (unlikely(!noblock)) - remove_wait_queue(&tun->wq.wait, &wait); + remove_wait_queue(&tfile->wq.wait, &wait); return ret; } @@ -885,21 +909,17 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; - struct tun_struct *tun = __tun_get(tfile); ssize_t len, ret; - if (!tun) - return -EBADFD; len = iov_length(iv, count); if (len < 0) { ret = -EINVAL; goto out; } - ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK); + ret = tun_do_read(tfile, iocb, iv, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); out: - tun_put(tun); return ret; } @@ -911,7 +931,7 @@ static void tun_setup(struct net_device *dev) tun->group = -1; dev->ethtool_ops = &tun_ethtool_ops; - dev->destructor = tun_free_netdev; + dev->destructor = free_netdev; } /* Trivial set of netlink ops to allow deleting tun or tap @@ -931,7 +951,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = { static void tun_sock_write_space(struct sock *sk) { - struct tun_struct *tun; + struct tun_file *tfile = NULL; wait_queue_head_t *wqueue; if (!sock_writeable(sk)) @@ -945,37 +965,38 @@ static void tun_sock_write_space(struct sock *sk) wake_up_interruptible_sync_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND); - tun = tun_sk(sk)->tun; - kill_fasync(&tun->fasync, SIGIO, POLL_OUT); -} - -static void tun_sock_destruct(struct sock *sk) -{ - free_netdev(tun_sk(sk)->tun->dev); + tfile = container_of(sk, struct tun_file, sk); + kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len) { - struct tun_struct *tun = container_of(sock, struct tun_struct, socket); - return tun_get_user(tun, m->msg_iov, total_len, - m->msg_flags & MSG_DONTWAIT); + struct tun_file *tfile = container_of(sock, struct tun_file, socket); + ssize_t result; + + result= tun_get_user(tfile, m->msg_iov, total_len, + m->msg_flags & MSG_DONTWAIT); + return result; } static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len, int flags) { - struct tun_struct *tun = container_of(sock, struct tun_struct, socket); + struct tun_file *tfile = container_of(sock, struct tun_file, socket); int ret; + if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) return -EINVAL; - ret = tun_do_read(tun, iocb, m->msg_iov, total_len, + + ret = tun_do_read(tfile, iocb, m->msg_iov, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } + return ret; } @@ -988,7 +1009,7 @@ static const struct proto_ops tun_socket_ops = { static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, - .obj_size = sizeof(struct tun_sock), + .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) @@ -1039,8 +1060,8 @@ static DEVICE_ATTR(group, 0444, tun_show_group, NULL); static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { - struct sock *sk; struct tun_struct *tun; + struct tun_file *tfile = file->private_data; struct net_device *dev; int err; @@ -1061,7 +1082,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) (tun->group != -1 && !in_egroup_p(tun->group))) && !capable(CAP_NET_ADMIN)) return -EPERM; - err = security_tun_dev_attach(tun->socket.sk); + err = security_tun_dev_attach(tfile->socket.sk); if (err < 0) return err; @@ -1105,24 +1126,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; - tun->txflt.count = 0; - tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); - err = -ENOMEM; - sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto); - if (!sk) - goto err_free_dev; - - tun->socket.wq = &tun->wq; - init_waitqueue_head(&tun->wq.wait); - tun->socket.ops = &tun_socket_ops; - sock_init_data(&tun->socket, sk); - sk->sk_write_space = tun_sock_write_space; - sk->sk_sndbuf = INT_MAX; - - tun_sk(sk)->tun = tun; - - security_tun_dev_post_create(sk); + security_tun_dev_post_create(&tfile->sk); tun_net_init(dev); @@ -1132,15 +1137,13 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) err = register_netdevice(tun->dev); if (err < 0) - goto err_free_sk; + goto err_free_dev; if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) || device_create_file(&tun->dev->dev, &dev_attr_owner) || device_create_file(&tun->dev->dev, &dev_attr_group)) pr_err("Failed to create tun sysfs files\n"); - sk->sk_destruct = tun_sock_destruct; - err = tun_attach(tun, file); if (err < 0) goto failed; @@ -1163,6 +1166,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) else tun->flags &= ~TUN_VNET_HDR; + /* Cache flags from tun device */ + tfile->flags = tun->flags; /* Make sure persistent devices do not get stuck in * xoff state. */ @@ -1172,11 +1177,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) strcpy(ifr->ifr_name, tun->dev->name); return 0; - err_free_sk: - sock_put(sk); - err_free_dev: +err_free_dev: free_netdev(dev); - failed: +failed: return err; } @@ -1348,9 +1351,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) break; - ret = update_filter(&tun->txflt, (void __user *)arg); + ret = update_filter(&tfile->txflt, (void __user *)arg); break; case SIOCGIFHWADDR: @@ -1370,7 +1373,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; case TUNGETSNDBUF: - sndbuf = tun->socket.sk->sk_sndbuf; + sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) ret = -EFAULT; break; @@ -1381,11 +1384,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } - tun->socket.sk->sk_sndbuf = sndbuf; + tfile->socket.sk->sk_sndbuf = sndbuf; break; case TUNGETVNETHDRSZ: - vnet_hdr_sz = tun->vnet_hdr_sz; + vnet_hdr_sz = tfile->vnet_hdr_sz; if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) ret = -EFAULT; break; @@ -1400,27 +1403,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } - tun->vnet_hdr_sz = vnet_hdr_sz; + tfile->vnet_hdr_sz = vnet_hdr_sz; break; case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) break; ret = -EFAULT; if (copy_from_user(&fprog, argp, sizeof(fprog))) break; - ret = sk_attach_filter(&fprog, tun->socket.sk); + ret = sk_attach_filter(&fprog, tfile->socket.sk); break; case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; - if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) break; - ret = sk_detach_filter(tun->socket.sk); + ret = sk_detach_filter(tfile->socket.sk); break; default: @@ -1472,43 +1475,50 @@ static long tun_chr_compat_ioctl(struct file *file, static int tun_chr_fasync(int fd, struct file *file, int on) { - struct tun_struct *tun = tun_get(file); + struct tun_file *tfile = file->private_data; int ret; - if (!tun) - return -EBADFD; - - tun_debug(KERN_INFO, tun, "tun_chr_fasync %d\n", on); - - if ((ret = fasync_helper(fd, file, on, &tun->fasync)) < 0) + if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; if (on) { ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0); if (ret) goto out; - tun->flags |= TUN_FASYNC; + tfile->flags |= TUN_FASYNC; } else - tun->flags &= ~TUN_FASYNC; + tfile->flags &= ~TUN_FASYNC; ret = 0; out: - tun_put(tun); return ret; } static int tun_chr_open(struct inode *inode, struct file * file) { + struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; DBG1(KERN_INFO, "tunX: tun_chr_open\n"); - tfile = kmalloc(sizeof(*tfile), GFP_KERNEL); + tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, + &tun_proto); if (!tfile) return -ENOMEM; - atomic_set(&tfile->count, 0); + tfile->tun = NULL; - tfile->net = get_net(current->nsproxy->net_ns); + tfile->net = net; + tfile->txflt.count = 0; + tfile->vnet_hdr_sz = sizeof(struct virtio_net_hdr); + tfile->socket.wq = &tfile->wq; + init_waitqueue_head(&tfile->wq.wait); + tfile->socket.file = file; + tfile->socket.ops = &tun_socket_ops; + sock_init_data(&tfile->socket, &tfile->sk); + + tfile->sk.sk_write_space = tun_sock_write_space; + tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; + return 0; } @@ -1532,14 +1542,14 @@ static int tun_chr_close(struct inode *inode, struct file *file) unregister_netdevice(dev); rtnl_unlock(); } - } - tun = tfile->tun; - if (tun) - sock_put(tun->socket.sk); + /* drop the reference that netdevice holds */ + sock_put(&tfile->sk); - put_net(tfile->net); - kfree(tfile); + } + + /* drop the reference that file holds */ + sock_put(&tfile->sk); return 0; } @@ -1668,13 +1678,14 @@ static void tun_cleanup(void) struct socket *tun_get_socket(struct file *file) { struct tun_struct *tun; + struct tun_file *tfile = file->private_data; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tun = tun_get(file); if (!tun) return ERR_PTR(-EBADFD); tun_put(tun); - return &tun->socket; + return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket);
As we've moved socket related structure to file->private_data, we can separate system calls that only touch tfile from others as they don't need hold rtnl lock. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/net/tun.c | 52 ++++++++++++++++++++++++++++++++++------------------ 1 files changed, 34 insertions(+), 18 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2739887..4cd292a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1240,10 +1240,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, struct tun_file *tfile = file->private_data; struct tun_struct *tun; void __user* argp = (void __user*)arg; - struct sock_fprog fprog; struct ifreq ifr; - int sndbuf; - int vnet_hdr_sz; int ret; if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) @@ -1348,14 +1345,6 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = set_offload(tun, arg); break; - case TUNSETTXFILTER: - /* Can be set only for TAPs */ - ret = -EINVAL; - if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) - break; - ret = update_filter(&tfile->txflt, (void __user *)arg); - break; - case SIOCGIFHWADDR: /* Get hw address */ memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN); @@ -1372,6 +1361,37 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr); break; + default: + ret = -EINVAL; + break; + } + +unlock: + rtnl_unlock(); + if (tun) + tun_put(tun); + return ret; +} + +static long __tun_socket_ioctl(struct file *file, unsigned int cmd, + unsigned long arg, int ifreq_len) +{ + struct tun_file *tfile = file->private_data; + void __user* argp = (void __user*)arg; + struct sock_fprog fprog; + int sndbuf; + int vnet_hdr_sz; + int ret = 0; + + switch (cmd) { + case TUNSETTXFILTER: + /* Can be set only for TAPs */ + ret = -EINVAL; + if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + break; + ret = update_filter(&tfile->txflt, (void __user *)arg); + break; + case TUNGETSNDBUF: sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) @@ -1427,21 +1447,17 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; default: - ret = -EINVAL; + ret = __tun_chr_ioctl(file, cmd, arg, ifreq_len); break; } -unlock: - rtnl_unlock(); - if (tun) - tun_put(tun); return ret; } static long tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq)); + return __tun_socket_ioctl(file, cmd, arg, sizeof (struct ifreq)); } #ifdef CONFIG_COMPAT @@ -1469,7 +1485,7 @@ static long tun_chr_compat_ioctl(struct file *file, * driver are compatible though, we don't need to convert the * contents. */ - return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); + return __tun_socket_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); } #endif /* CONFIG_COMPAT */
Jason Wang
2011-Aug-12 01:55 UTC
[net-next RFC PATCH 3/7] tuntap: introduce multiqueue related flags
Signed-off-by: Jason Wang <jasowang at redhat.com> --- include/linux/if_tun.h | 2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 06b1829..c92a291 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -34,6 +34,7 @@ #define TUN_ONE_QUEUE 0x0080 #define TUN_PERSIST 0x0100 #define TUN_VNET_HDR 0x0200 +#define TUN_TAP_MQ 0x0400 /* Ioctl defines */ #define TUNSETNOCSUM _IOW('T', 200, int) @@ -61,6 +62,7 @@ #define IFF_ONE_QUEUE 0x2000 #define IFF_VNET_HDR 0x4000 #define IFF_TUN_EXCL 0x8000 +#define IFF_MULTI_QUEUE 0x0100 /* Features for GSO (TUNSETOFFLOAD). */ #define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */
With the abstraction that each socket were a backend of a queue for userspace, this patch adds multiqueue support for tap device by allowing multiple sockets to be attached to a tap device. Then we could parallize the transmission by put them into different socket. As queue related information were stored in private_data of file new, we could simply implement the multiqueue support by add an array of pointers to sockets were stored in the tap device. Then ioctls may be added to manipulate those pointers for adding or removing queues. In order to let tx path lockless, NETIF_F_LLTX were used for multiqueue tap device. And RCU is used for doing synchronization between packet handling and system calls such as removing queues. Currently, multiqueue support is limited for tap , but it's easy also enable it for tun if we find it was also helpful. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/net/tun.c | 376 ++++++++++++++++++++++++++++++++++------------------- 1 files changed, 243 insertions(+), 133 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 4cd292a..8bc6dff 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -108,6 +108,8 @@ struct tap_filter { unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; +#define MAX_TAP_QUEUES (NR_CPUS < 16 ? NR_CPUS : 16) + struct tun_file { struct sock sk; struct socket socket; @@ -115,7 +117,7 @@ struct tun_file { int vnet_hdr_sz; struct tap_filter txflt; atomic_t count; - struct tun_struct *tun; + struct tun_struct __rcu *tun; struct net *net; struct fasync_struct *fasync; unsigned int flags; @@ -124,7 +126,8 @@ struct tun_file { struct tun_sock; struct tun_struct { - struct tun_file *tfile; + struct tun_file *tfiles[MAX_TAP_QUEUES]; + unsigned int numqueues; unsigned int flags; uid_t owner; gid_t group; @@ -139,80 +142,183 @@ struct tun_struct { #endif }; -static int tun_attach(struct tun_struct *tun, struct file *file) +static DEFINE_SPINLOCK(tun_lock); + +/* + * get_slot: return a [unused/occupied] slot in tun->tfiles[]: + * - if 'f' is NULL, return the first empty slot; + * - otherwise, return the slot this pointer occupies. + */ +static int tun_get_slot(struct tun_struct *tun, struct tun_file *tfile) { - struct tun_file *tfile = file->private_data; - int err; + int i; - ASSERT_RTNL(); + for (i = 0; i < MAX_TAP_QUEUES; i++) { + if (rcu_dereference(tun->tfiles[i]) == tfile) + return i; + } - netif_tx_lock_bh(tun->dev); + /* Should never happen */ + BUG_ON(1); +} - err = -EINVAL; - if (tfile->tun) - goto out; +/* + * tun_get_queue(): calculate the queue index + * - if skbs comes from mq nics, we can just borrow + * - if not, calculate from the hash + */ +static struct tun_file *tun_get_queue(struct net_device *dev, + struct sk_buff *skb) +{ + struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile = NULL; + int numqueues = tun->numqueues; + __u32 rxq; - err = -EBUSY; - if (tun->tfile) + BUG_ON(!rcu_read_lock_held()); + + if (!numqueues) goto out; - err = 0; - tfile->tun = tun; - tun->tfile = tfile; - netif_carrier_on(tun->dev); - dev_hold(tun->dev); - sock_hold(&tfile->sk); - atomic_inc(&tfile->count); + if (likely(skb_rx_queue_recorded(skb))) { + rxq = skb_get_rx_queue(skb); + + while (unlikely(rxq >= numqueues)) + rxq -= numqueues; + + tfile = rcu_dereference(tun->tfiles[rxq]); + if (tfile) + goto out; + } + + /* Check if we can use flow to select a queue */ + rxq = skb_get_rxhash(skb); + if (rxq) { + tfile = rcu_dereference(tun->tfiles[rxq % numqueues]); + if (tfile) + goto out; + } + + /* Everything failed - find first available queue */ + for (rxq = 0; rxq < MAX_TAP_QUEUES; rxq++) { + tfile = rcu_dereference(tun->tfiles[rxq]); + if (tfile) + break; + } out: - netif_tx_unlock_bh(tun->dev); - return err; + return tfile; } -static void __tun_detach(struct tun_struct *tun) +static int tun_detach(struct tun_file *tfile, bool clean) { - struct tun_file *tfile = tun->tfile; - /* Detach from net device */ - netif_tx_lock_bh(tun->dev); - netif_carrier_off(tun->dev); - tun->tfile = NULL; - netif_tx_unlock_bh(tun->dev); - - /* Drop read queue */ - skb_queue_purge(&tfile->socket.sk->sk_receive_queue); - - /* Drop the extra count on the net device */ - dev_put(tun->dev); -} + struct tun_struct *tun; + struct net_device *dev = NULL; + bool destroy = false; -static void tun_detach(struct tun_struct *tun) -{ - rtnl_lock(); - __tun_detach(tun); - rtnl_unlock(); -} + spin_lock(&tun_lock); -static struct tun_struct *__tun_get(struct tun_file *tfile) -{ - struct tun_struct *tun = NULL; + tun = rcu_dereference_protected(tfile->tun, + lockdep_is_held(&tun_lock)); + if (tun) { + int index = tun_get_slot(tun, tfile); + if (index == -1) { + spin_unlock(&tun_lock); + return -EINVAL; + } + dev = tun->dev; + + rcu_assign_pointer(tun->tfiles[index], NULL); + rcu_assign_pointer(tfile->tun, NULL); + --tun->numqueues; + sock_put(&tfile->sk); - if (atomic_inc_not_zero(&tfile->count)) - tun = tfile->tun; + if (tun->numqueues == 0 && !(tun->flags & TUN_PERSIST)) + destroy = true; + } + + spin_unlock(&tun_lock); + + synchronize_rcu(); + if (clean) + sock_put(&tfile->sk); - return tun; + if (destroy) { + rtnl_lock(); + if (dev->reg_state == NETREG_REGISTERED) + unregister_netdevice(dev); + rtnl_unlock(); + } + + return 0; } -static struct tun_struct *tun_get(struct file *file) +static void tun_detach_all(struct net_device *dev) { - return __tun_get(file->private_data); + struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES]; + int i, j = 0; + + spin_lock(&tun_lock); + + for (i = 0; i < MAX_TAP_QUEUES && tun->numqueues; i++) { + tfile = rcu_dereference_protected(tun->tfiles[i], + lockdep_is_held(&tun_lock)); + if (tfile) { + wake_up_all(&tfile->wq.wait); + tfile_list[i++] = tfile; + rcu_assign_pointer(tun->tfiles[i], NULL); + rcu_assign_pointer(tfile->tun, NULL); + --tun->numqueues; + } + } + BUG_ON(tun->numqueues != 0); + spin_unlock(&tun_lock); + + synchronize_rcu(); + for(--j; j >= 0; j--) + sock_put(&tfile_list[j]->sk); } -static void tun_put(struct tun_struct *tun) +static int tun_attach(struct tun_struct *tun, struct file *file) { - struct tun_file *tfile = tun->tfile; + struct tun_file *tfile = file->private_data; + int err, index; + + ASSERT_RTNL(); + + spin_lock(&tun_lock); - if (atomic_dec_and_test(&tfile->count)) - tun_detach(tfile->tun); + err = -EINVAL; + if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock))) + goto out; + + err = -EBUSY; + if (!(tun->flags & TUN_TAP_MQ) && + rcu_dereference_protected(tun->tfiles[0], + lockdep_is_held(&tun_lock))) { + /* Multiqueue is only for TAP */ + goto out; + } + + if (tun->numqueues == MAX_TAP_QUEUES) + goto out; + + err = 0; + index = tun_get_slot(tun, NULL); + BUG_ON(index == -1); + rcu_assign_pointer(tfile->tun, tun); + rcu_assign_pointer(tun->tfiles[index], tfile); + sock_hold(&tfile->sk); + tun->numqueues++; + + if (tun->numqueues == 1) + netif_carrier_on(tun->dev); + + /* device is allowed to go away first, so no need to hold extra refcnt. */ +out: + spin_unlock(&tun_lock); + return err; } /* TAP filtering */ @@ -332,16 +438,7 @@ static const struct ethtool_ops tun_ethtool_ops; /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { - struct tun_struct *tun = netdev_priv(dev); - struct tun_file *tfile = tun->tfile; - - /* Inform the methods they need to stop using the dev. - */ - if (tfile) { - wake_up_all(&tfile->wq.wait); - if (atomic_dec_and_test(&tfile->count)) - __tun_detach(tun); - } + tun_detach_all(dev); } /* Net device open. */ @@ -361,10 +458,10 @@ static int tun_net_close(struct net_device *dev) /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { - struct tun_struct *tun = netdev_priv(dev); - struct tun_file *tfile = tun->tfile; + struct tun_file *tfile = NULL; - tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); + rcu_read_lock(); + tfile = tun_get_queue(dev, skb); /* Drop packet if interface is not attached */ if (!tfile) @@ -381,7 +478,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) goto drop; if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) >= dev->tx_queue_len) { - if (!(tun->flags & TUN_ONE_QUEUE)) { + if (!(tfile->flags & TUN_ONE_QUEUE) && !(tfile->flags && TUN_TAP_MQ)) { /* Normal queueing mode. */ /* Packet scheduler handles dropping of further packets. */ netif_stop_queue(dev); @@ -390,7 +487,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) * error is more appropriate. */ dev->stats.tx_fifo_errors++; } else { - /* Single queue mode. + /* Single queue mode or multi queue mode. * Driver handles dropping of all packets itself. */ goto drop; } @@ -408,9 +505,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); wake_up_interruptible_poll(&tfile->wq.wait, POLLIN | POLLRDNORM | POLLRDBAND); + rcu_read_unlock(); return NETDEV_TX_OK; drop: + rcu_read_unlock(); dev->stats.tx_dropped++; kfree_skb(skb); return NETDEV_TX_OK; @@ -526,16 +625,22 @@ static void tun_net_init(struct net_device *dev) static unsigned int tun_chr_poll(struct file *file, poll_table * wait) { struct tun_file *tfile = file->private_data; - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = NULL; struct sock *sk; unsigned int mask = 0; - if (!tun) + if (!tfile) return POLLERR; - sk = tfile->socket.sk; + rcu_read_lock(); + tun = rcu_dereference(tfile->tun); + if (!tun) { + rcu_read_unlock(); + return POLLERR; + } + rcu_read_unlock(); - tun_debug(KERN_INFO, tun, "tun_chr_poll\n"); + sk = &tfile->sk; poll_wait(file, &tfile->wq.wait, wait); @@ -547,10 +652,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait) sock_writeable(sk))) mask |= POLLOUT | POLLWRNORM; - if (tun->dev->reg_state != NETREG_REGISTERED) + rcu_read_lock(); + tun = rcu_dereference(tfile->tun); + if (!tun || tun->dev->reg_state != NETREG_REGISTERED) mask = POLLERR; + rcu_read_unlock(); - tun_put(tun); return mask; } @@ -706,8 +813,10 @@ static ssize_t tun_get_user(struct tun_file *tfile, skb_shinfo(skb)->gso_segs = 0; } - tun = __tun_get(tfile); + rcu_read_lock(); + tun = rcu_dereference(tfile->tun); if (!tun) { + rcu_read_unlock(); return -EBADFD; } @@ -722,7 +831,7 @@ static ssize_t tun_get_user(struct tun_file *tfile, tun->dev->stats.rx_packets++; tun->dev->stats.rx_bytes += len; - tun_put(tun); + rcu_read_unlock(); netif_rx_ni(skb); @@ -732,16 +841,17 @@ err_free: count = -EINVAL; kfree_skb(skb); err: - tun = __tun_get(tfile); + rcu_read_lock(); + tun = rcu_dereference(tfile->tun); if (!tun) { + rcu_read_unlock(); return -EBADFD; } - if (drop) tun->dev->stats.rx_dropped++; if (error) tun->dev->stats.rx_frame_errors++; - tun_put(tun); + rcu_read_unlock(); return count; } @@ -834,12 +944,13 @@ static ssize_t tun_put_user(struct tun_file *tfile, skb_copy_datagram_const_iovec(skb, 0, iv, total, len); total += skb->len; - tun = __tun_get(tfile); + rcu_read_lock(); + tun = rcu_dereference(tfile->tun); if (tun) { tun->dev->stats.tx_packets++; tun->dev->stats.tx_bytes += len; - tun_put(tun); } + rcu_read_unlock(); return total; } @@ -869,28 +980,31 @@ static ssize_t tun_do_read(struct tun_file *tfile, break; } - tun = __tun_get(tfile); + rcu_read_lock(); + tun = rcu_dereference(tfile->tun); if (!tun) { - ret = -EIO; + ret = -EBADFD; + rcu_read_unlock(); break; } if (tun->dev->reg_state != NETREG_REGISTERED) { ret = -EIO; - tun_put(tun); + rcu_read_unlock(); break; } - tun_put(tun); + rcu_read_unlock(); /* Nothing to read, let's sleep */ schedule(); continue; } - tun = __tun_get(tfile); + rcu_read_lock(); + tun = rcu_dereference(tfile->tun); if (tun) { netif_wake_queue(tun->dev); - tun_put(tun); } + rcu_read_unlock(); ret = tun_put_user(tfile, skb, iv, len); kfree_skb(skb); @@ -1030,6 +1144,9 @@ static int tun_flags(struct tun_struct *tun) if (tun->flags & TUN_VNET_HDR) flags |= IFF_VNET_HDR; + if (tun->flags & TUN_TAP_MQ) + flags |= IFF_MULTI_QUEUE; + return flags; } @@ -1109,6 +1226,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) /* TAP device */ flags |= TUN_TAP_DEV; name = "tap%d"; + if (ifr->ifr_flags & IFF_MULTI_QUEUE) { + flags |= TUN_TAP_MQ; + name = "mqtap%d"; + } } else return -EINVAL; @@ -1134,6 +1255,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES; dev->features = dev->hw_features; + if (ifr->ifr_flags & IFF_MULTI_QUEUE) + dev->features |= NETIF_F_LLTX; err = register_netdevice(tun->dev); if (err < 0) @@ -1166,6 +1289,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) else tun->flags &= ~TUN_VNET_HDR; + if (ifr->ifr_flags & IFF_MULTI_QUEUE) + tun->flags |= TUN_TAP_MQ; + else + tun->flags &= ~TUN_TAP_MQ; + /* Cache flags from tun device */ tfile->flags = tun->flags; /* Make sure persistent devices do not get stuck in @@ -1256,38 +1384,39 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, (unsigned int __user*)argp); } - rtnl_lock(); - - tun = __tun_get(tfile); - if (cmd == TUNSETIFF && !tun) { + ret = 0; + if (cmd == TUNSETIFF) { + rtnl_lock(); ifr.ifr_name[IFNAMSIZ-1] = '\0'; - ret = tun_set_iff(tfile->net, file, &ifr); - + rtnl_unlock(); if (ret) - goto unlock; - + return ret; if (copy_to_user(argp, &ifr, ifreq_len)) - ret = -EFAULT; - goto unlock; + return -EFAULT; + return ret; } + rtnl_lock(); + + rcu_read_lock(); + ret = -EBADFD; + tun = rcu_dereference(tfile->tun); if (!tun) goto unlock; - tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd); - ret = 0; - switch (cmd) { + switch(cmd) { case TUNGETIFF: ret = tun_get_iff(current->nsproxy->net_ns, tun, &ifr); + rcu_read_unlock(); if (ret) - break; + goto out; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; - break; + goto out; case TUNSETNOCSUM: /* Disable/Enable checksum */ @@ -1349,9 +1478,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, /* Get hw address */ memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN); ifr.ifr_hwaddr.sa_family = tun->dev->type; + rcu_read_unlock(); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; - break; + goto out; case SIOCSIFHWADDR: /* Set hw address */ @@ -1367,9 +1497,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, } unlock: + rcu_read_unlock(); +out: rtnl_unlock(); - if (tun) - tun_put(tun); return ret; } @@ -1541,31 +1671,8 @@ static int tun_chr_open(struct inode *inode, struct file * file) static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; - struct tun_struct *tun; - - tun = __tun_get(tfile); - if (tun) { - struct net_device *dev = tun->dev; - - tun_debug(KERN_INFO, tun, "tun_chr_close\n"); - - __tun_detach(tun); - - /* If desirable, unregister the netdevice. */ - if (!(tun->flags & TUN_PERSIST)) { - rtnl_lock(); - if (dev->reg_state == NETREG_REGISTERED) - unregister_netdevice(dev); - rtnl_unlock(); - } - - /* drop the reference that netdevice holds */ - sock_put(&tfile->sk); - - } - /* drop the reference that file holds */ - sock_put(&tfile->sk); + tun_detach(tfile, true); return 0; } @@ -1693,14 +1800,17 @@ static void tun_cleanup(void) * holding a reference to the file for as long as the socket is in use. */ struct socket *tun_get_socket(struct file *file) { - struct tun_struct *tun; + struct tun_struct *tun = NULL; struct tun_file *tfile = file->private_data; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); - tun = tun_get(file); - if (!tun) + rcu_read_lock(); + tun = rcu_dereference(tfile->tun); + if (!tun) { + rcu_read_unlock(); return ERR_PTR(-EBADFD); - tun_put(tun); + } + rcu_read_unlock(); return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket);
Jason Wang
2011-Aug-12 01:55 UTC
[net-next RFC PATCH 5/7] tuntap: add ioctls to attach or detach a file form tap device
This patch adds userspace interface for multi-queue based tap device. Two new ioctls were added. The first is TUNATTACHQUEUE which is used to attach an opened file descriptor to an existed tap device. Another is TUNDETACHQUEUE which is used to detach an file from an existed tap device, and this file could be re-attach to the tap device as a queue again. After those ioctls were added, userspace can create a multiqueue tap device by open /dev/net/tap and call TUNSETIFF, then it could easily control the number of queues through TUNATTACHQUEUE and TUNDETACHQUEUE. Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/net/tun.c | 29 ++++++++++++++++++++++++----- include/linux/if_tun.h | 3 +++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8bc6dff..3bc9dca 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -158,8 +158,8 @@ static int tun_get_slot(struct tun_struct *tun, struct tun_file *tfile) return i; } - /* Should never happen */ - BUG_ON(1); + /* This is possible when call TUNDETACHQUEUE with wrong ifname */ + return -1; } /* @@ -1367,11 +1367,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, { struct tun_file *tfile = file->private_data; struct tun_struct *tun; + struct net_device *dev = NULL; void __user* argp = (void __user*)arg; struct ifreq ifr; int ret; - if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) + if (cmd == TUNSETIFF || cmd == TUNATTACHQUEUE || _IOC_TYPE(cmd) == 0x89) if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; @@ -1380,7 +1381,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | - IFF_VNET_HDR, + IFF_VNET_HDR | IFF_MULTI_QUEUE, (unsigned int __user*)argp); } @@ -1396,6 +1397,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, return -EFAULT; return ret; } + if (cmd == TUNDETACHQUEUE) { + return tun_detach(tfile, false); + } rtnl_lock(); @@ -1403,7 +1407,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EBADFD; tun = rcu_dereference(tfile->tun); - if (!tun) + if (!tun && cmd != TUNATTACHQUEUE) goto unlock; @@ -1418,6 +1422,21 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; goto out; + case TUNATTACHQUEUE: + dev = __dev_get_by_name(tfile->net, ifr.ifr_name); + if (!dev || dev->netdev_ops != &tap_netdev_ops) { + ret = -EINVAL; + } else if (ifr.ifr_flags & + ~(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR)) { + /* ignore illegal flag */ + ret = -EINVAL; + } else { + tfile->flags = TUN_TAP_DEV | TUN_NO_PI | TUN_VNET_HDR; + tun = netdev_priv(dev); + ret = tun_attach(tun, file); + } + break; + case TUNSETNOCSUM: /* Disable/Enable checksum */ diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index c92a291..d3f24d8 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -54,6 +54,9 @@ #define TUNDETACHFILTER _IOW('T', 214, struct sock_fprog) #define TUNGETVNETHDRSZ _IOR('T', 215, int) #define TUNSETVNETHDRSZ _IOW('T', 216, int) +#define TUNATTACHQUEUE _IOW('T', 217, int) +#define TUNDETACHQUEUE _IOW('T', 218, int) + /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001
From: Krishna Kumar <krkumar2 at in.ibm.com> Move queue_index from virtio_pci_vq_info to virtqueue. This allows callback handlers to figure out the queue number for the vq that needs attention. Signed-off-by: Krishna Kumar <krkumar2 at in.ibm.com> --- drivers/virtio/virtio_pci.c | 10 +++------- include/linux/virtio.h | 1 + 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 4bcc8b8..395af63 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -75,9 +75,6 @@ struct virtio_pci_vq_info /* the number of entries in the queue */ int num; - /* the index of the queue */ - int queue_index; - /* the virtual address of the ring queue */ void *queue; @@ -180,11 +177,10 @@ static void vp_reset(struct virtio_device *vdev) static void vp_notify(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); - struct virtio_pci_vq_info *info = vq->priv; /* we write the queue's selector into the notification register to * signal the other end */ - iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); + iowrite16(vq->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); } /* Handle a configuration change: Tell driver if it wants to know. */ @@ -380,7 +376,6 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index, if (!info) return ERR_PTR(-ENOMEM); - info->queue_index = index; info->num = num; info->msix_vector = msix_vec; @@ -403,6 +398,7 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index, goto out_activate_queue; } + vq->queue_index = index; vq->priv = info; info->vq = vq; @@ -441,7 +437,7 @@ static void vp_del_vq(struct virtqueue *vq) list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); - iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + iowrite16(vq->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); if (vp_dev->msix_enabled) { iowrite16(VIRTIO_MSI_NO_VECTOR, diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 7108857..ddfbce9 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -22,6 +22,7 @@ struct virtqueue { void (*callback)(struct virtqueue *vq); const char *name; struct virtio_device *vdev; + int queue_index; /* the index of the queue */ void *priv; };
From: Krishna Kumar <krkumar2 at in.ibm.com> Implement mq virtio-net driver. Though struct virtio_net_config changes, it works with the old qemu since the last element is not accessed unless qemu sets VIRTIO_NET_F_MULTIQUEUE. Signed-off-by: Krishna Kumar <krkumar2 at in.ibm.com> Signed-off-by: Jason Wang <jasowang at redhat.com> --- drivers/net/virtio_net.c | 578 +++++++++++++++++++++++++++++++------------- include/linux/virtio_net.h | 3 2 files changed, 411 insertions(+), 170 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 0c7321c..03a199d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -49,16 +49,48 @@ struct virtnet_stats { u64 rx_packets; }; -struct virtnet_info { - struct virtio_device *vdev; - struct virtqueue *rvq, *svq, *cvq; - struct net_device *dev; +/* Internal representation of a send virtqueue */ +struct send_queue { + /* Virtqueue associated with this send _queue */ + struct virtqueue *svq; + + /* TX: fragments + linear part + virtio header */ + struct scatterlist tx_sg[MAX_SKB_FRAGS + 2]; +}; + +/* Internal representation of a receive virtqueue */ +struct receive_queue { + /* Virtqueue associated with this receive_queue */ + struct virtqueue *rvq; + + /* Back pointer to the virtnet_info */ + struct virtnet_info *vi; + struct napi_struct napi; - unsigned int status; /* Number of input buffers, and max we've ever had. */ unsigned int num, max; + /* Work struct for refilling if we run low on memory. */ + struct delayed_work refill; + + /* Chain pages by the private ptr. */ + struct page *pages; + + /* RX: fragments + linear part + virtio header */ + struct scatterlist rx_sg[MAX_SKB_FRAGS + 2]; +}; + +struct virtnet_info { + struct send_queue **sq; + struct receive_queue **rq; + + int numtxqs; /* # of rxqs/txqs */ + struct virtio_device *vdev; + struct virtqueue *cvq; + struct net_device *dev; + unsigned int status; + /* I like... big packets and I cannot lie! */ bool big_packets; @@ -67,16 +99,6 @@ struct virtnet_info { /* Active statistics */ struct virtnet_stats __percpu *stats; - - /* Work struct for refilling if we run low on memory. */ - struct delayed_work refill; - - /* Chain pages by the private ptr. */ - struct page *pages; - - /* fragments + linear part + virtio header */ - struct scatterlist rx_sg[MAX_SKB_FRAGS + 2]; - struct scatterlist tx_sg[MAX_SKB_FRAGS + 2]; }; struct skb_vnet_hdr { @@ -106,22 +128,22 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb) * private is used to chain pages for big packets, put the whole * most recent used list in the beginning for reuse */ -static void give_pages(struct virtnet_info *vi, struct page *page) +static void give_pages(struct receive_queue *rq, struct page *page) { struct page *end; /* Find end of list, sew whole thing into vi->pages. */ for (end = page; end->private; end = (struct page *)end->private); - end->private = (unsigned long)vi->pages; - vi->pages = page; + end->private = (unsigned long)rq->pages; + rq->pages = page; } -static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask) +static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) { - struct page *p = vi->pages; + struct page *p = rq->pages; if (p) { - vi->pages = (struct page *)p->private; + rq->pages = (struct page *)p->private; /* clear private here, it is used to chain pages */ p->private = 0; } else @@ -132,12 +154,13 @@ static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask) static void skb_xmit_done(struct virtqueue *svq) { struct virtnet_info *vi = svq->vdev->priv; + int qnum = svq->queue_index / 2; /* RX/TX vqs are allocated in pairs */ /* Suppress further interrupts. */ virtqueue_disable_cb(svq); /* We were probably waiting for more output buffers. */ - netif_wake_queue(vi->dev); + netif_wake_subqueue(vi->dev, qnum); } static void set_skb_frag(struct sk_buff *skb, struct page *page, @@ -157,9 +180,10 @@ static void set_skb_frag(struct sk_buff *skb, struct page *page, *len -= f->size; } -static struct sk_buff *page_to_skb(struct virtnet_info *vi, +static struct sk_buff *page_to_skb(struct receive_queue *rq, struct page *page, unsigned int len) { + struct virtnet_info *vi = rq->vi; struct sk_buff *skb; struct skb_vnet_hdr *hdr; unsigned int copy, hdr_len, offset; @@ -202,12 +226,12 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, } if (page) - give_pages(vi, page); + give_pages(rq, page); return skb; } -static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb) +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb) { struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb); struct page *page; @@ -221,7 +245,8 @@ static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb) skb->dev->stats.rx_length_errors++; return -EINVAL; } - page = virtqueue_get_buf(vi->rvq, &len); + + page = virtqueue_get_buf(rq->rvq, &len); if (!page) { pr_debug("%s: rx error: %d buffers missing\n", skb->dev->name, hdr->mhdr.num_buffers); @@ -234,13 +259,14 @@ static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb) set_skb_frag(skb, page, 0, &len); - --vi->num; + --rq->num; } return 0; } -static void receive_buf(struct net_device *dev, void *buf, unsigned int len) +static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) { + struct net_device *dev = rq->vi->dev; struct virtnet_info *vi = netdev_priv(dev); struct virtnet_stats __percpu *stats = this_cpu_ptr(vi->stats); struct sk_buff *skb; @@ -251,7 +277,7 @@ static void receive_buf(struct net_device *dev, void *buf, unsigned int len) pr_debug("%s: short packet %i\n", dev->name, len); dev->stats.rx_length_errors++; if (vi->mergeable_rx_bufs || vi->big_packets) - give_pages(vi, buf); + give_pages(rq, buf); else dev_kfree_skb(buf); return; @@ -263,14 +289,14 @@ static void receive_buf(struct net_device *dev, void *buf, unsigned int len) skb_trim(skb, len); } else { page = buf; - skb = page_to_skb(vi, page, len); + skb = page_to_skb(rq, page, len); if (unlikely(!skb)) { dev->stats.rx_dropped++; - give_pages(vi, page); + give_pages(rq, page); return; } if (vi->mergeable_rx_bufs) - if (receive_mergeable(vi, skb)) { + if (receive_mergeable(rq, skb)) { dev_kfree_skb(skb); return; } @@ -341,184 +367,200 @@ frame_err: dev_kfree_skb(skb); } -static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp) +static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp) { struct sk_buff *skb; struct skb_vnet_hdr *hdr; int err; - skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN); + skb = netdev_alloc_skb_ip_align(rq->vi->dev, MAX_PACKET_LEN); if (unlikely(!skb)) return -ENOMEM; skb_put(skb, MAX_PACKET_LEN); hdr = skb_vnet_hdr(skb); - sg_set_buf(vi->rx_sg, &hdr->hdr, sizeof hdr->hdr); + sg_set_buf(rq->rx_sg, &hdr->hdr, sizeof hdr->hdr); - skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len); + skb_to_sgvec(skb, rq->rx_sg + 1, 0, skb->len); - err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, 2, skb, gfp); + err = virtqueue_add_buf_gfp(rq->rvq, rq->rx_sg, 0, 2, skb, gfp); if (err < 0) dev_kfree_skb(skb); return err; } -static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp) +static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp) { struct page *first, *list = NULL; char *p; int i, err, offset; - /* page in vi->rx_sg[MAX_SKB_FRAGS + 1] is list tail */ + /* page in rq->rx_sg[MAX_SKB_FRAGS + 1] is list tail */ for (i = MAX_SKB_FRAGS + 1; i > 1; --i) { - first = get_a_page(vi, gfp); + first = get_a_page(rq, gfp); if (!first) { if (list) - give_pages(vi, list); + give_pages(rq, list); return -ENOMEM; } - sg_set_buf(&vi->rx_sg[i], page_address(first), PAGE_SIZE); + sg_set_buf(&rq->rx_sg[i], page_address(first), PAGE_SIZE); /* chain new page in list head to match sg */ first->private = (unsigned long)list; list = first; } - first = get_a_page(vi, gfp); + first = get_a_page(rq, gfp); if (!first) { - give_pages(vi, list); + give_pages(rq, list); return -ENOMEM; } p = page_address(first); - /* vi->rx_sg[0], vi->rx_sg[1] share the same page */ - /* a separated vi->rx_sg[0] for virtio_net_hdr only due to QEMU bug */ - sg_set_buf(&vi->rx_sg[0], p, sizeof(struct virtio_net_hdr)); + /* rq->rx_sg[0], rq->rx_sg[1] share the same page */ + /* a separated rq->rx_sg[0] for virtio_net_hdr only due to QEMU bug */ + sg_set_buf(&rq->rx_sg[0], p, sizeof(struct virtio_net_hdr)); - /* vi->rx_sg[1] for data packet, from offset */ + /* rq->rx_sg[1] for data packet, from offset */ offset = sizeof(struct padded_vnet_hdr); - sg_set_buf(&vi->rx_sg[1], p + offset, PAGE_SIZE - offset); + sg_set_buf(&rq->rx_sg[1], p + offset, PAGE_SIZE - offset); /* chain first in list head */ first->private = (unsigned long)list; - err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2, + err = virtqueue_add_buf_gfp(rq->rvq, rq->rx_sg, 0, MAX_SKB_FRAGS + 2, first, gfp); if (err < 0) - give_pages(vi, first); + give_pages(rq, first); return err; } -static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp) +static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp) { struct page *page; int err; - page = get_a_page(vi, gfp); + page = get_a_page(rq, gfp); if (!page) return -ENOMEM; - sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE); + sg_init_one(rq->rx_sg, page_address(page), PAGE_SIZE); - err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, 1, page, gfp); + err = virtqueue_add_buf_gfp(rq->rvq, rq->rx_sg, 0, 1, page, gfp); if (err < 0) - give_pages(vi, page); + give_pages(rq, page); return err; } /* Returns false if we couldn't fill entirely (OOM). */ -static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp) +static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp) { + struct virtnet_info *vi = rq->vi; int err; bool oom; do { if (vi->mergeable_rx_bufs) - err = add_recvbuf_mergeable(vi, gfp); + err = add_recvbuf_mergeable(rq, gfp); else if (vi->big_packets) - err = add_recvbuf_big(vi, gfp); + err = add_recvbuf_big(rq, gfp); else - err = add_recvbuf_small(vi, gfp); + err = add_recvbuf_small(rq, gfp); oom = err == -ENOMEM; if (err < 0) break; - ++vi->num; + ++rq->num; } while (err > 0); - if (unlikely(vi->num > vi->max)) - vi->max = vi->num; - virtqueue_kick(vi->rvq); + if (unlikely(rq->num > rq->max)) + rq->max = rq->num; + virtqueue_kick(rq->rvq); return !oom; } static void skb_recv_done(struct virtqueue *rvq) { + int qnum = rvq->queue_index / 2; /* RX/TX vqs are allocated in pairs */ struct virtnet_info *vi = rvq->vdev->priv; + struct napi_struct *napi = &vi->rq[qnum]->napi; + /* Schedule NAPI, Suppress further interrupts if successful. */ - if (napi_schedule_prep(&vi->napi)) { + if (napi_schedule_prep(napi)) { virtqueue_disable_cb(rvq); - __napi_schedule(&vi->napi); + __napi_schedule(napi); } } -static void virtnet_napi_enable(struct virtnet_info *vi) +static void virtnet_napi_enable(struct receive_queue *rq) { - napi_enable(&vi->napi); + napi_enable(&rq->napi); /* If all buffers were filled by other side before we napi_enabled, we * won't get another interrupt, so process any outstanding packets * now. virtnet_poll wants re-enable the queue, so we disable here. * We synchronize against interrupts via NAPI_STATE_SCHED */ - if (napi_schedule_prep(&vi->napi)) { - virtqueue_disable_cb(vi->rvq); - __napi_schedule(&vi->napi); + if (napi_schedule_prep(&rq->napi)) { + virtqueue_disable_cb(rq->rvq); + __napi_schedule(&rq->napi); } } +static void virtnet_napi_enable_all_queues(struct virtnet_info *vi) +{ + int i; + + for (i = 0; i < vi->numtxqs; i++) + virtnet_napi_enable(vi->rq[i]); +} + static void refill_work(struct work_struct *work) { - struct virtnet_info *vi; + struct napi_struct *napi; + struct receive_queue *rq; bool still_empty; - vi = container_of(work, struct virtnet_info, refill.work); - napi_disable(&vi->napi); - still_empty = !try_fill_recv(vi, GFP_KERNEL); - virtnet_napi_enable(vi); + rq = container_of(work, struct receive_queue, refill.work); + napi = &rq->napi; + + napi_disable(napi); + still_empty = !try_fill_recv(rq, GFP_KERNEL); + virtnet_napi_enable(rq); /* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again. */ if (still_empty) - schedule_delayed_work(&vi->refill, HZ/2); + schedule_delayed_work(&rq->refill, HZ/2); } static int virtnet_poll(struct napi_struct *napi, int budget) { - struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi); + struct receive_queue *rq = container_of(napi, struct receive_queue, + napi); void *buf; unsigned int len, received = 0; again: while (received < budget && - (buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) { - receive_buf(vi->dev, buf, len); - --vi->num; + (buf = virtqueue_get_buf(rq->rvq, &len)) != NULL) { + receive_buf(rq, buf, len); + --rq->num; received++; } - if (vi->num < vi->max / 2) { - if (!try_fill_recv(vi, GFP_ATOMIC)) - schedule_delayed_work(&vi->refill, 0); + if (rq->num < rq->max / 2) { + if (!try_fill_recv(rq, GFP_ATOMIC)) + schedule_delayed_work(&rq->refill, 0); } /* Out of packets? */ if (received < budget) { napi_complete(napi); - if (unlikely(!virtqueue_enable_cb(vi->rvq)) && + if (unlikely(!virtqueue_enable_cb(rq->rvq)) && napi_schedule_prep(napi)) { - virtqueue_disable_cb(vi->rvq); + virtqueue_disable_cb(rq->rvq); __napi_schedule(napi); goto again; } @@ -527,13 +569,14 @@ again: return received; } -static unsigned int free_old_xmit_skbs(struct virtnet_info *vi) +static unsigned int free_old_xmit_skbs(struct virtnet_info *vi, + struct virtqueue *svq) { struct sk_buff *skb; unsigned int len, tot_sgs = 0; struct virtnet_stats __percpu *stats = this_cpu_ptr(vi->stats); - while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) { + while ((skb = virtqueue_get_buf(svq, &len)) != NULL) { pr_debug("Sent skb %p\n", skb); u64_stats_update_begin(&stats->syncp); @@ -547,7 +590,8 @@ static unsigned int free_old_xmit_skbs(struct virtnet_info *vi) return tot_sgs; } -static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) +static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb, + struct virtqueue *svq, struct scatterlist *tx_sg) { struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb); const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; @@ -585,12 +629,12 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) /* Encode metadata header at front. */ if (vi->mergeable_rx_bufs) - sg_set_buf(vi->tx_sg, &hdr->mhdr, sizeof hdr->mhdr); + sg_set_buf(tx_sg, &hdr->mhdr, sizeof hdr->mhdr); else - sg_set_buf(vi->tx_sg, &hdr->hdr, sizeof hdr->hdr); + sg_set_buf(tx_sg, &hdr->hdr, sizeof hdr->hdr); - hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1; - return virtqueue_add_buf(vi->svq, vi->tx_sg, hdr->num_sg, + hdr->num_sg = skb_to_sgvec(skb, tx_sg + 1, 0, skb->len) + 1; + return virtqueue_add_buf(svq, tx_sg, hdr->num_sg, 0, skb); } @@ -598,31 +642,34 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int capacity; + int qnum = skb_get_queue_mapping(skb); + struct virtqueue *svq = vi->sq[qnum]->svq; /* Free up any pending old buffers before queueing new ones. */ - free_old_xmit_skbs(vi); + free_old_xmit_skbs(vi, svq); /* Try to transmit */ - capacity = xmit_skb(vi, skb); + capacity = xmit_skb(vi, skb, svq, vi->sq[qnum]->tx_sg); /* This can happen with OOM and indirect buffers. */ if (unlikely(capacity < 0)) { if (net_ratelimit()) { if (likely(capacity == -ENOMEM)) { dev_warn(&dev->dev, - "TX queue failure: out of memory\n"); + "TXQ (%d) failure: out of memory\n", + qnum); } else { dev->stats.tx_fifo_errors++; dev_warn(&dev->dev, - "Unexpected TX queue failure: %d\n", - capacity); + "Unexpected TXQ (%d) failure: %d\n", + qnum, capacity); } } dev->stats.tx_dropped++; kfree_skb(skb); return NETDEV_TX_OK; } - virtqueue_kick(vi->svq); + virtqueue_kick(svq); /* Don't wait up for transmitted skbs to be freed. */ skb_orphan(skb); @@ -631,13 +678,13 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) /* Apparently nice girls don't return TX_BUSY; stop the queue * before it gets out of hand. Naturally, this wastes entries. */ if (capacity < 2+MAX_SKB_FRAGS) { - netif_stop_queue(dev); - if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) { + netif_stop_subqueue(dev, qnum); + if (unlikely(!virtqueue_enable_cb_delayed(svq))) { /* More just got used, free them then recheck. */ - capacity += free_old_xmit_skbs(vi); + capacity += free_old_xmit_skbs(vi, svq); if (capacity >= 2+MAX_SKB_FRAGS) { - netif_start_queue(dev); - virtqueue_disable_cb(vi->svq); + netif_start_subqueue(dev, qnum); + virtqueue_disable_cb(svq); } } } @@ -700,8 +747,10 @@ static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev, static void virtnet_netpoll(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); + int i; - napi_schedule(&vi->napi); + for (i = 0; i < vi->numtxqs; i++) + napi_schedule(&vi->rq[i]->napi); } #endif @@ -709,7 +758,7 @@ static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); - virtnet_napi_enable(vi); + virtnet_napi_enable_all_queues(vi); return 0; } @@ -761,8 +810,10 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, static int virtnet_close(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); + int i; - napi_disable(&vi->napi); + for (i = 0; i < vi->numtxqs; i++) + napi_disable(&vi->rq[i]->napi); return 0; } @@ -919,10 +970,10 @@ static void virtnet_update_status(struct virtnet_info *vi) if (vi->status & VIRTIO_NET_S_LINK_UP) { netif_carrier_on(vi->dev); - netif_wake_queue(vi->dev); + netif_tx_wake_all_queues(vi->dev); } else { netif_carrier_off(vi->dev); - netif_stop_queue(vi->dev); + netif_tx_stop_all_queues(vi->dev); } } @@ -933,18 +984,222 @@ static void virtnet_config_changed(struct virtio_device *vdev) virtnet_update_status(vi); } +static void free_receive_bufs(struct virtnet_info *vi) +{ + int i; + + for (i = 0; i < vi->numtxqs; i++) { + BUG_ON(vi->rq[i] == NULL); + while (vi->rq[i]->pages) + __free_pages(get_a_page(vi->rq[i], GFP_KERNEL), 0); + } +} + +/* Free memory allocated for send and receive queues */ +static void free_rq_sq(struct virtnet_info *vi) +{ + int i; + + if (vi->rq) { + for (i = 0; i < vi->numtxqs; i++) + kfree(vi->rq[i]); + kfree(vi->rq); + } + + if (vi->sq) { + for (i = 0; i < vi->numtxqs; i++) + kfree(vi->sq[i]); + kfree(vi->sq); + } +} + +static void free_unused_bufs(struct virtnet_info *vi) +{ + void *buf; + int i; + + for (i = 0; i < vi->numtxqs; i++) { + struct virtqueue *svq = vi->sq[i]->svq; + + while (1) { + buf = virtqueue_detach_unused_buf(svq); + if (!buf) + break; + dev_kfree_skb(buf); + } + } + + for (i = 0; i < vi->numtxqs; i++) { + struct virtqueue *rvq = vi->rq[i]->rvq; + + while (1) { + buf = virtqueue_detach_unused_buf(rvq); + if (!buf) + break; + if (vi->mergeable_rx_bufs || vi->big_packets) + give_pages(vi->rq[i], buf); + else + dev_kfree_skb(buf); + --vi->rq[i]->num; + } + BUG_ON(vi->rq[i]->num != 0); + } +} + +#define MAX_DEVICE_NAME 16 +static int initialize_vqs(struct virtnet_info *vi, int numtxqs) +{ + vq_callback_t **callbacks; + struct virtqueue **vqs; + int i, err = -ENOMEM; + int totalvqs; + char **names; + + /* Allocate receive queues */ + vi->rq = kcalloc(numtxqs, sizeof(*vi->rq), GFP_KERNEL); + if (!vi->rq) + goto out; + for (i = 0; i < numtxqs; i++) { + vi->rq[i] = kzalloc(sizeof(*vi->rq[i]), GFP_KERNEL); + if (!vi->rq[i]) + goto out; + } + + /* Allocate send queues */ + vi->sq = kcalloc(numtxqs, sizeof(*vi->sq), GFP_KERNEL); + if (!vi->sq) + goto out; + for (i = 0; i < numtxqs; i++) { + vi->sq[i] = kzalloc(sizeof(*vi->sq[i]), GFP_KERNEL); + if (!vi->sq[i]) + goto out; + } + + /* setup initial receive and send queue parameters */ + for (i = 0; i < numtxqs; i++) { + vi->rq[i]->vi = vi; + vi->rq[i]->pages = NULL; + INIT_DELAYED_WORK(&vi->rq[i]->refill, refill_work); + netif_napi_add(vi->dev, &vi->rq[i]->napi, virtnet_poll, + napi_weight); + + sg_init_table(vi->rq[i]->rx_sg, ARRAY_SIZE(vi->rq[i]->rx_sg)); + sg_init_table(vi->sq[i]->tx_sg, ARRAY_SIZE(vi->sq[i]->tx_sg)); + } + + /* + * We expect 1 RX virtqueue followed by 1 TX virtqueues, followed + * by the same 'numtxqs-1' times, and optionally one control virtqueue. + */ + totalvqs = numtxqs * 2 + + virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ); + + /* Allocate space for find_vqs parameters */ + vqs = kmalloc(totalvqs * sizeof(*vqs), GFP_KERNEL); + callbacks = kmalloc(totalvqs * sizeof(*callbacks), GFP_KERNEL); + names = kzalloc(totalvqs * sizeof(*names), GFP_KERNEL); + if (!vqs || !callbacks || !names) + goto free_params; + +#if 1 + /* Allocate/initialize parameters for recv/send virtqueues */ + for (i = 0; i < numtxqs * 2; i++) { + names[i] = kmalloc(MAX_DEVICE_NAME * sizeof(*names[i]), + GFP_KERNEL); + if (!names[i]) + goto free_params; + + if (!(i & 1)) { /* RX */ + callbacks[i] = skb_recv_done; + sprintf(names[i], "input.%d", i / 2); + } else { + callbacks[i] = skb_xmit_done; + sprintf(names[i], "output.%d", i / 2); + } + } + + /* Parameters for control virtqueue, if any */ + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { + callbacks[i] = NULL; + names[i] = "control"; + } +#else + /* Allocate/initialize parameters for recv virtqueues */ + for (i = 0; i < numtxqs * 2; i += 2) { + callbacks[i] = skb_recv_done; + names[i] = kmalloc(MAX_DEVICE_NAME * sizeof(*names[i]), + GFP_KERNEL); + if (!names[i]) + goto free_params; + sprintf(names[i], "input.%d", i / 2); + } + + /* Allocate/initialize parameters for send virtqueues */ + for (i = 1; i < numtxqs * 2; i += 2) { + callbacks[i] = skb_xmit_done; + names[i] = kmalloc(MAX_DEVICE_NAME * sizeof(*names[i]), + GFP_KERNEL); + if (!names[i]) + goto free_params; + sprintf(names[i], "output.%d", i / 2); + } + + /* Parameters for control virtqueue, if any */ + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { + callbacks[i - 1] = NULL; + names[i - 1] = "control"; + } +#endif + + err = vi->vdev->config->find_vqs(vi->vdev, totalvqs, vqs, callbacks, + (const char **)names); + if (err) + goto free_params; + + /* Assign the allocated vqs alternatively for RX/TX */ + for (i = 0; i < numtxqs * 2; i += 2) { + vi->rq[i/2]->rvq = vqs[i]; + vi->sq[i/2]->svq = vqs[i + 1]; + } + + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) + vi->cvq = vqs[i]; + +free_params: + if (names) { + for (i = 0; i < numtxqs * 2; i++) + kfree(names[i]); + kfree(names); + } + + kfree(callbacks); + kfree(vqs); + +out: + if (err) + free_rq_sq(vi); + + return err; +} + static int virtnet_probe(struct virtio_device *vdev) { - int err; + int i, err; + u16 numtxqs; + u16 num_queue_pairs = 2; struct net_device *dev; struct virtnet_info *vi; - struct virtqueue *vqs[3]; - vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL}; - const char *names[] = { "input", "output", "control" }; - int nvqs; + + /* Find if host supports MULTIQUEUE */ + err = virtio_config_val(vdev, VIRTIO_NET_F_MULTIQUEUE, + offsetof(struct virtio_net_config, + num_queue_pairs), &num_queue_pairs); + numtxqs = num_queue_pairs / 2; + if (!numtxqs) + numtxqs = 1; /* Allocate ourselves a network device with room for our info */ - dev = alloc_etherdev(sizeof(struct virtnet_info)); + dev = alloc_etherdev_mq(sizeof(struct virtnet_info), numtxqs); if (!dev) return -ENOMEM; @@ -991,19 +1246,14 @@ static int virtnet_probe(struct virtio_device *vdev) /* Set up our device-specific information */ vi = netdev_priv(dev); - netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight); vi->dev = dev; vi->vdev = vdev; vdev->priv = vi; - vi->pages = NULL; vi->stats = alloc_percpu(struct virtnet_stats); err = -ENOMEM; if (vi->stats == NULL) goto free; - - INIT_DELAYED_WORK(&vi->refill, refill_work); - sg_init_table(vi->rx_sg, ARRAY_SIZE(vi->rx_sg)); - sg_init_table(vi->tx_sg, ARRAY_SIZE(vi->tx_sg)); + vi->numtxqs = numtxqs; /* If we can receive ANY GSO packets, we must allocate large ones. */ if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || @@ -1014,23 +1264,14 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) vi->mergeable_rx_bufs = true; - /* We expect two virtqueues, receive then send, - * and optionally control. */ - nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2; - - err = vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names); + /* Initialize our rx/tx queue parameters, and invoke find_vqs */ + err = initialize_vqs(vi, numtxqs); if (err) goto free_stats; - vi->rvq = vqs[0]; - vi->svq = vqs[1]; - - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { - vi->cvq = vqs[2]; - - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) - dev->features |= NETIF_F_HW_VLAN_FILTER; - } + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) && + virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) + dev->features |= NETIF_F_HW_VLAN_FILTER; err = register_netdev(dev); if (err) { @@ -1039,14 +1280,21 @@ static int virtnet_probe(struct virtio_device *vdev) } /* Last of all, set up some receive buffers. */ - try_fill_recv(vi, GFP_KERNEL); - - /* If we didn't even get one input buffer, we're useless. */ - if (vi->num == 0) { - err = -ENOMEM; - goto unregister; + for (i = 0; i < numtxqs; i++) { + try_fill_recv(vi->rq[i], GFP_KERNEL); + + /* If we didn't even get one input buffer, we're useless. */ + if (vi->rq[i]->num == 0) { + if (i) + free_unused_bufs(vi); + err = -ENOMEM; + goto free_recv_bufs; + } } + dev_info(&dev->dev, "(virtio-net) Allocated %d RX & TX vq's\n", + numtxqs); + /* Assume link up if device can't report link status, otherwise get link status from config. */ if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { @@ -1057,61 +1305,51 @@ static int virtnet_probe(struct virtio_device *vdev) netif_carrier_on(dev); } - pr_debug("virtnet: registered device %s\n", dev->name); + pr_debug("virtnet: registered device %s with %d RX and TX vq's\n", + dev->name, numtxqs); return 0; -unregister: +free_recv_bufs: + free_receive_bufs(vi); unregister_netdev(dev); - cancel_delayed_work_sync(&vi->refill); + free_vqs: + for (i = 0; i < numtxqs; i++) + cancel_delayed_work_sync(&vi->rq[i]->refill); vdev->config->del_vqs(vdev); + free_rq_sq(vi); + free_stats: free_percpu(vi->stats); + free: free_netdev(dev); return err; } -static void free_unused_bufs(struct virtnet_info *vi) -{ - void *buf; - while (1) { - buf = virtqueue_detach_unused_buf(vi->svq); - if (!buf) - break; - dev_kfree_skb(buf); - } - while (1) { - buf = virtqueue_detach_unused_buf(vi->rvq); - if (!buf) - break; - if (vi->mergeable_rx_bufs || vi->big_packets) - give_pages(vi, buf); - else - dev_kfree_skb(buf); - --vi->num; - } - BUG_ON(vi->num != 0); -} - static void __devexit virtnet_remove(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; + int i; /* Stop all the virtqueues. */ vdev->config->reset(vdev); unregister_netdev(vi->dev); - cancel_delayed_work_sync(&vi->refill); + + for (i = 0; i < vi->numtxqs; i++) + cancel_delayed_work_sync(&vi->rq[i]->refill); /* Free unused buffers in both send and recv, if any. */ free_unused_bufs(vi); vdev->config->del_vqs(vi->vdev); - while (vi->pages) - __free_pages(get_a_page(vi, GFP_KERNEL), 0); + free_receive_bufs(vi); + + /* Free memory for send and receive queues */ + free_rq_sq(vi); free_percpu(vi->stats); free_netdev(vi->dev); @@ -1129,7 +1367,7 @@ static unsigned int features[] = { VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, - VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, + VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_MULTIQUEUE, }; static struct virtio_driver virtio_net_driver = { diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 970d5a2..fa85ac3 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -49,6 +49,7 @@ #define VIRTIO_NET_F_CTRL_RX 18 /* Control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN 19 /* Control channel VLAN filtering */ #define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */ +#define VIRTIO_NET_F_MULTIQUEUE 21 /* Device supports multiple TXQ/RXQ */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ @@ -57,6 +58,8 @@ struct virtio_net_config { __u8 mac[6]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ __u16 status; + /* total number of RX/TX queues */ + __u16 num_queue_pairs; } __attribute__((packed)); /* This is the first element of the scatter-gather list. If you don't
Jason Wang writes: > As multi-queue nics were commonly used for high-end servers, > current single queue based tap can not satisfy the > requirement of scaling guest network performance as the > numbers of vcpus increase. So the following series > implements multiple queue support in tun/tap. > > In order to take advantages of this, a multi-queue capable > driver and qemu were also needed. I just rebase the latest > version of Krishna's multi-queue virtio-net driver into this > series to simplify the test. And for multiqueue supported > qemu, you can refer the patches I post in > http://www.spinics.net/lists/kvm/msg52808.html. Vhost is > also a must to achieve high performance and its code could > be used for multi-queue without modification. Alternatively, > this series can be also used for Krishna's M:N > implementation of multiqueue but I didn't test it. > > The idea is simple: each socket were abstracted as a queue > for tun/tap, and userspace may open as many files as > required and then attach them to the devices. In order to > keep the ABI compatibility, device creation were still > finished in TUNSETIFF, and two new ioctls TUNATTACHQUEUE and > TUNDETACHQUEUE were added for user to manipulate the numbers > of queues for the tun/tap. > > I've done some basic performance testing of multi queue > tap. For tun, I just test it through vpnc. > > Notes: > - Test shows improvement when receving packets from > local/external host to guest, and send big packet from guest > to local/external host. > - Current multiqueue based virtio-net/tap introduce a > regression of send small packet (512 byte) from guest to > local/external host. I suspect it's the issue of queue > selection in both guest driver and tap. Would continue to > investigate. > - I would post the perforamnce numbers as a reply of this > mail. > > TODO: > - solve the issue of packet transmission of small packets. > - addressing the comments of virtio-net driver > - performance tunning > > Please review and comment it, Thanks. > > --- > > Jason Wang (5): > tuntap: move socket/sock related structures to tun_file > tuntap: categorize ioctl > tuntap: introduce multiqueue related flags > tuntap: multiqueue support > tuntap: add ioctls to attach or detach a file form tap device > > Krishna Kumar (2): > Change virtqueue structure > virtio-net changes > > > drivers/net/tun.c | 738 ++++++++++++++++++++++++++----------------- > drivers/net/virtio_net.c | 578 ++++++++++++++++++++++++---------- > drivers/virtio/virtio_pci.c | 10 - > include/linux/if_tun.h | 5 > include/linux/virtio.h | 1 > include/linux/virtio_net.h | 3 > 6 files changed, 867 insertions(+), 468 deletions(-) > > -- > Jason Wang > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo at vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ Here are some performance result for multiqueue tap For multiqueue, the test use qemu-kvm + mq patches, net-next-2.6+ tap mq patches + mq driver, For single queue, the test use qemu-kvm, net-next-2.6, rfs were also enabled in the guest during the test. All test were done by netperf in two i7(Intel(R) Xeon(R) CPU E5620 2.40GHz) with direct connected 82599 cards. Quick Notes to the result: - Regression with Guest to External/Local host of 512 bytes. - For the External host to guest, could scale or at least the same as the single queue implementation. 1 Guest to External Host TCP 512 byte Multiqueue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 2054.11 23.43 87 2 2037.32 22.64 89 4 2007.53 22.87 87 8 1993.41 23.82 83 == smp=2 queue=2 =sessions | throughput | cpu | normalized 1 1960.58 24.30 80 2 9250.41 32.19 287 4 3897.49 49.31 79 8 4088.44 46.85 87 == smp=4 queue=4 =sessions | throughput | cpu | normalized 1 1986.87 23.17 85 2 4431.79 44.64 99 4 8705.83 51.89 167 8 9420.63 45.96 204 == smp=8 queue=8 =sessions | throughput | cpu | normalized 1 1820.38 20.17 90 2 3707.64 42.19 87 4 8930.71 63.65 140 8 9391.13 51.90 180 Single-queue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 2032.64 22.96 88 2 2058.76 23.22 88 4 2028.97 22.84 88 8 1989.41 23.89 83 == smp=2 queue=1 =sessions | throughput | cpu | normalized 1 2444.50 25.00 97 2 9298.64 30.76 302 4 8788.58 30.82 285 8 9158.28 30.45 300 == smp=4 queue=1 =sessions | throughput | cpu | normalized 1 2359.50 25.10 94 2 9325.88 29.83 312 4 9198.29 32.96 279 8 8980.73 32.25 278 == smp=8 queue=1 =sessions | throughput | cpu | normalized 1 2170.15 23.77 91 2 8329.73 28.79 289 4 8152.25 36.11 225 8 9121.11 40.08 227 2 Guest to external host TCP with default size Multiqueue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 7767.87 18.43 421 2 9399.18 21.48 437 4 8373.23 21.37 391 8 9310.84 21.91 424 == smp=2 queue=2 =sessions | throughput | cpu | normalized 1 9358.75 20.27 461 2 9405.25 30.67 306 4 9407.63 26.24 358 8 9412.77 28.75 327 == smp=4 queue=4 =sessions | throughput | cpu | normalized 1 9358.39 22.11 423 2 9401.27 27.29 344 4 9414.98 28.75 327 8 9420.93 31.09 303 == smp=8 queue=8 =sessions | throughput | cpu | normalized 1 9057.52 20.09 450 2 8486.72 28.18 301 4 9330.96 40.13 232 8 9377.99 59.41 157 Single Queue Result == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 8192.58 19.30 424 2 9400.31 22.55 416 4 8771.94 21.75 403 8 8922.61 22.50 396 == smp=2 queue=1 =sessions | throughput | cpu | normalized 1 9387.28 23.13 405 2 8322.94 24.58 338 4 9404.86 26.22 358 8 9145.79 26.57 344 == smp=4 queue=1 =sessions | throughput | cpu | normalized 1 2377.83 9.86 241 2 9403.32 26.96 348 4 8822.57 27.23 324 8 9380.85 26.90 348 == smp=8 queue=1 =sessions | throughput | cpu | normalized 1 7275.95 21.47 338 2 9407.34 27.39 343 4 8365.05 25.99 321 8 9150.65 27.78 329 3 External Host to guest TCP, default packet size Multiqueue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 8944.69 25.59 349 2 8503.67 24.95 340 4 7910.54 25.88 305 8 7455.13 26.35 282 == smp=2 queue=2 =sessions | throughput | cpu | normalized 1 9370.11 23.70 395 2 9365.97 31.91 293 4 9389.83 34.99 268 8 9405.52 34.83 270 == smp=4 queue=4 =sessions | throughput | cpu | normalized 1 9061.71 23.45 386 2 9373.92 22.38 418 4 9399.83 40.89 229 8 9412.92 48.99 192 == smp=8 queue=8 =sessions | throughput | cpu | normalized 1 8203.61 24.64 332 2 9286.28 32.68 284 4 9403.61 49.33 190 8 9411.42 64.38 146 Single Queue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 8999.39 26.24 342 2 8921.23 25.00 356 4 7918.52 26.60 297 8 6901.77 25.92 266 == smp=2 queue=1 =sessions | throughput | cpu | normalized 1 9016.77 25.82 349 2 8572.92 33.19 258 4 7962.34 28.88 275 8 6959.10 32.77 212 == smp=4 queue=1 =sessions | throughput | cpu | normalized 1 8951.43 25.76 347 2 8411.78 35.51 236 4 7874.05 35.99 218 8 6869.55 36.80 186 == smp=8 queue=1 =sessions | throughput | cpu | normalized 1 9332.84 25.95 359 2 9103.57 30.37 299 4 7907.03 33.94 232 8 6919.99 38.82 178 4 External Host to guest TCP with 512 byte packet size Multiqueue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 3354.22 15.75 212 2 6419.73 22.59 284 4 7545.04 25.06 301 8 7550.39 26.32 286 == smp=2 queue=2 =sessions | throughput | cpu | normalized 1 3146.17 14.08 223 2 6414.55 21.01 305 4 9389.08 37.86 247 8 9402.39 40.24 233 == smp=4 queue=4 =sessions | throughput | cpu | normalized 1 3247.65 14.91 217 2 6528.78 29.89 218 4 9402.89 37.79 248 8 9404.06 47.87 196 == smp=8 queue=8 =sessions | throughput | cpu | normalized 1 4367.90 14.16 308 2 6962.76 27.99 248 4 9404.83 41.26 227 8 9412.09 57.74 163 Single Queue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 3253.88 14.53 223 2 6385.90 20.83 306 4 7581.40 26.07 290 8 7025.62 26.54 264 == smp=2 queue=1 =sessions | throughput | cpu | normalized 1 3257.61 13.85 235 2 6385.06 20.66 309 4 7465.50 32.27 231 8 7021.31 31.42 223 == smp=4 queue=1 =sessions | throughput | cpu | normalized 1 3186.60 15.88 200 2 6298.92 27.40 229 4 7474.69 32.53 229 8 6985.72 33.36 209 == smp=8 queue=1 =sessions | throughput | cpu | normalized 1 3279.81 17.63 186 2 6513.77 29.78 218 4 7413.30 35.44 209 8 6936.96 32.68 212 5 Guest to Local host TCP with 512 byte packet size Multuqueue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 1961.31 35.43 55 2 1974.04 34.76 56 4 1906.74 34.04 56 8 1907.94 34.75 54 == smp=2 queue=2 =sessions | throughput | cpu | normalized 1 1971.22 31.95 61 2 2484.96 58.75 42 4 3290.77 53.18 61 8 3031.99 54.11 56 == smp=4 queue=4 =sessions | throughput | cpu | normalized 1 1107.56 31.22 35 2 2811.83 59.57 47 4 10276.05 79.79 128 8 12760.93 96.93 131 == smp=8 queue=8 =sessions | throughput | cpu | normalized 1 1888.28 32.15 58 2 2335.03 56.72 41 4 9785.72 82.22 119 8 11274.42 95.60 117 Single Queue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 1981.08 31.89 62 2 1970.74 32.57 60 4 1944.63 32.02 60 8 1943.50 31.45 61 == smp=2 queue=1 =sessions | throughput | cpu | normalized 1 2118.23 34.80 60 2 7221.95 45.63 158 4 7924.92 47.06 168 8 8651.28 47.40 182 == smp=4 queue=1 =sessions | throughput | cpu | normalized 1 2110.70 33.18 63 2 6602.25 42.86 154 4 9715.38 47.38 205 8 20131.98 61.94 325 == smp=8 queue=1 =sessions | throughput | cpu | normalized 1 1881.33 40.69 46 2 7631.25 48.56 157 4 13366.28 59.47 224 8 19949.45 68.85 289 6 Guest to Local host with default packet size. Multuqueue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 8674.81 34.86 248 2 8576.14 34.72 247 4 8503.87 34.62 245 8 8247.43 33.77 244 == smp=2 queue=2 =sessions | throughput | cpu | normalized 1 7785.02 32.25 241 2 14696.71 58.14 252 4 12339.64 51.43 239 8 12997.55 52.53 247 == smp=4 queue=4 =sessions | throughput | cpu | normalized 1 8557.25 32.38 264 2 12164.88 58.56 207 4 18144.19 73.69 246 8 29756.33 96.15 309 == smp=8 queue=8 =sessions | throughput | cpu | normalized 1 6808.67 36.55 186 2 11590.04 61.14 189 4 23667.67 81.50 290 8 25501.89 92.44 275 Single Queue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 8053.49 36.35 221 2 8493.95 35.21 241 4 8367.26 34.61 241 8 8435.64 35.45 237 == smp=2 queue=1 =sessions | throughput | cpu | normalized 1 9259.56 35.24 262 2 17153.83 44.07 389 4 16901.67 45.88 368 8 18180.81 42.34 429 == smp=4 queue=1 =sessions | throughput | cpu | normalized 1 8928.11 31.22 285 2 16835.27 47.79 352 4 16923.83 47.78 354 8 18050.62 45.86 393 == smp=8 queue=1 =sessions | throughput | cpu | normalized 1 2978.88 25.75 115 2 15422.18 41.97 367 4 16137.10 45.90 351 8 16628.30 48.99 339 7 Local host to Guest with defaut 512 packet size Multiqueue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 3665.90 31.88 114 2 5709.15 38.16 149 4 8803.25 42.92 205 8 10530.33 45.21 232 == smp=2 queue=2 =sessions | throughput | cpu | normalized 1 3390.07 31.28 108 2 7502.21 62.42 120 4 14247.63 67.23 211 8 16766.93 69.66 240 == smp=4 queue=4 =sessions | throughput | cpu | normalized 1 3580.96 31.90 112 2 4353.46 62.85 69 4 8264.18 77.94 106 8 16014.00 80.11 199 == smp=8 queue=8 =sessions | throughput | cpu | normalized 1 1745.36 41.84 41 2 4472.03 73.50 60 4 12646.92 79.86 158 8 18212.21 89.79 202 Single Queue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 4220.96 31.88 132 2 5732.38 37.12 154 4 7006.81 41.60 168 8 10529.09 45.92 229 == smp=2 queue=1 =sessions | throughput | cpu | normalized 1 2665.41 40.53 65 2 9864.49 59.44 165 4 11678.42 60.20 193 8 16042.60 57.85 277 == smp=4 queue=1 =sessions | throughput | cpu | normalized 1 2609.10 42.67 61 2 5496.83 68.52 80 4 16848.24 60.49 278 8 14829.66 60.54 244 == smp=8 queue=1 =sessions | throughput | cpu | normalized 1 2567.15 44.54 57 2 5902.02 59.32 99 4 13265.99 68.48 193 8 15301.16 63.95 239 8 Local host to Guest with default packet size Multiqueue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 12531.65 29.95 418 2 12495.93 30.05 415 4 12487.40 31.28 399 8 11501.68 33.51 343 == smp=2 queue=2 =sessions | throughput | cpu | normalized 1 12566.08 28.86 435 2 21756.15 54.33 400 4 19899.84 56.37 353 8 19326.62 61.57 313 == smp=4 queue=4 =sessions | throughput | cpu | normalized 1 12383.42 28.69 431 2 19714.34 57.62 342 4 20609.45 64.13 321 8 18935.57 95.05 199 == smp=8 queue=8 =sessions | throughput | cpu | normalized 1 13736.90 31.95 429 2 26157.13 71.77 364 4 22874.41 78.54 291 8 19960.91 96.08 207 Single Queue Result: == smp=1 queue=1 =sessions | throughput | cpu | normalized 1 12501.11 30.01 416 2 12497.01 28.51 438 4 12429.25 31.09 399 8 12152.53 28.20 430 == smp=2 queue=1 =sessions | throughput | cpu | normalized 1 13632.87 35.32 385 2 19900.82 46.28 430 4 17510.87 42.21 414 8 14443.78 35.48 407 == smp=4 queue=1 =sessions | throughput | cpu | normalized 1 14584.61 37.70 386 2 12646.50 31.39 402 4 16248.16 49.22 330 8 14131.34 47.48 297 == smp=8 queue=1 =sessions | throughput | cpu | normalized 1 16279.89 39.51 412 2 16958.02 53.87 314 4 16906.03 50.35 335 8 14686.25 47.30 310 -- Jason Wang
Le vendredi 12 ao?t 2011 ? 09:55 +0800, Jason Wang a ?crit :>+ rxq = skb_get_rxhash(skb); >+ if (rxq) { >+ tfile = rcu_dereference(tun->tfiles[rxq % numqueues]); >+ if (tfile) >+ goto out; >+ }You can avoid an expensive divide with following trick : u32 idx = ((u64)rxq * numqueues) >> 32;> -static struct tun_struct *tun_get(struct file *file) > +static void tun_detach_all(struct net_device *dev) > { > - return __tun_get(file->private_data); > + struct tun_struct *tun = netdev_priv(dev); > + struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES]; > + int i, j = 0; > + > + spin_lock(&tun_lock); > + > + for (i = 0; i < MAX_TAP_QUEUES && tun->numqueues; i++) { > + tfile = rcu_dereference_protected(tun->tfiles[i], > + lockdep_is_held(&tun_lock)); > + if (tfile) { > + wake_up_all(&tfile->wq.wait); > + tfile_list[i++] = tfile;typo here, you want tfile_list[j++] = tfile;> + rcu_assign_pointer(tun->tfiles[i], NULL); > + rcu_assign_pointer(tfile->tun, NULL); > + --tun->numqueues; > + } > + } > + BUG_ON(tun->numqueues != 0); > + spin_unlock(&tun_lock); > + > + synchronize_rcu(); > + for(--j; j >= 0; j--) > + sock_put(&tfile_list[j]->sk); > } >Could you take a look at net/packet/af_packet.c, to check how David did the whole fanout thing ? __fanout_unlink() Trick is to not leave NULL entries in the tun->tfiles[] array. It makes things easier in hot path.
Sridhar Samudrala
2011-Aug-13 00:46 UTC
[net-next RFC PATCH 0/7] multiqueue support for tun/tap
On Fri, 2011-08-12 at 09:54 +0800, Jason Wang wrote:> As multi-queue nics were commonly used for high-end servers, > current single queue based tap can not satisfy the > requirement of scaling guest network performance as the > numbers of vcpus increase. So the following series > implements multiple queue support in tun/tap. > > In order to take advantages of this, a multi-queue capable > driver and qemu were also needed. I just rebase the latest > version of Krishna's multi-queue virtio-net driver into this > series to simplify the test. And for multiqueue supported > qemu, you can refer the patches I post in > http://www.spinics.net/lists/kvm/msg52808.html. Vhost is > also a must to achieve high performance and its code could > be used for multi-queue without modification. Alternatively, > this series can be also used for Krishna's M:N > implementation of multiqueue but I didn't test it. > > The idea is simple: each socket were abstracted as a queue > for tun/tap, and userspace may open as many files as > required and then attach them to the devices. In order to > keep the ABI compatibility, device creation were still > finished in TUNSETIFF, and two new ioctls TUNATTACHQUEUE and > TUNDETACHQUEUE were added for user to manipulate the numbers > of queues for the tun/tap.Is it possible to have tap create these queues automatically when TUNSETIFF is called instead of having userspace to do the new ioctls. I am just wondering if it is possible to get multi-queue to be enabled without any changes to qemu. I guess the number of queues could be based on the number of vhost threads/guest virtio-net queues. Also, is it possible to enable multi-queue on the host alone without any guest virtio-net changes? Have you done any multiple TCP_RR/UDP_RR testing with small packet sizes? 256byte request/response with 50-100 instances?> > I've done some basic performance testing of multi queue > tap. For tun, I just test it through vpnc. > > Notes: > - Test shows improvement when receving packets from > local/external host to guest, and send big packet from guest > to local/external host. > - Current multiqueue based virtio-net/tap introduce a > regression of send small packet (512 byte) from guest to > local/external host. I suspect it's the issue of queue > selection in both guest driver and tap. Would continue to > investigate. > - I would post the perforamnce numbers as a reply of this > mail. > > TODO: > - solve the issue of packet transmission of small packets. > - addressing the comments of virtio-net driver > - performance tunning > > Please review and comment it, Thanks. > > --- > > Jason Wang (5): > tuntap: move socket/sock related structures to tun_file > tuntap: categorize ioctl > tuntap: introduce multiqueue related flags > tuntap: multiqueue support > tuntap: add ioctls to attach or detach a file form tap device > > Krishna Kumar (2): > Change virtqueue structure > virtio-net changes > > > drivers/net/tun.c | 738 ++++++++++++++++++++++++++----------------- > drivers/net/virtio_net.c | 578 ++++++++++++++++++++++++---------- > drivers/virtio/virtio_pci.c | 10 - > include/linux/if_tun.h | 5 > include/linux/virtio.h | 1 > include/linux/virtio_net.h | 3 > 6 files changed, 867 insertions(+), 468 deletions(-) >
----- Original Message -----> Le vendredi 12 ao?t 2011 ? 09:55 +0800, Jason Wang a ?crit : > > >+ rxq = skb_get_rxhash(skb); > >+ if (rxq) { > >+ tfile = rcu_dereference(tun->tfiles[rxq % numqueues]); > >+ if (tfile) > >+ goto out; > >+ } > > You can avoid an expensive divide with following trick : > > u32 idx = ((u64)rxq * numqueues) >> 32; >Sure.> > > > -static struct tun_struct *tun_get(struct file *file) > > +static void tun_detach_all(struct net_device *dev) > > { > > - return __tun_get(file->private_data); > > + struct tun_struct *tun = netdev_priv(dev); > > + struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES]; > > + int i, j = 0; > > + > > + spin_lock(&tun_lock); > > + > > + for (i = 0; i < MAX_TAP_QUEUES && tun->numqueues; i++) { > > + tfile = rcu_dereference_protected(tun->tfiles[i], > > + lockdep_is_held(&tun_lock)); > > + if (tfile) { > > + wake_up_all(&tfile->wq.wait); > > + tfile_list[i++] = tfile; > > typo here, you want tfile_list[j++] = tfile; >Yes, thanks for catching this.> > + rcu_assign_pointer(tun->tfiles[i], NULL); > > + rcu_assign_pointer(tfile->tun, NULL); > > + --tun->numqueues; > > + } > > + } > > + BUG_ON(tun->numqueues != 0); > > + spin_unlock(&tun_lock); > > + > > + synchronize_rcu(); > > + for(--j; j >= 0; j--) > > + sock_put(&tfile_list[j]->sk); > > } > > > > Could you take a look at net/packet/af_packet.c, to check how David > did > the whole fanout thing ? > > __fanout_unlink() > > Trick is to not leave NULL entries in the tun->tfiles[] array. > > It makes things easier in hot path.Sure I would go to take a look at this.> > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo at vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html