This is the third version of the macvtap device driver, following another major restructuring and a lot of bug fixes: * Change macvtap to be based around a struct sock * macvtap: fix initialization * return 0 to netlink * don't use rcu for q->file and q->vlan pointers * macvtap: checkpatch.pl fixes * macvtap: fix tun IFF flags * Use a struct socket to make tx flow control work * disable BH processing during transmit * only add an ethernet header for receive not forward * allocate the SKB using GFP_NOWAIT since we're in rcu_read_lock * use atomic allocation for socket * fix blocking on send * do not destroy netdev twice in error path There are still known problems, but unless there are fundamental concerns, I'd like this to go into net-next as an experimental driver, fixing up the remaining problems by 2.6.34-rc1. Arnd
Arnd Bergmann
2010-Jan-27 10:05 UTC
[Bridge] [PATCH 1/3] net: maintain namespace isolation between vlan and real device
In the vlan and macvlan drivers, the start_xmit function forwards data to the dev_queue_xmit function for another device, which may potentially belong to a different namespace. To make sure that classification stays within a single namespace, this resets the potentially critical fields. Signed-off-by: Arnd Bergmann <arnd at arndb.de> --- drivers/net/macvlan.c | 2 +- include/linux/netdevice.h | 9 +++++++++ net/8021q/vlan_dev.c | 2 +- net/core/dev.c | 35 +++++++++++++++++++++++++++++++---- 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index bad1303..e0436fd 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -269,7 +269,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) } xmit_world: - skb->dev = vlan->lowerdev; + skb_set_dev(skb, vlan->lowerdev); return dev_queue_xmit(skb); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93a32a5..622ba5a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1004,6 +1004,15 @@ static inline bool netdev_uses_dsa_tags(struct net_device *dev) return 0; } +#ifndef CONFIG_NET_NS +static inline void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb->dev = dev; +} +#else /* CONFIG_NET_NS */ +void skb_set_dev(struct sk_buff *skb, struct net_device *dev); +#endif + static inline bool netdev_uses_trailer_tags(struct net_device *dev) { #ifdef CONFIG_NET_DSA_TAG_TRAILER diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 77a49ff..95034a8 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -322,7 +322,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, } - skb->dev = vlan_dev_info(dev)->real_dev; + skb_set_dev(skb, vlan_dev_info(dev)->real_dev); len = skb->len; ret = dev_queue_xmit(skb); diff --git a/net/core/dev.c b/net/core/dev.c index 2cba5c5..e80403a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1448,13 +1448,10 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) if (skb->len > (dev->mtu + dev->hard_header_len)) return NET_RX_DROP; - skb_dst_drop(skb); + skb_set_dev(skb, dev); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1614,6 +1611,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) return false; } +/** + * skb_dev_set -- assign a buffer to a new device + * @skb: buffer for the new device + * @dev: network device + * + * If an skb is owned by a device already, we have to reset + * all data private to the namespace a device belongs to + * before assigning it a new device. + */ +#ifdef CONFIG_NET_NS +void skb_set_dev(struct sk_buff *skb, struct net_device *dev) +{ + skb_dst_drop(skb); + if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { + secpath_reset(skb); + nf_reset(skb); + skb_init_secmark(skb); + skb->mark = 0; + skb->priority = 0; + skb->nf_trace = 0; + skb->ipvs_property = 0; +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#endif + } + skb->dev = dev; +} +EXPORT_SYMBOL(skb_set_dev); +#endif /* CONFIG_NET_NS */ + /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. -- 1.6.3.3
Arnd Bergmann
2010-Jan-27 10:06 UTC
[Bridge] [PATCH 2/3] net/macvlan: allow multiple driver backends
This makes it possible to hook into the macvlan driver from another kernel module. In particular, the goal is to extend it with the macvtap backend that provides a tun/tap compatible interface directly on the macvlan device. Signed-off-by: Arnd Bergmann <arnd at arndb.de> --- drivers/net/macvlan.c | 113 +++++++++++++++++++------------------------- include/linux/if_macvlan.h | 70 +++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 64 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index e0436fd..1517537 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -39,31 +39,6 @@ struct macvlan_port { struct list_head vlans; }; -/** - * struct macvlan_rx_stats - MACVLAN percpu rx stats - * @rx_packets: number of received packets - * @rx_bytes: number of received bytes - * @multicast: number of received multicast packets - * @rx_errors: number of errors - */ -struct macvlan_rx_stats { - unsigned long rx_packets; - unsigned long rx_bytes; - unsigned long multicast; - unsigned long rx_errors; -}; - -struct macvlan_dev { - struct net_device *dev; - struct list_head list; - struct hlist_node hlist; - struct macvlan_port *port; - struct net_device *lowerdev; - struct macvlan_rx_stats *rx_stats; - enum macvlan_mode mode; -}; - - static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { @@ -118,31 +93,17 @@ static int macvlan_addr_busy(const struct macvlan_port *port, return 0; } -static inline void macvlan_count_rx(const struct macvlan_dev *vlan, - unsigned int len, bool success, - bool multicast) -{ - struct macvlan_rx_stats *rx_stats; - - rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); - if (likely(success)) { - rx_stats->rx_packets++;; - rx_stats->rx_bytes += len; - if (multicast) - rx_stats->multicast++; - } else { - rx_stats->rx_errors++; - } -} -static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, +static int macvlan_broadcast_one(struct sk_buff *skb, + const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { + struct net_device *dev = vlan->dev; if (!skb) return NET_RX_DROP; if (local) - return dev_forward_skb(dev, skb); + return vlan->forward(dev, skb); skb->dev = dev; if (!compare_ether_addr_64bits(eth->h_dest, @@ -151,7 +112,7 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, else skb->pkt_type = PACKET_MULTICAST; - return netif_receive_skb(skb); + return vlan->receive(skb); } static void macvlan_broadcast(struct sk_buff *skb, @@ -175,7 +136,7 @@ static void macvlan_broadcast(struct sk_buff *skb, continue; nskb = skb_clone(skb, GFP_ATOMIC); - err = macvlan_broadcast_one(nskb, vlan->dev, eth, + err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, 1); @@ -238,7 +199,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) skb->dev = dev; skb->pkt_type = PACKET_HOST; - netif_receive_skb(skb); + vlan->receive(skb); return NULL; } @@ -260,7 +221,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { unsigned int length = skb->len + ETH_HLEN; - int ret = dev_forward_skb(dest->dev, skb); + int ret = dest->forward(dest->dev, skb); macvlan_count_rx(dest, length, ret == NET_RX_SUCCESS, 0); @@ -273,8 +234,8 @@ xmit_world: return dev_queue_xmit(skb); } -static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, - struct net_device *dev) +netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, + struct net_device *dev) { int i = skb_get_queue_mapping(skb); struct netdev_queue *txq = netdev_get_tx_queue(dev, i); @@ -290,6 +251,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, return ret; } +EXPORT_SYMBOL_GPL(macvlan_start_xmit); static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, @@ -623,8 +585,11 @@ static int macvlan_get_tx_queues(struct net *net, return 0; } -static int macvlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +int macvlan_common_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + int (*receive)(struct sk_buff *skb), + int (*forward)(struct net_device *dev, + struct sk_buff *skb)) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port; @@ -664,6 +629,8 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; + vlan->receive = receive; + vlan->forward = forward; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) @@ -677,8 +644,17 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, netif_stacked_transfer_operstate(lowerdev, dev); return 0; } +EXPORT_SYMBOL_GPL(macvlan_common_newlink); -static void macvlan_dellink(struct net_device *dev, struct list_head *head) +static int macvlan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + return macvlan_common_newlink(src_net, dev, tb, data, + netif_receive_skb, + dev_forward_skb); +} + +void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; @@ -689,6 +665,7 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head) if (list_empty(&port->vlans)) macvlan_port_destroy(port->dev); } +EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) @@ -720,19 +697,27 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, }; -static struct rtnl_link_ops macvlan_link_ops __read_mostly = { +int macvlan_link_register(struct rtnl_link_ops *ops) +{ + /* common fields */ + ops->priv_size = sizeof(struct macvlan_dev); + ops->get_tx_queues = macvlan_get_tx_queues; + ops->setup = macvlan_setup; + ops->validate = macvlan_validate; + ops->maxtype = IFLA_MACVLAN_MAX; + ops->policy = macvlan_policy; + ops->changelink = macvlan_changelink; + ops->get_size = macvlan_get_size; + ops->fill_info = macvlan_fill_info; + + return rtnl_link_register(ops); +}; +EXPORT_SYMBOL_GPL(macvlan_link_register); + +static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", - .priv_size = sizeof(struct macvlan_dev), - .get_tx_queues = macvlan_get_tx_queues, - .setup = macvlan_setup, - .validate = macvlan_validate, .newlink = macvlan_newlink, .dellink = macvlan_dellink, - .maxtype = IFLA_MACVLAN_MAX, - .policy = macvlan_policy, - .changelink = macvlan_changelink, - .get_size = macvlan_get_size, - .fill_info = macvlan_fill_info, }; static int macvlan_device_event(struct notifier_block *unused, @@ -761,7 +746,7 @@ static int macvlan_device_event(struct notifier_block *unused, break; case NETDEV_UNREGISTER: list_for_each_entry_safe(vlan, next, &port->vlans, list) - macvlan_dellink(vlan->dev, NULL); + vlan->dev->rtnl_link_ops->dellink(vlan->dev, NULL); break; } return NOTIFY_DONE; @@ -778,7 +763,7 @@ static int __init macvlan_init_module(void) register_netdevice_notifier(&macvlan_notifier_block); macvlan_handle_frame_hook = macvlan_handle_frame; - err = rtnl_link_register(&macvlan_link_ops); + err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 5f200ba..9a11544 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -1,6 +1,76 @@ #ifndef _LINUX_IF_MACVLAN_H #define _LINUX_IF_MACVLAN_H +#include <linux/if_link.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <net/netlink.h> + +struct macvlan_port; +struct macvtap_queue; + +/** + * struct macvlan_rx_stats - MACVLAN percpu rx stats + * @rx_packets: number of received packets + * @rx_bytes: number of received bytes + * @multicast: number of received multicast packets + * @rx_errors: number of errors + */ +struct macvlan_rx_stats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long multicast; + unsigned long rx_errors; +}; + +struct macvlan_dev { + struct net_device *dev; + struct list_head list; + struct hlist_node hlist; + struct macvlan_port *port; + struct net_device *lowerdev; + struct macvlan_rx_stats *rx_stats; + enum macvlan_mode mode; + int (*receive)(struct sk_buff *skb); + int (*forward)(struct net_device *dev, struct sk_buff *skb); +}; + +static inline void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast) +{ + struct macvlan_rx_stats *rx_stats; + + rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); + if (likely(success)) { + rx_stats->rx_packets++;; + rx_stats->rx_bytes += len; + if (multicast) + rx_stats->multicast++; + } else { + rx_stats->rx_errors++; + } +} + +extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + int (*receive)(struct sk_buff *skb), + int (*forward)(struct net_device *dev, + struct sk_buff *skb)); + +extern void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast); + +extern void macvlan_dellink(struct net_device *dev, struct list_head *head); + +extern int macvlan_link_register(struct rtnl_link_ops *ops); + +extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, + struct net_device *dev); + + extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *); #endif /* _LINUX_IF_MACVLAN_H */ -- 1.6.3.3
In order to use macvlan with qemu and other tools that require a tap file descriptor, the macvtap driver adds a small backend with a character device with the same interface as the tun driver, with a minimum set of features. Macvtap interfaces are created in the same way as macvlan interfaces using ip link, but the netif is just used as a handle for configuration and accounting, while the data goes through the chardev. Each macvtap interface has its own character device, simplifying permission management significantly over the generic tun/tap driver. Cc: Patrick McHardy <kaber at trash.net> Cc: Stephen Hemminger <shemminger at linux-foundation.org> Cc: David S. Miller" <davem at davemloft.net> Cc: "Michael S. Tsirkin" <mst at redhat.com> Cc: Herbert Xu <herbert at gondor.apana.org.au> Cc: Or Gerlitz <ogerlitz at voltaire.com> Cc: netdev at vger.kernel.org Cc: bridge at lists.linux-foundation.org Cc: linux-kernel at vger.kernel.org Signed-off-by: Arnd Bergmann <arnd at arndb.de> --- drivers/net/Kconfig | 12 + drivers/net/Makefile | 1 + drivers/net/macvtap.c | 572 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/if_macvlan.h | 1 + 4 files changed, 586 insertions(+), 0 deletions(-) create mode 100644 drivers/net/macvtap.c diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index cb0e534..411e207 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -90,6 +90,18 @@ config MACVLAN To compile this driver as a module, choose M here: the module will be called macvlan. +config MACVTAP + tristate "MAC-VLAN based tap driver (EXPERIMENTAL)" + depends on MACVLAN + help + This adds a specialized tap character device driver that is based + on the MAC-VLAN network interface, called macvtap. A macvtap device + can be added in the same way as a macvlan device, using 'type + macvlan', and then be accessed through the tap user space interface. + + To compile this driver as a module, choose M here: the module + will be called macvtap. + config EQUALIZER tristate "EQL (serial line load balancing) support" ---help--- diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 0b763cb..9595803 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_IFB) += ifb.o obj-$(CONFIG_MACVLAN) += macvlan.o +obj-$(CONFIG_MACVTAP) += macvtap.o obj-$(CONFIG_DE600) += de600.o obj-$(CONFIG_DE620) += de620.o obj-$(CONFIG_LANCE) += lance.o diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c new file mode 100644 index 0000000..2916202 --- /dev/null +++ b/drivers/net/macvtap.c @@ -0,0 +1,572 @@ +#include <linux/etherdevice.h> +#include <linux/if_macvlan.h> +#include <linux/interrupt.h> +#include <linux/nsproxy.h> +#include <linux/compat.h> +#include <linux/if_tun.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/cache.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/cdev.h> +#include <linux/fs.h> + +#include <net/net_namespace.h> +#include <net/rtnetlink.h> +#include <net/sock.h> + +/* + * A macvtap queue is the central object of this driver, it connects + * an open character device to a macvlan interface. There can be + * multiple queues on one interface, which map back to queues + * implemented in hardware on the underlying device. + * + * macvtap_proto is used to allocate queues through the sock allocation + * mechanism. + * + * TODO: multiqueue support is currently not implemented, even though + * macvtap is basically prepared for that. We will need to add this + * here as well as in virtio-net and qemu to get line rate on 10gbit + * adapters from a guest. + */ +struct macvtap_queue { + struct sock sk; + struct socket sock; + struct macvlan_dev *vlan; + struct file *file; +}; + +static struct proto macvtap_proto = { + .name = "macvtap", + .owner = THIS_MODULE, + .obj_size = sizeof (struct macvtap_queue), +}; + +/* + * Minor number matches netdev->ifindex, so need a potentially + * large value. This also makes it possible to split the + * tap functionality out again in the future by offering it + * from other drivers besides macvtap. As long as every device + * only has one tap, the interface numbers assure that the + * device nodes are unique. + */ +static unsigned int macvtap_major; +#define MACVTAP_NUM_DEVS 65536 +static struct class *macvtap_class; +static struct cdev macvtap_cdev; + +/* + * RCU usage: + * The macvtap_queue is referenced both from the chardev struct file + * and from the struct macvlan_dev using rcu_read_lock. + * + * We never actually update the contents of a macvtap_queue atomically + * with RCU but it is used for race-free destruction of a queue when + * either the file or the macvlan_dev goes away. Pointers back to + * the dev and the file are implicitly valid as long as the queue + * exists. + * + * The callbacks from macvlan are always done with rcu_read_lock held + * already, while in the file_operations, we get it ourselves. + * + * When destroying a queue, we remove the pointers from the file and + * from the dev and then synchronize_rcu to make sure no thread is + * still using the queue. There may still be references to the struct + * sock inside of the queue from outbound SKBs, but these never + * reference back to the file or the dev. The data structure is freed + * through __sk_free when both our references and any pending SKBs + * are gone. + * + * macvtap_lock is only used to prevent multiple concurrent open() + * calls to assign a new vlan->tap pointer. It could be moved into + * the macvlan_dev itself but is extremely rarely used. + */ +static DEFINE_SPINLOCK(macvtap_lock); + +/* + * Choose the next free queue, for now there is only one + */ +static int macvtap_set_queue(struct net_device *dev, struct file *file, + struct macvtap_queue *q) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + int err = -EBUSY; + + spin_lock(&macvtap_lock); + if (rcu_dereference(vlan->tap)) + goto out; + + err = 0; + q->vlan = vlan; + rcu_assign_pointer(vlan->tap, q); + + q->file = file; + rcu_assign_pointer(file->private_data, q); + +out: + spin_unlock(&macvtap_lock); + return err; +} + +/* + * We must destroy each queue exactly once, when either + * the netdev or the file go away. + * + * Using the spinlock makes sure that we don't get + * to the queue again after destroying it. + * + * synchronize_rcu serializes with the packet flow + * that uses rcu_read_lock. + */ +static void macvtap_del_queue(struct macvtap_queue **qp) +{ + struct macvtap_queue *q; + + spin_lock(&macvtap_lock); + q = rcu_dereference(*qp); + if (!q) { + spin_unlock(&macvtap_lock); + return; + } + + rcu_assign_pointer(q->vlan->tap, NULL); + rcu_assign_pointer(q->file->private_data, NULL); + spin_unlock(&macvtap_lock); + + synchronize_rcu(); + sock_put(&q->sk); +} + +/* + * Since we only support one queue, just dereference the pointer. + */ +static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, + struct sk_buff *skb) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + + return rcu_dereference(vlan->tap); +} + +static void macvtap_del_queues(struct net_device *dev) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + macvtap_del_queue(&vlan->tap); +} + +static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file) +{ + rcu_read_lock_bh(); + return rcu_dereference(file->private_data); +} + +static inline void macvtap_file_put_queue(void) +{ + rcu_read_unlock_bh(); +} + +/* + * Forward happens for data that gets sent from one macvlan + * endpoint to another one in bridge mode. We just take + * the skb and put it into the receive queue. + */ +static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) +{ + struct macvtap_queue *q = macvtap_get_queue(dev, skb); + if (!q) + return -ENOLINK; + + skb_queue_tail(&q->sk.sk_receive_queue, skb); + wake_up(q->sk.sk_sleep); + return 0; +} + +/* + * Receive is for data from the external interface (lowerdev), + * in case of macvtap, we can treat that the same way as + * forward, which macvlan cannot. + */ +static int macvtap_receive(struct sk_buff *skb) +{ + skb_push(skb, ETH_HLEN); + return macvtap_forward(skb->dev, skb); +} + +static int macvtap_newlink(struct net *src_net, + struct net_device *dev, + struct nlattr *tb[], + struct nlattr *data[]) +{ + struct device *classdev; + dev_t devt; + int err; + + err = macvlan_common_newlink(src_net, dev, tb, data, + macvtap_receive, macvtap_forward); + if (err) + goto out; + + devt = MKDEV(MAJOR(macvtap_major), dev->ifindex); + + classdev = device_create(macvtap_class, &dev->dev, devt, + dev, "tap%d", dev->ifindex); + if (IS_ERR(classdev)) { + err = PTR_ERR(classdev); + macvtap_del_queues(dev); + } + +out: + return err; +} + +static void macvtap_dellink(struct net_device *dev, + struct list_head *head) +{ + device_destroy(macvtap_class, + MKDEV(MAJOR(macvtap_major), dev->ifindex)); + + macvtap_del_queues(dev); + macvlan_dellink(dev, head); +} + +static struct rtnl_link_ops macvtap_link_ops __read_mostly = { + .kind = "macvtap", + .newlink = macvtap_newlink, + .dellink = macvtap_dellink, +}; + + +static void macvtap_sock_write_space(struct sock *sk) +{ + if (!sock_writeable(sk) || + !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) + return; + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_sync(sk->sk_sleep); +} + +static int macvtap_open(struct inode *inode, struct file *file) +{ + struct net *net = current->nsproxy->net_ns; + struct net_device *dev = dev_get_by_index(net, iminor(inode)); + struct macvtap_queue *q; + int err; + + err = -ENODEV; + if (!dev) + goto out; + + /* check if this is a macvtap device */ + err = -EINVAL; + if (dev->rtnl_link_ops != &macvtap_link_ops) + goto out; + + err = -ENOMEM; + q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, + &macvtap_proto); + if (!q) + goto out; + + init_waitqueue_head(&q->sock.wait); + q->sock.type = SOCK_RAW; + q->sock.state = SS_CONNECTED; + sock_init_data(&q->sock, &q->sk); + q->sk.sk_allocation = GFP_ATOMIC; /* for now */ + q->sk.sk_write_space = macvtap_sock_write_space; + + err = macvtap_set_queue(dev, file, q); + if (err) + sock_put(&q->sk); + +out: + return err; +} + +static int macvtap_release(struct inode *inode, struct file *file) +{ + macvtap_del_queue((struct macvtap_queue **)&file->private_data); + return 0; +} + +static unsigned int macvtap_poll(struct file *file, poll_table * wait) +{ + struct macvtap_queue *q = macvtap_file_get_queue(file); + unsigned int mask = POLLERR; + + if (!q) + goto out; + + mask = 0; + poll_wait(file, &q->sock.wait, wait); + + if (!skb_queue_empty(&q->sk.sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + if (sock_writeable(&q->sk) || + (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && + sock_writeable(&q->sk))) + mask |= POLLOUT | POLLWRNORM; + +out: + macvtap_file_put_queue(); + return mask; +} + +/* Get packet from user space buffer */ +static ssize_t macvtap_get_user(struct macvtap_queue *q, + const struct iovec *iv, size_t count, + int noblock) +{ + struct sk_buff *skb; + size_t len = count; + int err; + + if (unlikely(len < ETH_HLEN)) + return -EINVAL; + + skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); + + if (!skb) { + macvlan_count_rx(q->vlan, 0, false, false); + return err; + } + + skb_reserve(skb, NET_IP_ALIGN); + skb_put(skb, count); + + if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) { + macvlan_count_rx(q->vlan, 0, false, false); + kfree_skb(skb); + return -EFAULT; + } + + skb_set_network_header(skb, ETH_HLEN); + + macvlan_start_xmit(skb, q->vlan->dev); + + return count; +} + +static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t result = -ENOLINK; + struct macvtap_queue *q = macvtap_file_get_queue(file); + + if (!q) + goto out; + + result = macvtap_get_user(q, iv, iov_length(iv, count), + file->f_flags & O_NONBLOCK); +out: + macvtap_file_put_queue(); + return result; +} + +/* Put packet to the user space buffer */ +static ssize_t macvtap_put_user(struct macvtap_queue *q, + const struct sk_buff *skb, + const struct iovec *iv, int len) +{ + struct macvlan_dev *vlan = q->vlan; + int ret; + + len = min_t(int, skb->len, len); + + ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); + + macvlan_count_rx(vlan, len, ret == 0, 0); + + return ret ? ret : len; +} + +static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct macvtap_queue *q = macvtap_file_get_queue(file); + + DECLARE_WAITQUEUE(wait, current); + struct sk_buff *skb; + ssize_t len, ret = 0; + + if (!q) { + ret = -ENOLINK; + goto out; + } + + len = iov_length(iv, count); + if (len < 0) { + ret = -EINVAL; + goto out; + } + + add_wait_queue(q->sk.sk_sleep, &wait); + while (len) { + current->state = TASK_INTERRUPTIBLE; + + /* Read frames from the queue */ + skb = skb_dequeue(&q->sk.sk_receive_queue); + if (!skb) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + /* Nothing to read, let's sleep */ + schedule(); + continue; + } + ret = macvtap_put_user(q, skb, iv, len); + kfree_skb(skb); + break; + } + + current->state = TASK_RUNNING; + remove_wait_queue(q->sk.sk_sleep, &wait); + +out: + macvtap_file_put_queue(); + return ret; +} + +/* + * provide compatibility with generic tun/tap interface + */ +static long macvtap_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct macvtap_queue *q; + void __user *argp = (void __user *)arg; + struct ifreq __user *ifr = argp; + unsigned int __user *up = argp; + unsigned int u; + int err; + + switch (cmd) { + case TUNSETIFF: + /* ignore the name, just look at flags */ + if (get_user(u, &ifr->ifr_flags)) + return -EFAULT; + if (u != (IFF_TAP | IFF_NO_PI)) + return -EINVAL; + return 0; + + case TUNGETIFF: + q = macvtap_file_get_queue(file); + if (!q) + return -ENOLINK; + err = 0; + if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || + put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) + err = -EFAULT; + macvtap_file_put_queue(); + return err; + + case TUNGETFEATURES: + if (put_user((IFF_TAP | IFF_NO_PI), up)) + return -EFAULT; + return 0; + + case TUNSETSNDBUF: + /* ignore */ + return 0; + + case TUNSETOFFLOAD: + /* let the user check for future flags */ + if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + /* TODO: add support for these, so far we don't + support any offload */ + if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + return 0; + + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations macvtap_fops = { + .owner = THIS_MODULE, + .open = macvtap_open, + .release = macvtap_release, + .aio_read = macvtap_aio_read, + .aio_write = macvtap_aio_write, + .poll = macvtap_poll, + .llseek = no_llseek, + .unlocked_ioctl = macvtap_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = macvtap_compat_ioctl, +#endif +}; + +static int macvtap_init(void) +{ + int err; + + err = alloc_chrdev_region(&macvtap_major, 0, + MACVTAP_NUM_DEVS, "macvtap"); + if (err) + goto out1; + + cdev_init(&macvtap_cdev, &macvtap_fops); + err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); + if (err) + goto out2; + + macvtap_class = class_create(THIS_MODULE, "macvtap"); + if (IS_ERR(macvtap_class)) { + err = PTR_ERR(macvtap_class); + goto out3; + } + + err = macvlan_link_register(&macvtap_link_ops); + if (err) + goto out4; + + return 0; + +out4: + class_unregister(macvtap_class); +out3: + cdev_del(&macvtap_cdev); +out2: + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +out1: + return err; +} +module_init(macvtap_init); + +static void macvtap_exit(void) +{ + rtnl_link_unregister(&macvtap_link_ops); + class_unregister(macvtap_class); + cdev_del(&macvtap_cdev); + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +} +module_exit(macvtap_exit); + +MODULE_ALIAS_RTNL_LINK("macvtap"); +MODULE_AUTHOR("Arnd Bergmann <arnd at arndb.de>"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 9a11544..51f1512 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -34,6 +34,7 @@ struct macvlan_dev { enum macvlan_mode mode; int (*receive)(struct sk_buff *skb); int (*forward)(struct net_device *dev, struct sk_buff *skb); + struct macvtap_queue *tap; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, -- 1.6.3.3
On Wednesday 27 January 2010, Arnd Bergmann wrote:> There are still known problems, but unless there > are fundamental concerns, I'd like this to go > into net-next as an experimental driver, > fixing up the remaining problems by 2.6.34-rc1.I should have been more specific here. The one really annoying problem is a reference counting problem I introduced in one of the last changes that prevents you from destroying a device after it has been used. Unfortunately, I'm still traveling after LCA, and haven't had a chance to look into this before sending out the patches as I had originally planned. I've also seen crashes that are not fully reproducible, any bug reports on those are appreciated. Arnd
David Miller
2010-Jan-29 05:33 UTC
[Bridge] [PATCH 1/3] net: maintain namespace isolation between vlan and real device
From: Arnd Bergmann <arnd at arndb.de> Date: Wed, 27 Jan 2010 11:05:15 +0100> + * skb_dev_set -- assign a buffer to a new deviceMy english is terrible, but I think this should be "assign a new device to a buffer". If you agree, please fix this up when you resubmit patches #1 and #2 along with the fix you already plan to make to patch #3. Thanks!
Arnd Bergmann
2010-Jan-29 10:12 UTC
[Bridge] [PATCH 1/3] net: maintain namespace isolation between vlan and real device
On Friday 29 January 2010, David Miller wrote:> From: Arnd Bergmann <arnd at arndb.de> > Date: Wed, 27 Jan 2010 11:05:15 +0100 > > > + * skb_dev_set -- assign a buffer to a new device > > My english is terrible, but I think this should be > "assign a new device to a buffer".Right, that seems clearer.> If you agree, please fix this up when you resubmit patches #1 and #2 > along with the fix you already plan to make to patch #3.Ok, will do. Thanks, Arnd
This is the fourth version of the macvtap driver, based on the comments I got for the last version I got a few days ago. Very few changes: * release netdev in chardev open function so we can destroy it properly. * Implement TUNSETSNDBUF * fix sleeping call in rcu_read_lock * Fix comment in namespace isolation patch * Fix small context difference to make it apply to net-next I can't really test here while travelling, so please give it a go if you're interested in this driver. Arnd
Arnd Bergmann
2010-Jan-30 22:23 UTC
[Bridge] [PATCH 2/3] macvlan: allow multiple driver backends
This makes it possible to hook into the macvlan driver from another kernel module. In particular, the goal is to extend it with the macvtap backend that provides a tun/tap compatible interface directly on the macvlan device. Signed-off-by: Arnd Bergmann <arnd at arndb.de> --- drivers/net/macvlan.c | 113 +++++++++++++++++++------------------------- include/linux/if_macvlan.h | 70 +++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 64 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d32e0bd..40faa36 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -39,31 +39,6 @@ struct macvlan_port { struct list_head vlans; }; -/** - * struct macvlan_rx_stats - MACVLAN percpu rx stats - * @rx_packets: number of received packets - * @rx_bytes: number of received bytes - * @multicast: number of received multicast packets - * @rx_errors: number of errors - */ -struct macvlan_rx_stats { - unsigned long rx_packets; - unsigned long rx_bytes; - unsigned long multicast; - unsigned long rx_errors; -}; - -struct macvlan_dev { - struct net_device *dev; - struct list_head list; - struct hlist_node hlist; - struct macvlan_port *port; - struct net_device *lowerdev; - struct macvlan_rx_stats *rx_stats; - enum macvlan_mode mode; -}; - - static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { @@ -118,31 +93,17 @@ static int macvlan_addr_busy(const struct macvlan_port *port, return 0; } -static inline void macvlan_count_rx(const struct macvlan_dev *vlan, - unsigned int len, bool success, - bool multicast) -{ - struct macvlan_rx_stats *rx_stats; - - rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); - if (likely(success)) { - rx_stats->rx_packets++;; - rx_stats->rx_bytes += len; - if (multicast) - rx_stats->multicast++; - } else { - rx_stats->rx_errors++; - } -} -static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, +static int macvlan_broadcast_one(struct sk_buff *skb, + const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { + struct net_device *dev = vlan->dev; if (!skb) return NET_RX_DROP; if (local) - return dev_forward_skb(dev, skb); + return vlan->forward(dev, skb); skb->dev = dev; if (!compare_ether_addr_64bits(eth->h_dest, @@ -151,7 +112,7 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, else skb->pkt_type = PACKET_MULTICAST; - return netif_rx(skb); + return vlan->receive(skb); } static void macvlan_broadcast(struct sk_buff *skb, @@ -175,7 +136,7 @@ static void macvlan_broadcast(struct sk_buff *skb, continue; nskb = skb_clone(skb, GFP_ATOMIC); - err = macvlan_broadcast_one(nskb, vlan->dev, eth, + err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, 1); @@ -238,7 +199,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) skb->dev = dev; skb->pkt_type = PACKET_HOST; - netif_rx(skb); + vlan->receive(skb); return NULL; } @@ -260,7 +221,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { unsigned int length = skb->len + ETH_HLEN; - int ret = dev_forward_skb(dest->dev, skb); + int ret = dest->forward(dest->dev, skb); macvlan_count_rx(dest, length, ret == NET_RX_SUCCESS, 0); @@ -273,8 +234,8 @@ xmit_world: return dev_queue_xmit(skb); } -static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, - struct net_device *dev) +netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, + struct net_device *dev) { int i = skb_get_queue_mapping(skb); struct netdev_queue *txq = netdev_get_tx_queue(dev, i); @@ -290,6 +251,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, return ret; } +EXPORT_SYMBOL_GPL(macvlan_start_xmit); static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, @@ -623,8 +585,11 @@ static int macvlan_get_tx_queues(struct net *net, return 0; } -static int macvlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +int macvlan_common_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + int (*receive)(struct sk_buff *skb), + int (*forward)(struct net_device *dev, + struct sk_buff *skb)) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port; @@ -664,6 +629,8 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; + vlan->receive = receive; + vlan->forward = forward; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) @@ -677,8 +644,17 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, netif_stacked_transfer_operstate(lowerdev, dev); return 0; } +EXPORT_SYMBOL_GPL(macvlan_common_newlink); -static void macvlan_dellink(struct net_device *dev, struct list_head *head) +static int macvlan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + return macvlan_common_newlink(src_net, dev, tb, data, + netif_rx, + dev_forward_skb); +} + +void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; @@ -689,6 +665,7 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head) if (list_empty(&port->vlans)) macvlan_port_destroy(port->dev); } +EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) @@ -720,19 +697,27 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, }; -static struct rtnl_link_ops macvlan_link_ops __read_mostly = { +int macvlan_link_register(struct rtnl_link_ops *ops) +{ + /* common fields */ + ops->priv_size = sizeof(struct macvlan_dev); + ops->get_tx_queues = macvlan_get_tx_queues; + ops->setup = macvlan_setup; + ops->validate = macvlan_validate; + ops->maxtype = IFLA_MACVLAN_MAX; + ops->policy = macvlan_policy; + ops->changelink = macvlan_changelink; + ops->get_size = macvlan_get_size; + ops->fill_info = macvlan_fill_info; + + return rtnl_link_register(ops); +}; +EXPORT_SYMBOL_GPL(macvlan_link_register); + +static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", - .priv_size = sizeof(struct macvlan_dev), - .get_tx_queues = macvlan_get_tx_queues, - .setup = macvlan_setup, - .validate = macvlan_validate, .newlink = macvlan_newlink, .dellink = macvlan_dellink, - .maxtype = IFLA_MACVLAN_MAX, - .policy = macvlan_policy, - .changelink = macvlan_changelink, - .get_size = macvlan_get_size, - .fill_info = macvlan_fill_info, }; static int macvlan_device_event(struct notifier_block *unused, @@ -761,7 +746,7 @@ static int macvlan_device_event(struct notifier_block *unused, break; case NETDEV_UNREGISTER: list_for_each_entry_safe(vlan, next, &port->vlans, list) - macvlan_dellink(vlan->dev, NULL); + vlan->dev->rtnl_link_ops->dellink(vlan->dev, NULL); break; } return NOTIFY_DONE; @@ -778,7 +763,7 @@ static int __init macvlan_init_module(void) register_netdevice_notifier(&macvlan_notifier_block); macvlan_handle_frame_hook = macvlan_handle_frame; - err = rtnl_link_register(&macvlan_link_ops); + err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 5f200ba..9a11544 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -1,6 +1,76 @@ #ifndef _LINUX_IF_MACVLAN_H #define _LINUX_IF_MACVLAN_H +#include <linux/if_link.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <net/netlink.h> + +struct macvlan_port; +struct macvtap_queue; + +/** + * struct macvlan_rx_stats - MACVLAN percpu rx stats + * @rx_packets: number of received packets + * @rx_bytes: number of received bytes + * @multicast: number of received multicast packets + * @rx_errors: number of errors + */ +struct macvlan_rx_stats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long multicast; + unsigned long rx_errors; +}; + +struct macvlan_dev { + struct net_device *dev; + struct list_head list; + struct hlist_node hlist; + struct macvlan_port *port; + struct net_device *lowerdev; + struct macvlan_rx_stats *rx_stats; + enum macvlan_mode mode; + int (*receive)(struct sk_buff *skb); + int (*forward)(struct net_device *dev, struct sk_buff *skb); +}; + +static inline void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast) +{ + struct macvlan_rx_stats *rx_stats; + + rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); + if (likely(success)) { + rx_stats->rx_packets++;; + rx_stats->rx_bytes += len; + if (multicast) + rx_stats->multicast++; + } else { + rx_stats->rx_errors++; + } +} + +extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + int (*receive)(struct sk_buff *skb), + int (*forward)(struct net_device *dev, + struct sk_buff *skb)); + +extern void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast); + +extern void macvlan_dellink(struct net_device *dev, struct list_head *head); + +extern int macvlan_link_register(struct rtnl_link_ops *ops); + +extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, + struct net_device *dev); + + extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *); #endif /* _LINUX_IF_MACVLAN_H */ -- 1.6.3.3
In order to use macvlan with qemu and other tools that require a tap file descriptor, the macvtap driver adds a small backend with a character device with the same interface as the tun driver, with a minimum set of features. Macvtap interfaces are created in the same way as macvlan interfaces using ip link, but the netif is just used as a handle for configuration and accounting, while the data goes through the chardev. Each macvtap interface has its own character device, simplifying permission management significantly over the generic tun/tap driver. Cc: Patrick McHardy <kaber at trash.net> Cc: Stephen Hemminger <shemminger at linux-foundation.org> Cc: David S. Miller" <davem at davemloft.net> Cc: "Michael S. Tsirkin" <mst at redhat.com> Cc: Herbert Xu <herbert at gondor.apana.org.au> Cc: Or Gerlitz <ogerlitz at voltaire.com> Cc: netdev at vger.kernel.org Cc: bridge at lists.linux-foundation.org Cc: linux-kernel at vger.kernel.org Signed-off-by: Arnd Bergmann <arnd at arndb.de> --- drivers/net/Kconfig | 12 + drivers/net/Makefile | 1 + drivers/net/macvtap.c | 581 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/if_macvlan.h | 1 + 4 files changed, 595 insertions(+), 0 deletions(-) create mode 100644 drivers/net/macvtap.c diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index cb0e534..411e207 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -90,6 +90,18 @@ config MACVLAN To compile this driver as a module, choose M here: the module will be called macvlan. +config MACVTAP + tristate "MAC-VLAN based tap driver (EXPERIMENTAL)" + depends on MACVLAN + help + This adds a specialized tap character device driver that is based + on the MAC-VLAN network interface, called macvtap. A macvtap device + can be added in the same way as a macvlan device, using 'type + macvlan', and then be accessed through the tap user space interface. + + To compile this driver as a module, choose M here: the module + will be called macvtap. + config EQUALIZER tristate "EQL (serial line load balancing) support" ---help--- diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 0b763cb..9595803 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_IFB) += ifb.o obj-$(CONFIG_MACVLAN) += macvlan.o +obj-$(CONFIG_MACVTAP) += macvtap.o obj-$(CONFIG_DE600) += de600.o obj-$(CONFIG_DE620) += de620.o obj-$(CONFIG_LANCE) += lance.o diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c new file mode 100644 index 0000000..ad1f6ef --- /dev/null +++ b/drivers/net/macvtap.c @@ -0,0 +1,581 @@ +#include <linux/etherdevice.h> +#include <linux/if_macvlan.h> +#include <linux/interrupt.h> +#include <linux/nsproxy.h> +#include <linux/compat.h> +#include <linux/if_tun.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/cache.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/cdev.h> +#include <linux/fs.h> + +#include <net/net_namespace.h> +#include <net/rtnetlink.h> +#include <net/sock.h> + +/* + * A macvtap queue is the central object of this driver, it connects + * an open character device to a macvlan interface. There can be + * multiple queues on one interface, which map back to queues + * implemented in hardware on the underlying device. + * + * macvtap_proto is used to allocate queues through the sock allocation + * mechanism. + * + * TODO: multiqueue support is currently not implemented, even though + * macvtap is basically prepared for that. We will need to add this + * here as well as in virtio-net and qemu to get line rate on 10gbit + * adapters from a guest. + */ +struct macvtap_queue { + struct sock sk; + struct socket sock; + struct macvlan_dev *vlan; + struct file *file; +}; + +static struct proto macvtap_proto = { + .name = "macvtap", + .owner = THIS_MODULE, + .obj_size = sizeof (struct macvtap_queue), +}; + +/* + * Minor number matches netdev->ifindex, so need a potentially + * large value. This also makes it possible to split the + * tap functionality out again in the future by offering it + * from other drivers besides macvtap. As long as every device + * only has one tap, the interface numbers assure that the + * device nodes are unique. + */ +static unsigned int macvtap_major; +#define MACVTAP_NUM_DEVS 65536 +static struct class *macvtap_class; +static struct cdev macvtap_cdev; + +/* + * RCU usage: + * The macvtap_queue is referenced both from the chardev struct file + * and from the struct macvlan_dev using rcu_read_lock. + * + * We never actually update the contents of a macvtap_queue atomically + * with RCU but it is used for race-free destruction of a queue when + * either the file or the macvlan_dev goes away. Pointers back to + * the dev and the file are implicitly valid as long as the queue + * exists. + * + * The callbacks from macvlan are always done with rcu_read_lock held + * already, while in the file_operations, we get it ourselves. + * + * When destroying a queue, we remove the pointers from the file and + * from the dev and then synchronize_rcu to make sure no thread is + * still using the queue. There may still be references to the struct + * sock inside of the queue from outbound SKBs, but these never + * reference back to the file or the dev. The data structure is freed + * through __sk_free when both our references and any pending SKBs + * are gone. + * + * macvtap_lock is only used to prevent multiple concurrent open() + * calls to assign a new vlan->tap pointer. It could be moved into + * the macvlan_dev itself but is extremely rarely used. + */ +static DEFINE_SPINLOCK(macvtap_lock); + +/* + * Choose the next free queue, for now there is only one + */ +static int macvtap_set_queue(struct net_device *dev, struct file *file, + struct macvtap_queue *q) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + int err = -EBUSY; + + spin_lock(&macvtap_lock); + if (rcu_dereference(vlan->tap)) + goto out; + + err = 0; + q->vlan = vlan; + rcu_assign_pointer(vlan->tap, q); + + q->file = file; + rcu_assign_pointer(file->private_data, q); + +out: + spin_unlock(&macvtap_lock); + return err; +} + +/* + * We must destroy each queue exactly once, when either + * the netdev or the file go away. + * + * Using the spinlock makes sure that we don't get + * to the queue again after destroying it. + * + * synchronize_rcu serializes with the packet flow + * that uses rcu_read_lock. + */ +static void macvtap_del_queue(struct macvtap_queue **qp) +{ + struct macvtap_queue *q; + + spin_lock(&macvtap_lock); + q = rcu_dereference(*qp); + if (!q) { + spin_unlock(&macvtap_lock); + return; + } + + rcu_assign_pointer(q->vlan->tap, NULL); + rcu_assign_pointer(q->file->private_data, NULL); + spin_unlock(&macvtap_lock); + + synchronize_rcu(); + sock_put(&q->sk); +} + +/* + * Since we only support one queue, just dereference the pointer. + */ +static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, + struct sk_buff *skb) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + + return rcu_dereference(vlan->tap); +} + +static void macvtap_del_queues(struct net_device *dev) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + macvtap_del_queue(&vlan->tap); +} + +static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file) +{ + rcu_read_lock_bh(); + return rcu_dereference(file->private_data); +} + +static inline void macvtap_file_put_queue(void) +{ + rcu_read_unlock_bh(); +} + +/* + * Forward happens for data that gets sent from one macvlan + * endpoint to another one in bridge mode. We just take + * the skb and put it into the receive queue. + */ +static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) +{ + struct macvtap_queue *q = macvtap_get_queue(dev, skb); + if (!q) + return -ENOLINK; + + skb_queue_tail(&q->sk.sk_receive_queue, skb); + wake_up(q->sk.sk_sleep); + return 0; +} + +/* + * Receive is for data from the external interface (lowerdev), + * in case of macvtap, we can treat that the same way as + * forward, which macvlan cannot. + */ +static int macvtap_receive(struct sk_buff *skb) +{ + skb_push(skb, ETH_HLEN); + return macvtap_forward(skb->dev, skb); +} + +static int macvtap_newlink(struct net *src_net, + struct net_device *dev, + struct nlattr *tb[], + struct nlattr *data[]) +{ + struct device *classdev; + dev_t devt; + int err; + + err = macvlan_common_newlink(src_net, dev, tb, data, + macvtap_receive, macvtap_forward); + if (err) + goto out; + + devt = MKDEV(MAJOR(macvtap_major), dev->ifindex); + + classdev = device_create(macvtap_class, &dev->dev, devt, + dev, "tap%d", dev->ifindex); + if (IS_ERR(classdev)) { + err = PTR_ERR(classdev); + macvtap_del_queues(dev); + } + +out: + return err; +} + +static void macvtap_dellink(struct net_device *dev, + struct list_head *head) +{ + device_destroy(macvtap_class, + MKDEV(MAJOR(macvtap_major), dev->ifindex)); + + macvtap_del_queues(dev); + macvlan_dellink(dev, head); +} + +static struct rtnl_link_ops macvtap_link_ops __read_mostly = { + .kind = "macvtap", + .newlink = macvtap_newlink, + .dellink = macvtap_dellink, +}; + + +static void macvtap_sock_write_space(struct sock *sk) +{ + if (!sock_writeable(sk) || + !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) + return; + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_sync(sk->sk_sleep); +} + +static int macvtap_open(struct inode *inode, struct file *file) +{ + struct net *net = current->nsproxy->net_ns; + struct net_device *dev = dev_get_by_index(net, iminor(inode)); + struct macvtap_queue *q; + int err; + + err = -ENODEV; + if (!dev) + goto out; + + /* check if this is a macvtap device */ + err = -EINVAL; + if (dev->rtnl_link_ops != &macvtap_link_ops) + goto out; + + err = -ENOMEM; + q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, + &macvtap_proto); + if (!q) + goto out; + + init_waitqueue_head(&q->sock.wait); + q->sock.type = SOCK_RAW; + q->sock.state = SS_CONNECTED; + sock_init_data(&q->sock, &q->sk); + q->sk.sk_allocation = GFP_ATOMIC; /* for now */ + q->sk.sk_write_space = macvtap_sock_write_space; + + err = macvtap_set_queue(dev, file, q); + if (err) + sock_put(&q->sk); + +out: + if (dev) + dev_put(dev); + + return err; +} + +static int macvtap_release(struct inode *inode, struct file *file) +{ + macvtap_del_queue((struct macvtap_queue **)&file->private_data); + return 0; +} + +static unsigned int macvtap_poll(struct file *file, poll_table * wait) +{ + struct macvtap_queue *q = macvtap_file_get_queue(file); + unsigned int mask = POLLERR; + + if (!q) + goto out; + + mask = 0; + poll_wait(file, &q->sock.wait, wait); + + if (!skb_queue_empty(&q->sk.sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + if (sock_writeable(&q->sk) || + (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && + sock_writeable(&q->sk))) + mask |= POLLOUT | POLLWRNORM; + +out: + macvtap_file_put_queue(); + return mask; +} + +/* Get packet from user space buffer */ +static ssize_t macvtap_get_user(struct macvtap_queue *q, + const struct iovec *iv, size_t count, + int noblock) +{ + struct sk_buff *skb; + size_t len = count; + int err; + + if (unlikely(len < ETH_HLEN)) + return -EINVAL; + + skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); + + if (!skb) { + macvlan_count_rx(q->vlan, 0, false, false); + return err; + } + + skb_reserve(skb, NET_IP_ALIGN); + skb_put(skb, count); + + if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) { + macvlan_count_rx(q->vlan, 0, false, false); + kfree_skb(skb); + return -EFAULT; + } + + skb_set_network_header(skb, ETH_HLEN); + + macvlan_start_xmit(skb, q->vlan->dev); + + return count; +} + +static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t result = -ENOLINK; + struct macvtap_queue *q = macvtap_file_get_queue(file); + + if (!q) + goto out; + + result = macvtap_get_user(q, iv, iov_length(iv, count), + file->f_flags & O_NONBLOCK); +out: + macvtap_file_put_queue(); + return result; +} + +/* Put packet to the user space buffer */ +static ssize_t macvtap_put_user(struct macvtap_queue *q, + const struct sk_buff *skb, + const struct iovec *iv, int len) +{ + struct macvlan_dev *vlan = q->vlan; + int ret; + + len = min_t(int, skb->len, len); + + ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); + + macvlan_count_rx(vlan, len, ret == 0, 0); + + return ret ? ret : len; +} + +static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct macvtap_queue *q = macvtap_file_get_queue(file); + + DECLARE_WAITQUEUE(wait, current); + struct sk_buff *skb; + ssize_t len, ret = 0; + + if (!q) { + ret = -ENOLINK; + goto out; + } + + len = iov_length(iv, count); + if (len < 0) { + ret = -EINVAL; + goto out; + } + + add_wait_queue(q->sk.sk_sleep, &wait); + while (len) { + current->state = TASK_INTERRUPTIBLE; + + /* Read frames from the queue */ + skb = skb_dequeue(&q->sk.sk_receive_queue); + if (!skb) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + /* Nothing to read, let's sleep */ + schedule(); + continue; + } + ret = macvtap_put_user(q, skb, iv, len); + kfree_skb(skb); + break; + } + + current->state = TASK_RUNNING; + remove_wait_queue(q->sk.sk_sleep, &wait); + +out: + macvtap_file_put_queue(); + return ret; +} + +/* + * provide compatibility with generic tun/tap interface + */ +static long macvtap_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct macvtap_queue *q; + void __user *argp = (void __user *)arg; + struct ifreq __user *ifr = argp; + unsigned int __user *up = argp; + unsigned int u; + char devname[IFNAMSIZ]; + + switch (cmd) { + case TUNSETIFF: + /* ignore the name, just look at flags */ + if (get_user(u, &ifr->ifr_flags)) + return -EFAULT; + if (u != (IFF_TAP | IFF_NO_PI)) + return -EINVAL; + return 0; + + case TUNGETIFF: + q = macvtap_file_get_queue(file); + if (!q) + return -ENOLINK; + memcpy(devname, q->vlan->dev->name, sizeof(devname)); + macvtap_file_put_queue(); + + if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || + put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) + return -EFAULT; + return 0; + + case TUNGETFEATURES: + if (put_user((IFF_TAP | IFF_NO_PI), up)) + return -EFAULT; + return 0; + + case TUNSETSNDBUF: + if (get_user(u, up)) + return -EFAULT; + + q = macvtap_file_get_queue(file); + q->sk.sk_sndbuf = u; + macvtap_file_put_queue(); + return 0; + + case TUNSETOFFLOAD: + /* let the user check for future flags */ + if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + /* TODO: add support for these, so far we don't + support any offload */ + if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | + TUN_F_TSO_ECN | TUN_F_UFO)) + return -EINVAL; + + return 0; + + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations macvtap_fops = { + .owner = THIS_MODULE, + .open = macvtap_open, + .release = macvtap_release, + .aio_read = macvtap_aio_read, + .aio_write = macvtap_aio_write, + .poll = macvtap_poll, + .llseek = no_llseek, + .unlocked_ioctl = macvtap_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = macvtap_compat_ioctl, +#endif +}; + +static int macvtap_init(void) +{ + int err; + + err = alloc_chrdev_region(&macvtap_major, 0, + MACVTAP_NUM_DEVS, "macvtap"); + if (err) + goto out1; + + cdev_init(&macvtap_cdev, &macvtap_fops); + err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); + if (err) + goto out2; + + macvtap_class = class_create(THIS_MODULE, "macvtap"); + if (IS_ERR(macvtap_class)) { + err = PTR_ERR(macvtap_class); + goto out3; + } + + err = macvlan_link_register(&macvtap_link_ops); + if (err) + goto out4; + + return 0; + +out4: + class_unregister(macvtap_class); +out3: + cdev_del(&macvtap_cdev); +out2: + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +out1: + return err; +} +module_init(macvtap_init); + +static void macvtap_exit(void) +{ + rtnl_link_unregister(&macvtap_link_ops); + class_unregister(macvtap_class); + cdev_del(&macvtap_cdev); + unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); +} +module_exit(macvtap_exit); + +MODULE_ALIAS_RTNL_LINK("macvtap"); +MODULE_AUTHOR("Arnd Bergmann <arnd at arndb.de>"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 9a11544..51f1512 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -34,6 +34,7 @@ struct macvlan_dev { enum macvlan_mode mode; int (*receive)(struct sk_buff *skb); int (*forward)(struct net_device *dev, struct sk_buff *skb); + struct macvtap_queue *tap; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, -- 1.6.3.3