This attempts to implement a "virtual I/O" layer which should allow common drivers to be efficiently used across most virtual I/O mechanisms. It will no-doubt need further enhancement. The details of probing the device are left to hypervisor-specific code: it simple constructs the "struct virtio_device" and hands it to the probe function (eg. virtnet_probe() or virtblk_probe()). The virtio drivers add and detach input and output buffers; as the buffers are used up their associated "used" pointers are filled in. I have written two virtio device drivers (net and block) and two virtio implementations (for lguest): a read-write socket-style implementation, and a more efficient descriptor-based implementation). Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- include/linux/virtio.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff -r 2db2135723b0 include/linux/virtio.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/virtio.h Thu May 31 17:52:19 2007 +1000 @@ -0,0 +1,66 @@ +#ifndef _LINUX_VIRTIO_H +#define _LINUX_VIRTIO_H +#include <linux/types.h> +#include <linux/scatterlist.h> + +/** + * virtio_device - description and routines to drive a virtual device. + * @dev: the underlying struct device. + * @ops: the operations for this virtual device. + */ +struct virtio_device { + struct device *dev; + struct virtio_ops *ops; +}; + +/** + * virtio_ops - virtio abstraction layer + * @add_outbuf: prepare to send data to the other end: + * vdev: the virtio_device + * sg: the description of the buffer(s). + * num: the size of the sg array. + * used: the length sent (set once sending is done). + * Returns an identifier or an error. + * @add_inbuf: prepare to receive data from the other end: + * vdev: the virtio_device + * sg: the description of the buffer(s). + * num: the size of the sg array. + * used: the length sent (set once data received). + * Returns an identifier or an error (eg. -ENOSPC). + * @sync: update after add_inbuf/add_outbuf + * vdev: the virtio_device we're talking about. + * Use the virtio_sync wrapper, to avoid unnecessary calls. + * @detach_outbuf: make sure sent sg can no longer be read. + * vdev: the virtio_device we're talking about. + * id: the identifier returned from add_outbuf. + * @detach_inbuf: make sure sent sg can no longer be written to. + * vdev: the virtio_device we're talking about. + * id: the identifier returned from add_inbuf. + */ +struct virtio_ops { + unsigned long (*add_outbuf)(struct virtio_device *vdev, + const struct scatterlist sg[], + unsigned int num, + unsigned long *used); + + unsigned long (*add_inbuf)(struct virtio_device *vdev, + struct scatterlist sg[], + unsigned int num, + unsigned long *used); + + void (*sync)(struct virtio_device *vdev); + + void (*detach_outbuf)(struct virtio_device *vdev, unsigned long id); + void (*detach_inbuf)(struct virtio_device *vdev, unsigned long id); +}; + +/** + * virtio_sync - start sending/receiving data from the other end. + * @vdev: the virtio_device we're talking about. + */ +static inline void virtio_sync(struct virtio_device *vdev) +{ + if (vdev->ops->sync) + vdev->ops->sync(vdev); +} +#endif /* _LINUX_VIRTIO_H */
Rusty Russell
2007-May-31 05:21 UTC
[PATCH RFC 2/3] virtio infrastructure: example net driver
Example net driver using virtio TODO: 1) Locking (see #2). 2) NAPI. 3) GSO. 4) Checksum options. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/net/Makefile | 2 drivers/net/virtio_net.c | 237 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/virtio_net.h | 13 ++ 3 files changed, 251 insertions(+), 1 deletion(-) diff -r 96df1769fce9 drivers/net/Makefile --- a/drivers/net/Makefile Thu May 31 17:52:38 2007 +1000 +++ b/drivers/net/Makefile Thu May 31 17:52:40 2007 +1000 @@ -37,7 +37,7 @@ obj-$(CONFIG_CASSINI) += cassini.o obj-$(CONFIG_MACE) += mace.o obj-$(CONFIG_BMAC) += bmac.o - +obj-y += virtio_net.o obj-$(CONFIG_DGRS) += dgrs.o obj-$(CONFIG_VORTEX) += 3c59x.o obj-$(CONFIG_TYPHOON) += typhoon.o diff -r 96df1769fce9 drivers/net/virtio_net.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/net/virtio_net.c Thu May 31 17:53:54 2007 +1000 @@ -0,0 +1,236 @@ +/* A simple network driver using virtio. + * + * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +//#define DEBUG +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/module.h> +#include <linux/virtio.h> +#include <linux/scatterlist.h> + +#define NET_BUFS 128 +/* FIXME: Make dynamic */ +#define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN) + +struct virtnet_info +{ + struct virtio_device *vdev; + struct net_device *ndev; + + /* Receive queue. */ + struct sk_buff *in[NET_BUFS]; + /* Send queue. */ + struct sk_buff *out[NET_BUFS]; + + /* Lengths for input buffers as they are used. */ + unsigned long in_used[NET_BUFS]; + + /* Lengths for output buffers as they are used. */ + unsigned long out_used[NET_BUFS]; + + /* IDs for buffers. */ + long in_ids[NET_BUFS]; + long out_ids[NET_BUFS]; +}; + +static int start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + unsigned int n, i; + /* FIXME: What *is* the max here? */ + struct scatterlist sg[MAX_SKB_FRAGS + 2]; + const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; + + pr_debug("%s: xmit %02x:%02x:%02x:%02x:%02x:%02x\n", + dev->name, dest[0],dest[1],dest[2],dest[3],dest[4],dest[5]); + + /* Go through and find a new output slot, and free any on the way */ + for (i = 0; i < NET_BUFS; i++) { + if (!vi->out[i]) + break; + + if (!vi->out_used[i]) + continue; + + vi->vdev->ops->detach_outbuf(vi->vdev, vi->out_ids[i]); + dev->stats.tx_bytes += vi->out_used[i]; + dev->stats.tx_packets++; + dev_kfree_skb(vi->out[i]); + vi->out[i] = NULL; + break; + } + if (unlikely(i == NET_BUFS)) { + pr_debug("%s: ring full\n", dev->name); + goto stop; + } + + n = skb_to_sgvec(skb, sg, 0, skb->len); + vi->out_used[i] = 0; + vi->out_ids[i] = vi->vdev->ops->add_outbuf(vi->vdev, sg, n, + &vi->out_used[i]); + if (IS_ERR_VALUE(vi->out_ids[i])) { + pr_debug("%s: virtio not prepared to send\n", dev->name); + goto stop; + } + vi->out[i] = skb; + virtio_sync(vi->vdev); + return 0; +stop: + netif_stop_queue(dev); + return NETDEV_TX_BUSY; +} + +static void receive_skb(struct net_device *dev, struct sk_buff *skb, + unsigned len) +{ + if (unlikely(len < ETH_HLEN)) { + pr_debug("%s: short packet %i\n", dev->name, len); + dev->stats.rx_length_errors++; + dev_kfree_skb(skb); + return; + } + BUG_ON(len > MAX_PACKET_LEN); + + skb_trim(skb, len); + skb->protocol = eth_type_trans(skb, dev); + pr_debug("Receiving skb proto 0x%04x len %i type %i\n", + ntohs(skb->protocol), skb->len, skb->pkt_type); + dev->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + netif_rx(skb); +} + +static void try_fill_recv(struct virtnet_info *vi, unsigned int i) +{ + /* FIXME: What *is* the max here? */ + struct scatterlist sg[MAX_SKB_FRAGS + 1]; + unsigned int n; + + vi->in[i] = netdev_alloc_skb(vi->ndev, MAX_PACKET_LEN); + if (unlikely(!vi->in[i])) + return; + + skb_put(vi->in[i], MAX_PACKET_LEN); + n = skb_to_sgvec(vi->in[i], sg, 0, vi->in[i]->len); + vi->in_used[i] = 0; + vi->in_ids[i] = vi->vdev->ops->add_inbuf(vi->vdev, sg, n, + &vi->in_used[i]); + if (IS_ERR_VALUE(vi->in_ids[i])) { + kfree_skb(vi->in[i]); + vi->in[i] = NULL; + } +} + +static int virtnet_open(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + unsigned int i; + + for (i = 0; i < NET_BUFS; i++) + try_fill_recv(vi, i); + + virtio_sync(vi->vdev); + return 0; +} + +static int virtnet_close(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + unsigned int i; + + for (i = 0; i < NET_BUFS; i++) { + if (vi->in[i]) { + vi->vdev->ops->detach_inbuf(vi->vdev, vi->in_ids[i]); + kfree_skb(vi->in[i]); + vi->in[i] = NULL; + } + if (vi->out[i]) { + vi->vdev->ops->detach_outbuf(vi->vdev, vi->out_ids[i]); + kfree_skb(vi->out[i]); + vi->out[i] = NULL; + } + } + return 0; +} + +struct net_device *virtnet_probe(struct virtio_device *vdev, + const u8 mac[ETH_ALEN]) +{ + int err; + struct net_device *dev; + struct virtnet_info *vi; + + dev = alloc_etherdev(sizeof(struct virtnet_info)); + if (!dev) + return ERR_PTR(-ENOMEM); + + SET_MODULE_OWNER(dev); + + ether_setup(dev); + memcpy(dev->dev_addr, mac, ETH_ALEN); + dev->open = virtnet_open; + dev->stop = virtnet_close; + dev->hard_start_xmit = start_xmit; + SET_NETDEV_DEV(dev, vdev->dev); + + vi = netdev_priv(dev); + vi->vdev = vdev; + vi->ndev = dev; + + err = register_netdev(dev); + if (err) { + pr_debug("virtio_net: registering device failed\n"); + goto free; + } + pr_debug("virtnet: registered device %s\n", dev->name); + return dev; + +free: + free_netdev(dev); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(virtnet_probe); + +/* We get this when the other side sends buffers, or consumes them. */ +int virtnet_interrupt(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + unsigned int i; + + for (i = 0; i < NET_BUFS; i++) { + if (vi->in[i] && vi->in_used[i]) { + vi->vdev->ops->detach_inbuf(vi->vdev, vi->in_ids[i]); + receive_skb(dev, vi->in[i], vi->in_used[i]); + try_fill_recv(vi, i); + } + } + + netif_wake_queue(dev); + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(virtnet_interrupt); + +void virtnet_remove(struct net_device *dev) +{ + unregister_netdev(dev); + free_netdev(dev); +} +EXPORT_SYMBOL_GPL(virtnet_remove); + +MODULE_DESCRIPTION("Virtio network driver"); +MODULE_LICENSE("GPL"); diff -r 96df1769fce9 include/linux/virtio_net.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/virtio_net.h Thu May 31 17:52:40 2007 +1000 @@ -0,0 +1,13 @@ +#ifndef _LINUX_VIRTIO_NET_H +#define _LINUX_VIRTIO_NET_H +#include <linux/types.h> +#include <linux/etherdevice.h> +struct net_device; +struct virtio_device; + +struct net_device *virtnet_probe(struct virtio_device *vdev, + const u8 mac[ETH_ALEN]); +int virtnet_interrupt(struct net_device *dev); +void virtnet_remove(struct net_device *dev); + +#endif /* _LINUX_VIRTIO_NET_H */
Rusty Russell wrote:> This attempts to implement a "virtual I/O" layer which should allow > common drivers to be efficiently used across most virtual I/O > mechanisms. It will no-doubt need further enhancement. > > The details of probing the device are left to hypervisor-specific > code: it simple constructs the "struct virtio_device" and hands it to > the probe function (eg. virtnet_probe() or virtblk_probe()). > > The virtio drivers add and detach input and output buffers; as the > buffers are used up their associated "used" pointers are filled in. > > I have written two virtio device drivers (net and block) and two > virtio implementations (for lguest): a read-write socket-style > implementation, and a more efficient descriptor-based implementation).These should work for s390 afaics. They seem to fit the requirements of network IO.
>This attempts to implement a "virtual I/O" layer which should allow >common drivers to be efficiently used across most virtual I/O >mechanisms. It will no-doubt need further enhancement. > >The details of probing the device are left to hypervisor-specific >code: it simple constructs the "struct virtio_device" and hands it to >the probe function (eg. virtnet_probe() or virtblk_probe()). > >The virtio drivers add and detach input and output buffers; as the >buffers are used up their associated "used" pointers are filled in. > >I have written two virtio device drivers (net and block) and two >virtio implementations (for lguest): a read-write socket-style >implementation, and a more efficient descriptor-based implementation). > >Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>That's the exact things I was planning to add to KVM/Linux. All virtual I/O devices should have common interface and share the core functionality. Since Xen PV drivers are already performance optimized and feature rich, we were planning to generalize the hypervisor-specific backend in order to reuse them. This is a good step toward such sharing. Cheers, Dor.
Santos, Jose Renato G
2007-Jun-01 16:37 UTC
[Xen-devel] [PATCH RFC 1/3] virtio infrastructure
> -----Original Message----- > From: xen-devel-bounces@lists.xensource.com > [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of > Rusty Russell > Sent: Thursday, May 31, 2007 5:19 AM > To: kvm-devel; Xen Mailing List; virtualization > Cc: Jimi Xenidis; Stephen Rothwell; jmk@plan9.bell-labs.com; > Herbert Xu; Christian Borntraeger; Suzanne McIntosh; Anthony > Liguori; Martin Schwidefsky > Subject: [Xen-devel] [PATCH RFC 1/3] virtio infrastructure > > This attempts to implement a "virtual I/O" layer which should > allow common drivers to be efficiently used across most > virtual I/O mechanisms. It will no-doubt need further enhancement. > >Rusty Could you please clarify what is the purpose of this "virtual I/O" layer? At least for networking, why isn't the current linux net device abstraction sufficient for hiding the details of different virtual devices implementations? What am I missing? Thanks Renato
Rusty Russell wrote:> This attempts to implement a "virtual I/O" layer which should allow > common drivers to be efficiently used across most virtual I/O > mechanisms. It will no-doubt need further enhancement. > > The details of probing the device are left to hypervisor-specific > code: it simple constructs the "struct virtio_device" and hands it to > the probe function (eg. virtnet_probe() or virtblk_probe()). > > The virtio drivers add and detach input and output buffers; as the > buffers are used up their associated "used" pointers are filled in. > >Good stuff.> +/** > + * virtio_ops - virtio abstraction layer > + * @add_outbuf: prepare to send data to the other end: > + * vdev: the virtio_device > + * sg: the description of the buffer(s). > + * num: the size of the sg array. > + * used: the length sent (set once sending is done). > + * Returns an identifier or an error. > + * @add_inbuf: prepare to receive data from the other end: > + * vdev: the virtio_device > + * sg: the description of the buffer(s). > + * num: the size of the sg array. > + * used: the length sent (set once data received). > + * Returns an identifier or an error (eg. -ENOSPC). >Instead of 'used', how about a completion callback (with associated data pointer)? A new helper, virtio_complete(), would call the callback for all completed requests. It would eliminate all the tedious scanning used to match the identifier. It would also be nice to support a bit of non-buffer data, like a set of bitflags. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic.