For virtualization, we've developed virtio_ring for efficient communication. This would also work well for userspace-kernel communication, particularly for things like the tun device. By using the same ABI, we can join guests to the host kernel trivially. These patches are fairly alpha; I've seen some network stalls I have to track down and there are some fixmes. Comments welcome! Rusty. diff -r 99132ad16999 Documentation/test_vring.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Documentation/test_vring.c Sat Apr 05 21:31:40 2008 +1100 @@ -0,0 +1,47 @@ +#include <unistd.h> +#include <linux/virtio_ring.h> +#include <stdio.h> +#include <stdint.h> +#include <err.h> +#include <poll.h> + +#ifndef __NR_vringfd +#define __NR_vringfd 327 +#endif + +int main() +{ + int fd, r; + struct vring vr; + uint16_t used = 0; + struct pollfd pfd; + void *buf = calloc(vring_size(256, getpagesize()), 0); + + vring_init(&vr, 256, buf, getpagesize()); + + fd = syscall(__NR_vringfd, buf, 256, &used); + if (fd < 0) + err(1, "vringfd gave %i", fd); + + pfd.fd = fd; + pfd.events = POLLIN; + r = poll(&pfd, 1, 0); + + if (r != 0) + err(1, "poll gave %i", r); + + vr.used->idx++; + r = poll(&pfd, 1, 0); + + if (r != 1) + err(1, "poll after buf used gave %i", r); + + used++; + r = poll(&pfd, 1, 0); + + if (r != 0) + err(1, "poll after used incremented gave %i", r); + + close(fd); + return 0; +} diff -r 99132ad16999 arch/x86/kernel/syscall_table_32.S --- a/arch/x86/kernel/syscall_table_32.S Sat Apr 05 21:20:32 2008 +1100 +++ b/arch/x86/kernel/syscall_table_32.S Sat Apr 05 21:31:40 2008 +1100 @@ -326,3 +326,4 @@ ENTRY(sys_call_table) .long sys_fallocate .long sys_timerfd_settime /* 325 */ .long sys_timerfd_gettime + .long sys_vringfd diff -r 99132ad16999 fs/Kconfig --- a/fs/Kconfig Sat Apr 05 21:20:32 2008 +1100 +++ b/fs/Kconfig Sat Apr 05 21:31:40 2008 +1100 @@ -2135,4 +2135,14 @@ source "fs/nls/Kconfig" source "fs/nls/Kconfig" source "fs/dlm/Kconfig" +config VRINGFD + bool "vring fd support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + vring is a ringbuffer implementation for efficient I/O. It is + currently used by virtualization hosts (lguest, kvm) for efficient + networking using the tun driver. + + If unsure, say N. + endmenu diff -r 99132ad16999 fs/Makefile --- a/fs/Makefile Sat Apr 05 21:20:32 2008 +1100 +++ b/fs/Makefile Sat Apr 05 21:31:40 2008 +1100 @@ -119,3 +119,4 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_GFS2_FS) += gfs2/ +obj-$(CONFIG_VRINGFD) += vring.o diff -r 99132ad16999 fs/vring.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fs/vring.c Sat Apr 05 21:31:40 2008 +1100 @@ -0,0 +1,376 @@ +/* Ring-buffer file descriptor implementation. + * + * Copyright 2008 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/virtio_ring.h> +#include <linux/vring.h> +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/wait.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/highmem.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/magic.h> +#include <linux/module.h> + +static struct vfsmount *vring_mnt; +static DEFINE_MUTEX(vring_lock); + +struct vring_info +{ + struct vring ring; + u16 mask; + u16 __user *last_used; + u16 last_avail; + + const struct vring_ops *ops; + void *ops_data; + + /* Waitqueue for poll() */ + wait_queue_head_t poll_wait; + + /* The mapped used ring. */ + struct vring_used *used; + struct page *used_page; +}; + +static unsigned int vring_poll(struct file *filp, + struct poll_table_struct *poll) +{ + struct vring_info *vr = filp->private_data; + int err; + unsigned int mask; + u16 used, last_used; + + /* Some uses of vrings require updating in user context. This + * is best done close to the caller, ie. here. */ + if (vr->ops && vr->ops->pull) { + err = vr->ops->pull(vr->ops_data); + if (unlikely(err < 0)) + return err; + + if (err > 0) { + /* Buffers have been used, no need to check indices */ + mask = POLLIN | POLLRDNORM; + goto poll_wait; + } + } + + err = get_user(used, &vr->ring.used->idx); + if (unlikely(err)) + return err; + + err = get_user(last_used, vr->last_used); + if (unlikely(err)) + return err; + + /* More buffers have been used? It's 'readable'. */ + if (used != last_used) + mask = POLLIN | POLLRDNORM; + else + mask = 0; + +poll_wait: + poll_wait(filp, &vr->poll_wait, poll); + + return mask; +} + +static ssize_t vring_write(struct file *filp, const char __user *buf, + size_t size, loff_t *off) +{ + struct vring_info *vr = filp->private_data; + + if (vr->ops && vr->ops->push) + return vr->ops->push(vr->ops_data); + + return -EINVAL; +} + +static int vring_release(struct inode *inode, struct file *filp) +{ + struct vring_info *vr = filp->private_data; + + /* Callback for other end. */ + if (vr->ops && vr->ops->destroy) + vr->ops->destroy(vr->ops_data); + + if (vr->used) { + kunmap(vr->used_page); + put_page(vr->used_page); + } + + kfree(vr); + return 0; +} + +static const struct file_operations vring_fops = { + .release = vring_release, + .write = vring_write, + .poll = vring_poll, +}; + +asmlinkage long sys_vringfd(void __user *addr, + unsigned num_descs, + u16 __user *last_used) +{ + int fd, err; + struct file *filp; + struct vring_info *vr; + + /* Must be a power of two, and representable by u16 */ + if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536) { + err = -EINVAL; + goto out; + } + + fd = get_unused_fd(); + if (fd < 0) { + err = fd; + goto out; + } + + filp = alloc_file(vring_mnt, dget(vring_mnt->mnt_root), FMODE_WRITE, + &vring_fops); + if (!filp) { + err = -ENFILE; + goto put_fd; + } + + filp->private_data = vr = kmalloc(sizeof(*vr), GFP_KERNEL); + if (!vr) { + err = -ENOMEM; + goto put_filp; + } + + /* Set up pointers into ring. */ + vring_init(&vr->ring, num_descs, addr, PAGE_SIZE); + init_waitqueue_head(&vr->poll_wait); + vr->last_used = last_used; + vr->mask = num_descs - 1; + vr->ops = NULL; + vr->used = NULL; + + err = get_user(vr->last_avail, &vr->ring.avail->idx); + if (err) + goto free_vr; + + fd_install(fd, filp); + return fd; + +free_vr: + kfree(vr); +put_filp: + put_filp(filp); +put_fd: + put_unused_fd(fd); +out: + return err; +} + +/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */ +int vring_get_buffer(struct vring_info *vr, + struct iovec *in_iov, + unsigned int *num_in, unsigned long *in_len, + struct iovec *out_iov, + unsigned int *num_out, unsigned long *out_len) +{ + unsigned int i, in = 0, out = 0; + unsigned long dummy; + u16 head; + struct vring_desc d; + + if (unlikely(get_user(head, &vr->ring.avail->idx) != 0)) + return -EFAULT; + + if (vr->last_avail == head) + return 0; + + if (!in_len) + in_len = &dummy; + if (!out_len) + out_len = &dummy; + + *in_len = *out_len = 0; + + if (unlikely(get_user(head, &vr->ring.avail->ring[head]) != 0)) + return -EFAULT; + + i = head; + do { + if (unlikely(i >= vr->ring.num)) { + pr_debug("vring: bad index: %u\n", i); + return -EINVAL; + } + + if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0) + return -EFAULT; + + if (d.flags & VRING_DESC_F_WRITE) { + /* Check for length and iovec overflows */ + if (!num_in) + return -EINVAL; + if (in == *num_in || *in_len + d.len < *in_len) + return -E2BIG; + in_iov[in].iov_len = d.len; + *in_len += d.len; + in_iov[in].iov_base = (void __user*)(long)d.addr; + in++; + } else { + if (!num_out) + return -EINVAL; + if (out == *num_out || *out_len + d.len < *out_len) + return -E2BIG; + out_iov[out].iov_len = d.len; + *out_len += d.len; + out_iov[out].iov_base = (void __user*)(long)d.addr; + out++; + } + + i = d.next; + } while (d.flags & VRING_DESC_F_NEXT); + + if (num_in) + *num_in = in; + if (num_out) + *num_out = out; + + /* 0 is a valid head, so add one. */ + vr->last_avail++; + return head + 1; +} +EXPORT_SYMBOL_GPL(vring_get_buffer); + +void vring_used_buffer(struct vring_info *vr, int id, u32 len) +{ + struct vring_used_elem used; + u16 used_idx; + + BUG_ON(id <= 0 || id > vr->ring.num); + + used.id = id - 1; + used.len = len; + if (get_user(used_idx, &vr->ring.used->idx) != 0) + return; + + copy_to_user(&vr->ring.used->ring[used_idx & vr->mask], &used, + sizeof(used)); + wmb(); + used_idx++; + put_user(used_idx, &vr->ring.used->idx); +} +EXPORT_SYMBOL_GPL(vring_used_buffer); + +void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len) +{ + struct vring_used_elem *used; + + BUG_ON(id <= 0 || id > vr->ring.num); + BUG_ON(!vr->used); + + used = &vr->used->ring[vr->used->idx & vr->mask]; + used->id = id - 1; + used->len = len; + /* Make sure buffer is written before we update index. */ + wmb(); + vr->used->idx++; +} +EXPORT_SYMBOL_GPL(vring_used_buffer_atomic); + +void vring_wake(struct vring_info *vr) +{ + wake_up(&vr->poll_wait); +} +EXPORT_SYMBOL_GPL(vring_wake); + +struct vring_info *vring_attach(int fd, const struct vring_ops *ops, + void *data, bool atomic_use) +{ + struct file *filp; + struct vring_info *vr; + + /* Must be a valid fd, and must be one of ours. */ + filp = fget(fd); + if (!filp) { + vr = ERR_PTR(-EBADF); + goto out; + } + + if (filp->f_op != &vring_fops) { + vr = ERR_PTR(-EBADF); + goto fput; + } + + /* Mutex simply protects against parallel vring_attach. */ + mutex_lock(&vring_lock); + vr = filp->private_data; + if (vr->ops) { + vr = ERR_PTR(-EBUSY); + goto unlock; + } + + /* If they want to use atomically, we have to map the page. */ + if (atomic_use) { + if (get_user_pages(current, current->mm, + (unsigned long)vr->ring.used, 1, 1, 1, + &vr->used_page, NULL) != 1) { + vr = ERR_PTR(-EFAULT); + goto unlock; + } + vr->used = kmap(vr->used_page); + if (!vr->used) { + put_page(vr->used_page); + vr = ERR_PTR(-ENOMEM); + goto unlock; + } + } + + vr->ops = ops; + vr->ops_data = data; + +unlock: + mutex_unlock(&vring_lock); +fput: + fput(filp); +out: + return vr; +} +EXPORT_SYMBOL_GPL(vring_attach); + +static int vringfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "vring", NULL, VRINGFS_SUPER_MAGIC, mnt); +} + +static struct file_system_type vring_fs_type = { + .name = "vringfs", + .get_sb = vringfs_get_sb, + .kill_sb = kill_anon_super, +}; + +static int init(void) +{ + register_filesystem(&vring_fs_type); + vring_mnt = kern_mount(&vring_fs_type); + return 0; +} + +module_init(init); diff -r 99132ad16999 include/asm-x86/unistd_32.h --- a/include/asm-x86/unistd_32.h Sat Apr 05 21:20:32 2008 +1100 +++ b/include/asm-x86/unistd_32.h Sat Apr 05 21:31:40 2008 +1100 @@ -332,6 +332,7 @@ #define __NR_fallocate 324 #define __NR_timerfd_settime 325 #define __NR_timerfd_gettime 326 +#define __NR_vringfd 327 #ifdef __KERNEL__ diff -r 99132ad16999 include/linux/magic.h --- a/include/linux/magic.h Sat Apr 05 21:20:32 2008 +1100 +++ b/include/linux/magic.h Sat Apr 05 21:31:40 2008 +1100 @@ -41,5 +41,6 @@ #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA +#define VRINGFS_SUPER_MAGIC 0xB1BBAD #endif /* __LINUX_MAGIC_H__ */ diff -r 99132ad16999 include/linux/syscalls.h --- a/include/linux/syscalls.h Sat Apr 05 21:20:32 2008 +1100 +++ b/include/linux/syscalls.h Sat Apr 05 21:31:40 2008 +1100 @@ -614,6 +614,7 @@ asmlinkage long sys_timerfd_gettime(int asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_vringfd(void __user *, unsigned num, u16 __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); diff -r 99132ad16999 include/linux/vring.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/vring.h Sat Apr 05 21:31:40 2008 +1100 @@ -0,0 +1,54 @@ +/* Ring-buffer file descriptor implementation. + * + * Copyright 2008 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef _LINUX_VRING_H +#define _LINUX_VRING_H + +/* All members are optional */ +struct vring_ops +{ + /* Cleanup */ + void (*destroy)(void *); + + /* Returns number of used buffers, or negative errno. */ + int (*pull)(void *); + + /* Returns 0 or negative errno. */ + int (*push)(void *); +}; + +/* If they want to call vring_used_buffer_atomic(), set atomic_use. + * This currently means that the userspace used buffer must fit in a page. */ +struct vring_info *vring_attach(int fd, const struct vring_ops *ops, + void *data, bool atomic_use); + +struct iovec; + +/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */ +int vring_get_buffer(struct vring_info *vr, + struct iovec *in_iov, + unsigned int *num_in, unsigned long *in_len, + struct iovec *out_iov, + unsigned int *num_out, unsigned long *out_len); + +void vring_used_buffer(struct vring_info *vr, int id, u32 len); + +void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len); + +void vring_wake(struct vring_info *vr); +#endif /* _LINUX_VRING_H */ diff -r 99132ad16999 kernel/sys_ni.c --- a/kernel/sys_ni.c Sat Apr 05 21:20:32 2008 +1100 +++ b/kernel/sys_ni.c Sat Apr 05 21:31:40 2008 +1100 @@ -161,3 +161,4 @@ cond_syscall(compat_sys_timerfd_settime) cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); +cond_syscall(sys_vringfd);
It turns out the lguest (and possibly kvm) want the addresses in the ring buffer to only cover a certain part of memory, and be offset. It makes sense that this be an ioctl. Signed-off-by: Rusty Russell <rusty at rustcorp.com.au> diff -r 08fb00b8acab Documentation/ioctl-number.txt --- a/Documentation/ioctl-number.txt Sat Apr 05 21:31:40 2008 +1100 +++ b/Documentation/ioctl-number.txt Sat Apr 05 22:00:10 2008 +1100 @@ -183,6 +183,7 @@ 0xAC 00-1F linux/raw.h 0xAC 00-1F linux/raw.h 0xAD 00 Netfilter device in development: <mailto:rusty at rustcorp.com.au> +0xAE 00-01 linux/vring.h 0xB0 all RATIO devices in development: <mailto:vgo at ratio.de> 0xB1 00-1F PPPoX <mailto:mostrows at styx.uwaterloo.ca> diff -r 08fb00b8acab fs/vring.c --- a/fs/vring.c Sat Apr 05 21:31:40 2008 +1100 +++ b/fs/vring.c Sat Apr 05 22:00:10 2008 +1100 @@ -38,6 +38,8 @@ struct vring_info u16 mask; u16 __user *last_used; u16 last_avail; + + unsigned long base, limit; const struct vring_ops *ops; void *ops_data; @@ -120,10 +122,30 @@ static int vring_release(struct inode *i return 0; } +static int vring_ioctl(struct inode *in, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct vring_info *vr = filp->private_data; + + switch (cmd) { + case VRINGSETBASE: + vr->base = arg; + break; + case VRINGSETLIMIT: + vr->limit = arg; + break; + default: + return -ENOTTY; + } + + return 0; +} + static const struct file_operations vring_fops = { .release = vring_release, .write = vring_write, .poll = vring_poll, + .ioctl = vring_ioctl, }; asmlinkage long sys_vringfd(void __user *addr, @@ -166,6 +188,8 @@ asmlinkage long sys_vringfd(void __user vr->mask = num_descs - 1; vr->ops = NULL; vr->used = NULL; + vr->limit = -1UL; + vr->base = 0; err = get_user(vr->last_avail, &vr->ring.avail->idx); if (err) @@ -208,12 +232,15 @@ int vring_get_buffer(struct vring_info * out_len = &dummy; *in_len = *out_len = 0; - - if (unlikely(get_user(head, &vr->ring.avail->ring[head]) != 0)) + + if (unlikely(get_user(head, &vr->ring.avail->ring[vr->last_avail + % vr->ring.num]))) return -EFAULT; i = head; do { + void __user *base; + if (unlikely(i >= vr->ring.num)) { pr_debug("vring: bad index: %u\n", i); return -EINVAL; @@ -222,24 +249,38 @@ int vring_get_buffer(struct vring_info * if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0) return -EFAULT; + if (d.addr + d.len > vr->limit || (d.addr + d.len < d.addr)) { + pr_debug("vring: bad addr/len: %u@%p\n", + d.len, (void *)(unsigned long)d.addr); + return -EINVAL; + } + + base = (void __user *)(unsigned long)d.addr + vr->base; + if (d.flags & VRING_DESC_F_WRITE) { /* Check for length and iovec overflows */ - if (!num_in) + if (!num_in) { + pr_debug("vring: writable desc %u in ring %p\n", + i, vr->ring.desc); return -EINVAL; + } if (in == *num_in || *in_len + d.len < *in_len) return -E2BIG; in_iov[in].iov_len = d.len; *in_len += d.len; - in_iov[in].iov_base = (void __user*)(long)d.addr; + in_iov[in].iov_base = base; in++; } else { - if (!num_out) + if (!num_out) { + pr_debug("vring: readable desc %u in ring %p\n", + i, vr->ring.desc); return -EINVAL; + } if (out == *num_out || *out_len + d.len < *out_len) return -E2BIG; out_iov[out].iov_len = d.len; *out_len += d.len; - out_iov[out].iov_base = (void __user*)(long)d.addr; + out_iov[out].iov_base = base; out++; } diff -r 08fb00b8acab include/linux/vring.h --- a/include/linux/vring.h Sat Apr 05 21:31:40 2008 +1100 +++ b/include/linux/vring.h Sat Apr 05 22:00:10 2008 +1100 @@ -18,7 +18,13 @@ */ #ifndef _LINUX_VRING_H #define _LINUX_VRING_H +#include <linux/types.h> +/* Ioctl defines, as in "ioctls are AEgly". */ +#define VRINGSETBASE _IO(0xAE, 0) +#define VRINGSETLIMIT _IO(0xAE, 1) + +#ifdef __KERNEL__ /* All members are optional */ struct vring_ops { @@ -51,4 +57,6 @@ void vring_used_buffer_atomic(struct vri void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len); void vring_wake(struct vring_info *vr); +#endif /* __KERNEL__ */ + #endif /* _LINUX_VRING_H */
Rusty Russell wrote:> It turns out the lguest (and possibly kvm) want the addresses in the > ring buffer to only cover a certain part of memory, and be offset. > > <mailto:rusty at rustcorp.com.au> > +0xAE 00-01 linux/vring.h >Another virtualization subsystem already uses this range. -- Any sufficiently difficult bug is indistinguishable from a feature.
Rusty Russell wrote:> For virtualization, we've developed virtio_ring for efficient communication. > This would also work well for userspace-kernel communication, particularly > for things like the tun device. By using the same ABI, we can join guests > to the host kernel trivially. > > These patches are fairly alpha; I've seen some network stalls I have to > track down and there are some fixmes. > > Comments welcome! > Rusty. > > diff -r 99132ad16999 Documentation/test_vring.c > --- /dev/null Thu Jan 01 00:00:00 1970 +0000 > +++ b/Documentation/test_vring.c Sat Apr 05 21:31:40 2008 +1100 > @@ -0,0 +1,47 @@ > +#include <unistd.h> > +#include <linux/virtio_ring.h> > +#include <stdio.h> > +#include <stdint.h> > +#include <err.h> > +#include <poll.h> > + > +#ifndef __NR_vringfd > +#define __NR_vringfd 327 > +#endif > + > +int main() > +{ > + int fd, r; > + struct vring vr; > + uint16_t used = 0; > + struct pollfd pfd; > + void *buf = calloc(vring_size(256, getpagesize()), 0);Shouldn't this be calloc(1, vring_size(256, getpagesize()));?> + vring_init(&vr, 256, buf, getpagesize()); > + > + fd = syscall(__NR_vringfd, buf, 256, &used); > + if (fd < 0) > + err(1, "vringfd gave %i", fd); > + > + pfd.fd = fd; > + pfd.events = POLLIN; > + r = poll(&pfd, 1, 0); > + > + if (r != 0) > + err(1, "poll gave %i", r); > + > + vr.used->idx++; > + r = poll(&pfd, 1, 0); > + > + if (r != 1) > + err(1, "poll after buf used gave %i", r); > + > + used++; > + r = poll(&pfd, 1, 0); > + > + if (r != 0) > + err(1, "poll after used incremented gave %i", r);I have a tough time seeing what you're demonstrating here. Perhaps some comments?> + close(fd); > + return 0; > +} > diff -r 99132ad16999 arch/x86/kernel/syscall_table_32.S > --- a/arch/x86/kernel/syscall_table_32.S Sat Apr 05 21:20:32 2008 +1100 > +++ b/arch/x86/kernel/syscall_table_32.S Sat Apr 05 21:31:40 2008 +1100 > @@ -326,3 +326,4 @@ ENTRY(sys_call_table) > .long sys_fallocate > .long sys_timerfd_settime /* 325 */ > .long sys_timerfd_gettime > + .long sys_vringfd > diff -r 99132ad16999 fs/Kconfig > --- a/fs/Kconfig Sat Apr 05 21:20:32 2008 +1100 > +++ b/fs/Kconfig Sat Apr 05 21:31:40 2008 +1100 > @@ -2135,4 +2135,14 @@ source "fs/nls/Kconfig" > source "fs/nls/Kconfig" > source "fs/dlm/Kconfig" > > +config VRINGFD > + bool "vring fd support (EXPERIMENTAL)" > + depends on EXPERIMENTAL > + help > + vring is a ringbuffer implementation for efficient I/O. It is > + currently used by virtualization hosts (lguest, kvm) for efficient > + networking using the tun driver. > + > + If unsure, say N. > +Should select VIRTIO && VIRTIO_RING <snip>> +/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */ > +int vring_get_buffer(struct vring_info *vr, > + struct iovec *in_iov, > + unsigned int *num_in, unsigned long *in_len, > + struct iovec *out_iov, > + unsigned int *num_out, unsigned long *out_len) > +{It seems unlikely that a caller could place both in_iov/out_iov on the stack since to do it safely, you would have to use vring.num which is determined by userspace. Since you have to kmalloc() the buffers anyway, why not just allocate a single data structure within this function and return it.> +void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len) > +{ > + struct vring_used_elem *used; > + > + BUG_ON(id <= 0 || id > vr->ring.num); > + BUG_ON(!vr->used); > + > + used = &vr->used->ring[vr->used->idx & vr->mask]; > + used->id = id - 1; > + used->len = len; > + /* Make sure buffer is written before we update index. */ > + wmb(); > + vr->used->idx++; > +} > +EXPORT_SYMBOL_GPL(vring_used_buffer_atomic);When is this actually safe to use?> + > +void vring_wake(struct vring_info *vr) > +{ > + wake_up(&vr->poll_wait); > +} > +EXPORT_SYMBOL_GPL(vring_wake); > + > +struct vring_info *vring_attach(int fd, const struct vring_ops *ops, > + void *data, bool atomic_use) > +{ > + struct file *filp; > + struct vring_info *vr; > + > + /* Must be a valid fd, and must be one of ours. */ > + filp = fget(fd); > + if (!filp) { > + vr = ERR_PTR(-EBADF); > + goto out; > + } > + > + if (filp->f_op != &vring_fops) { > + vr = ERR_PTR(-EBADF); > + goto fput; > + } > + > + /* Mutex simply protects against parallel vring_attach. */ > + mutex_lock(&vring_lock); > + vr = filp->private_data; > + if (vr->ops) { > + vr = ERR_PTR(-EBUSY); > + goto unlock; > + } > + > + /* If they want to use atomically, we have to map the page. */ > + if (atomic_use) { > + if (get_user_pages(current, current->mm, > + (unsigned long)vr->ring.used, 1, 1, 1, > + &vr->used_page, NULL) != 1) { > + vr = ERR_PTR(-EFAULT); > + goto unlock; > + }Oh, this is when it's safe to use. You don't seem to be acquiring current->mm->mmap_sem here. Also, this assumes the used ring fits within a single page which isn't true with a ring > 512 elements. A consequence of this is that devices that interact with a ring queue atomically now have an additional capability: pinning an arbitrary amount of physical memory. I think this means that it's no longer safe to give a tap fd to an unprivileged process regardless of how you're routing the traffic. Regards, Anthony Liguori
Rusty Russell wrote:> It turns out the lguest (and possibly kvm) want the addresses in the > ring buffer to only cover a certain part of memory, and be offset. > > It makes sense that this be an ioctl. > > Signed-off-by: Rusty Russell <rusty at rustcorp.com.au> ><snip>> @@ -208,12 +232,15 @@ int vring_get_buffer(struct vring_info * > out_len = &dummy; > > *in_len = *out_len = 0; > - > - if (unlikely(get_user(head, &vr->ring.avail->ring[head]) != 0)) > + > + if (unlikely(get_user(head, &vr->ring.avail->ring[vr->last_avail > + % vr->ring.num])))Why not & with vr->mask for the sake of consistency with the rest of the code. Regards, Anthony Liguori
Rusty Russell wrote:> This patch modifies tun to allow a vringfd to specify the receive > buffer. Because we can't copy to userspace in bh context, we queue > like normal then use the "pull" hook to actually do the copy. > > More thought needs to be put into the possible races with ring > registration and a simultaneous close, for example (see FIXME). > > We use struct virtio_net_hdr prepended to packets in the ring to allow > userspace to receive GSO packets in future (at the moment, the tun > driver doesn't tell the stack it can handle them, so these cases are > never taken). > > Signed-off-by: Rusty Russell <rusty at rustcorp.com.au><snip>> + > +#ifdef CONFIG_VRINGFDI think the rest of the code needs these for it to actually work without VRINGFD.> +static void unset_recv(void *_tun) > +{ > + struct tun_struct *tun = _tun; > + > + tun->inring = NULL; > +} > + > +/* Returns number of used buffers, or negative errno. */ > +static int pull_recv_skbs(void *_tun) > +{ > + struct tun_struct *tun = _tun; > + int err = 0, num_copied = 0; > + struct sk_buff *skb; > + > + while ((skb = skb_dequeue(&tun->readq)) != NULL) { > + struct iovec iov[1+MAX_SKB_FRAGS]; > + struct virtio_net_hdr gso = { 0 }; /* no info leak */ > + unsigned int iovnum = ARRAY_SIZE(iov); > + unsigned long len; > + int id; > + > + id = vring_get_buffer(tun->inring, iov, &iovnum, &len, > + NULL, NULL, NULL); > + if (id <= 0) { > + err = id; > + break; > + }Ah, I see now why you're passing something from the stack. Regards, Anthony Liguori
On Sunday 06 April 2008 03:18:59 Anthony Liguori wrote:> Rusty Russell wrote: > > - if (unlikely(get_user(head, &vr->ring.avail->ring[head]) != 0)) > > + > > + if (unlikely(get_user(head, &vr->ring.avail->ring[vr->last_avail > > + % vr->ring.num]))) > > Why not & with vr->mask for the sake of consistency with the rest of the > code.Thanks, fixed. Rusty.
Hey, Rusty,> For virtualization, we've developed virtio_ring for efficient communication. > This would also work well for userspace-kernel communication, particularly > for things like the tun device. By using the same ABI, we can join guests > to the host kernel trivially.I'm *sure* you meant to document that somewhat non-trivial proposed new kernel API as soon as you got a moment. But, since I went to the trouble of reverse engineering it, I decided not to wait and to make a little unreliable guide of my own. For those who are curious about how vringfd() is meant to work (or about how badly I misunderstood it), the info is at: http://lwn.net/SubscriberLink/276856/8c927025c53ad7ce/ Hopefully it will be helpful, jon
On Tuesday 08 April 2008 03:54:34 Jonathan Corbet wrote:> Hey, Rusty, > > > For virtualization, we've developed virtio_ring for efficient > > communication. This would also work well for userspace-kernel > > communication, particularly for things like the tun device. By using the > > same ABI, we can join guests to the host kernel trivially. > > I'm *sure* you meant to document that somewhat non-trivial proposed new > kernel API as soon as you got a moment.Actually, yes. But I wanted to get it out there before I start the treck across to the virtualization summit. A few points: 'The page alignment for the used array is important - that array might be mapped separately into kernel space.' Well, the used array is written by one side only, so it's possible to split the ring here and make each part r/o to the other side. More importantly, a page boundary is almost certainly a cacheline boundary, and we already have a userspace interface for it. 'Note that the flags fields in the vring_avail and vring_used structures appear to be unused.' virtio uses these for wakeup/interrupt suppression. It's a cheap way to avoid hypercalls, and we can use them the same way to avoid system calls (you set the suppression bit while you're actually looking at the ring). The need for the kmap (and hence the atomic horror) has now been alleviated: I changed the shinfo destructor code to allow the destructor to hold onto the skb data so it can queue it and free it later. BTW, the only place currently where both output and input buffers are used is the virtio_blk driver doing a read, where the header describes the operation, and the other buffers are overwritten with the data. Thanks! Rusty.
On Saturday 05 April 2008, Rusty Russell wrote:> +asmlinkage long sys_vringfd(void __user *addr, > +??????????????????????? ? ?unsigned num_descs, > +??????????????????????? ? ?u16 __user *last_used) > +{ > +???????int fd, err; > +???????struct file *filp; > +???????struct vring_info *vr; > + > +???????/* Must be a power of two, and representable by u16 */ > +???????if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536) { > +???????????????err = -EINVAL; > +???????????????goto out; > +???????} > + > +???????fd = get_unused_fd(); > +???????if (fd < 0) { > +???????????????err = fd; > +???????????????goto out; > +???????} > + > +???????filp = alloc_file(vring_mnt, dget(vring_mnt->mnt_root), FMODE_WRITE, > +??????????????????????? ?&vring_fops); > +???????if (!filp) { > +???????????????err = -ENFILE; > +???????????????goto put_fd; > +???????}This looks like a candidate for anon_inode_getfd(), which would let you get rid of the code for registering your own file system. Arnd <><
Rusty Russell wrote:> +asmlinkage long sys_vringfd(void __user *addr, > + unsigned num_descs, > + u16 __user *last_used) > +{ > + int fd, err; > + struct file *filp; > + struct vring_info *vr; > + > + /* Must be a power of two, and representable by u16 */ > + if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536) { >65536 doesn't fit in u16. J
Hi Rusty, A couple comments below. On Sat, Apr 05, 2008 at 10:02:08PM +1000, Rusty Russell wrote:> +static unsigned int vring_poll(struct file *filp, > + struct poll_table_struct *poll) > +{ > + struct vring_info *vr = filp->private_data; > + int err; > + unsigned int mask; > + u16 used, last_used; > + > + /* Some uses of vrings require updating in user context. This > + * is best done close to the caller, ie. here. */ > + if (vr->ops && vr->ops->pull) { > + err = vr->ops->pull(vr->ops_data); > + if (unlikely(err < 0)) > + return err; > + > + if (err > 0) { > + /* Buffers have been used, no need to check indices */ > + mask = POLLIN | POLLRDNORM; > + goto poll_wait; > + } > + } > + > + err = get_user(used, &vr->ring.used->idx); > + if (unlikely(err)) > + return err; > + > + err = get_user(last_used, vr->last_used); > + if (unlikely(err)) > + return err; > + > + /* More buffers have been used? It's 'readable'. */ > + if (used != last_used) > + mask = POLLIN | POLLRDNORM; > + else > + mask = 0; > + > +poll_wait: > + poll_wait(filp, &vr->poll_wait, poll); > + > + return mask; > +}I suppose you are doing data copy in ->poll instead of ->read to save a system call? This is weird, not conformant to what the interface is supposed to do. This way select/poll syscalls might block in userspace datacopy. The application might have a higher priority fd in the fdset to be informed of, for example. So why not change this to the common arrangement, with vring_poll adding the waitqueue with poll_wait() and vring_read doing the actual data copy ?> +struct vring_info *vring_attach(int fd, const struct vring_ops *ops, > + void *data, bool atomic_use) > +{ > + struct file *filp; > + struct vring_info *vr; > + > + /* Must be a valid fd, and must be one of ours. */ > + filp = fget(fd); > + if (!filp) { > + vr = ERR_PTR(-EBADF); > + goto out; > + } > + > + if (filp->f_op != &vring_fops) { > + vr = ERR_PTR(-EBADF); > + goto fput; > + } > + > + /* Mutex simply protects against parallel vring_attach. */ > + mutex_lock(&vring_lock); > + vr = filp->private_data; > + if (vr->ops) { > + vr = ERR_PTR(-EBUSY); > + goto unlock; > + } > + > + /* If they want to use atomically, we have to map the page. */ > + if (atomic_use) { > + if (get_user_pages(current, current->mm, > + (unsigned long)vr->ring.used, 1, 1, 1, > + &vr->used_page, NULL) != 1) {Can't the same be achieved by the app mlocking the vring pages, which then goes through standard rlimit checking ?
Possibly Parallel Threads
- [PATCH RFC 1/5] vringfd syscall
- [PATCH 2/2] vdpa: implement config interrupt in IFCVF
- [RFC v2] vhost: introduce mdev based hardware vhost backend
- [RFC v2] vhost: introduce mdev based hardware vhost backend
- [RFC] vhost: introduce mdev based hardware vhost backend