vhost-blk is an in-kernel virito-blk device accelerator. Due to lack of proper in-kernel AIO interface, this version converts guest's I/O request to bio and use submit_bio() to submit I/O directly. So this version any supports raw block device as guest's disk image, e.g. /dev/sda, /dev/ram0. We can add file based image support to vhost-blk once we have in-kernel AIO interface. There are some work in progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown: http://marc.info/?l=linux-fsdevel&m=133312234313122 Performance evaluation: ----------------------------- 1) LKVM Fio with libaio ioengine on Fusion IO device using kvm tool IOPS Before After Improvement seq-read 107 121 +13.0% seq-write 130 179 +37.6% rnd-read 102 122 +19.6% rnd-write 125 159 +27.0% 2) QEMU Fio with libaio ioengine on Fusion IO device using QEMU IOPS Before After Improvement seq-read 76 123 +61.8% seq-write 139 173 +24.4% rnd-read 73 120 +64.3% rnd-write 75 156 +108.0% Userspace bits: ----------------------------- 1) LKVM The latest vhost-blk userspace bits for kvm tool can be found here: git at github.com:asias/linux-kvm.git blk.vhost-blk 2) QEMU The latest vhost-blk userspace prototype for QEMU can be found here: git at github.com:asias/qemu.git blk.vhost-blk Changes in v4: - Mark req->status as userspace pointer - Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status() - Add if (need_resched()) schedule() in blk thread - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop() - Use vq_err() instead of pr_warn() - Fail un Unsupported request - Add flush in vhost_blk_set_features() Changes in v3: - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph! - Check file passed by user is a raw block device file Signed-off-by: Asias He <asias at redhat.com> --- drivers/vhost/Kconfig | 1 + drivers/vhost/Kconfig.blk | 10 + drivers/vhost/Makefile | 2 + drivers/vhost/blk.c | 677 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/vhost/blk.h | 8 + 5 files changed, 698 insertions(+) create mode 100644 drivers/vhost/Kconfig.blk create mode 100644 drivers/vhost/blk.c create mode 100644 drivers/vhost/blk.h diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 202bba6..acd8038 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -11,4 +11,5 @@ config VHOST_NET if STAGING source "drivers/vhost/Kconfig.tcm" +source "drivers/vhost/Kconfig.blk" endif diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk new file mode 100644 index 0000000..ff8ab76 --- /dev/null +++ b/drivers/vhost/Kconfig.blk @@ -0,0 +1,10 @@ +config VHOST_BLK + tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL && m + ---help--- + This kernel module can be loaded in host kernel to accelerate + guest block with virtio_blk. Not to be confused with virtio_blk + module itself which needs to be loaded in guest kernel. + + To compile this driver as a module, choose M here: the module will + be called vhost_blk. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index a27b053..1a8a4a5 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o vhost_net-y := vhost.o net.o obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o +vhost_blk-y := blk.o diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c new file mode 100644 index 0000000..5c2b790 --- /dev/null +++ b/drivers/vhost/blk.c @@ -0,0 +1,677 @@ +/* + * Copyright (C) 2011 Taobao, Inc. + * Author: Liu Yuan <tailai.ly at taobao.com> + * + * Copyright (C) 2012 Red Hat, Inc. + * Author: Asias He <asias at redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-blk server in host kernel. + */ + +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/vhost.h> +#include <linux/virtio_blk.h> +#include <linux/mutex.h> +#include <linux/file.h> +#include <linux/kthread.h> +#include <linux/blkdev.h> + +#include "vhost.c" +#include "vhost.h" +#include "blk.h" + +/* The block header is in the first and separate buffer. */ +#define BLK_HDR 0 + +static DEFINE_IDA(vhost_blk_index_ida); + +enum { + VHOST_BLK_VQ_REQ = 0, + VHOST_BLK_VQ_MAX = 1, +}; + +struct req_page_list { + struct page **pages; + int pages_nr; +}; + +struct vhost_blk_req { + struct llist_node llnode; + struct req_page_list *pl; + struct vhost_blk *blk; + + struct iovec *iov; + int iov_nr; + + struct bio **bio; + atomic_t bio_nr; + + sector_t sector; + int write; + u16 head; + long len; + + u8 __user *status; +}; + +struct vhost_blk { + struct task_struct *host_kick; + struct vhost_blk_req *reqs; + struct vhost_virtqueue vq; + struct llist_head llhead; + struct vhost_dev dev; + u16 reqs_nr; + int index; +}; + +static inline int iov_num_pages(struct iovec *iov) +{ + return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) - + ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT; +} + +static int vhost_blk_setup(struct vhost_blk *blk) +{ + blk->reqs_nr = blk->vq.num; + + blk->reqs = kmalloc(sizeof(struct vhost_blk_req) * blk->reqs_nr, + GFP_KERNEL); + if (!blk->reqs) + return -ENOMEM; + + return 0; +} + +static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status) +{ + struct vhost_blk *blk = req->blk; + + if (__copy_to_user(req->status, &status, sizeof(status))) { + vq_err(&blk->vq, "Failed to write status\n"); + return -EFAULT; + } + + return 0; +} + +static void vhost_blk_enable_vq(struct vhost_blk *blk, + struct vhost_virtqueue *vq) +{ + wake_up_process(blk->host_kick); +} + +static void vhost_blk_req_done(struct bio *bio, int err) +{ + struct vhost_blk_req *req = bio->bi_private; + struct vhost_blk *blk = req->blk; + + if (err) + req->len = err; + + if (atomic_dec_and_test(&req->bio_nr)) { + llist_add(&req->llnode, &blk->llhead); + wake_up_process(blk->host_kick); + } + + bio_put(bio); +} + +static void vhost_blk_req_umap(struct vhost_blk_req *req) +{ + struct req_page_list *pl; + int i, j; + + + if (!req->pl) { + kfree(req->bio); + return; + } + + for (i = 0; i < req->iov_nr; i++) { + pl = &req->pl[i]; + for (j = 0; j < pl->pages_nr; j++) { + if (!req->write) + set_page_dirty_lock(pl->pages[j]); + page_cache_release(pl->pages[j]); + } + } + + kfree(req->pl); +} + +static int vhost_blk_bio_make(struct vhost_blk_req *req, + struct block_device *bdev) +{ + int pages_nr_total, i, j, ret; + struct iovec *iov = req->iov; + int iov_nr = req->iov_nr; + struct page **pages, *page; + struct bio *bio = NULL; + int bio_nr = 0; + + req->len = 0; + pages_nr_total = 0; + for (i = 0; i < iov_nr; i++) { + req->len += iov[i].iov_len; + pages_nr_total += iov_num_pages(&iov[i]); + } + + if (unlikely(req->write == WRITE_FLUSH)) { + req->pl = NULL; + req->bio = kmalloc(sizeof(struct bio *), GFP_KERNEL); + bio = bio_alloc(GFP_KERNEL, 1); + if (!bio) { + kfree(req->bio); + return -ENOMEM; + } + bio->bi_sector = req->sector; + bio->bi_bdev = bdev; + bio->bi_private = req; + bio->bi_end_io = vhost_blk_req_done; + req->bio[bio_nr++] = bio; + + goto out; + } + + req->pl = kmalloc((iov_nr * sizeof(struct req_page_list)) + + (pages_nr_total * sizeof(struct page *)) + + (pages_nr_total * sizeof(struct bio *)), + GFP_KERNEL); + if (!req->pl) + return -ENOMEM; + pages = (struct page **)&req->pl[iov_nr]; + req->bio = (struct bio **)&pages[pages_nr_total]; + + req->iov_nr = 0; + for (i = 0; i < iov_nr; i++) { + int pages_nr = iov_num_pages(&iov[i]); + unsigned long iov_base, iov_len; + struct req_page_list *pl; + + iov_base = (unsigned long)iov[i].iov_base; + iov_len = (unsigned long)iov[i].iov_len; + + ret = get_user_pages_fast(iov_base, pages_nr, + !req->write, pages); + if (ret != pages_nr) + goto fail; + + req->iov_nr++; + pl = &req->pl[i]; + pl->pages_nr = pages_nr; + pl->pages = pages; + + for (j = 0; j < pages_nr; j++) { + unsigned int off, len; + page = pages[j]; + off = iov_base & ~PAGE_MASK; + len = PAGE_SIZE - off; + if (len > iov_len) + len = iov_len; + + while (!bio || bio_add_page(bio, page, len, off) <= 0) { + bio = bio_alloc(GFP_KERNEL, pages_nr); + if (!bio) + goto fail; + bio->bi_sector = req->sector; + bio->bi_bdev = bdev; + bio->bi_private = req; + bio->bi_end_io = vhost_blk_req_done; + req->bio[bio_nr++] = bio; + } + req->sector += len >> 9; + iov_base += len; + iov_len -= len; + } + + pages += pages_nr; + } +out: + atomic_set(&req->bio_nr, bio_nr); + return 0; + +fail: + for (i = 0; i < bio_nr; i++) + bio_put(req->bio[i]); + vhost_blk_req_umap(req); + return -ENOMEM; +} + +static inline void vhost_blk_bio_send(struct vhost_blk_req *req) +{ + struct blk_plug plug; + int i, bio_nr; + + bio_nr = atomic_read(&req->bio_nr); + blk_start_plug(&plug); + for (i = 0; i < bio_nr; i++) + submit_bio(req->write, req->bio[i]); + blk_finish_plug(&plug); +} + +static int vhost_blk_req_submit(struct vhost_blk_req *req, struct file *file) +{ + + struct inode *inode = file->f_mapping->host; + struct block_device *bdev = inode->i_bdev; + int ret; + + ret = vhost_blk_bio_make(req, bdev); + if (ret < 0) + return ret; + + vhost_blk_bio_send(req); + + return ret; +} + +static int vhost_blk_req_done_thread(void *data) +{ + mm_segment_t oldfs = get_fs(); + struct vhost_blk *blk = data; + struct vhost_virtqueue *vq; + struct llist_node *llnode; + struct vhost_blk_req *req; + bool added; + u8 status; + int ret; + + vq = &blk->vq; + set_fs(USER_DS); + use_mm(blk->dev.mm); + for (;;) { + llnode = llist_del_all(&blk->llhead); + if (!llnode) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + if (unlikely(kthread_should_stop())) + break; + continue; + } + + if (need_resched()) + schedule(); + + added = false; + while (llnode) { + req = llist_entry(llnode, struct vhost_blk_req, llnode); + llnode = llist_next(llnode); + + vhost_blk_req_umap(req); + + status = req->len > 0 ? + VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; + ret = vhost_blk_set_status(req, status); + if (unlikely(ret)) + continue; + vhost_add_used(&blk->vq, req->head, req->len); + added = true; + } + if (likely(added)) + vhost_signal(&blk->dev, &blk->vq); + } + unuse_mm(blk->dev.mm); + set_fs(oldfs); + return 0; +} + +static void vhost_blk_flush(struct vhost_blk *blk) +{ + vhost_poll_flush(&blk->vq.poll); +} + +static inline void vhost_blk_stop(struct vhost_blk *blk, struct file **file) +{ + struct vhost_virtqueue *vq = &blk->vq; + struct file *f; + + mutex_lock(&vq->mutex); + f = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); + rcu_assign_pointer(vq->private_data, NULL); + mutex_unlock(&vq->mutex); + + *file = f; +} + +/* Handle guest request */ +static int vhost_blk_req_handle(struct vhost_virtqueue *vq, + struct virtio_blk_outhdr *hdr, + u16 head, u16 out, u16 in, + struct file *file) +{ + struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev); + struct vhost_blk_req *req; + int iov_nr, ret; + u8 status; + + if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID) + iov_nr = in - 1; + else + iov_nr = out - 1; + + req = &blk->reqs[head]; + req->head = head; + req->blk = blk; + /* The block data buffer follows block header buffer */ + req->iov = &vq->iov[BLK_HDR + 1]; + req->iov_nr = iov_nr; + /* The block status buffer follows block data buffer */ + req->status = vq->iov[iov_nr + 1].iov_base; + req->sector = hdr->sector; + + switch (hdr->type) { + case VIRTIO_BLK_T_OUT: + req->write = WRITE; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_IN: + req->write = READ; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_FLUSH: + req->write = WRITE_FLUSH; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_GET_ID: { + char id[VIRTIO_BLK_ID_BYTES]; + int len; + + ret = snprintf(id, VIRTIO_BLK_ID_BYTES, + "vhost-blk%d", blk->index); + if (ret < 0) + break; + len = ret; + ret = __copy_to_user(req->iov[0].iov_base, id, len); + status = ret < 0 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + ret = vhost_blk_set_status(req, status); + if (ret) + break; + vhost_add_used_and_signal(&blk->dev, vq, head, len); + break; + } + default: + vq_err(vq, "Unsupported request type %d\n", hdr->type); + status = VIRTIO_BLK_S_UNSUPP; + ret = vhost_blk_set_status(req, status); + if (ret) + break; + vhost_add_used_and_signal(&blk->dev, vq, head, 0); + } + + return ret; +} + +/* Guest kick us for IO request */ +static void vhost_blk_handle_guest_kick(struct vhost_work *work) +{ + struct virtio_blk_outhdr hdr; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + struct blk_plug plug; + struct file *f; + int in, out; + u16 head; + + vq = container_of(work, struct vhost_virtqueue, poll.work); + blk = container_of(vq->dev, struct vhost_blk, dev); + + /* TODO: check that we are running from vhost_worker? */ + f = rcu_dereference_check(vq->private_data, 1); + if (!f) + return; + + vhost_disable_notify(&blk->dev, vq); + blk_start_plug(&plug); + for (;;) { + head = vhost_get_vq_desc(&blk->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (unlikely(head < 0)) + break; + + if (unlikely(head == vq->num)) { + if (unlikely(vhost_enable_notify(&blk->dev, vq))) { + vhost_disable_notify(&blk->dev, vq); + continue; + } + break; + } + + if (unlikely(copy_from_user(&hdr, vq->iov[BLK_HDR].iov_base, + sizeof(hdr)))) { + vq_err(vq, "Failed to get block header!\n"); + vhost_discard_vq_desc(vq, 1); + break; + } + + if (vhost_blk_req_handle(vq, &hdr, head, out, in, f) < 0) + break; + } + blk_finish_plug(&plug); +} + +static int vhost_blk_open(struct inode *inode, struct file *file) +{ + struct vhost_blk *blk; + int ret; + + blk = kzalloc(sizeof(*blk), GFP_KERNEL); + if (!blk) { + ret = -ENOMEM; + goto out; + } + + ret = ida_simple_get(&vhost_blk_index_ida, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out_dev; + blk->index = ret; + + blk->vq.handle_kick = vhost_blk_handle_guest_kick; + + ret = vhost_dev_init(&blk->dev, &blk->vq, VHOST_BLK_VQ_MAX); + if (ret < 0) + goto out_dev; + file->private_data = blk; + + blk->host_kick = kthread_create(vhost_blk_req_done_thread, + blk, "vhost-blk-%d", current->pid); + if (IS_ERR(blk->host_kick)) { + ret = PTR_ERR(blk->host_kick); + goto out_dev; + } + + return ret; +out_dev: + kfree(blk); +out: + return ret; +} + +static int vhost_blk_release(struct inode *inode, struct file *f) +{ + struct vhost_blk *blk = f->private_data; + struct file *file; + + ida_simple_remove(&vhost_blk_index_ida, blk->index); + vhost_blk_stop(blk, &file); + vhost_blk_flush(blk); + vhost_dev_cleanup(&blk->dev, false); + if (file) + fput(file); + kthread_stop(blk->host_kick); + kfree(blk->reqs); + kfree(blk); + + return 0; +} + +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features) +{ + mutex_lock(&blk->dev.mutex); + blk->dev.acked_features = features; + vhost_blk_flush(blk); + mutex_unlock(&blk->dev.mutex); + + return 0; +} + +static long vhost_blk_set_backend(struct vhost_blk *blk, unsigned index, int fd) +{ + struct vhost_virtqueue *vq = &blk->vq; + struct file *file, *oldfile; + struct inode *inode; + int ret; + + mutex_lock(&blk->dev.mutex); + ret = vhost_dev_check_owner(&blk->dev); + if (ret) + goto out_dev; + + if (index >= VHOST_BLK_VQ_MAX) { + ret = -ENOBUFS; + goto out_dev; + } + + mutex_lock(&vq->mutex); + + if (!vhost_vq_access_ok(vq)) { + ret = -EFAULT; + goto out_vq; + } + + file = fget(fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto out_vq; + } + + /* Only raw block device is supported for now */ + inode = file->f_mapping->host; + if (!S_ISBLK(inode->i_mode)) { + ret = -EFAULT; + goto out_file; + } + + oldfile = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); + if (file != oldfile) { + rcu_assign_pointer(vq->private_data, file); + vhost_blk_enable_vq(blk, vq); + + ret = vhost_init_used(vq); + if (ret) + goto out_file; + } + + mutex_unlock(&vq->mutex); + + if (oldfile) { + vhost_blk_flush(blk); + fput(oldfile); + } + + mutex_unlock(&blk->dev.mutex); + return 0; + +out_file: + fput(file); +out_vq: + mutex_unlock(&vq->mutex); +out_dev: + mutex_unlock(&blk->dev.mutex); + return ret; +} + +static long vhost_blk_reset_owner(struct vhost_blk *blk) +{ + struct file *file = NULL; + int err; + + mutex_lock(&blk->dev.mutex); + err = vhost_dev_check_owner(&blk->dev); + if (err) + goto done; + vhost_blk_stop(blk, &file); + vhost_blk_flush(blk); + err = vhost_dev_reset_owner(&blk->dev); +done: + mutex_unlock(&blk->dev.mutex); + if (file) + fput(file); + return err; +} + +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_blk *blk = f->private_data; + void __user *argp = (void __user *)arg; + struct vhost_vring_file backend; + u64 __user *featurep = argp; + u64 features; + int ret; + + switch (ioctl) { + case VHOST_BLK_SET_BACKEND: + if (copy_from_user(&backend, argp, sizeof(backend))) + return -EFAULT; + return vhost_blk_set_backend(blk, backend.index, backend.fd); + case VHOST_GET_FEATURES: + features = VHOST_BLK_FEATURES; + if (copy_to_user(featurep, &features, sizeof(features))) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES: + if (copy_from_user(&features, featurep, sizeof(features))) + return -EFAULT; + if (features & ~VHOST_BLK_FEATURES) + return -EOPNOTSUPP; + return vhost_blk_set_features(blk, features); + case VHOST_RESET_OWNER: + return vhost_blk_reset_owner(blk); + default: + mutex_lock(&blk->dev.mutex); + ret = vhost_dev_ioctl(&blk->dev, ioctl, arg); + if (!ret && ioctl == VHOST_SET_VRING_NUM) + ret = vhost_blk_setup(blk); + vhost_blk_flush(blk); + mutex_unlock(&blk->dev.mutex); + return ret; + } +} + +static const struct file_operations vhost_blk_fops = { + .owner = THIS_MODULE, + .open = vhost_blk_open, + .release = vhost_blk_release, + .llseek = noop_llseek, + .unlocked_ioctl = vhost_blk_ioctl, +}; + +static struct miscdevice vhost_blk_misc = { + MISC_DYNAMIC_MINOR, + "vhost-blk", + &vhost_blk_fops, +}; + +static int vhost_blk_init(void) +{ + return misc_register(&vhost_blk_misc); +} + +static void vhost_blk_exit(void) +{ + misc_deregister(&vhost_blk_misc); +} + +module_init(vhost_blk_init); +module_exit(vhost_blk_exit); + +MODULE_VERSION("0.0.3"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio_blk"); diff --git a/drivers/vhost/blk.h b/drivers/vhost/blk.h new file mode 100644 index 0000000..2f674f0 --- /dev/null +++ b/drivers/vhost/blk.h @@ -0,0 +1,8 @@ +#include <linux/vhost.h> + +enum { + VHOST_BLK_FEATURES = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX), +}; +/* VHOST_BLK specific defines */ +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, struct vhost_vring_file) -- 1.7.11.7
On Mon, Oct 15, 2012 at 05:57:03PM +0800, Asias He wrote:> vhost-blk is an in-kernel virito-blk device accelerator. > > Due to lack of proper in-kernel AIO interface, this version converts > guest's I/O request to bio and use submit_bio() to submit I/O directly. > So this version any supports raw block device as guest's disk image, > e.g. /dev/sda, /dev/ram0. We can add file based image support to > vhost-blk once we have in-kernel AIO interface. There are some work in > progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown: > > http://marc.info/?l=linux-fsdevel&m=133312234313122 > > Performance evaluation: > ----------------------------- > 1) LKVM > Fio with libaio ioengine on Fusion IO device using kvm tool > IOPS Before After Improvement > seq-read 107 121 +13.0% > seq-write 130 179 +37.6% > rnd-read 102 122 +19.6% > rnd-write 125 159 +27.0% > > 2) QEMU > Fio with libaio ioengine on Fusion IO device using QEMU > IOPS Before After Improvement > seq-read 76 123 +61.8% > seq-write 139 173 +24.4% > rnd-read 73 120 +64.3% > rnd-write 75 156 +108.0% > > Userspace bits: > ----------------------------- > 1) LKVM > The latest vhost-blk userspace bits for kvm tool can be found here: > git at github.com:asias/linux-kvm.git blk.vhost-blk > > 2) QEMU > The latest vhost-blk userspace prototype for QEMU can be found here: > git at github.com:asias/qemu.git blk.vhost-blk > > Changes in v4: > - Mark req->status as userspace pointer > - Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status() > - Add if (need_resched()) schedule() in blk thread > - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop() > - Use vq_err() instead of pr_warn() > - Fail un Unsupported request > - Add flush in vhost_blk_set_features() > > Changes in v3: > - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph! > - Check file passed by user is a raw block device file > > Signed-off-by: Asias He <asias at redhat.com>Cc linux-kernel at vger.kernel.org, Jens, hch, davem. I wonder what do others think. Christoph - any comments? Also, what's the best way to merge this? To avoid conflicts when vhost APIs change (as that will be for 3.9) I can take this on the vhost tree which is normally merged by Dave Miller. Is this OK with everyone? Any acks?> --- > drivers/vhost/Kconfig | 1 + > drivers/vhost/Kconfig.blk | 10 + > drivers/vhost/Makefile | 2 + > drivers/vhost/blk.c | 677 ++++++++++++++++++++++++++++++++++++++++++++++ > drivers/vhost/blk.h | 8 + > 5 files changed, 698 insertions(+) > create mode 100644 drivers/vhost/Kconfig.blk > create mode 100644 drivers/vhost/blk.c > create mode 100644 drivers/vhost/blk.h > > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig > index 202bba6..acd8038 100644 > --- a/drivers/vhost/Kconfig > +++ b/drivers/vhost/Kconfig > @@ -11,4 +11,5 @@ config VHOST_NET > > if STAGING > source "drivers/vhost/Kconfig.tcm" > +source "drivers/vhost/Kconfig.blk" > endif > diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk > new file mode 100644 > index 0000000..ff8ab76 > --- /dev/null > +++ b/drivers/vhost/Kconfig.blk > @@ -0,0 +1,10 @@ > +config VHOST_BLK > + tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)" > + depends on BLOCK && EXPERIMENTAL && m > + ---help--- > + This kernel module can be loaded in host kernel to accelerate > + guest block with virtio_blk. Not to be confused with virtio_blk > + module itself which needs to be loaded in guest kernel. > + > + To compile this driver as a module, choose M here: the module will > + be called vhost_blk. > diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile > index a27b053..1a8a4a5 100644 > --- a/drivers/vhost/Makefile > +++ b/drivers/vhost/Makefile > @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o > vhost_net-y := vhost.o net.o > > obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o > +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o > +vhost_blk-y := blk.o > diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c > new file mode 100644 > index 0000000..5c2b790 > --- /dev/null > +++ b/drivers/vhost/blk.c > @@ -0,0 +1,677 @@ > +/* > + * Copyright (C) 2011 Taobao, Inc. > + * Author: Liu Yuan <tailai.ly at taobao.com> > + * > + * Copyright (C) 2012 Red Hat, Inc. > + * Author: Asias He <asias at redhat.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + * > + * virtio-blk server in host kernel. > + */ > + > +#include <linux/miscdevice.h> > +#include <linux/module.h> > +#include <linux/vhost.h> > +#include <linux/virtio_blk.h> > +#include <linux/mutex.h> > +#include <linux/file.h> > +#include <linux/kthread.h> > +#include <linux/blkdev.h> > + > +#include "vhost.c" > +#include "vhost.h" > +#include "blk.h" > + > +/* The block header is in the first and separate buffer. */ > +#define BLK_HDR 0 > + > +static DEFINE_IDA(vhost_blk_index_ida); > + > +enum { > + VHOST_BLK_VQ_REQ = 0, > + VHOST_BLK_VQ_MAX = 1, > +}; > + > +struct req_page_list { > + struct page **pages; > + int pages_nr; > +}; > + > +struct vhost_blk_req { > + struct llist_node llnode; > + struct req_page_list *pl; > + struct vhost_blk *blk; > + > + struct iovec *iov; > + int iov_nr; > + > + struct bio **bio; > + atomic_t bio_nr; > + > + sector_t sector; > + int write; > + u16 head; > + long len; > + > + u8 __user *status; > +}; > + > +struct vhost_blk { > + struct task_struct *host_kick; > + struct vhost_blk_req *reqs; > + struct vhost_virtqueue vq; > + struct llist_head llhead; > + struct vhost_dev dev; > + u16 reqs_nr; > + int index; > +}; > + > +static inline int iov_num_pages(struct iovec *iov) > +{ > + return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) - > + ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT; > +} > + > +static int vhost_blk_setup(struct vhost_blk *blk) > +{ > + blk->reqs_nr = blk->vq.num; > + > + blk->reqs = kmalloc(sizeof(struct vhost_blk_req) * blk->reqs_nr, > + GFP_KERNEL); > + if (!blk->reqs) > + return -ENOMEM; > + > + return 0; > +} > + > +static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status) > +{ > + struct vhost_blk *blk = req->blk; > + > + if (__copy_to_user(req->status, &status, sizeof(status))) { > + vq_err(&blk->vq, "Failed to write status\n"); > + return -EFAULT; > + } > + > + return 0; > +} > + > +static void vhost_blk_enable_vq(struct vhost_blk *blk, > + struct vhost_virtqueue *vq) > +{ > + wake_up_process(blk->host_kick); > +} > + > +static void vhost_blk_req_done(struct bio *bio, int err) > +{ > + struct vhost_blk_req *req = bio->bi_private; > + struct vhost_blk *blk = req->blk; > + > + if (err) > + req->len = err; > + > + if (atomic_dec_and_test(&req->bio_nr)) { > + llist_add(&req->llnode, &blk->llhead); > + wake_up_process(blk->host_kick); > + } > + > + bio_put(bio); > +} > + > +static void vhost_blk_req_umap(struct vhost_blk_req *req) > +{ > + struct req_page_list *pl; > + int i, j; > + > + > + if (!req->pl) { > + kfree(req->bio); > + return; > + } > + > + for (i = 0; i < req->iov_nr; i++) { > + pl = &req->pl[i]; > + for (j = 0; j < pl->pages_nr; j++) { > + if (!req->write) > + set_page_dirty_lock(pl->pages[j]); > + page_cache_release(pl->pages[j]); > + } > + } > + > + kfree(req->pl); > +} > + > +static int vhost_blk_bio_make(struct vhost_blk_req *req, > + struct block_device *bdev) > +{ > + int pages_nr_total, i, j, ret; > + struct iovec *iov = req->iov; > + int iov_nr = req->iov_nr; > + struct page **pages, *page; > + struct bio *bio = NULL; > + int bio_nr = 0; > + > + req->len = 0; > + pages_nr_total = 0; > + for (i = 0; i < iov_nr; i++) { > + req->len += iov[i].iov_len; > + pages_nr_total += iov_num_pages(&iov[i]); > + } > + > + if (unlikely(req->write == WRITE_FLUSH)) { > + req->pl = NULL; > + req->bio = kmalloc(sizeof(struct bio *), GFP_KERNEL); > + bio = bio_alloc(GFP_KERNEL, 1); > + if (!bio) { > + kfree(req->bio); > + return -ENOMEM; > + } > + bio->bi_sector = req->sector; > + bio->bi_bdev = bdev; > + bio->bi_private = req; > + bio->bi_end_io = vhost_blk_req_done; > + req->bio[bio_nr++] = bio; > + > + goto out; > + } > + > + req->pl = kmalloc((iov_nr * sizeof(struct req_page_list)) + > + (pages_nr_total * sizeof(struct page *)) + > + (pages_nr_total * sizeof(struct bio *)), > + GFP_KERNEL); > + if (!req->pl) > + return -ENOMEM; > + pages = (struct page **)&req->pl[iov_nr]; > + req->bio = (struct bio **)&pages[pages_nr_total]; > + > + req->iov_nr = 0; > + for (i = 0; i < iov_nr; i++) { > + int pages_nr = iov_num_pages(&iov[i]); > + unsigned long iov_base, iov_len; > + struct req_page_list *pl; > + > + iov_base = (unsigned long)iov[i].iov_base; > + iov_len = (unsigned long)iov[i].iov_len; > + > + ret = get_user_pages_fast(iov_base, pages_nr, > + !req->write, pages); > + if (ret != pages_nr) > + goto fail; > + > + req->iov_nr++; > + pl = &req->pl[i]; > + pl->pages_nr = pages_nr; > + pl->pages = pages; > + > + for (j = 0; j < pages_nr; j++) { > + unsigned int off, len; > + page = pages[j]; > + off = iov_base & ~PAGE_MASK; > + len = PAGE_SIZE - off; > + if (len > iov_len) > + len = iov_len; > + > + while (!bio || bio_add_page(bio, page, len, off) <= 0) { > + bio = bio_alloc(GFP_KERNEL, pages_nr); > + if (!bio) > + goto fail; > + bio->bi_sector = req->sector; > + bio->bi_bdev = bdev; > + bio->bi_private = req; > + bio->bi_end_io = vhost_blk_req_done; > + req->bio[bio_nr++] = bio; > + } > + req->sector += len >> 9; > + iov_base += len; > + iov_len -= len; > + } > + > + pages += pages_nr; > + } > +out: > + atomic_set(&req->bio_nr, bio_nr); > + return 0; > + > +fail: > + for (i = 0; i < bio_nr; i++) > + bio_put(req->bio[i]); > + vhost_blk_req_umap(req); > + return -ENOMEM; > +} > + > +static inline void vhost_blk_bio_send(struct vhost_blk_req *req) > +{ > + struct blk_plug plug; > + int i, bio_nr; > + > + bio_nr = atomic_read(&req->bio_nr); > + blk_start_plug(&plug); > + for (i = 0; i < bio_nr; i++) > + submit_bio(req->write, req->bio[i]); > + blk_finish_plug(&plug); > +} > + > +static int vhost_blk_req_submit(struct vhost_blk_req *req, struct file *file) > +{ > + > + struct inode *inode = file->f_mapping->host; > + struct block_device *bdev = inode->i_bdev; > + int ret; > + > + ret = vhost_blk_bio_make(req, bdev); > + if (ret < 0) > + return ret; > + > + vhost_blk_bio_send(req); > + > + return ret; > +} > + > +static int vhost_blk_req_done_thread(void *data) > +{ > + mm_segment_t oldfs = get_fs(); > + struct vhost_blk *blk = data; > + struct vhost_virtqueue *vq; > + struct llist_node *llnode; > + struct vhost_blk_req *req; > + bool added; > + u8 status; > + int ret; > + > + vq = &blk->vq; > + set_fs(USER_DS); > + use_mm(blk->dev.mm); > + for (;;) { > + llnode = llist_del_all(&blk->llhead); > + if (!llnode) { > + set_current_state(TASK_INTERRUPTIBLE); > + schedule(); > + if (unlikely(kthread_should_stop())) > + break; > + continue; > + } > + > + if (need_resched()) > + schedule(); > + > + added = false; > + while (llnode) { > + req = llist_entry(llnode, struct vhost_blk_req, llnode); > + llnode = llist_next(llnode); > + > + vhost_blk_req_umap(req); > + > + status = req->len > 0 ? > + VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; > + ret = vhost_blk_set_status(req, status); > + if (unlikely(ret)) > + continue; > + vhost_add_used(&blk->vq, req->head, req->len); > + added = true; > + } > + if (likely(added)) > + vhost_signal(&blk->dev, &blk->vq); > + } > + unuse_mm(blk->dev.mm); > + set_fs(oldfs); > + return 0; > +} > + > +static void vhost_blk_flush(struct vhost_blk *blk) > +{ > + vhost_poll_flush(&blk->vq.poll); > +} > + > +static inline void vhost_blk_stop(struct vhost_blk *blk, struct file **file) > +{ > + struct vhost_virtqueue *vq = &blk->vq; > + struct file *f; > + > + mutex_lock(&vq->mutex); > + f = rcu_dereference_protected(vq->private_data, > + lockdep_is_held(&vq->mutex)); > + rcu_assign_pointer(vq->private_data, NULL); > + mutex_unlock(&vq->mutex); > + > + *file = f; > +} > + > +/* Handle guest request */ > +static int vhost_blk_req_handle(struct vhost_virtqueue *vq, > + struct virtio_blk_outhdr *hdr, > + u16 head, u16 out, u16 in, > + struct file *file) > +{ > + struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev); > + struct vhost_blk_req *req; > + int iov_nr, ret; > + u8 status; > + > + if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID) > + iov_nr = in - 1; > + else > + iov_nr = out - 1; > + > + req = &blk->reqs[head]; > + req->head = head; > + req->blk = blk; > + /* The block data buffer follows block header buffer */ > + req->iov = &vq->iov[BLK_HDR + 1]; > + req->iov_nr = iov_nr; > + /* The block status buffer follows block data buffer */ > + req->status = vq->iov[iov_nr + 1].iov_base; > + req->sector = hdr->sector; > + > + switch (hdr->type) { > + case VIRTIO_BLK_T_OUT: > + req->write = WRITE; > + ret = vhost_blk_req_submit(req, file); > + break; > + case VIRTIO_BLK_T_IN: > + req->write = READ; > + ret = vhost_blk_req_submit(req, file); > + break; > + case VIRTIO_BLK_T_FLUSH: > + req->write = WRITE_FLUSH; > + ret = vhost_blk_req_submit(req, file); > + break; > + case VIRTIO_BLK_T_GET_ID: { > + char id[VIRTIO_BLK_ID_BYTES]; > + int len; > + > + ret = snprintf(id, VIRTIO_BLK_ID_BYTES, > + "vhost-blk%d", blk->index); > + if (ret < 0) > + break; > + len = ret; > + ret = __copy_to_user(req->iov[0].iov_base, id, len); > + status = ret < 0 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; > + ret = vhost_blk_set_status(req, status); > + if (ret) > + break; > + vhost_add_used_and_signal(&blk->dev, vq, head, len); > + break; > + } > + default: > + vq_err(vq, "Unsupported request type %d\n", hdr->type); > + status = VIRTIO_BLK_S_UNSUPP; > + ret = vhost_blk_set_status(req, status); > + if (ret) > + break; > + vhost_add_used_and_signal(&blk->dev, vq, head, 0); > + } > + > + return ret; > +} > + > +/* Guest kick us for IO request */ > +static void vhost_blk_handle_guest_kick(struct vhost_work *work) > +{ > + struct virtio_blk_outhdr hdr; > + struct vhost_virtqueue *vq; > + struct vhost_blk *blk; > + struct blk_plug plug; > + struct file *f; > + int in, out; > + u16 head; > + > + vq = container_of(work, struct vhost_virtqueue, poll.work); > + blk = container_of(vq->dev, struct vhost_blk, dev); > + > + /* TODO: check that we are running from vhost_worker? */ > + f = rcu_dereference_check(vq->private_data, 1); > + if (!f) > + return; > + > + vhost_disable_notify(&blk->dev, vq); > + blk_start_plug(&plug); > + for (;;) { > + head = vhost_get_vq_desc(&blk->dev, vq, vq->iov, > + ARRAY_SIZE(vq->iov), > + &out, &in, NULL, NULL); > + if (unlikely(head < 0)) > + break; > + > + if (unlikely(head == vq->num)) { > + if (unlikely(vhost_enable_notify(&blk->dev, vq))) { > + vhost_disable_notify(&blk->dev, vq); > + continue; > + } > + break; > + } > + > + if (unlikely(copy_from_user(&hdr, vq->iov[BLK_HDR].iov_base, > + sizeof(hdr)))) { > + vq_err(vq, "Failed to get block header!\n"); > + vhost_discard_vq_desc(vq, 1); > + break; > + } > + > + if (vhost_blk_req_handle(vq, &hdr, head, out, in, f) < 0) > + break; > + } > + blk_finish_plug(&plug); > +} > + > +static int vhost_blk_open(struct inode *inode, struct file *file) > +{ > + struct vhost_blk *blk; > + int ret; > + > + blk = kzalloc(sizeof(*blk), GFP_KERNEL); > + if (!blk) { > + ret = -ENOMEM; > + goto out; > + } > + > + ret = ida_simple_get(&vhost_blk_index_ida, 0, 0, GFP_KERNEL); > + if (ret < 0) > + goto out_dev; > + blk->index = ret; > + > + blk->vq.handle_kick = vhost_blk_handle_guest_kick; > + > + ret = vhost_dev_init(&blk->dev, &blk->vq, VHOST_BLK_VQ_MAX); > + if (ret < 0) > + goto out_dev; > + file->private_data = blk; > + > + blk->host_kick = kthread_create(vhost_blk_req_done_thread, > + blk, "vhost-blk-%d", current->pid); > + if (IS_ERR(blk->host_kick)) { > + ret = PTR_ERR(blk->host_kick); > + goto out_dev; > + } > + > + return ret; > +out_dev: > + kfree(blk); > +out: > + return ret; > +} > + > +static int vhost_blk_release(struct inode *inode, struct file *f) > +{ > + struct vhost_blk *blk = f->private_data; > + struct file *file; > + > + ida_simple_remove(&vhost_blk_index_ida, blk->index); > + vhost_blk_stop(blk, &file); > + vhost_blk_flush(blk); > + vhost_dev_cleanup(&blk->dev, false); > + if (file) > + fput(file); > + kthread_stop(blk->host_kick); > + kfree(blk->reqs); > + kfree(blk); > + > + return 0; > +} > + > +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features) > +{ > + mutex_lock(&blk->dev.mutex); > + blk->dev.acked_features = features; > + vhost_blk_flush(blk); > + mutex_unlock(&blk->dev.mutex); > + > + return 0; > +} > + > +static long vhost_blk_set_backend(struct vhost_blk *blk, unsigned index, int fd) > +{ > + struct vhost_virtqueue *vq = &blk->vq; > + struct file *file, *oldfile; > + struct inode *inode; > + int ret; > + > + mutex_lock(&blk->dev.mutex); > + ret = vhost_dev_check_owner(&blk->dev); > + if (ret) > + goto out_dev; > + > + if (index >= VHOST_BLK_VQ_MAX) { > + ret = -ENOBUFS; > + goto out_dev; > + } > + > + mutex_lock(&vq->mutex); > + > + if (!vhost_vq_access_ok(vq)) { > + ret = -EFAULT; > + goto out_vq; > + } > + > + file = fget(fd); > + if (IS_ERR(file)) { > + ret = PTR_ERR(file); > + goto out_vq; > + } > + > + /* Only raw block device is supported for now */ > + inode = file->f_mapping->host; > + if (!S_ISBLK(inode->i_mode)) { > + ret = -EFAULT; > + goto out_file; > + } > + > + oldfile = rcu_dereference_protected(vq->private_data, > + lockdep_is_held(&vq->mutex)); > + if (file != oldfile) { > + rcu_assign_pointer(vq->private_data, file); > + vhost_blk_enable_vq(blk, vq); > + > + ret = vhost_init_used(vq); > + if (ret) > + goto out_file; > + } > + > + mutex_unlock(&vq->mutex); > + > + if (oldfile) { > + vhost_blk_flush(blk); > + fput(oldfile); > + } > + > + mutex_unlock(&blk->dev.mutex); > + return 0; > + > +out_file: > + fput(file); > +out_vq: > + mutex_unlock(&vq->mutex); > +out_dev: > + mutex_unlock(&blk->dev.mutex); > + return ret; > +} > + > +static long vhost_blk_reset_owner(struct vhost_blk *blk) > +{ > + struct file *file = NULL; > + int err; > + > + mutex_lock(&blk->dev.mutex); > + err = vhost_dev_check_owner(&blk->dev); > + if (err) > + goto done; > + vhost_blk_stop(blk, &file); > + vhost_blk_flush(blk); > + err = vhost_dev_reset_owner(&blk->dev); > +done: > + mutex_unlock(&blk->dev.mutex); > + if (file) > + fput(file); > + return err; > +} > + > +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, > + unsigned long arg) > +{ > + struct vhost_blk *blk = f->private_data; > + void __user *argp = (void __user *)arg; > + struct vhost_vring_file backend; > + u64 __user *featurep = argp; > + u64 features; > + int ret; > + > + switch (ioctl) { > + case VHOST_BLK_SET_BACKEND: > + if (copy_from_user(&backend, argp, sizeof(backend))) > + return -EFAULT; > + return vhost_blk_set_backend(blk, backend.index, backend.fd); > + case VHOST_GET_FEATURES: > + features = VHOST_BLK_FEATURES; > + if (copy_to_user(featurep, &features, sizeof(features))) > + return -EFAULT; > + return 0; > + case VHOST_SET_FEATURES: > + if (copy_from_user(&features, featurep, sizeof(features))) > + return -EFAULT; > + if (features & ~VHOST_BLK_FEATURES) > + return -EOPNOTSUPP; > + return vhost_blk_set_features(blk, features); > + case VHOST_RESET_OWNER: > + return vhost_blk_reset_owner(blk); > + default: > + mutex_lock(&blk->dev.mutex); > + ret = vhost_dev_ioctl(&blk->dev, ioctl, arg); > + if (!ret && ioctl == VHOST_SET_VRING_NUM) > + ret = vhost_blk_setup(blk); > + vhost_blk_flush(blk); > + mutex_unlock(&blk->dev.mutex); > + return ret; > + } > +} > + > +static const struct file_operations vhost_blk_fops = { > + .owner = THIS_MODULE, > + .open = vhost_blk_open, > + .release = vhost_blk_release, > + .llseek = noop_llseek, > + .unlocked_ioctl = vhost_blk_ioctl, > +}; > + > +static struct miscdevice vhost_blk_misc = { > + MISC_DYNAMIC_MINOR, > + "vhost-blk", > + &vhost_blk_fops, > +}; > + > +static int vhost_blk_init(void) > +{ > + return misc_register(&vhost_blk_misc); > +} > + > +static void vhost_blk_exit(void) > +{ > + misc_deregister(&vhost_blk_misc); > +} > + > +module_init(vhost_blk_init); > +module_exit(vhost_blk_exit); > + > +MODULE_VERSION("0.0.3"); > +MODULE_LICENSE("GPL v2"); > +MODULE_AUTHOR("Asias He"); > +MODULE_DESCRIPTION("Host kernel accelerator for virtio_blk"); > diff --git a/drivers/vhost/blk.h b/drivers/vhost/blk.h > new file mode 100644 > index 0000000..2f674f0 > --- /dev/null > +++ b/drivers/vhost/blk.h > @@ -0,0 +1,8 @@ > +#include <linux/vhost.h> > + > +enum { > + VHOST_BLK_FEATURES = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | > + (1ULL << VIRTIO_RING_F_EVENT_IDX), > +}; > +/* VHOST_BLK specific defines */ > +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, struct vhost_vring_file) > -- > 1.7.11.7
Asias He <asias at redhat.com> writes:> vhost-blk is an in-kernel virito-blk device accelerator. > > Due to lack of proper in-kernel AIO interface, this version converts > guest's I/O request to bio and use submit_bio() to submit I/O directly. > So this version any supports raw block device as guest's disk image, > e.g. /dev/sda, /dev/ram0. We can add file based image support to > vhost-blk once we have in-kernel AIO interface. There are some work in > progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown: > > http://marc.info/?l=linux-fsdevel&m=133312234313122OK, this generally looks quite neat. There is one significant bug, however:> +/* The block header is in the first and separate buffer. */ > +#define BLK_HDR 0You need to do a proper pull off the iovec; you can't simply assume this. I'm working on fixing qemu, too. linux/drivers/vhost/net.c simply skips the header, you want something which actually copies it from userspace: /* Returns 0, -EFAULT or -EINVAL (too short) */ int copy_from_iovec_user(void *dst, size_t len, struct iovec *iov, int iov_nr); int copy_to_iovec_user(struct iovec *iov, int iov_nr, const void *src, size_t len); These consume the iov in place. You could pass struct iovec **iov and int * if you wanted to be really efficient (otherwise you have zero-length iov entries at the front after you've pulled things off). This goes away:> + if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID) > + iov_nr = in - 1; > + else > + iov_nr = out - 1;This becomes a simple assignment:> + /* The block data buffer follows block header buffer */ > + req->iov = &vq->iov[BLK_HDR + 1];This one actually requires iteration, since you should handle the case where the last iov is zero length:> + /* The block status buffer follows block data buffer */ > + req->status = vq->iov[iov_nr + 1].iov_base;This becomes copy_to_iovec_user:> + case VIRTIO_BLK_T_GET_ID: { > + char id[VIRTIO_BLK_ID_BYTES]; > + int len; > + > + ret = snprintf(id, VIRTIO_BLK_ID_BYTES, > + "vhost-blk%d", blk->index); > + if (ret < 0) > + break; > + len = ret; > + ret = __copy_to_user(req->iov[0].iov_base, id, len);This becomes copy_from_iovec_user:> + if (unlikely(copy_from_user(&hdr, vq->iov[BLK_HDR].iov_base, > + sizeof(hdr)))) { > + vq_err(vq, "Failed to get block header!\n"); > + vhost_discard_vq_desc(vq, 1); > + break; > + }The rest looks OK, at a glance. Thanks, Rusty.