On Fri, Nov 02, 2018 at 06:21:23PM +0000, Vitaly Mayatskikh
wrote:> This driver accelerates host side of virtio-blk.
>
> Signed-off-by: Vitaly Mayatskikh <v.mayatskih at gmail.com>
> ---
> drivers/vhost/Kconfig | 13 ++
> drivers/vhost/Makefile | 3 +
> drivers/vhost/blk.c | 510 +++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 526 insertions(+)
> create mode 100644 drivers/vhost/blk.c
>
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index b580885243f7..c4980d6af0ea 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
> adds some overhead, it is disabled by default.
>
> If unsure, say "N".
> +
> +config VHOST_BLK
> + tristate "Host kernel accelerator for virtio blk
(EXPERIMENTAL)"
> + depends on BLOCK && EVENTFD
> + select VHOST
> + default n
> + help
> + This kernel module can be loaded in host kernel to accelerate
> + guest block with virtio_blk. Not to be confused with virtio_blk
> + module itself which needs to be loaded in guest kernel.
> +
> + To compile this driver as a module, choose M here: the module will
> + be called vhost_blk.
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 6c6df24f770c..c8be36cd9214 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o
> obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
> vhost_vsock-y := vsock.o
>
> +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
> +vhost_blk-y := blk.o
> +
> obj-$(CONFIG_VHOST_RING) += vringh.o
>
> obj-$(CONFIG_VHOST) += vhost.o
> diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
> new file mode 100644
> index 000000000000..aefb9a61fa0f
> --- /dev/null
> +++ b/drivers/vhost/blk.c
> @@ -0,0 +1,510 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018 IBM Corporation
> + * Author: Vitaly Mayatskikh <v.mayatskih at gmail.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + *
> + * virtio-blk server in host kernel.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/miscdevice.h>
> +#include <linux/virtio_blk.h>
> +#include <linux/vhost.h>
> +#include <linux/fs.h>
> +#include "vhost.h"
> +
> +enum {
> + VHOST_BLK_FEATURES > + VHOST_FEATURES |
> + (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
> + (1ULL << VIRTIO_RING_F_EVENT_IDX) |
> + (1ULL << VIRTIO_BLK_F_MQ)
> +};
> +
> +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)
> +
> +enum {
> + VHOST_BLK_VQ_MAX = 16,
> + VHOST_BLK_VQ_MAX_REQS = 128,
> +};
> +
> +struct vhost_blk_req {
> + struct llist_node list;
> + int index;
> + struct vhost_blk_queue *q;
> + struct virtio_blk_outhdr hdr;
> + struct iovec *out_iov;
> + struct iovec *in_iov;
> + u8 out_num;
> + u8 in_num;
> + long len;
> + struct kiocb iocb;
> + struct iov_iter i;
> + int res;
> + void __user *status;
> +};
> +
> +struct vhost_blk_queue {
> + int index;
> + struct vhost_blk *blk;
> + struct vhost_virtqueue vq;
> + struct vhost_work w;
> + struct llist_head wl;
> + struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS];
> +};
> +
> +struct vhost_blk {
> + struct vhost_dev dev;
> + struct file *backend;
> + int num_queues;
> + struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX];
> + struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX];
> +};
> +
> +static void vhost_blk_flush(struct vhost_blk *blk)
> +{
> + int i;
> +
> + for (i = 0; i < blk->num_queues; i++)
> + vhost_poll_flush(&blk->queue[i].vq.poll);
> +}
> +
> +
> +static void vhost_blk_stop(struct vhost_blk *blk)
> +{
> + struct vhost_virtqueue *vq;
> + int i;
> +
> + for (i = 0; i < blk->num_queues; i++) {
> + vq = &blk->queue[i].vq;
> + mutex_lock(&vq->mutex);
> + rcu_assign_pointer(vq->private_data, NULL);
> + mutex_unlock(&vq->mutex);
> + }
> +}
> +
> +static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char
status)
> +{
> + int ret;
> + int len = req->len;
> +
> + pr_debug("%s vq[%d] req->index %d status %d len %d\n",
__func__,
> + req->q->index, req->index, status, len);
> + ret = put_user(status, (unsigned char __user *)req->status);
I'd make it u8 and not unsigned char. Also why don't you change
req->status type so you don't need a cast?
> +
> + WARN(ret, "%s: vq[%d] req->index %d failed to write
status\n", __func__,
> + req->q->index, req->index);
kernel warnings and debug messages that are guest-triggerable lead to
disk full errors on the host. applies elsewhere too. you want traces
instead.
> +
> + vhost_add_used(&req->q->vq, req->index, len);
this can fail too.
> +
> + return ret;
> +}
> +
> +static void vhost_blk_io_done_work(struct vhost_work *w)
> +{
> + struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w);
> + struct llist_node *node;
> + struct vhost_blk_req *req, *tmp;
> +
> + node = llist_del_all(&q->wl);
> + llist_for_each_entry_safe(req, tmp, node, list) {
> + vhost_blk_req_done(req, req->res);
> + }
> + vhost_signal(&q->blk->dev, &q->vq);
> +}
> +
> +static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long
ret2)
> +{
> + struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
> + iocb);
> +
> + pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n",
__func__,
> + req->q->index, req->index, ret, ret2);
> +
> + req->res = (ret == req->len) ? VIRTIO_BLK_S_OK :
VIRTIO_BLK_S_IOERR;
> + llist_add(&req->list, &req->q->wl);
> + vhost_vq_work_queue(&req->q->vq, &req->q->w);
> +}
> +
> +static int vhost_blk_req_handle(struct vhost_blk_req *req)
> +{
> + struct vhost_blk *blk = req->q->blk;
> + struct vhost_virtqueue *vq = &req->q->vq;
> + int type = le32_to_cpu(req->hdr.type);
> + int ret;
> + u8 status;
> +
> + if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
> + bool write = (type == VIRTIO_BLK_T_OUT);
> + int nr_seg = (write ? req->out_num : req->in_num) - 1;
> + unsigned long sector = le64_to_cpu(req->hdr.sector);
> + ssize_t len, rem_len;
> +
> + if (!req->q->blk->backend) {
> + vq_err(vq, "blk %p no backend!\n", req->q->blk);
> + ret = -EINVAL;
> + goto out_err;
> + }
> +
> + len = iov_length(&vq->iov[1], nr_seg);
> + pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
> + __func__, current->pid, current->comm,
> + write ? "WRITE" : "READ", req->hdr.sector,
len);
> +
> + req->len = len;
> + rem_len = len;
> + iov_iter_init(&req->i, (write ? WRITE : READ),
> + write ? &req->out_iov[0] : &req->in_iov[0],
> + nr_seg, len);
> +
> + req->iocb.ki_pos = sector << 9;
> + req->iocb.ki_filp = blk->backend;
> + req->iocb.ki_complete = vhost_blk_iocb_complete;
> + req->iocb.ki_flags = IOCB_DIRECT;
> +
> + if (write)
> + ret = call_write_iter(blk->backend, &req->iocb,
> + &req->i);
> + else
> + ret = call_read_iter(blk->backend, &req->iocb,
> + &req->i);
> +
> + if (ret != -EIOCBQUEUED)
> + vhost_blk_iocb_complete(&req->iocb, ret, 0);
> +
> + ret = 0;
> + goto out;
> + }
> +
> + if (type == VIRTIO_BLK_T_GET_ID) {
> + char s[] = "vhost_blk";
Isn't this supposed to return the serial #?
> + size_t len = min_t(size_t, req->in_iov[0].iov_len,
> + strlen(s));
> +
> + ret = copy_to_user(req->in_iov[0].iov_base, s, len);
I don't think we should assume there's no scatter list here.
> + status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> + if (put_user(status, (unsigned char __user *)req->status)) {
> + ret = -EFAULT;
> + goto out_err;
> + }
> + vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
> + ret = 0;
> + goto out;
> + } else {
> + pr_warn("Unsupported request type %d\n", type);
> + vhost_discard_vq_desc(vq, 1);
> + ret = -EINVAL;
> + return ret;
> + }
> +out_err:
> + vhost_discard_vq_desc(vq, 1);
> +out:
> + return ret;
> +}
> +
> +static void vhost_blk_handle_guest_kick(struct vhost_work *work)
> +{
> + struct vhost_virtqueue *vq;
> + struct vhost_blk_queue *q;
> + struct vhost_blk *blk;
> + struct vhost_blk_req *req;
> + int in, out;
> + int head;
> +
> + vq = container_of(work, struct vhost_virtqueue, poll.work);
> + q = container_of(vq, struct vhost_blk_queue, vq);
> + blk = container_of(vq->dev, struct vhost_blk, dev);
> +
> + vhost_disable_notify(&blk->dev, vq);
> + for (;;) {
> + in = out = -1;
> +
> + head = vhost_get_vq_desc(vq, vq->iov,
> + ARRAY_SIZE(vq->iov),
> + &out, &in, NULL, NULL);
> +
> + if (head < 0)
> + break;
> +
> + if (head == vq->num) {
> + if (vhost_enable_notify(&blk->dev, vq)) {
> + vhost_disable_notify(&blk->dev, vq);
> + continue;
> + }
> + break;
> + }
> +
> + req = &q->req[head];
> + req->index = head;
> + req->out_num = out;
> + req->in_num = in;
> + req->out_iov = &vq->iov[1];
> + req->in_iov = &vq->iov[out];
> + req->status = vq->iov[out + in - 1].iov_base;
Shouldn't we validate that there's actually an in?
> +
> + if (copy_from_user(&req->hdr, vq->iov[0].iov_base,
> + sizeof(req->hdr))) {
> + vq_err(vq, "Failed to get block header!\n");
> + vhost_discard_vq_desc(vq, 1);
> + continue;
> + }
It's better to avoid assuming that header is in a single iov entry,
use an iterator.
> + if (vhost_blk_req_handle(req) < 0)
> + break;
> + }
> +}
> +
> +static int vhost_blk_open(struct inode *inode, struct file *file)
> +{
> + struct vhost_blk *blk;
> + struct vhost_blk_queue *q;
> + int i, j;
> +
> + blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
> + if (!blk)
> + return -ENOMEM;
> +
> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
> + q = &blk->queue[i];
> + q->index = i;
> + q->blk = blk;
> + q->vq.handle_kick = vhost_blk_handle_guest_kick;
> + vhost_work_init(&q->w, vhost_blk_io_done_work);
> + blk->vqs[i] = &q->vq;
> + for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) {
> + q->req[j].index = j;
> + q->req[j].q = q;
> + }
> + }
> + vhost_dev_init(&blk->dev, (struct vhost_virtqueue
**)&blk->vqs,
> + VHOST_BLK_VQ_MAX);
> + file->private_data = blk;
> +
> + return 0;
> +}
> +
> +static int vhost_blk_release(struct inode *inode, struct file *f)
> +{
> + struct vhost_blk *blk = f->private_data;
> +
> + vhost_blk_stop(blk);
> + mutex_lock(&blk->dev.mutex);
> + vhost_blk_flush(blk);
> + vhost_dev_stop(&blk->dev);
> + vhost_dev_cleanup(&blk->dev);
> + vhost_blk_flush(blk);
> +
> + if (blk->backend) {
> + fput(blk->backend);
> + blk->backend = NULL;
> + }
> +
> + mutex_unlock(&blk->dev.mutex);
> + kvfree(blk);
> +
> + return 0;
> +}
> +
> +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
> +{
> + int i;
> + int ret = -EFAULT;
> +
> + mutex_lock(&blk->dev.mutex);
> + if ((features & (1 << VHOST_F_LOG_ALL)) &&
> + !vhost_log_access_ok(&blk->dev))
> + goto out_unlock;
> +
> + if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
> + if (vhost_init_device_iotlb(&blk->dev, true))
> + goto out_unlock;
> + }
> +
> + for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) {
> + struct vhost_virtqueue *vq = blk->vqs[i];
> +
> + mutex_lock(&vq->mutex);
> + vq->acked_features = features & VHOST_BLK_FEATURES;
> + mutex_unlock(&vq->mutex);
> + }
> + ret = 0;
> +out_unlock:
> + mutex_unlock(&blk->dev.mutex);
> +
> + return ret;
> +}
> +
> +static long vhost_blk_reset_owner(struct vhost_blk *blk)
> +{
> + long err;
> + struct vhost_umem *umem;
> +
> + mutex_lock(&blk->dev.mutex);
> + err = vhost_dev_check_owner(&blk->dev);
> + if (err)
> + goto done;
> + umem = vhost_dev_reset_owner_prepare();
> + if (!umem) {
> + err = -ENOMEM;
> + goto done;
> + }
> + vhost_blk_stop(blk);
> + vhost_blk_flush(blk);
> + vhost_dev_reset_owner(&blk->dev, umem);
> +done:
> + mutex_unlock(&blk->dev.mutex);
> + return err;
> +}
> +
> +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
> +{
> + struct file *backend;
> + int ret, i;
> + struct vhost_virtqueue *vq;
> +
> + mutex_lock(&blk->dev.mutex);
> + ret = vhost_dev_check_owner(&blk->dev);
> + if (ret)
> + goto out_dev;
> +
> + backend = fget(fd);
> + if (IS_ERR(backend)) {
> + ret = PTR_ERR(backend);
> + goto out_dev;
> + }
> +
> + if (backend == blk->backend) {
> + ret = 0;
> + goto out_file;
> + }
> +
> + if (blk->backend)
> + fput(blk->backend);
> + blk->backend = backend;
> + for (i = 0; i < blk->num_queues; i++) {
> + vq = &blk->queue[i].vq;
> + if (!vhost_vq_access_ok(vq)) {
> + ret = -EFAULT;
> + goto out_file;
> + }
> + mutex_lock(&vq->mutex);
> + rcu_assign_pointer(vq->private_data, backend);
> + ret = vhost_vq_init_access(vq);
> + mutex_unlock(&vq->mutex);
> + if (ret) {
> + pr_err("vhost_vq_init_access failed: %d\n", ret);
> + goto out_file;
> + }
> +
> + }
> + ret = 0;
> + goto out_dev;
> +out_file:
> + fput(backend);
> + blk->backend = NULL;
> +out_dev:
> + mutex_unlock(&blk->dev.mutex);
> + vhost_blk_flush(blk);
> + return ret;
> +}
> +
> +static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int
ioctl,
> + void __user *argp)
> +{
> + long ret;
> +
> + mutex_lock(&blk->dev.mutex);
> + ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
> + if (ret == -ENOIOCTLCMD)
> + ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
> + else
> + vhost_blk_flush(blk);
> + mutex_unlock(&blk->dev.mutex);
> + return ret;
> +}
> +
> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
> + unsigned long arg)
> +{
> + struct vhost_blk *blk = f->private_data;
> + void __user *argp = (void __user *)arg;
> + int fd;
> + u64 __user *featurep = argp;
> + u64 features;
> + long ret;
> + struct vhost_vring_state s;
> +
> + switch (ioctl) {
> + case VHOST_SET_MEM_TABLE:
> + vhost_blk_stop(blk);
> + ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> + break;
> + case VHOST_SET_VRING_NUM:
> + if (copy_from_user(&s, argp, sizeof(s)))
> + return -EFAULT;
> + ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> + if (!ret)
> + blk->num_queues = s.index + 1;
> + break;
> + case VHOST_BLK_SET_BACKEND:
> + if (copy_from_user(&fd, argp, sizeof(fd)))
> + return -EFAULT;
> + ret = vhost_blk_set_backend(blk, fd);
> + break;
> + case VHOST_GET_FEATURES:
> + features = VHOST_BLK_FEATURES;
> + if (copy_to_user(featurep, &features, sizeof(features)))
> + return -EFAULT;
> + ret = 0;
> + break;
> + case VHOST_SET_FEATURES:
> + if (copy_from_user(&features, featurep, sizeof(features)))
> + return -EFAULT;
> + if (features & ~VHOST_BLK_FEATURES)
> + return -EOPNOTSUPP;
> + ret = vhost_blk_set_features(blk, features);
> + break;
> + case VHOST_RESET_OWNER:
> + ret = vhost_blk_reset_owner(blk);
> + break;
> + default:
> + ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
> + break;
> + }
> + return ret;
> +}
> +
> +static const struct file_operations vhost_blk_fops = {
> + .owner = THIS_MODULE,
> + .open = vhost_blk_open,
> + .release = vhost_blk_release,
> + .llseek = noop_llseek,
> + .unlocked_ioctl = vhost_blk_ioctl,
> +};
> +
> +static struct miscdevice vhost_blk_misc = {
> + MISC_DYNAMIC_MINOR,
> + "vhost-blk",
> + &vhost_blk_fops,
> +};
> +
> +static int vhost_blk_init(void)
> +{
> + return misc_register(&vhost_blk_misc);
> +}
> +module_init(vhost_blk_init);
> +
> +static void vhost_blk_exit(void)
> +{
> + misc_deregister(&vhost_blk_misc);
> +}
> +
> +module_exit(vhost_blk_exit);
> +
> +MODULE_VERSION("1.0");
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Vitaly Mayatskikh");
> +MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");
> +MODULE_ALIAS("devname:vhost-blk");
> --
> 2.17.1