Ming Lin
2015-Nov-20 00:20 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
Hi, This is the first attempt to add a new qemu nvme backend using in-kernel nvme target. Most code are ported from qemu-nvme and also borrow code from Hannes Reinecke's rts-megasas. It's similar as vhost-scsi, but doesn't use virtio. The advantage is guest can run unmodified NVMe driver. So guest can be any OS that has a NVMe driver. The goal is to get as good performance as vhost-scsi. But for now, peformance is poor. MMIO is the bottleneck. One improvment could be to use google's NVMe vendor extension that I send in another thread, aslo here: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-google-ext Qemu side: http://www.minggr.net/cgit/cgit.cgi/qemu/log/?h=vhost-nvme.0 Kernel side also here: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=vhost-nvme.0 Thanks for any comment, Ming
From: Ming Lin <ming.l at ssi.samsung.com> Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/Kconfig | 11 +++++++++++ drivers/nvme/target/Makefile | 2 ++ drivers/nvme/target/vhost.c | 16 ++++++++++++++++ 3 files changed, 29 insertions(+) create mode 100644 drivers/nvme/target/vhost.c diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 1bf92db..22760f5 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -12,3 +12,14 @@ config NVME_TARGET_LOOP to test NVMe host and target side features. If unsure, say N. + +config NVME_TARGET_VHOST + tristate "NVMe vhost support" + depends on BLK_DEV_NVME + select NVME_TARGET + select VHOST + select VHOST_RING + help + This enabled the NVMe vhost support. + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 21e9134..1d8d523 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -1,6 +1,8 @@ obj-$(CONFIG_NVME_TARGET) += nvmet.o obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o +obj-$(CONFIG_NVME_TARGET_VHOST) += nvme-vhost.o nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o nvme-loop-y += loop.o +nvme-vhost-y += vhost.o diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c new file mode 100644 index 0000000..623af00 --- /dev/null +++ b/drivers/nvme/target/vhost.c @@ -0,0 +1,16 @@ +#include <linux/module.h> + +static int __init nvmet_vhost_init(void) +{ + return 0; +} +module_init(nvmet_vhost_init); + +static void nvmet_vhost_exit(void) +{ +} +module_exit(nvmet_vhost_exit); + +MODULE_AUTHOR("Ming Lin <ming.l at ssi.samsung.com>"); +MODULE_LICENSE("GPL v2"); + -- 1.9.1
From: Ming Lin <ming.l at ssi.samsung.com> Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/core.c | 1 + drivers/nvme/target/vhost.c | 264 +++++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/vhost.h | 15 +++ 3 files changed, 279 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 5c770bf..1bfef66 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -378,6 +378,7 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) { kref_put(&ctrl->ref, nvmet_ctrl_free); } +EXPORT_SYMBOL_GPL(nvmet_ctrl_put); struct nvmet_subsys *nvmet_find_subsys(char *subsys_name) { diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c index 623af00..fa2e668 100644 --- a/drivers/nvme/target/vhost.c +++ b/drivers/nvme/target/vhost.c @@ -1,13 +1,275 @@ #include <linux/module.h> +#include <linux/compat.h> +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/file.h> +#include "../../vhost/vhost.h" +#include "nvmet.h" -static int __init nvmet_vhost_init(void) +struct nvmet_vhost_ctrl_eventfd { + struct file *call; + struct eventfd_ctx *call_ctx; + int __user *irq_enabled; + int __user *vector; +}; + +struct nvmet_vhost_cq { + struct nvmet_cq cq; + + struct eventfd_ctx *eventfd; +}; + +struct nvmet_vhost_sq { + struct nvmet_sq sq; +}; + +struct nvmet_vhost_ctrl { + struct vhost_dev dev; + struct nvmet_vhost_ctrl_eventfd *eventfd; + + u16 cntlid; + struct nvmet_ctrl *ctrl; + u32 num_queues; + + struct nvmet_vhost_cq **cqs; + struct nvmet_vhost_sq **sqs; +}; + +static int +nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n, + struct vhost_nvme_target *c) { + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + int num_queues; + int ret = 0; + + subsys = nvmet_find_subsys(c->vhost_wwpn); + if (!subsys) { + pr_warn("connect request for invalid subsystem!\n"); + return -EINVAL; + } + + mutex_lock(&subsys->lock); + ctrl = nvmet_alloc_ctrl(subsys, c->vhost_wwpn); + if (IS_ERR(ctrl)) { + ret = -EINVAL; + goto out_unlock; + } + n->cntlid = ctrl->cntlid; + n->ctrl = ctrl; + n->num_queues = subsys->max_qid + 1; + + num_queues = ctrl->subsys->max_qid + 1; + n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL); + if (!n->cqs) { + ret = -ENOMEM; + goto out_ctrl_put; + } + n->sqs = kzalloc(sizeof(*n->sqs) * num_queues, GFP_KERNEL); + if (!n->sqs) { + ret = -ENOMEM; + goto free_cqs; + } + + n->eventfd = kmalloc(sizeof(struct nvmet_vhost_ctrl_eventfd) + * num_queues, GFP_KERNEL); + if (!n->eventfd) { + ret = -ENOMEM; + goto free_sqs; + } + + mutex_unlock(&subsys->lock); return 0; + +free_sqs: + kfree(n->sqs); + +free_cqs: + kfree(n->cqs); + +out_ctrl_put: + nvmet_ctrl_put(ctrl); + +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +static int nvmet_vhost_set_eventfd(struct nvmet_vhost_ctrl *n, void __user *argp) +{ + struct nvmet_vhost_eventfd eventfd; + int num; + int ret; + + ret = copy_from_user(&eventfd, argp, sizeof(struct nvmet_vhost_eventfd)); + if (unlikely(ret)) + return ret; + + num = eventfd.num; + if (num > n->ctrl->subsys->max_qid) + return -EINVAL; + + n->eventfd[num].call = eventfd_fget(eventfd.fd); + if (IS_ERR(n->eventfd[num].call)) + return -EBADF; + n->eventfd[num].call_ctx = eventfd_ctx_fileget(n->eventfd[num].call); + if (IS_ERR(n->eventfd[num].call_ctx)) { + fput(n->eventfd[num].call); + return -EBADF; + } + + n->eventfd[num].irq_enabled = eventfd.irq_enabled; + n->eventfd[num].vector = eventfd.vector; + + return 0; +} + +static int nvmet_vhost_open(struct inode *inode, struct file *f) +{ + struct nvmet_vhost_ctrl *n = kzalloc(sizeof(*n), GFP_KERNEL); + + if (!n) + return -ENOMEM; + + /* We don't use virtqueue */ + vhost_dev_init(&n->dev, NULL, 0); + f->private_data = n; + + return 0; +} + +static void nvme_free_sq(struct nvmet_vhost_sq *sq, + struct nvmet_vhost_ctrl *n) +{ + n->sqs[sq->sq.qid] = NULL; + if (sq->sq.qid) + kfree(sq); +} + +static void nvme_free_cq(struct nvmet_vhost_cq *cq, + struct nvmet_vhost_ctrl *n) +{ + n->cqs[cq->cq.qid] = NULL; + if (cq->cq.qid) + kfree(cq); +} + +static void nvmet_vhost_clear_ctrl(struct nvmet_vhost_ctrl *n) +{ + int i; + + for (i = 0; i < n->num_queues; i++) { + if (n->sqs[i] != NULL) + nvme_free_sq(n->sqs[i], n); + } + for (i = 0; i < n->num_queues; i++) { + if (n->cqs[i] != NULL) + nvme_free_cq(n->cqs[i], n); + } + + kfree(n->eventfd); + kfree(n->cqs); + kfree(n->sqs); + nvmet_ctrl_put(n->ctrl); +} + +static void nvmet_vhost_clear_eventfd(struct nvmet_vhost_ctrl *n) +{ + int i; + + for (i = 0; i < n->num_queues; i++) { + if (n->eventfd[i].call_ctx) { + eventfd_ctx_put(n->eventfd[i].call_ctx); + fput(n->eventfd[i].call); + } + } +} + +static int nvmet_vhost_release(struct inode *inode, struct file *f) +{ + struct nvmet_vhost_ctrl *n = f->private_data; + + nvmet_vhost_clear_eventfd(n); + nvmet_vhost_clear_ctrl(n); + + vhost_dev_stop(&n->dev); + vhost_dev_cleanup(&n->dev, false); + + kfree(n); + return 0; +} + +static long nvmet_vhost_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct nvmet_vhost_ctrl *n = f->private_data; + void __user *argp = (void __user *)arg; + u64 __user *featurep = argp; + u64 features; + int r; + + switch (ioctl) { + case VHOST_NVME_SET_ENDPOINT: + { + struct vhost_nvme_target conf; + if (copy_from_user(&conf, argp, sizeof(conf))) + return -EFAULT; + + return nvmet_vhost_set_endpoint(n, &conf); + } + case VHOST_NVME_SET_EVENTFD: + r = nvmet_vhost_set_eventfd(n, argp); + return r; + case VHOST_GET_FEATURES: + features = VHOST_FEATURES; + if (copy_to_user(featurep, &features, sizeof(features))) + return -EFAULT; + return 0; + default: + mutex_lock(&n->dev.mutex); + r = vhost_dev_ioctl(&n->dev, ioctl, argp); + mutex_unlock(&n->dev.mutex); + return r; + } +} + +#ifdef CONFIG_COMPAT +static long nvmet_vhost_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return nvmet_vhost_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations nvmet_vhost_fops = { + .owner = THIS_MODULE, + .release = nvmet_vhost_release, + .unlocked_ioctl = nvmet_vhost_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nvmet_vhost_compat_ioctl, +#endif + .open = nvmet_vhost_open, + .llseek = noop_llseek, +}; + +static struct miscdevice nvmet_vhost_misc = { + MISC_DYNAMIC_MINOR, + "vhost-nvme", + &nvmet_vhost_fops, +}; + +static int __init nvmet_vhost_init(void) +{ + return misc_register(&nvmet_vhost_misc); } module_init(nvmet_vhost_init); static void nvmet_vhost_exit(void) { + misc_deregister(&nvmet_vhost_misc); } module_exit(nvmet_vhost_exit); diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index ab373191..ae4b619 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -169,4 +169,19 @@ struct vhost_scsi_target { #define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) #define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) +struct vhost_nvme_target { + char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */ +}; + +struct nvmet_vhost_eventfd { + int num; + int fd; + int *irq_enabled; + int *vector; +}; + +#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x47, struct vhost_nvme_target) +#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x48, struct vhost_nvme_target) +#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x45, struct nvmet_vhost_eventfd) + #endif -- 1.9.1
From: Ming Lin <ming.l at ssi.samsung.com> Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/vhost.c | 102 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vhost.h | 17 ++++++-- 2 files changed, 116 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c index fa2e668..01c44b8 100644 --- a/drivers/nvme/target/vhost.c +++ b/drivers/nvme/target/vhost.c @@ -8,6 +8,8 @@ #include "../../vhost/vhost.h" #include "nvmet.h" +#define NVMET_VHOST_AQ_DEPTH 256 + struct nvmet_vhost_ctrl_eventfd { struct file *call; struct eventfd_ctx *call_ctx; @@ -35,6 +37,10 @@ struct nvmet_vhost_ctrl { struct nvmet_vhost_cq **cqs; struct nvmet_vhost_sq **sqs; + + u32 aqa; + u64 asq; + u64 acq; }; static int @@ -127,6 +133,100 @@ static int nvmet_vhost_set_eventfd(struct nvmet_vhost_ctrl *n, void __user *argp return 0; } +static int nvmet_vhost_bar_read(struct nvmet_ctrl *ctrl, int offset, u64 *val) +{ + int status = NVME_SC_SUCCESS; + + switch(offset) { + case NVME_REG_CAP: + *val = ctrl->cap; + break; + case NVME_REG_CAP+4: + *val = ctrl->cap >> 32; + case NVME_REG_VS: + *val = ctrl->subsys->ver; + break; + case NVME_REG_CC: + *val = ctrl->cc; + break; + case NVME_REG_CSTS: + *val = ctrl->csts; + break; + case NVME_REG_AQA: + *val = (NVMET_VHOST_AQ_DEPTH - 1) | + (((NVMET_VHOST_AQ_DEPTH - 1) << 16)); + break; + default: + printk("Unknown offset: 0x%x\n", offset); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + return status; +} + +static int nvmet_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64 val) +{ + struct nvmet_ctrl *ctrl = n->ctrl; + int status = NVME_SC_SUCCESS; + + switch(offset) { + case NVME_REG_CC: + nvmet_update_cc(ctrl, val); + break; + case NVME_REG_AQA: + n->aqa = val & 0xffffffff; + break; + case NVME_REG_ASQ: + n->asq = val; + break; + case NVME_REG_ASQ + 4: + n->asq |= val << 32; + break; + case NVME_REG_ACQ: + n->acq = val; + break; + case NVME_REG_ACQ + 4: + n->acq |= val << 32; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + return status; +} + +static int nvmet_vhost_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64 val) +{ + if (offset < 0x1000) + return nvmet_bar_write(n, offset, val); + + return -1; +} + +static int nvmet_vhost_ioc_bar(struct nvmet_vhost_ctrl *n, void __user *argp) +{ + struct nvmet_vhost_bar bar; + struct nvmet_vhost_bar __user *user_bar = argp; + int ret = -EINVAL; + + ret = copy_from_user(&bar, argp, sizeof(bar)); + if (unlikely(ret)) + return ret; + + if (bar.type == VHOST_NVME_BAR_READ) { + u64 val; + ret = nvmet_vhost_bar_read(n->ctrl, bar.offset, &val); + if (ret != NVME_SC_SUCCESS) + return ret; + ret = copy_to_user(&user_bar->val, &val, sizeof(u64)); + } else if (bar.type == VHOST_NVME_BAR_WRITE) + ret = nvmet_vhost_bar_write(n, bar.offset, bar.val); + + return ret; +} + static int nvmet_vhost_open(struct inode *inode, struct file *f) { struct nvmet_vhost_ctrl *n = kzalloc(sizeof(*n), GFP_KERNEL); @@ -223,6 +323,8 @@ static long nvmet_vhost_ioctl(struct file *f, unsigned int ioctl, case VHOST_NVME_SET_EVENTFD: r = nvmet_vhost_set_eventfd(n, argp); return r; + case VHOST_NVME_BAR: + return nvmet_vhost_ioc_bar(n, argp); case VHOST_GET_FEATURES: features = VHOST_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index ae4b619..a0cefcc 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -180,8 +180,19 @@ struct nvmet_vhost_eventfd { int *vector; }; -#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x47, struct vhost_nvme_target) -#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x48, struct vhost_nvme_target) -#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x45, struct nvmet_vhost_eventfd) +#define VHOST_NVME_BAR_READ 0 +#define VHOST_NVME_BAR_WRITE 1 + +struct nvmet_vhost_bar { + int type; /* read/write */ + u64 offset; + unsigned size; + u64 val; +}; + +#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x45, struct vhost_nvme_target) +#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x46, struct vhost_nvme_target) +#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x47, struct nvmet_vhost_eventfd) +#define VHOST_NVME_BAR _IOW(VHOST_VIRTIO, 0x48, struct nvmet_vhost_bar) #endif -- 1.9.1
From: Ming Lin <ming.l at ssi.samsung.com> This is used to execute controller specific start code Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/core.c | 3 +++ drivers/nvme/target/nvmet.h | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 1bfef66..0a0fc48 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -251,6 +251,9 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) } ctrl->csts = NVME_CSTS_RDY; + + if (ctrl->start) + ctrl->start(ctrl->opaque); } static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 9335584..eac008b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -59,6 +59,9 @@ struct nvmet_ctrl { struct kref ref; #define NVMET_SUBSYS_NAME_LEN 256 char subsys_name[NVMET_SUBSYS_NAME_LEN]; + + void *opaque; + void (*start)(void *); }; struct nvmet_subsys { -- 1.9.1
Ming Lin
2015-Nov-20 00:21 UTC
[RFC PATCH 5/9] nvme-vhost: add controller "start" callback
From: Ming Lin <ming.l at ssi.samsung.com> Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/vhost.c | 106 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c index 01c44b8..4a147d6 100644 --- a/drivers/nvme/target/vhost.c +++ b/drivers/nvme/target/vhost.c @@ -10,6 +10,35 @@ #define NVMET_VHOST_AQ_DEPTH 256 +enum NvmeCcShift { + CC_MPS_SHIFT = 7, + CC_IOSQES_SHIFT = 16, + CC_IOCQES_SHIFT = 20, +}; + +enum NvmeCcMask { + CC_MPS_MASK = 0xf, + CC_IOSQES_MASK = 0xf, + CC_IOCQES_MASK = 0xf, +}; + +#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK) +#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) +#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) + +enum NvmeAqaShift { + AQA_ASQS_SHIFT = 0, + AQA_ACQS_SHIFT = 16, +}; + +enum NvmeAqaMask { + AQA_ASQS_MASK = 0xfff, + AQA_ACQS_MASK = 0xfff, +}; + +#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK) +#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) + struct nvmet_vhost_ctrl_eventfd { struct file *call; struct eventfd_ctx *call_ctx; @@ -19,12 +48,23 @@ struct nvmet_vhost_ctrl_eventfd { struct nvmet_vhost_cq { struct nvmet_cq cq; + struct nvmet_vhost_ctrl *ctrl; + u32 head; + u32 tail; + u8 phase; + u64 dma_addr; struct eventfd_ctx *eventfd; }; struct nvmet_vhost_sq { struct nvmet_sq sq; + struct nvmet_vhost_ctrl *ctrl; + + u32 head; + u32 tail; + u64 dma_addr; + u16 cqid; }; struct nvmet_vhost_ctrl { @@ -37,12 +77,76 @@ struct nvmet_vhost_ctrl { struct nvmet_vhost_cq **cqs; struct nvmet_vhost_sq **sqs; + struct nvmet_vhost_cq admin_cq; + struct nvmet_vhost_sq admin_sq; u32 aqa; u64 asq; u64 acq; + u16 cqe_size; + u16 sqe_size; + u16 max_prp_ents; + u16 page_bits; + u32 page_size; }; +static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq, + struct nvmet_vhost_ctrl *n, u64 dma_addr, + u16 cqid, u16 size, struct eventfd_ctx *eventfd, + u16 vector, u16 irq_enabled) +{ + cq->ctrl = n; + cq->dma_addr = dma_addr; + cq->phase = 1; + cq->head = cq->tail = 0; + cq->eventfd = eventfd; + n->cqs[cqid] = cq; + + nvmet_cq_init(n->ctrl, &cq->cq, cqid, size); + + return 0; +} + +static int nvmet_vhost_init_sq(struct nvmet_vhost_sq *sq, + struct nvmet_vhost_ctrl *n, u64 dma_addr, + u16 sqid, u16 cqid, u16 size) +{ + sq->ctrl = n; + sq->dma_addr = dma_addr; + sq->cqid = cqid; + sq->head = sq->tail = 0; + n->sqs[sqid] = sq; + + nvmet_sq_init(n->ctrl, &sq->sq, sqid, size); + + return 0; +} + +static void nvmet_vhost_start_ctrl(void *opaque) +{ + struct nvmet_vhost_ctrl *n = opaque; + u32 page_bits = NVME_CC_MPS(n->ctrl->cc) + 12; + u32 page_size = 1 << page_bits; + int ret; + + n->page_bits = page_bits; + n->page_size = page_size; + n->max_prp_ents = n->page_size / sizeof(uint64_t); + n->cqe_size = 1 << NVME_CC_IOCQES(n->ctrl->cc); + n->sqe_size = 1 << NVME_CC_IOSQES(n->ctrl->cc); + + nvmet_vhost_init_cq(&n->admin_cq, n, n->acq, 0, + NVME_AQA_ACQS(n->aqa) + 1, n->eventfd[0].call_ctx, + 0, 1); + + ret = nvmet_vhost_init_sq(&n->admin_sq, n, n->asq, 0, 0, + NVME_AQA_ASQS(n->aqa) + 1); + if (ret) { + pr_warn("nvmet_vhost_init_sq failed!!!\n"); + BUG_ON(1); + } +} + static int nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n, struct vhost_nvme_target *c) @@ -67,6 +171,8 @@ nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n, n->cntlid = ctrl->cntlid; n->ctrl = ctrl; n->num_queues = subsys->max_qid + 1; + ctrl->opaque = n; + ctrl->start = nvmet_vhost_start_ctrl; num_queues = ctrl->subsys->max_qid + 1; n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL); -- 1.9.1
From: Ming Lin <ming.l at ssi.samsung.com> This is used to execute controller specific cmd parse code Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/admin-cmd.c | 7 +++++++ drivers/nvme/target/nvmet.h | 3 +++ 2 files changed, 10 insertions(+) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index d9db0d4..f009c77 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -346,6 +346,13 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req) req->data = 0; return 0; #endif + default: + if (req->sq->ctrl->parse_extra_admin_cmd) { + int ret = req->sq->ctrl->parse_extra_admin_cmd(req); + + if (!ret) + return 0; + } } pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index eac008b..ef79813 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -44,6 +44,8 @@ struct nvmet_sq { u16 size; }; +struct nvmet_req; + struct nvmet_ctrl { struct nvmet_subsys *subsys; struct nvmet_cq **cqs; @@ -62,6 +64,7 @@ struct nvmet_ctrl { void *opaque; void (*start)(void *); + int (*parse_extra_admin_cmd)(struct nvmet_req *); }; struct nvmet_subsys { -- 1.9.1
Ming Lin
2015-Nov-20 00:21 UTC
[RFC PATCH 7/9] nvme-vhost: add "parse_extra_admin_cmd" callback
From: Ming Lin <ming.l at ssi.samsung.com> Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/vhost.c | 153 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c index 4a147d6..04ed0bc 100644 --- a/drivers/nvme/target/vhost.c +++ b/drivers/nvme/target/vhost.c @@ -39,6 +39,11 @@ enum NvmeAqaMask { #define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK) #define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) +#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) +#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) + +#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1) + struct nvmet_vhost_ctrl_eventfd { struct file *call; struct eventfd_ctx *call_ctx; @@ -90,6 +95,19 @@ struct nvmet_vhost_ctrl { u32 page_size; }; +#define sq_to_vsq(sq) container_of(sq, struct nvmet_vhost_sq, sq) +#define cq_to_vcq(cq) container_of(cq, struct nvmet_vhost_cq, cq) + +static int nvmet_vhost_check_sqid(struct nvmet_ctrl *n, u16 sqid) +{ + return sqid <= n->subsys->max_qid && n->sqs[sqid] != NULL ? 0 : -1; +} + +static int nvmet_vhost_check_cqid(struct nvmet_ctrl *n, u16 cqid) +{ + return cqid <= n->subsys->max_qid && n->cqs[cqid] != NULL ? 0 : -1; +} + static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq, struct nvmet_vhost_ctrl *n, u64 dma_addr, u16 cqid, u16 size, struct eventfd_ctx *eventfd, @@ -147,6 +165,140 @@ static void nvmet_vhost_start_ctrl(void *opaque) } } +static void nvmet_vhost_create_cq(struct nvmet_req *req) +{ + struct nvmet_cq *cq; + struct nvmet_vhost_cq *vcq; + struct nvmet_vhost_ctrl *n; + struct nvme_create_cq *c; + u16 cqid; + u16 vector; + u16 qsize; + u16 qflags; + u64 prp1; + int status; + int ret; + + cq = req->cq; + vcq = cq_to_vcq(cq); + n = vcq->ctrl; + c = &req->cmd->create_cq; + cqid = le16_to_cpu(c->cqid); + vector = le16_to_cpu(c->irq_vector); + qsize = le16_to_cpu(c->qsize); + qflags = le16_to_cpu(c->cq_flags); + prp1 = le64_to_cpu(c->prp1); + status = NVME_SC_SUCCESS; + + if (!cqid || (cqid && !nvmet_vhost_check_cqid(n->ctrl, cqid))) { + status = NVME_SC_QID_INVALID | NVME_SC_DNR; + goto out; + } + if (!qsize || qsize > NVME_CAP_MQES(n->ctrl->cap)) { + status = NVME_SC_QUEUE_SIZE | NVME_SC_DNR; + goto out; + } + if (!prp1) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + if (vector > n->num_queues) { + status = NVME_SC_INVALID_VECTOR | NVME_SC_DNR; + goto out; + } + if (!(NVME_CQ_FLAGS_PC(qflags))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + + vcq = kmalloc(sizeof(*vcq), GFP_KERNEL); + if (!vcq) { + status = NVME_SC_INTERNAL | NVME_SC_DNR; + goto out; + } + + ret = nvmet_vhost_init_cq(vcq, n, prp1, cqid, qsize+1, + n->eventfd[cqid].call_ctx, vector, + NVME_CQ_FLAGS_IEN(qflags)); + if (ret) + status = NVME_SC_INTERNAL | NVME_SC_DNR; + +out: + nvmet_req_complete(req, status); +} + +static void nvmet_vhost_create_sq(struct nvmet_req *req) +{ + struct nvme_create_sq *c = &req->cmd->create_sq; + u16 cqid = le16_to_cpu(c->cqid); + u16 sqid = le16_to_cpu(c->sqid); + u16 qsize = le16_to_cpu(c->qsize); + u16 qflags = le16_to_cpu(c->sq_flags); + u64 prp1 = le64_to_cpu(c->prp1); + + struct nvmet_sq *sq = req->sq; + struct nvmet_vhost_sq *vsq; + struct nvmet_vhost_ctrl *n; + int status; + int ret; + + status = NVME_SC_SUCCESS; + vsq = sq_to_vsq(sq); + n = vsq->ctrl; + + if (!cqid || nvmet_vhost_check_cqid(n->ctrl, cqid)) { + status = NVME_SC_CQ_INVALID | NVME_SC_DNR; + goto out; + } + if (!sqid || (sqid && !nvmet_vhost_check_sqid(n->ctrl, sqid))) { + status = NVME_SC_QID_INVALID | NVME_SC_DNR; + goto out; + } + if (!qsize || qsize > NVME_CAP_MQES(n->ctrl->cap)) { + status = NVME_SC_QUEUE_SIZE | NVME_SC_DNR; + goto out; + } + if (!prp1 || prp1 & (n->page_size - 1)) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + if (!(NVME_SQ_FLAGS_PC(qflags))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + + vsq = kmalloc(sizeof(*vsq), GFP_KERNEL); + if (!sq) { + status = NVME_SC_INTERNAL | NVME_SC_DNR; + goto out; + } + + ret = nvmet_vhost_init_sq(vsq, n, prp1, sqid, cqid, qsize + 1); + if (ret) + status = NVME_SC_INTERNAL | NVME_SC_DNR; + +out: + nvmet_req_complete(req, status); +} + +static int nvmet_vhost_parse_admin_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + switch (cmd->common.opcode) { + case nvme_admin_create_cq: + req->execute = nvmet_vhost_create_cq; + req->data_len = 0; + return 0; + case nvme_admin_create_sq: + req->execute = nvmet_vhost_create_sq; + req->data_len = 0; + return 0; + } + + return -1; +} + static int nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n, struct vhost_nvme_target *c) @@ -173,6 +325,7 @@ nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n, n->num_queues = subsys->max_qid + 1; ctrl->opaque = n; ctrl->start = nvmet_vhost_start_ctrl; + ctrl->parse_extra_admin_cmd = nvmet_vhost_parse_admin_cmd; num_queues = ctrl->subsys->max_qid + 1; n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL); -- 1.9.1
From: Ming Lin <ming.l at ssi.samsung.com> This borrows code from Hannes Reinecke's rts-megasas. Cc: Hannes Reinecke <hare at suse.de> Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/vhost.c | 108 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c index 04ed0bc..6847c86 100644 --- a/drivers/nvme/target/vhost.c +++ b/drivers/nvme/target/vhost.c @@ -5,6 +5,7 @@ #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/file.h> +#include <linux/highmem.h> #include "../../vhost/vhost.h" #include "nvmet.h" @@ -95,6 +96,113 @@ struct nvmet_vhost_ctrl { u32 page_size; }; +const struct vhost_memory_region * +find_region(struct vhost_dev *hba, __u64 addr, __u32 len) +{ + struct vhost_memory *mem; + struct vhost_memory_region *reg; + int i; + + if (!hba->memory) + return NULL; + + mem = hba->memory; + /* linear search is not brilliant, but we really have on the order of 6 + * regions in practice */ + for (i = 0; i < mem->nregions; ++i) { + reg = mem->regions + i; + if (reg->guest_phys_addr <= addr && + reg->guest_phys_addr + reg->memory_size - 1 >= addr) + return reg; + } + return NULL; +} + +static bool check_region_boundary(const struct vhost_memory_region *reg, + uint64_t addr, size_t len) +{ + unsigned long max_size; + + max_size = reg->memory_size - addr + reg->guest_phys_addr; + return (max_size < len); +} + +static void __user *map_to_region(const struct vhost_memory_region *reg, + uint64_t addr) +{ + return (void __user *)(unsigned long) + (reg->userspace_addr + addr - reg->guest_phys_addr); +} + +static void __user *map_guest_to_host(struct vhost_dev *dev, + uint64_t addr, int size) +{ + const struct vhost_memory_region *reg = NULL; + + reg = find_region(dev, addr, size); + if (unlikely(!reg)) + return ERR_PTR(-EPERM); + + if (unlikely(check_region_boundary(reg, addr, size))) + return ERR_PTR(-EFAULT); + + return map_to_region(reg, addr); +} + +static int nvmet_vhost_rw(struct vhost_dev *dev, u64 guest_pa, + void *buf, uint32_t size, int write) +{ + void __user *host_user_va; + void *host_kernel_va; + struct page *page; + uintptr_t offset; + int ret; + + host_user_va = map_guest_to_host(dev, guest_pa, size); + if (unlikely(!host_user_va)) { + pr_warn("cannot map guest addr %p, error %ld\n", + (void *)guest_pa, PTR_ERR(host_user_va)); + return -EINVAL; + } + + ret = get_user_pages(current, dev->mm, + (unsigned long)host_user_va, 1, + false, 0, &page, NULL); + if (unlikely(ret != 1)) { + pr_warn("get_user_pages fail!!!\n"); + return -EINVAL; + } + + host_kernel_va = kmap(page); + if (unlikely(!host_kernel_va)) { + pr_warn("kmap fail!!!\n"); + put_page(page); + return -EINVAL; + } + + offset = (uintptr_t)host_user_va & ~PAGE_MASK; + if (write) + memcpy(host_kernel_va + offset, buf, size); + else + memcpy(buf, host_kernel_va + offset, size); + kunmap(host_kernel_va); + put_page(page); + + return 0; +} + +int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa, + void *buf, uint32_t size) +{ + return nvmet_vhost_rw(dev, guest_pa, buf, size, 0); +} + +int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa, + void *buf, uint32_t size) +{ + return nvmet_vhost_rw(dev, guest_pa, buf, size, 1); +} + #define sq_to_vsq(sq) container_of(sq, struct nvmet_vhost_sq, sq) #define cq_to_vcq(cq) container_of(cq, struct nvmet_vhost_cq, cq) -- 1.9.1
From: Ming Lin <ming.l at ssi.samsung.com> This adds nvme submission/completion queue handlers, which are ported from qemu-nvme. And hooks into nvme-target to do the real job. Cc: Keith Busch <keith.busch at intel.com> Signed-off-by: Ming Lin <ming.l at ssi.samsung.com> --- drivers/nvme/target/vhost.c | 420 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 416 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c index 6847c86..3ce1348 100644 --- a/drivers/nvme/target/vhost.c +++ b/drivers/nvme/target/vhost.c @@ -6,10 +6,12 @@ #include <linux/mutex.h> #include <linux/file.h> #include <linux/highmem.h> +#include <linux/kthread.h> #include "../../vhost/vhost.h" #include "nvmet.h" #define NVMET_VHOST_AQ_DEPTH 256 +#define NVMET_VHOST_MAX_SEGMENTS 32 enum NvmeCcShift { CC_MPS_SHIFT = 7, @@ -52,6 +54,15 @@ struct nvmet_vhost_ctrl_eventfd { int __user *vector; }; +struct nvmet_vhost_iod { + struct nvmet_vhost_sq *sq; + struct scatterlist sg[NVMET_VHOST_MAX_SEGMENTS]; + struct nvme_command cmd; + struct nvme_completion rsp; + struct nvmet_req req; + struct list_head entry; +}; + struct nvmet_vhost_cq { struct nvmet_cq cq; struct nvmet_vhost_ctrl *ctrl; @@ -61,6 +72,12 @@ struct nvmet_vhost_cq { u8 phase; u64 dma_addr; struct eventfd_ctx *eventfd; + + struct list_head sq_list; + struct list_head req_list; + spinlock_t lock; + struct task_struct *thread; + int scheduled; }; struct nvmet_vhost_sq { @@ -71,6 +88,13 @@ struct nvmet_vhost_sq { u32 tail; u64 dma_addr; u16 cqid; + + struct nvmet_vhost_iod *io_req; + struct list_head req_list; + struct list_head entry; + struct mutex lock; + struct task_struct *thread; + int scheduled; }; struct nvmet_vhost_ctrl { @@ -191,13 +215,13 @@ static int nvmet_vhost_rw(struct vhost_dev *dev, u64 guest_pa, return 0; } -int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa, +static int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa, void *buf, uint32_t size) { return nvmet_vhost_rw(dev, guest_pa, buf, size, 0); } -int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa, +static int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa, void *buf, uint32_t size) { return nvmet_vhost_rw(dev, guest_pa, buf, size, 1); @@ -216,6 +240,289 @@ static int nvmet_vhost_check_cqid(struct nvmet_ctrl *n, u16 cqid) return cqid <= n->subsys->max_qid && n->cqs[cqid] != NULL ? 0 : -1; } +static void nvmet_vhost_inc_cq_tail(struct nvmet_vhost_cq *cq) +{ + cq->tail++; + if (cq->tail >= cq->cq.size) { + cq->tail = 0; + cq->phase = !cq->phase; + } +} + +static void nvmet_vhost_inc_sq_head(struct nvmet_vhost_sq *sq) +{ + sq->head = (sq->head + 1) % sq->sq.size; +} + +static uint8_t nvmet_vhost_cq_full(struct nvmet_vhost_cq *cq) +{ + return (cq->tail + 1) % cq->cq.size == cq->head; +} + +static uint8_t nvmet_vhost_sq_empty(struct nvmet_vhost_sq *sq) +{ + return sq->head == sq->tail; +} + +static void nvmet_vhost_post_cqes(struct nvmet_vhost_cq *cq) +{ + struct nvmet_vhost_ctrl *n = cq->ctrl; + struct nvmet_vhost_iod *req; + struct list_head *p, *tmp; + int signal = 0; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + list_for_each_safe(p, tmp, &cq->req_list) { + struct nvmet_vhost_sq *sq; + u64 addr; + + if (nvmet_vhost_cq_full(cq)) + goto unlock; + + req = list_entry(p, struct nvmet_vhost_iod, entry); + list_del(p); + + sq = req->sq; + req->rsp.status |= cq->phase; + req->rsp.sq_id = cpu_to_le16(sq->sq.qid); + req->rsp.sq_head = cpu_to_le16(sq->head); + addr = cq->dma_addr + cq->tail * n->cqe_size; + nvmet_vhost_inc_cq_tail(cq); + spin_unlock_irqrestore(&cq->lock, flags); + + nvmet_vhost_write(&n->dev, addr, (void *)&req->rsp, + sizeof(req->rsp)); + + mutex_lock(&sq->lock); + list_add_tail(p, &sq->req_list); + mutex_unlock(&sq->lock); + + signal = 1; + + spin_lock_irqsave(&cq->lock, flags); + } + + if (signal) + eventfd_signal(cq->eventfd, 1); + +unlock: + cq->scheduled = 0; + spin_unlock_irqrestore(&cq->lock, flags); +} + +static int nvmet_vhost_cq_thread(void *arg) +{ + struct nvmet_vhost_cq *sq = arg; + + while (1) { + if (kthread_should_stop()) + break; + + nvmet_vhost_post_cqes(sq); + + schedule(); + } + + return 0; +} + +static void nvmet_vhost_enqueue_req_completion( + struct nvmet_vhost_cq *cq, struct nvmet_vhost_iod *iod) +{ + unsigned long flags; + + BUG_ON(cq->cq.qid != iod->sq->sq.qid); + spin_lock_irqsave(&cq->lock, flags); + list_add_tail(&iod->entry, &cq->req_list); + if (!cq->scheduled) { + wake_up_process(cq->thread); + cq->scheduled = 1; + } + spin_unlock_irqrestore(&cq->lock, flags); +} + +static void nvmet_vhost_queue_response(struct nvmet_req *req) +{ + struct nvmet_vhost_iod *iod + container_of(req, struct nvmet_vhost_iod, req); + struct nvmet_vhost_sq *sq = iod->sq; + struct nvmet_vhost_ctrl *n = sq->ctrl; + struct nvmet_vhost_cq *cq = n->cqs[sq->sq.qid]; + + nvmet_vhost_enqueue_req_completion(cq, iod); +} + +static int nvmet_vhost_sglist_add(struct nvmet_vhost_ctrl *n, struct scatterlist *sg, + u64 guest_addr, int len, int is_write) +{ + void __user *host_addr; + struct page *page; + unsigned int offset, nbytes; + int ret; + + host_addr = map_guest_to_host(&n->dev, guest_addr, len); + if (unlikely(!host_addr)) { + pr_warn("cannot map guest addr %p, error %ld\n", + (void *)guest_addr, PTR_ERR(host_addr)); + return PTR_ERR(host_addr); + } + + ret = get_user_pages(current, n->dev.mm, (unsigned long)host_addr, 1, + is_write, 0, &page, NULL); + BUG_ON(ret == 0); /* we should either get our page or fail */ + if (ret < 0) { + pr_warn("get_user_pages faild: host_addr %p, %d\n", + host_addr, ret); + return ret; + } + + offset = (uintptr_t)host_addr & ~PAGE_MASK; + nbytes = min_t(unsigned int, PAGE_SIZE - offset, len); + sg_set_page(sg, page, nbytes, offset); + + return 0; +} + +static int nvmet_vhost_map_prp(struct nvmet_vhost_ctrl *n, struct scatterlist *sgl, + u64 prp1, u64 prp2, unsigned int len) +{ + unsigned int trans_len = n->page_size - (prp1 % n->page_size); + int num_prps = (len >> n->page_bits) + 1; + //FIXME + int is_write = 1; + + trans_len = min(len, trans_len); + if (!prp1) + return -1; + + sg_init_table(sgl, num_prps); + + nvmet_vhost_sglist_add(n, sgl, prp1, trans_len, is_write); + + len -= trans_len; + if (len) { + if (!prp2) + goto error; + if (len > n->page_size) { + u64 prp_list[n->max_prp_ents]; + u16 nents, prp_trans; + int i = 0; + + nents = (len + n->page_size - 1) >> n->page_bits; + prp_trans = min(n->max_prp_ents, nents) * sizeof(u64); + nvmet_vhost_read(&n->dev, prp2, (void *)prp_list, prp_trans); + + while (len != 0) { + u64 prp_ent = le64_to_cpu(prp_list[i]); + + if (i == n->max_prp_ents - 1 && len > n->page_size) { + if (!prp_ent || prp_ent & (n->page_size - 1)) + goto error; + i = 0; + nents = (len + n->page_size - 1) >> n->page_bits; + prp_trans = min(n->max_prp_ents, nents) * sizeof(u64); + nvmet_vhost_read(&n->dev, prp_ent, (void *)prp_list, prp_trans); + prp_ent = le64_to_cpu(prp_list[i]); + } + + if (!prp_ent || prp_ent & (n->page_size - 1)) + goto error; + + trans_len = min(len, n->page_size); + nvmet_vhost_sglist_add(n, sgl, prp_ent, trans_len, is_write); + sgl++; + len -= trans_len; + i++; + } + } else { + if (prp2 & (n->page_size - 1)) + goto error; + nvmet_vhost_sglist_add(n, sgl, prp2, trans_len, is_write); + } + } + + return num_prps; + +error: + return -1; +} + +static void nvmet_vhost_process_sq(struct nvmet_vhost_sq *sq) +{ + struct nvmet_vhost_ctrl *n = sq->ctrl; + struct nvmet_vhost_cq *cq = n->cqs[sq->sq.qid]; + struct nvmet_vhost_iod *iod; + struct nvme_command *cmd; + int ret; + + mutex_lock(&sq->lock); + + while (!(nvmet_vhost_sq_empty(sq) || list_empty(&sq->req_list))) { + u64 addr = sq->dma_addr + sq->head * n->sqe_size;; + + nvmet_vhost_inc_sq_head(sq); + iod = list_first_entry(&sq->req_list, + struct nvmet_vhost_iod, entry); + list_del(&iod->entry); + mutex_unlock(&sq->lock); + + cmd = &iod->cmd; + ret = nvmet_vhost_read(&n->dev, addr, + (void *)cmd, sizeof(*cmd)); + if (ret) { + pr_warn("nvmet_vhost_read fail\n"); + goto out; + } + + ret = nvmet_req_init(&iod->req, &cq->cq, &sq->sq, + nvmet_vhost_queue_response); + if (ret) { + pr_warn("nvmet_req_init error: ret 0x%x, qid %d\n", ret, sq->sq.qid); + goto out; + } + if (iod->req.data_len) { + ret = nvmet_vhost_map_prp(n, iod->sg, cmd->common.prp1, + cmd->common.prp2, iod->req.data_len); + if (ret > 0) { + iod->req.sg = iod->sg; + iod->req.sg_cnt = ret; + } else { + pr_warn("map prp error\n"); + goto out; + } + } + iod->req.execute(&iod->req); + mutex_lock(&sq->lock); + } + +unlock: + sq->scheduled = 0; + mutex_unlock(&sq->lock); + return; + +out: + mutex_lock(&sq->lock); + list_add_tail(&iod->entry, &sq->req_list); + goto unlock; +} + +static int nvmet_vhost_sq_thread(void *opaque) +{ + struct nvmet_vhost_sq *sq = opaque; + + while (1) { + if (kthread_should_stop()) + break; + + nvmet_vhost_process_sq(sq); + + schedule(); + } + + return 0; +} + static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq, struct nvmet_vhost_ctrl *n, u64 dma_addr, u16 cqid, u16 size, struct eventfd_ctx *eventfd, @@ -228,6 +535,12 @@ static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq, cq->eventfd = eventfd; n->cqs[cqid] = cq; + spin_lock_init(&cq->lock); + INIT_LIST_HEAD(&cq->req_list); + INIT_LIST_HEAD(&cq->sq_list); + cq->scheduled = 0; + cq->thread = kthread_create(nvmet_vhost_cq_thread, cq, "nvmet_vhost_cq"); + nvmet_cq_init(n->ctrl, &cq->cq, cqid, size); return 0; @@ -237,12 +550,36 @@ static int nvmet_vhost_init_sq(struct nvmet_vhost_sq *sq, struct nvmet_vhost_ctrl *n, u64 dma_addr, u16 sqid, u16 cqid, u16 size) { + struct nvmet_vhost_cq *cq; + struct nvmet_vhost_iod *iod; + int i; + sq->ctrl = n; sq->dma_addr = dma_addr; sq->cqid = cqid; sq->head = sq->tail = 0; n->sqs[sqid] = sq; + mutex_init(&sq->lock); + INIT_LIST_HEAD(&sq->req_list); + sq->io_req = kmalloc(sizeof(struct nvmet_vhost_iod) * size, GFP_KERNEL); + if (!sq->io_req) + return -ENOMEM; + for (i = 0; i < size; i++) { + iod = &sq->io_req[i]; + + iod->req.cmd = &iod->cmd; + iod->req.rsp = &iod->rsp; + iod->sq = sq; + list_add_tail(&iod->entry, &sq->req_list); + } + sq->scheduled = 0; + sq->thread = kthread_create(nvmet_vhost_sq_thread, sq, "nvmet_vhost_sq"); + + cq = n->cqs[cqid]; + list_add_tail(&sq->entry, &cq->sq_list); + n->sqs[sqid] = sq; + nvmet_sq_init(n->ctrl, &sq->sq, sqid, size); return 0; @@ -564,12 +901,84 @@ static int nvmet_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64 val) return status; } +static int nvmet_vhost_process_db(struct nvmet_ctrl *ctrl, int offset, u64 val) +{ + u16 qid; + + if (offset & ((1 << 2) - 1)) + return -EINVAL; + + if (((offset - 0x1000) >> 2) & 1) { + u16 new_head = val & 0xffff; + int start_sqs; + struct nvmet_vhost_cq *vcq; + struct nvmet_cq *cq; + unsigned long flags; + + qid = (offset - (0x1000 + (1 << 2))) >> 3; + if (nvmet_vhost_check_cqid(ctrl, qid)) + return -EINVAL; + + cq = ctrl->cqs[qid]; + if (new_head >= cq->size) + return -EINVAL; + + vcq = cq_to_vcq(cq); + spin_lock_irqsave(&vcq->lock, flags); + start_sqs = nvmet_vhost_cq_full(vcq) ? 1 : 0; + vcq->head = new_head; + spin_unlock_irqrestore(&vcq->lock, flags); + if (start_sqs) { + struct nvmet_vhost_sq *sq; + struct list_head *p; + + list_for_each(p, &vcq->sq_list) { + sq = list_entry(p, struct nvmet_vhost_sq, entry); + if (!sq->scheduled) { + sq->scheduled = 1; + wake_up_process(sq->thread); + } + } + if (!vcq->scheduled) { + vcq->scheduled = 1; + wake_up_process(vcq->thread); + } + } + + if (vcq->tail != vcq->head) + eventfd_signal(vcq->eventfd, 1); + } else { + struct nvmet_vhost_sq *vsq; + struct nvmet_sq *sq; + u16 new_tail = val & 0xffff; + + qid = (offset - 0x1000) >> 3; + if (nvmet_vhost_check_sqid(ctrl, qid)) + return -EINVAL; + + sq = ctrl->sqs[qid]; + if (new_tail >= sq->size) + return -ENOSPC; + + vsq = sq_to_vsq(sq); + mutex_lock(&vsq->lock); + vsq->tail = new_tail; + if (!vsq->scheduled) { + vsq->scheduled = 1; + wake_up_process(vsq->thread); + } + mutex_unlock(&vsq->lock); + } + + return 0; +} + static int nvmet_vhost_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64 val) { if (offset < 0x1000) return nvmet_bar_write(n, offset, val); - - return -1; + else + return nvmet_vhost_process_db(n->ctrl, offset, val); } static int nvmet_vhost_ioc_bar(struct nvmet_vhost_ctrl *n, void __user *argp) @@ -612,6 +1021,8 @@ static void nvme_free_sq(struct nvmet_vhost_sq *sq, struct nvmet_vhost_ctrl *n) { n->sqs[sq->sq.qid] = NULL; + kthread_stop(sq->thread); + kfree(sq->io_req); if (sq->sq.qid) kfree(sq); } @@ -620,6 +1031,7 @@ static void nvme_free_cq(struct nvmet_vhost_cq *cq, struct nvmet_vhost_ctrl *n) { n->cqs[cq->cq.qid] = NULL; + kthread_stop(cq->thread); if (cq->cq.qid) kfree(cq); } -- 1.9.1
Christoph Hellwig
2015-Nov-20 05:13 UTC
[RFC PATCH 4/9] nvmet: add a controller "start" hook
On Thu, Nov 19, 2015 at 04:21:03PM -0800, Ming Lin wrote:> #define NVMET_SUBSYS_NAME_LEN 256 > char subsys_name[NVMET_SUBSYS_NAME_LEN]; > + > + void *opaque; > + void (*start)(void *); > };Why can't vhost use container_of to get at the containing structure similar to what the loop driver does? In addition I think we'll eventually need an ops structure here, but I can take care of that later.
Christoph Hellwig
2015-Nov-20 05:16 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
Thanks Ming, from a first quick view this looks great. I'll look over it in a bit more detail once I get a bit more time.
Ming Lin
2015-Nov-20 05:33 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On Fri, 2015-11-20 at 06:16 +0100, Christoph Hellwig wrote:> Thanks Ming, > > from a first quick view this looks great. I'll look over it in a bit > more detail once I get a bit more time.Thanks to CC Nic :-) But funny, I double-checked bash history. I actually CCed Nic. Don't know why it's lost. mlin at ssi:~$ history |grep "nab" 1861 git send-email --from "Ming Lin <mlin at kernel.org>" --to "linux-nvme at lists.infradead.org" --cc "qemu-devel at nongnu.org" --cc "virtualization at lists.linux-foundation.org" --cc "Christoph Hellwig <hch at lst.de>" --cc "Nicholas A. Bellinger <nab at linux-iscsi.org>" --compose ~/patches/*.patch
Paolo Bonzini
2015-Nov-21 13:11 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On 20/11/2015 01:20, Ming Lin wrote:> One improvment could be to use google's NVMe vendor extension that > I send in another thread, aslo here: > https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-google-ext > > Qemu side: > http://www.minggr.net/cgit/cgit.cgi/qemu/log/?h=vhost-nvme.0 > Kernel side also here: > https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=vhost-nvme.0How much do you get with vhost-nvme plus vendor extension, compared to 190 MB/s for QEMU? Note that in all likelihood, QEMU can actually do better than 190 MB/s, and gain more parallelism too, by moving the processing of the ioeventfds to a separate thread. This is similar to hw/block/dataplane/virtio-blk.c. It's actually pretty easy to do. Even though hw/block/dataplane/virtio-blk.c is still using some old APIs, all memory access in QEMU is now thread-safe. I have pending patches for 2.6 that cut that file down to a mere 200 lines of code, NVMe would probably be about the same. Paolo
Paolo Bonzini
2015-Dec-01 16:02 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On 01/12/2015 00:20, Ming Lin wrote:> qemu-nvme: 148MB/s > vhost-nvme + google-ext: 230MB/s > qemu-nvme + google-ext + eventfd: 294MB/s > virtio-scsi: 296MB/s > virtio-blk: 344MB/s > > "vhost-nvme + google-ext" didn't get good enough performance.I'd expect it to be on par of qemu-nvme with ioeventfd but the question is: why should it be better? For vhost-net, the answer is that more zerocopy can be done if you put the data path in the kernel. But qemu-nvme is already using io_submit for the data path, perhaps there's not much to gain from vhost-nvme... Paolo> Still tuning.
Ming Lin
2015-Dec-01 16:26 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On Tue, 2015-12-01 at 17:02 +0100, Paolo Bonzini wrote:> > On 01/12/2015 00:20, Ming Lin wrote: > > qemu-nvme: 148MB/s > > vhost-nvme + google-ext: 230MB/s > > qemu-nvme + google-ext + eventfd: 294MB/s > > virtio-scsi: 296MB/s > > virtio-blk: 344MB/s > > > > "vhost-nvme + google-ext" didn't get good enough performance. > > I'd expect it to be on par of qemu-nvme with ioeventfd but the question > is: why should it be better? For vhost-net, the answer is that more > zerocopy can be done if you put the data path in the kernel. > > But qemu-nvme is already using io_submit for the data path, perhaps > there's not much to gain from vhost-nvme...What do you think about virtio-nvme+vhost-nvme? I also have patch for vritio-nvme: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-split/virtio Just need to change vhost-nvme to work with it.> > Paolo > > > Still tuning.
Apparently Analagous Threads
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target