Ming Lin
2015-Nov-20 00:20 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
Hi, This is the first attempt to add a new qemu nvme backend using in-kernel nvme target. Most code are ported from qemu-nvme and also borrow code from Hannes Reinecke's rts-megasas. It's similar as vhost-scsi, but doesn't use virtio. The advantage is guest can run unmodified NVMe driver. So guest can be any OS that has a NVMe driver. The goal is to get as good performance as vhost-scsi. But for now, peformance is poor. MMIO is the bottleneck. One improvment could be to use google's NVMe vendor extension that I send in another thread, aslo here: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-google-ext Qemu side: http://www.minggr.net/cgit/cgit.cgi/qemu/log/?h=vhost-nvme.0 Kernel side also here: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=vhost-nvme.0 Thanks for any comment, Ming
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/Kconfig | 11 +++++++++++
drivers/nvme/target/Makefile | 2 ++
drivers/nvme/target/vhost.c | 16 ++++++++++++++++
3 files changed, 29 insertions(+)
create mode 100644 drivers/nvme/target/vhost.c
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 1bf92db..22760f5 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -12,3 +12,14 @@ config NVME_TARGET_LOOP
to test NVMe host and target side features.
If unsure, say N.
+
+config NVME_TARGET_VHOST
+ tristate "NVMe vhost support"
+ depends on BLK_DEV_NVME
+ select NVME_TARGET
+ select VHOST
+ select VHOST_RING
+ help
+ This enabled the NVMe vhost support.
+
+ If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 21e9134..1d8d523 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -1,6 +1,8 @@
obj-$(CONFIG_NVME_TARGET) += nvmet.o
obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o
+obj-$(CONFIG_NVME_TARGET_VHOST) += nvme-vhost.o
nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o
nvme-loop-y += loop.o
+nvme-vhost-y += vhost.o
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
new file mode 100644
index 0000000..623af00
--- /dev/null
+++ b/drivers/nvme/target/vhost.c
@@ -0,0 +1,16 @@
+#include <linux/module.h>
+
+static int __init nvmet_vhost_init(void)
+{
+ return 0;
+}
+module_init(nvmet_vhost_init);
+
+static void nvmet_vhost_exit(void)
+{
+}
+module_exit(nvmet_vhost_exit);
+
+MODULE_AUTHOR("Ming Lin <ming.l at ssi.samsung.com>");
+MODULE_LICENSE("GPL v2");
+
--
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/core.c | 1 +
drivers/nvme/target/vhost.c | 264 +++++++++++++++++++++++++++++++++++++++++++-
include/uapi/linux/vhost.h | 15 +++
3 files changed, 279 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 5c770bf..1bfef66 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -378,6 +378,7 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
{
kref_put(&ctrl->ref, nvmet_ctrl_free);
}
+EXPORT_SYMBOL_GPL(nvmet_ctrl_put);
struct nvmet_subsys *nvmet_find_subsys(char *subsys_name)
{
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 623af00..fa2e668 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -1,13 +1,275 @@
#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include "../../vhost/vhost.h"
+#include "nvmet.h"
-static int __init nvmet_vhost_init(void)
+struct nvmet_vhost_ctrl_eventfd {
+ struct file *call;
+ struct eventfd_ctx *call_ctx;
+ int __user *irq_enabled;
+ int __user *vector;
+};
+
+struct nvmet_vhost_cq {
+ struct nvmet_cq cq;
+
+ struct eventfd_ctx *eventfd;
+};
+
+struct nvmet_vhost_sq {
+ struct nvmet_sq sq;
+};
+
+struct nvmet_vhost_ctrl {
+ struct vhost_dev dev;
+ struct nvmet_vhost_ctrl_eventfd *eventfd;
+
+ u16 cntlid;
+ struct nvmet_ctrl *ctrl;
+ u32 num_queues;
+
+ struct nvmet_vhost_cq **cqs;
+ struct nvmet_vhost_sq **sqs;
+};
+
+static int
+nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
+ struct vhost_nvme_target *c)
{
+ struct nvmet_subsys *subsys;
+ struct nvmet_ctrl *ctrl;
+ int num_queues;
+ int ret = 0;
+
+ subsys = nvmet_find_subsys(c->vhost_wwpn);
+ if (!subsys) {
+ pr_warn("connect request for invalid subsystem!\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&subsys->lock);
+ ctrl = nvmet_alloc_ctrl(subsys, c->vhost_wwpn);
+ if (IS_ERR(ctrl)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ n->cntlid = ctrl->cntlid;
+ n->ctrl = ctrl;
+ n->num_queues = subsys->max_qid + 1;
+
+ num_queues = ctrl->subsys->max_qid + 1;
+ n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL);
+ if (!n->cqs) {
+ ret = -ENOMEM;
+ goto out_ctrl_put;
+ }
+ n->sqs = kzalloc(sizeof(*n->sqs) * num_queues, GFP_KERNEL);
+ if (!n->sqs) {
+ ret = -ENOMEM;
+ goto free_cqs;
+ }
+
+ n->eventfd = kmalloc(sizeof(struct nvmet_vhost_ctrl_eventfd)
+ * num_queues, GFP_KERNEL);
+ if (!n->eventfd) {
+ ret = -ENOMEM;
+ goto free_sqs;
+ }
+
+ mutex_unlock(&subsys->lock);
return 0;
+
+free_sqs:
+ kfree(n->sqs);
+
+free_cqs:
+ kfree(n->cqs);
+
+out_ctrl_put:
+ nvmet_ctrl_put(ctrl);
+
+out_unlock:
+ mutex_unlock(&subsys->lock);
+ return ret;
+}
+
+static int nvmet_vhost_set_eventfd(struct nvmet_vhost_ctrl *n, void __user
*argp)
+{
+ struct nvmet_vhost_eventfd eventfd;
+ int num;
+ int ret;
+
+ ret = copy_from_user(&eventfd, argp, sizeof(struct nvmet_vhost_eventfd));
+ if (unlikely(ret))
+ return ret;
+
+ num = eventfd.num;
+ if (num > n->ctrl->subsys->max_qid)
+ return -EINVAL;
+
+ n->eventfd[num].call = eventfd_fget(eventfd.fd);
+ if (IS_ERR(n->eventfd[num].call))
+ return -EBADF;
+ n->eventfd[num].call_ctx = eventfd_ctx_fileget(n->eventfd[num].call);
+ if (IS_ERR(n->eventfd[num].call_ctx)) {
+ fput(n->eventfd[num].call);
+ return -EBADF;
+ }
+
+ n->eventfd[num].irq_enabled = eventfd.irq_enabled;
+ n->eventfd[num].vector = eventfd.vector;
+
+ return 0;
+}
+
+static int nvmet_vhost_open(struct inode *inode, struct file *f)
+{
+ struct nvmet_vhost_ctrl *n = kzalloc(sizeof(*n), GFP_KERNEL);
+
+ if (!n)
+ return -ENOMEM;
+
+ /* We don't use virtqueue */
+ vhost_dev_init(&n->dev, NULL, 0);
+ f->private_data = n;
+
+ return 0;
+}
+
+static void nvme_free_sq(struct nvmet_vhost_sq *sq,
+ struct nvmet_vhost_ctrl *n)
+{
+ n->sqs[sq->sq.qid] = NULL;
+ if (sq->sq.qid)
+ kfree(sq);
+}
+
+static void nvme_free_cq(struct nvmet_vhost_cq *cq,
+ struct nvmet_vhost_ctrl *n)
+{
+ n->cqs[cq->cq.qid] = NULL;
+ if (cq->cq.qid)
+ kfree(cq);
+}
+
+static void nvmet_vhost_clear_ctrl(struct nvmet_vhost_ctrl *n)
+{
+ int i;
+
+ for (i = 0; i < n->num_queues; i++) {
+ if (n->sqs[i] != NULL)
+ nvme_free_sq(n->sqs[i], n);
+ }
+ for (i = 0; i < n->num_queues; i++) {
+ if (n->cqs[i] != NULL)
+ nvme_free_cq(n->cqs[i], n);
+ }
+
+ kfree(n->eventfd);
+ kfree(n->cqs);
+ kfree(n->sqs);
+ nvmet_ctrl_put(n->ctrl);
+}
+
+static void nvmet_vhost_clear_eventfd(struct nvmet_vhost_ctrl *n)
+{
+ int i;
+
+ for (i = 0; i < n->num_queues; i++) {
+ if (n->eventfd[i].call_ctx) {
+ eventfd_ctx_put(n->eventfd[i].call_ctx);
+ fput(n->eventfd[i].call);
+ }
+ }
+}
+
+static int nvmet_vhost_release(struct inode *inode, struct file *f)
+{
+ struct nvmet_vhost_ctrl *n = f->private_data;
+
+ nvmet_vhost_clear_eventfd(n);
+ nvmet_vhost_clear_ctrl(n);
+
+ vhost_dev_stop(&n->dev);
+ vhost_dev_cleanup(&n->dev, false);
+
+ kfree(n);
+ return 0;
+}
+
+static long nvmet_vhost_ioctl(struct file *f, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct nvmet_vhost_ctrl *n = f->private_data;
+ void __user *argp = (void __user *)arg;
+ u64 __user *featurep = argp;
+ u64 features;
+ int r;
+
+ switch (ioctl) {
+ case VHOST_NVME_SET_ENDPOINT:
+ {
+ struct vhost_nvme_target conf;
+ if (copy_from_user(&conf, argp, sizeof(conf)))
+ return -EFAULT;
+
+ return nvmet_vhost_set_endpoint(n, &conf);
+ }
+ case VHOST_NVME_SET_EVENTFD:
+ r = nvmet_vhost_set_eventfd(n, argp);
+ return r;
+ case VHOST_GET_FEATURES:
+ features = VHOST_FEATURES;
+ if (copy_to_user(featurep, &features, sizeof(features)))
+ return -EFAULT;
+ return 0;
+ default:
+ mutex_lock(&n->dev.mutex);
+ r = vhost_dev_ioctl(&n->dev, ioctl, argp);
+ mutex_unlock(&n->dev.mutex);
+ return r;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long nvmet_vhost_compat_ioctl(struct file *f, unsigned int ioctl,
+ unsigned long arg)
+{
+ return nvmet_vhost_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations nvmet_vhost_fops = {
+ .owner = THIS_MODULE,
+ .release = nvmet_vhost_release,
+ .unlocked_ioctl = nvmet_vhost_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = nvmet_vhost_compat_ioctl,
+#endif
+ .open = nvmet_vhost_open,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice nvmet_vhost_misc = {
+ MISC_DYNAMIC_MINOR,
+ "vhost-nvme",
+ &nvmet_vhost_fops,
+};
+
+static int __init nvmet_vhost_init(void)
+{
+ return misc_register(&nvmet_vhost_misc);
}
module_init(nvmet_vhost_init);
static void nvmet_vhost_exit(void)
{
+ misc_deregister(&nvmet_vhost_misc);
}
module_exit(nvmet_vhost_exit);
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index ab373191..ae4b619 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -169,4 +169,19 @@ struct vhost_scsi_target {
#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
+struct vhost_nvme_target {
+ char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */
+};
+
+struct nvmet_vhost_eventfd {
+ int num;
+ int fd;
+ int *irq_enabled;
+ int *vector;
+};
+
+#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x47, struct
vhost_nvme_target)
+#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x48, struct
vhost_nvme_target)
+#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x45, struct
nvmet_vhost_eventfd)
+
#endif
--
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/vhost.c | 102 ++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/vhost.h | 17 ++++++--
2 files changed, 116 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index fa2e668..01c44b8 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -8,6 +8,8 @@
#include "../../vhost/vhost.h"
#include "nvmet.h"
+#define NVMET_VHOST_AQ_DEPTH 256
+
struct nvmet_vhost_ctrl_eventfd {
struct file *call;
struct eventfd_ctx *call_ctx;
@@ -35,6 +37,10 @@ struct nvmet_vhost_ctrl {
struct nvmet_vhost_cq **cqs;
struct nvmet_vhost_sq **sqs;
+
+ u32 aqa;
+ u64 asq;
+ u64 acq;
};
static int
@@ -127,6 +133,100 @@ static int nvmet_vhost_set_eventfd(struct nvmet_vhost_ctrl
*n, void __user *argp
return 0;
}
+static int nvmet_vhost_bar_read(struct nvmet_ctrl *ctrl, int offset, u64 *val)
+{
+ int status = NVME_SC_SUCCESS;
+
+ switch(offset) {
+ case NVME_REG_CAP:
+ *val = ctrl->cap;
+ break;
+ case NVME_REG_CAP+4:
+ *val = ctrl->cap >> 32;
+ case NVME_REG_VS:
+ *val = ctrl->subsys->ver;
+ break;
+ case NVME_REG_CC:
+ *val = ctrl->cc;
+ break;
+ case NVME_REG_CSTS:
+ *val = ctrl->csts;
+ break;
+ case NVME_REG_AQA:
+ *val = (NVMET_VHOST_AQ_DEPTH - 1) |
+ (((NVMET_VHOST_AQ_DEPTH - 1) << 16));
+ break;
+ default:
+ printk("Unknown offset: 0x%x\n", offset);
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ break;
+ }
+
+ return status;
+}
+
+static int nvmet_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64 val)
+{
+ struct nvmet_ctrl *ctrl = n->ctrl;
+ int status = NVME_SC_SUCCESS;
+
+ switch(offset) {
+ case NVME_REG_CC:
+ nvmet_update_cc(ctrl, val);
+ break;
+ case NVME_REG_AQA:
+ n->aqa = val & 0xffffffff;
+ break;
+ case NVME_REG_ASQ:
+ n->asq = val;
+ break;
+ case NVME_REG_ASQ + 4:
+ n->asq |= val << 32;
+ break;
+ case NVME_REG_ACQ:
+ n->acq = val;
+ break;
+ case NVME_REG_ACQ + 4:
+ n->acq |= val << 32;
+ break;
+ default:
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ break;
+ }
+
+ return status;
+}
+
+static int nvmet_vhost_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64
val)
+{
+ if (offset < 0x1000)
+ return nvmet_bar_write(n, offset, val);
+
+ return -1;
+}
+
+static int nvmet_vhost_ioc_bar(struct nvmet_vhost_ctrl *n, void __user *argp)
+{
+ struct nvmet_vhost_bar bar;
+ struct nvmet_vhost_bar __user *user_bar = argp;
+ int ret = -EINVAL;
+
+ ret = copy_from_user(&bar, argp, sizeof(bar));
+ if (unlikely(ret))
+ return ret;
+
+ if (bar.type == VHOST_NVME_BAR_READ) {
+ u64 val;
+ ret = nvmet_vhost_bar_read(n->ctrl, bar.offset, &val);
+ if (ret != NVME_SC_SUCCESS)
+ return ret;
+ ret = copy_to_user(&user_bar->val, &val, sizeof(u64));
+ } else if (bar.type == VHOST_NVME_BAR_WRITE)
+ ret = nvmet_vhost_bar_write(n, bar.offset, bar.val);
+
+ return ret;
+}
+
static int nvmet_vhost_open(struct inode *inode, struct file *f)
{
struct nvmet_vhost_ctrl *n = kzalloc(sizeof(*n), GFP_KERNEL);
@@ -223,6 +323,8 @@ static long nvmet_vhost_ioctl(struct file *f, unsigned int
ioctl,
case VHOST_NVME_SET_EVENTFD:
r = nvmet_vhost_set_eventfd(n, argp);
return r;
+ case VHOST_NVME_BAR:
+ return nvmet_vhost_ioc_bar(n, argp);
case VHOST_GET_FEATURES:
features = VHOST_FEATURES;
if (copy_to_user(featurep, &features, sizeof(features)))
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index ae4b619..a0cefcc 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -180,8 +180,19 @@ struct nvmet_vhost_eventfd {
int *vector;
};
-#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x47, struct
vhost_nvme_target)
-#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x48, struct
vhost_nvme_target)
-#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x45, struct
nvmet_vhost_eventfd)
+#define VHOST_NVME_BAR_READ 0
+#define VHOST_NVME_BAR_WRITE 1
+
+struct nvmet_vhost_bar {
+ int type; /* read/write */
+ u64 offset;
+ unsigned size;
+ u64 val;
+};
+
+#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x45, struct
vhost_nvme_target)
+#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x46, struct
vhost_nvme_target)
+#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x47, struct
nvmet_vhost_eventfd)
+#define VHOST_NVME_BAR _IOW(VHOST_VIRTIO, 0x48, struct nvmet_vhost_bar)
#endif
--
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
This is used to execute controller specific start code
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/core.c | 3 +++
drivers/nvme/target/nvmet.h | 3 +++
2 files changed, 6 insertions(+)
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 1bfef66..0a0fc48 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -251,6 +251,9 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
}
ctrl->csts = NVME_CSTS_RDY;
+
+ if (ctrl->start)
+ ctrl->start(ctrl->opaque);
}
static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 9335584..eac008b 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -59,6 +59,9 @@ struct nvmet_ctrl {
struct kref ref;
#define NVMET_SUBSYS_NAME_LEN 256
char subsys_name[NVMET_SUBSYS_NAME_LEN];
+
+ void *opaque;
+ void (*start)(void *);
};
struct nvmet_subsys {
--
1.9.1
Ming Lin
2015-Nov-20 00:21 UTC
[RFC PATCH 5/9] nvme-vhost: add controller "start" callback
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/vhost.c | 106 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 01c44b8..4a147d6 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -10,6 +10,35 @@
#define NVMET_VHOST_AQ_DEPTH 256
+enum NvmeCcShift {
+ CC_MPS_SHIFT = 7,
+ CC_IOSQES_SHIFT = 16,
+ CC_IOCQES_SHIFT = 20,
+};
+
+enum NvmeCcMask {
+ CC_MPS_MASK = 0xf,
+ CC_IOSQES_MASK = 0xf,
+ CC_IOCQES_MASK = 0xf,
+};
+
+#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK)
+#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
+#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
+
+enum NvmeAqaShift {
+ AQA_ASQS_SHIFT = 0,
+ AQA_ACQS_SHIFT = 16,
+};
+
+enum NvmeAqaMask {
+ AQA_ASQS_MASK = 0xfff,
+ AQA_ACQS_MASK = 0xfff,
+};
+
+#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
+#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
+
struct nvmet_vhost_ctrl_eventfd {
struct file *call;
struct eventfd_ctx *call_ctx;
@@ -19,12 +48,23 @@ struct nvmet_vhost_ctrl_eventfd {
struct nvmet_vhost_cq {
struct nvmet_cq cq;
+ struct nvmet_vhost_ctrl *ctrl;
+ u32 head;
+ u32 tail;
+ u8 phase;
+ u64 dma_addr;
struct eventfd_ctx *eventfd;
};
struct nvmet_vhost_sq {
struct nvmet_sq sq;
+ struct nvmet_vhost_ctrl *ctrl;
+
+ u32 head;
+ u32 tail;
+ u64 dma_addr;
+ u16 cqid;
};
struct nvmet_vhost_ctrl {
@@ -37,12 +77,76 @@ struct nvmet_vhost_ctrl {
struct nvmet_vhost_cq **cqs;
struct nvmet_vhost_sq **sqs;
+ struct nvmet_vhost_cq admin_cq;
+ struct nvmet_vhost_sq admin_sq;
u32 aqa;
u64 asq;
u64 acq;
+ u16 cqe_size;
+ u16 sqe_size;
+ u16 max_prp_ents;
+ u16 page_bits;
+ u32 page_size;
};
+static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq,
+ struct nvmet_vhost_ctrl *n, u64 dma_addr,
+ u16 cqid, u16 size, struct eventfd_ctx *eventfd,
+ u16 vector, u16 irq_enabled)
+{
+ cq->ctrl = n;
+ cq->dma_addr = dma_addr;
+ cq->phase = 1;
+ cq->head = cq->tail = 0;
+ cq->eventfd = eventfd;
+ n->cqs[cqid] = cq;
+
+ nvmet_cq_init(n->ctrl, &cq->cq, cqid, size);
+
+ return 0;
+}
+
+static int nvmet_vhost_init_sq(struct nvmet_vhost_sq *sq,
+ struct nvmet_vhost_ctrl *n, u64 dma_addr,
+ u16 sqid, u16 cqid, u16 size)
+{
+ sq->ctrl = n;
+ sq->dma_addr = dma_addr;
+ sq->cqid = cqid;
+ sq->head = sq->tail = 0;
+ n->sqs[sqid] = sq;
+
+ nvmet_sq_init(n->ctrl, &sq->sq, sqid, size);
+
+ return 0;
+}
+
+static void nvmet_vhost_start_ctrl(void *opaque)
+{
+ struct nvmet_vhost_ctrl *n = opaque;
+ u32 page_bits = NVME_CC_MPS(n->ctrl->cc) + 12;
+ u32 page_size = 1 << page_bits;
+ int ret;
+
+ n->page_bits = page_bits;
+ n->page_size = page_size;
+ n->max_prp_ents = n->page_size / sizeof(uint64_t);
+ n->cqe_size = 1 << NVME_CC_IOCQES(n->ctrl->cc);
+ n->sqe_size = 1 << NVME_CC_IOSQES(n->ctrl->cc);
+
+ nvmet_vhost_init_cq(&n->admin_cq, n, n->acq, 0,
+ NVME_AQA_ACQS(n->aqa) + 1, n->eventfd[0].call_ctx,
+ 0, 1);
+
+ ret = nvmet_vhost_init_sq(&n->admin_sq, n, n->asq, 0, 0,
+ NVME_AQA_ASQS(n->aqa) + 1);
+ if (ret) {
+ pr_warn("nvmet_vhost_init_sq failed!!!\n");
+ BUG_ON(1);
+ }
+}
+
static int
nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
struct vhost_nvme_target *c)
@@ -67,6 +171,8 @@ nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
n->cntlid = ctrl->cntlid;
n->ctrl = ctrl;
n->num_queues = subsys->max_qid + 1;
+ ctrl->opaque = n;
+ ctrl->start = nvmet_vhost_start_ctrl;
num_queues = ctrl->subsys->max_qid + 1;
n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL);
--
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
This is used to execute controller specific cmd parse code
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/admin-cmd.c | 7 +++++++
drivers/nvme/target/nvmet.h | 3 +++
2 files changed, 10 insertions(+)
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index d9db0d4..f009c77 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -346,6 +346,13 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
req->data = 0;
return 0;
#endif
+ default:
+ if (req->sq->ctrl->parse_extra_admin_cmd) {
+ int ret = req->sq->ctrl->parse_extra_admin_cmd(req);
+
+ if (!ret)
+ return 0;
+ }
}
pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index eac008b..ef79813 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -44,6 +44,8 @@ struct nvmet_sq {
u16 size;
};
+struct nvmet_req;
+
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_cq **cqs;
@@ -62,6 +64,7 @@ struct nvmet_ctrl {
void *opaque;
void (*start)(void *);
+ int (*parse_extra_admin_cmd)(struct nvmet_req *);
};
struct nvmet_subsys {
--
1.9.1
Ming Lin
2015-Nov-20 00:21 UTC
[RFC PATCH 7/9] nvme-vhost: add "parse_extra_admin_cmd" callback
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/vhost.c | 153 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 153 insertions(+)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 4a147d6..04ed0bc 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -39,6 +39,11 @@ enum NvmeAqaMask {
#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
+#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1)
+#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
+
+#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1)
+
struct nvmet_vhost_ctrl_eventfd {
struct file *call;
struct eventfd_ctx *call_ctx;
@@ -90,6 +95,19 @@ struct nvmet_vhost_ctrl {
u32 page_size;
};
+#define sq_to_vsq(sq) container_of(sq, struct nvmet_vhost_sq, sq)
+#define cq_to_vcq(cq) container_of(cq, struct nvmet_vhost_cq, cq)
+
+static int nvmet_vhost_check_sqid(struct nvmet_ctrl *n, u16 sqid)
+{
+ return sqid <= n->subsys->max_qid && n->sqs[sqid] != NULL
? 0 : -1;
+}
+
+static int nvmet_vhost_check_cqid(struct nvmet_ctrl *n, u16 cqid)
+{
+ return cqid <= n->subsys->max_qid && n->cqs[cqid] != NULL
? 0 : -1;
+}
+
static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq,
struct nvmet_vhost_ctrl *n, u64 dma_addr,
u16 cqid, u16 size, struct eventfd_ctx *eventfd,
@@ -147,6 +165,140 @@ static void nvmet_vhost_start_ctrl(void *opaque)
}
}
+static void nvmet_vhost_create_cq(struct nvmet_req *req)
+{
+ struct nvmet_cq *cq;
+ struct nvmet_vhost_cq *vcq;
+ struct nvmet_vhost_ctrl *n;
+ struct nvme_create_cq *c;
+ u16 cqid;
+ u16 vector;
+ u16 qsize;
+ u16 qflags;
+ u64 prp1;
+ int status;
+ int ret;
+
+ cq = req->cq;
+ vcq = cq_to_vcq(cq);
+ n = vcq->ctrl;
+ c = &req->cmd->create_cq;
+ cqid = le16_to_cpu(c->cqid);
+ vector = le16_to_cpu(c->irq_vector);
+ qsize = le16_to_cpu(c->qsize);
+ qflags = le16_to_cpu(c->cq_flags);
+ prp1 = le64_to_cpu(c->prp1);
+ status = NVME_SC_SUCCESS;
+
+ if (!cqid || (cqid && !nvmet_vhost_check_cqid(n->ctrl, cqid))) {
+ status = NVME_SC_QID_INVALID | NVME_SC_DNR;
+ goto out;
+ }
+ if (!qsize || qsize > NVME_CAP_MQES(n->ctrl->cap)) {
+ status = NVME_SC_QUEUE_SIZE | NVME_SC_DNR;
+ goto out;
+ }
+ if (!prp1) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ goto out;
+ }
+ if (vector > n->num_queues) {
+ status = NVME_SC_INVALID_VECTOR | NVME_SC_DNR;
+ goto out;
+ }
+ if (!(NVME_CQ_FLAGS_PC(qflags))) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ goto out;
+ }
+
+ vcq = kmalloc(sizeof(*vcq), GFP_KERNEL);
+ if (!vcq) {
+ status = NVME_SC_INTERNAL | NVME_SC_DNR;
+ goto out;
+ }
+
+ ret = nvmet_vhost_init_cq(vcq, n, prp1, cqid, qsize+1,
+ n->eventfd[cqid].call_ctx, vector,
+ NVME_CQ_FLAGS_IEN(qflags));
+ if (ret)
+ status = NVME_SC_INTERNAL | NVME_SC_DNR;
+
+out:
+ nvmet_req_complete(req, status);
+}
+
+static void nvmet_vhost_create_sq(struct nvmet_req *req)
+{
+ struct nvme_create_sq *c = &req->cmd->create_sq;
+ u16 cqid = le16_to_cpu(c->cqid);
+ u16 sqid = le16_to_cpu(c->sqid);
+ u16 qsize = le16_to_cpu(c->qsize);
+ u16 qflags = le16_to_cpu(c->sq_flags);
+ u64 prp1 = le64_to_cpu(c->prp1);
+
+ struct nvmet_sq *sq = req->sq;
+ struct nvmet_vhost_sq *vsq;
+ struct nvmet_vhost_ctrl *n;
+ int status;
+ int ret;
+
+ status = NVME_SC_SUCCESS;
+ vsq = sq_to_vsq(sq);
+ n = vsq->ctrl;
+
+ if (!cqid || nvmet_vhost_check_cqid(n->ctrl, cqid)) {
+ status = NVME_SC_CQ_INVALID | NVME_SC_DNR;
+ goto out;
+ }
+ if (!sqid || (sqid && !nvmet_vhost_check_sqid(n->ctrl, sqid))) {
+ status = NVME_SC_QID_INVALID | NVME_SC_DNR;
+ goto out;
+ }
+ if (!qsize || qsize > NVME_CAP_MQES(n->ctrl->cap)) {
+ status = NVME_SC_QUEUE_SIZE | NVME_SC_DNR;
+ goto out;
+ }
+ if (!prp1 || prp1 & (n->page_size - 1)) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ goto out;
+ }
+ if (!(NVME_SQ_FLAGS_PC(qflags))) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ goto out;
+ }
+
+ vsq = kmalloc(sizeof(*vsq), GFP_KERNEL);
+ if (!sq) {
+ status = NVME_SC_INTERNAL | NVME_SC_DNR;
+ goto out;
+ }
+
+ ret = nvmet_vhost_init_sq(vsq, n, prp1, sqid, cqid, qsize + 1);
+ if (ret)
+ status = NVME_SC_INTERNAL | NVME_SC_DNR;
+
+out:
+ nvmet_req_complete(req, status);
+}
+
+static int nvmet_vhost_parse_admin_cmd(struct nvmet_req *req)
+{
+ struct nvme_command *cmd = req->cmd;
+
+ switch (cmd->common.opcode) {
+ case nvme_admin_create_cq:
+ req->execute = nvmet_vhost_create_cq;
+ req->data_len = 0;
+ return 0;
+ case nvme_admin_create_sq:
+ req->execute = nvmet_vhost_create_sq;
+ req->data_len = 0;
+ return 0;
+ }
+
+ return -1;
+}
+
static int
nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
struct vhost_nvme_target *c)
@@ -173,6 +325,7 @@ nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
n->num_queues = subsys->max_qid + 1;
ctrl->opaque = n;
ctrl->start = nvmet_vhost_start_ctrl;
+ ctrl->parse_extra_admin_cmd = nvmet_vhost_parse_admin_cmd;
num_queues = ctrl->subsys->max_qid + 1;
n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL);
--
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
This borrows code from Hannes Reinecke's rts-megasas.
Cc: Hannes Reinecke <hare at suse.de>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/vhost.c | 108 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 108 insertions(+)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 04ed0bc..6847c86 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -5,6 +5,7 @@
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/file.h>
+#include <linux/highmem.h>
#include "../../vhost/vhost.h"
#include "nvmet.h"
@@ -95,6 +96,113 @@ struct nvmet_vhost_ctrl {
u32 page_size;
};
+const struct vhost_memory_region *
+find_region(struct vhost_dev *hba, __u64 addr, __u32 len)
+{
+ struct vhost_memory *mem;
+ struct vhost_memory_region *reg;
+ int i;
+
+ if (!hba->memory)
+ return NULL;
+
+ mem = hba->memory;
+ /* linear search is not brilliant, but we really have on the order of 6
+ * regions in practice */
+ for (i = 0; i < mem->nregions; ++i) {
+ reg = mem->regions + i;
+ if (reg->guest_phys_addr <= addr &&
+ reg->guest_phys_addr + reg->memory_size - 1 >= addr)
+ return reg;
+ }
+ return NULL;
+}
+
+static bool check_region_boundary(const struct vhost_memory_region *reg,
+ uint64_t addr, size_t len)
+{
+ unsigned long max_size;
+
+ max_size = reg->memory_size - addr + reg->guest_phys_addr;
+ return (max_size < len);
+}
+
+static void __user *map_to_region(const struct vhost_memory_region *reg,
+ uint64_t addr)
+{
+ return (void __user *)(unsigned long)
+ (reg->userspace_addr + addr - reg->guest_phys_addr);
+}
+
+static void __user *map_guest_to_host(struct vhost_dev *dev,
+ uint64_t addr, int size)
+{
+ const struct vhost_memory_region *reg = NULL;
+
+ reg = find_region(dev, addr, size);
+ if (unlikely(!reg))
+ return ERR_PTR(-EPERM);
+
+ if (unlikely(check_region_boundary(reg, addr, size)))
+ return ERR_PTR(-EFAULT);
+
+ return map_to_region(reg, addr);
+}
+
+static int nvmet_vhost_rw(struct vhost_dev *dev, u64 guest_pa,
+ void *buf, uint32_t size, int write)
+{
+ void __user *host_user_va;
+ void *host_kernel_va;
+ struct page *page;
+ uintptr_t offset;
+ int ret;
+
+ host_user_va = map_guest_to_host(dev, guest_pa, size);
+ if (unlikely(!host_user_va)) {
+ pr_warn("cannot map guest addr %p, error %ld\n",
+ (void *)guest_pa, PTR_ERR(host_user_va));
+ return -EINVAL;
+ }
+
+ ret = get_user_pages(current, dev->mm,
+ (unsigned long)host_user_va, 1,
+ false, 0, &page, NULL);
+ if (unlikely(ret != 1)) {
+ pr_warn("get_user_pages fail!!!\n");
+ return -EINVAL;
+ }
+
+ host_kernel_va = kmap(page);
+ if (unlikely(!host_kernel_va)) {
+ pr_warn("kmap fail!!!\n");
+ put_page(page);
+ return -EINVAL;
+ }
+
+ offset = (uintptr_t)host_user_va & ~PAGE_MASK;
+ if (write)
+ memcpy(host_kernel_va + offset, buf, size);
+ else
+ memcpy(buf, host_kernel_va + offset, size);
+ kunmap(host_kernel_va);
+ put_page(page);
+
+ return 0;
+}
+
+int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa,
+ void *buf, uint32_t size)
+{
+ return nvmet_vhost_rw(dev, guest_pa, buf, size, 0);
+}
+
+int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa,
+ void *buf, uint32_t size)
+{
+ return nvmet_vhost_rw(dev, guest_pa, buf, size, 1);
+}
+
#define sq_to_vsq(sq) container_of(sq, struct nvmet_vhost_sq, sq)
#define cq_to_vcq(cq) container_of(cq, struct nvmet_vhost_cq, cq)
--
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
This adds nvme submission/completion queue handlers,
which are ported from qemu-nvme.
And hooks into nvme-target to do the real job.
Cc: Keith Busch <keith.busch at intel.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/target/vhost.c | 420 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 416 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 6847c86..3ce1348 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -6,10 +6,12 @@
#include <linux/mutex.h>
#include <linux/file.h>
#include <linux/highmem.h>
+#include <linux/kthread.h>
#include "../../vhost/vhost.h"
#include "nvmet.h"
#define NVMET_VHOST_AQ_DEPTH 256
+#define NVMET_VHOST_MAX_SEGMENTS 32
enum NvmeCcShift {
CC_MPS_SHIFT = 7,
@@ -52,6 +54,15 @@ struct nvmet_vhost_ctrl_eventfd {
int __user *vector;
};
+struct nvmet_vhost_iod {
+ struct nvmet_vhost_sq *sq;
+ struct scatterlist sg[NVMET_VHOST_MAX_SEGMENTS];
+ struct nvme_command cmd;
+ struct nvme_completion rsp;
+ struct nvmet_req req;
+ struct list_head entry;
+};
+
struct nvmet_vhost_cq {
struct nvmet_cq cq;
struct nvmet_vhost_ctrl *ctrl;
@@ -61,6 +72,12 @@ struct nvmet_vhost_cq {
u8 phase;
u64 dma_addr;
struct eventfd_ctx *eventfd;
+
+ struct list_head sq_list;
+ struct list_head req_list;
+ spinlock_t lock;
+ struct task_struct *thread;
+ int scheduled;
};
struct nvmet_vhost_sq {
@@ -71,6 +88,13 @@ struct nvmet_vhost_sq {
u32 tail;
u64 dma_addr;
u16 cqid;
+
+ struct nvmet_vhost_iod *io_req;
+ struct list_head req_list;
+ struct list_head entry;
+ struct mutex lock;
+ struct task_struct *thread;
+ int scheduled;
};
struct nvmet_vhost_ctrl {
@@ -191,13 +215,13 @@ static int nvmet_vhost_rw(struct vhost_dev *dev, u64
guest_pa,
return 0;
}
-int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa,
+static int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa,
void *buf, uint32_t size)
{
return nvmet_vhost_rw(dev, guest_pa, buf, size, 0);
}
-int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa,
+static int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa,
void *buf, uint32_t size)
{
return nvmet_vhost_rw(dev, guest_pa, buf, size, 1);
@@ -216,6 +240,289 @@ static int nvmet_vhost_check_cqid(struct nvmet_ctrl *n,
u16 cqid)
return cqid <= n->subsys->max_qid && n->cqs[cqid] != NULL
? 0 : -1;
}
+static void nvmet_vhost_inc_cq_tail(struct nvmet_vhost_cq *cq)
+{
+ cq->tail++;
+ if (cq->tail >= cq->cq.size) {
+ cq->tail = 0;
+ cq->phase = !cq->phase;
+ }
+}
+
+static void nvmet_vhost_inc_sq_head(struct nvmet_vhost_sq *sq)
+{
+ sq->head = (sq->head + 1) % sq->sq.size;
+}
+
+static uint8_t nvmet_vhost_cq_full(struct nvmet_vhost_cq *cq)
+{
+ return (cq->tail + 1) % cq->cq.size == cq->head;
+}
+
+static uint8_t nvmet_vhost_sq_empty(struct nvmet_vhost_sq *sq)
+{
+ return sq->head == sq->tail;
+}
+
+static void nvmet_vhost_post_cqes(struct nvmet_vhost_cq *cq)
+{
+ struct nvmet_vhost_ctrl *n = cq->ctrl;
+ struct nvmet_vhost_iod *req;
+ struct list_head *p, *tmp;
+ int signal = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+ list_for_each_safe(p, tmp, &cq->req_list) {
+ struct nvmet_vhost_sq *sq;
+ u64 addr;
+
+ if (nvmet_vhost_cq_full(cq))
+ goto unlock;
+
+ req = list_entry(p, struct nvmet_vhost_iod, entry);
+ list_del(p);
+
+ sq = req->sq;
+ req->rsp.status |= cq->phase;
+ req->rsp.sq_id = cpu_to_le16(sq->sq.qid);
+ req->rsp.sq_head = cpu_to_le16(sq->head);
+ addr = cq->dma_addr + cq->tail * n->cqe_size;
+ nvmet_vhost_inc_cq_tail(cq);
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ nvmet_vhost_write(&n->dev, addr, (void *)&req->rsp,
+ sizeof(req->rsp));
+
+ mutex_lock(&sq->lock);
+ list_add_tail(p, &sq->req_list);
+ mutex_unlock(&sq->lock);
+
+ signal = 1;
+
+ spin_lock_irqsave(&cq->lock, flags);
+ }
+
+ if (signal)
+ eventfd_signal(cq->eventfd, 1);
+
+unlock:
+ cq->scheduled = 0;
+ spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+static int nvmet_vhost_cq_thread(void *arg)
+{
+ struct nvmet_vhost_cq *sq = arg;
+
+ while (1) {
+ if (kthread_should_stop())
+ break;
+
+ nvmet_vhost_post_cqes(sq);
+
+ schedule();
+ }
+
+ return 0;
+}
+
+static void nvmet_vhost_enqueue_req_completion(
+ struct nvmet_vhost_cq *cq, struct nvmet_vhost_iod *iod)
+{
+ unsigned long flags;
+
+ BUG_ON(cq->cq.qid != iod->sq->sq.qid);
+ spin_lock_irqsave(&cq->lock, flags);
+ list_add_tail(&iod->entry, &cq->req_list);
+ if (!cq->scheduled) {
+ wake_up_process(cq->thread);
+ cq->scheduled = 1;
+ }
+ spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+static void nvmet_vhost_queue_response(struct nvmet_req *req)
+{
+ struct nvmet_vhost_iod *iod + container_of(req, struct nvmet_vhost_iod, req);
+ struct nvmet_vhost_sq *sq = iod->sq;
+ struct nvmet_vhost_ctrl *n = sq->ctrl;
+ struct nvmet_vhost_cq *cq = n->cqs[sq->sq.qid];
+
+ nvmet_vhost_enqueue_req_completion(cq, iod);
+}
+
+static int nvmet_vhost_sglist_add(struct nvmet_vhost_ctrl *n, struct
scatterlist *sg,
+ u64 guest_addr, int len, int is_write)
+{
+ void __user *host_addr;
+ struct page *page;
+ unsigned int offset, nbytes;
+ int ret;
+
+ host_addr = map_guest_to_host(&n->dev, guest_addr, len);
+ if (unlikely(!host_addr)) {
+ pr_warn("cannot map guest addr %p, error %ld\n",
+ (void *)guest_addr, PTR_ERR(host_addr));
+ return PTR_ERR(host_addr);
+ }
+
+ ret = get_user_pages(current, n->dev.mm, (unsigned long)host_addr, 1,
+ is_write, 0, &page, NULL);
+ BUG_ON(ret == 0); /* we should either get our page or fail */
+ if (ret < 0) {
+ pr_warn("get_user_pages faild: host_addr %p, %d\n",
+ host_addr, ret);
+ return ret;
+ }
+
+ offset = (uintptr_t)host_addr & ~PAGE_MASK;
+ nbytes = min_t(unsigned int, PAGE_SIZE - offset, len);
+ sg_set_page(sg, page, nbytes, offset);
+
+ return 0;
+}
+
+static int nvmet_vhost_map_prp(struct nvmet_vhost_ctrl *n, struct scatterlist
*sgl,
+ u64 prp1, u64 prp2, unsigned int len)
+{
+ unsigned int trans_len = n->page_size - (prp1 % n->page_size);
+ int num_prps = (len >> n->page_bits) + 1;
+ //FIXME
+ int is_write = 1;
+
+ trans_len = min(len, trans_len);
+ if (!prp1)
+ return -1;
+
+ sg_init_table(sgl, num_prps);
+
+ nvmet_vhost_sglist_add(n, sgl, prp1, trans_len, is_write);
+
+ len -= trans_len;
+ if (len) {
+ if (!prp2)
+ goto error;
+ if (len > n->page_size) {
+ u64 prp_list[n->max_prp_ents];
+ u16 nents, prp_trans;
+ int i = 0;
+
+ nents = (len + n->page_size - 1) >> n->page_bits;
+ prp_trans = min(n->max_prp_ents, nents) * sizeof(u64);
+ nvmet_vhost_read(&n->dev, prp2, (void *)prp_list, prp_trans);
+
+ while (len != 0) {
+ u64 prp_ent = le64_to_cpu(prp_list[i]);
+
+ if (i == n->max_prp_ents - 1 && len > n->page_size) {
+ if (!prp_ent || prp_ent & (n->page_size - 1))
+ goto error;
+ i = 0;
+ nents = (len + n->page_size - 1) >> n->page_bits;
+ prp_trans = min(n->max_prp_ents, nents) * sizeof(u64);
+ nvmet_vhost_read(&n->dev, prp_ent, (void *)prp_list, prp_trans);
+ prp_ent = le64_to_cpu(prp_list[i]);
+ }
+
+ if (!prp_ent || prp_ent & (n->page_size - 1))
+ goto error;
+
+ trans_len = min(len, n->page_size);
+ nvmet_vhost_sglist_add(n, sgl, prp_ent, trans_len, is_write);
+ sgl++;
+ len -= trans_len;
+ i++;
+ }
+ } else {
+ if (prp2 & (n->page_size - 1))
+ goto error;
+ nvmet_vhost_sglist_add(n, sgl, prp2, trans_len, is_write);
+ }
+ }
+
+ return num_prps;
+
+error:
+ return -1;
+}
+
+static void nvmet_vhost_process_sq(struct nvmet_vhost_sq *sq)
+{
+ struct nvmet_vhost_ctrl *n = sq->ctrl;
+ struct nvmet_vhost_cq *cq = n->cqs[sq->sq.qid];
+ struct nvmet_vhost_iod *iod;
+ struct nvme_command *cmd;
+ int ret;
+
+ mutex_lock(&sq->lock);
+
+ while (!(nvmet_vhost_sq_empty(sq) || list_empty(&sq->req_list))) {
+ u64 addr = sq->dma_addr + sq->head * n->sqe_size;;
+
+ nvmet_vhost_inc_sq_head(sq);
+ iod = list_first_entry(&sq->req_list,
+ struct nvmet_vhost_iod, entry);
+ list_del(&iod->entry);
+ mutex_unlock(&sq->lock);
+
+ cmd = &iod->cmd;
+ ret = nvmet_vhost_read(&n->dev, addr,
+ (void *)cmd, sizeof(*cmd));
+ if (ret) {
+ pr_warn("nvmet_vhost_read fail\n");
+ goto out;
+ }
+
+ ret = nvmet_req_init(&iod->req, &cq->cq, &sq->sq,
+ nvmet_vhost_queue_response);
+ if (ret) {
+ pr_warn("nvmet_req_init error: ret 0x%x, qid %d\n", ret,
sq->sq.qid);
+ goto out;
+ }
+ if (iod->req.data_len) {
+ ret = nvmet_vhost_map_prp(n, iod->sg, cmd->common.prp1,
+ cmd->common.prp2, iod->req.data_len);
+ if (ret > 0) {
+ iod->req.sg = iod->sg;
+ iod->req.sg_cnt = ret;
+ } else {
+ pr_warn("map prp error\n");
+ goto out;
+ }
+ }
+ iod->req.execute(&iod->req);
+ mutex_lock(&sq->lock);
+ }
+
+unlock:
+ sq->scheduled = 0;
+ mutex_unlock(&sq->lock);
+ return;
+
+out:
+ mutex_lock(&sq->lock);
+ list_add_tail(&iod->entry, &sq->req_list);
+ goto unlock;
+}
+
+static int nvmet_vhost_sq_thread(void *opaque)
+{
+ struct nvmet_vhost_sq *sq = opaque;
+
+ while (1) {
+ if (kthread_should_stop())
+ break;
+
+ nvmet_vhost_process_sq(sq);
+
+ schedule();
+ }
+
+ return 0;
+}
+
static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq,
struct nvmet_vhost_ctrl *n, u64 dma_addr,
u16 cqid, u16 size, struct eventfd_ctx *eventfd,
@@ -228,6 +535,12 @@ static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq,
cq->eventfd = eventfd;
n->cqs[cqid] = cq;
+ spin_lock_init(&cq->lock);
+ INIT_LIST_HEAD(&cq->req_list);
+ INIT_LIST_HEAD(&cq->sq_list);
+ cq->scheduled = 0;
+ cq->thread = kthread_create(nvmet_vhost_cq_thread, cq,
"nvmet_vhost_cq");
+
nvmet_cq_init(n->ctrl, &cq->cq, cqid, size);
return 0;
@@ -237,12 +550,36 @@ static int nvmet_vhost_init_sq(struct nvmet_vhost_sq *sq,
struct nvmet_vhost_ctrl *n, u64 dma_addr,
u16 sqid, u16 cqid, u16 size)
{
+ struct nvmet_vhost_cq *cq;
+ struct nvmet_vhost_iod *iod;
+ int i;
+
sq->ctrl = n;
sq->dma_addr = dma_addr;
sq->cqid = cqid;
sq->head = sq->tail = 0;
n->sqs[sqid] = sq;
+ mutex_init(&sq->lock);
+ INIT_LIST_HEAD(&sq->req_list);
+ sq->io_req = kmalloc(sizeof(struct nvmet_vhost_iod) * size, GFP_KERNEL);
+ if (!sq->io_req)
+ return -ENOMEM;
+ for (i = 0; i < size; i++) {
+ iod = &sq->io_req[i];
+
+ iod->req.cmd = &iod->cmd;
+ iod->req.rsp = &iod->rsp;
+ iod->sq = sq;
+ list_add_tail(&iod->entry, &sq->req_list);
+ }
+ sq->scheduled = 0;
+ sq->thread = kthread_create(nvmet_vhost_sq_thread, sq,
"nvmet_vhost_sq");
+
+ cq = n->cqs[cqid];
+ list_add_tail(&sq->entry, &cq->sq_list);
+ n->sqs[sqid] = sq;
+
nvmet_sq_init(n->ctrl, &sq->sq, sqid, size);
return 0;
@@ -564,12 +901,84 @@ static int nvmet_bar_write(struct nvmet_vhost_ctrl *n, int
offset, u64 val)
return status;
}
+static int nvmet_vhost_process_db(struct nvmet_ctrl *ctrl, int offset, u64 val)
+{
+ u16 qid;
+
+ if (offset & ((1 << 2) - 1))
+ return -EINVAL;
+
+ if (((offset - 0x1000) >> 2) & 1) {
+ u16 new_head = val & 0xffff;
+ int start_sqs;
+ struct nvmet_vhost_cq *vcq;
+ struct nvmet_cq *cq;
+ unsigned long flags;
+
+ qid = (offset - (0x1000 + (1 << 2))) >> 3;
+ if (nvmet_vhost_check_cqid(ctrl, qid))
+ return -EINVAL;
+
+ cq = ctrl->cqs[qid];
+ if (new_head >= cq->size)
+ return -EINVAL;
+
+ vcq = cq_to_vcq(cq);
+ spin_lock_irqsave(&vcq->lock, flags);
+ start_sqs = nvmet_vhost_cq_full(vcq) ? 1 : 0;
+ vcq->head = new_head;
+ spin_unlock_irqrestore(&vcq->lock, flags);
+ if (start_sqs) {
+ struct nvmet_vhost_sq *sq;
+ struct list_head *p;
+
+ list_for_each(p, &vcq->sq_list) {
+ sq = list_entry(p, struct nvmet_vhost_sq, entry);
+ if (!sq->scheduled) {
+ sq->scheduled = 1;
+ wake_up_process(sq->thread);
+ }
+ }
+ if (!vcq->scheduled) {
+ vcq->scheduled = 1;
+ wake_up_process(vcq->thread);
+ }
+ }
+
+ if (vcq->tail != vcq->head)
+ eventfd_signal(vcq->eventfd, 1);
+ } else {
+ struct nvmet_vhost_sq *vsq;
+ struct nvmet_sq *sq;
+ u16 new_tail = val & 0xffff;
+
+ qid = (offset - 0x1000) >> 3;
+ if (nvmet_vhost_check_sqid(ctrl, qid))
+ return -EINVAL;
+
+ sq = ctrl->sqs[qid];
+ if (new_tail >= sq->size)
+ return -ENOSPC;
+
+ vsq = sq_to_vsq(sq);
+ mutex_lock(&vsq->lock);
+ vsq->tail = new_tail;
+ if (!vsq->scheduled) {
+ vsq->scheduled = 1;
+ wake_up_process(vsq->thread);
+ }
+ mutex_unlock(&vsq->lock);
+ }
+
+ return 0;
+}
+
static int nvmet_vhost_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64
val)
{
if (offset < 0x1000)
return nvmet_bar_write(n, offset, val);
-
- return -1;
+ else
+ return nvmet_vhost_process_db(n->ctrl, offset, val);
}
static int nvmet_vhost_ioc_bar(struct nvmet_vhost_ctrl *n, void __user *argp)
@@ -612,6 +1021,8 @@ static void nvme_free_sq(struct nvmet_vhost_sq *sq,
struct nvmet_vhost_ctrl *n)
{
n->sqs[sq->sq.qid] = NULL;
+ kthread_stop(sq->thread);
+ kfree(sq->io_req);
if (sq->sq.qid)
kfree(sq);
}
@@ -620,6 +1031,7 @@ static void nvme_free_cq(struct nvmet_vhost_cq *cq,
struct nvmet_vhost_ctrl *n)
{
n->cqs[cq->cq.qid] = NULL;
+ kthread_stop(cq->thread);
if (cq->cq.qid)
kfree(cq);
}
--
1.9.1
Christoph Hellwig
2015-Nov-20 05:13 UTC
[RFC PATCH 4/9] nvmet: add a controller "start" hook
On Thu, Nov 19, 2015 at 04:21:03PM -0800, Ming Lin wrote:> #define NVMET_SUBSYS_NAME_LEN 256 > char subsys_name[NVMET_SUBSYS_NAME_LEN]; > + > + void *opaque; > + void (*start)(void *); > };Why can't vhost use container_of to get at the containing structure similar to what the loop driver does? In addition I think we'll eventually need an ops structure here, but I can take care of that later.
Christoph Hellwig
2015-Nov-20 05:16 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
Thanks Ming, from a first quick view this looks great. I'll look over it in a bit more detail once I get a bit more time.
Ming Lin
2015-Nov-20 05:33 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On Fri, 2015-11-20 at 06:16 +0100, Christoph Hellwig wrote:> Thanks Ming, > > from a first quick view this looks great. I'll look over it in a bit > more detail once I get a bit more time.Thanks to CC Nic :-) But funny, I double-checked bash history. I actually CCed Nic. Don't know why it's lost. mlin at ssi:~$ history |grep "nab" 1861 git send-email --from "Ming Lin <mlin at kernel.org>" --to "linux-nvme at lists.infradead.org" --cc "qemu-devel at nongnu.org" --cc "virtualization at lists.linux-foundation.org" --cc "Christoph Hellwig <hch at lst.de>" --cc "Nicholas A. Bellinger <nab at linux-iscsi.org>" --compose ~/patches/*.patch
Paolo Bonzini
2015-Nov-21 13:11 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On 20/11/2015 01:20, Ming Lin wrote:> One improvment could be to use google's NVMe vendor extension that > I send in another thread, aslo here: > https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-google-ext > > Qemu side: > http://www.minggr.net/cgit/cgit.cgi/qemu/log/?h=vhost-nvme.0 > Kernel side also here: > https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=vhost-nvme.0How much do you get with vhost-nvme plus vendor extension, compared to 190 MB/s for QEMU? Note that in all likelihood, QEMU can actually do better than 190 MB/s, and gain more parallelism too, by moving the processing of the ioeventfds to a separate thread. This is similar to hw/block/dataplane/virtio-blk.c. It's actually pretty easy to do. Even though hw/block/dataplane/virtio-blk.c is still using some old APIs, all memory access in QEMU is now thread-safe. I have pending patches for 2.6 that cut that file down to a mere 200 lines of code, NVMe would probably be about the same. Paolo
Paolo Bonzini
2015-Dec-01 16:02 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On 01/12/2015 00:20, Ming Lin wrote:> qemu-nvme: 148MB/s > vhost-nvme + google-ext: 230MB/s > qemu-nvme + google-ext + eventfd: 294MB/s > virtio-scsi: 296MB/s > virtio-blk: 344MB/s > > "vhost-nvme + google-ext" didn't get good enough performance.I'd expect it to be on par of qemu-nvme with ioeventfd but the question is: why should it be better? For vhost-net, the answer is that more zerocopy can be done if you put the data path in the kernel. But qemu-nvme is already using io_submit for the data path, perhaps there's not much to gain from vhost-nvme... Paolo> Still tuning.
Ming Lin
2015-Dec-01 16:26 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On Tue, 2015-12-01 at 17:02 +0100, Paolo Bonzini wrote:> > On 01/12/2015 00:20, Ming Lin wrote: > > qemu-nvme: 148MB/s > > vhost-nvme + google-ext: 230MB/s > > qemu-nvme + google-ext + eventfd: 294MB/s > > virtio-scsi: 296MB/s > > virtio-blk: 344MB/s > > > > "vhost-nvme + google-ext" didn't get good enough performance. > > I'd expect it to be on par of qemu-nvme with ioeventfd but the question > is: why should it be better? For vhost-net, the answer is that more > zerocopy can be done if you put the data path in the kernel. > > But qemu-nvme is already using io_submit for the data path, perhaps > there's not much to gain from vhost-nvme...What do you think about virtio-nvme+vhost-nvme? I also have patch for vritio-nvme: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-split/virtio Just need to change vhost-nvme to work with it.> > Paolo > > > Still tuning.