Ming Lin
2015-Nov-20  00:20 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
Hi, This is the first attempt to add a new qemu nvme backend using in-kernel nvme target. Most code are ported from qemu-nvme and also borrow code from Hannes Reinecke's rts-megasas. It's similar as vhost-scsi, but doesn't use virtio. The advantage is guest can run unmodified NVMe driver. So guest can be any OS that has a NVMe driver. The goal is to get as good performance as vhost-scsi. But for now, peformance is poor. MMIO is the bottleneck. One improvment could be to use google's NVMe vendor extension that I send in another thread, aslo here: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-google-ext Qemu side: http://www.minggr.net/cgit/cgit.cgi/qemu/log/?h=vhost-nvme.0 Kernel side also here: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=vhost-nvme.0 Thanks for any comment, Ming
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/Kconfig  | 11 +++++++++++
 drivers/nvme/target/Makefile |  2 ++
 drivers/nvme/target/vhost.c  | 16 ++++++++++++++++
 3 files changed, 29 insertions(+)
 create mode 100644 drivers/nvme/target/vhost.c
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 1bf92db..22760f5 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -12,3 +12,14 @@ config NVME_TARGET_LOOP
 	  to test NVMe host and target side features.
 
 	  If unsure, say N.
+
+config NVME_TARGET_VHOST
+	tristate "NVMe vhost support"
+	depends on BLK_DEV_NVME
+	select NVME_TARGET
+	select VHOST
+	select VHOST_RING
+	help
+	  This enabled the NVMe vhost support.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 21e9134..1d8d523 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -1,6 +1,8 @@
 
 obj-$(CONFIG_NVME_TARGET)		+= nvmet.o
 obj-$(CONFIG_NVME_TARGET_LOOP)		+= nvme-loop.o
+obj-$(CONFIG_NVME_TARGET_VHOST)		+= nvme-vhost.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o io-cmd.o
 nvme-loop-y	+= loop.o
+nvme-vhost-y	+= vhost.o
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
new file mode 100644
index 0000000..623af00
--- /dev/null
+++ b/drivers/nvme/target/vhost.c
@@ -0,0 +1,16 @@
+#include <linux/module.h>
+
+static int __init nvmet_vhost_init(void)
+{
+	return 0;
+}
+module_init(nvmet_vhost_init);
+
+static void nvmet_vhost_exit(void)
+{
+}
+module_exit(nvmet_vhost_exit);
+
+MODULE_AUTHOR("Ming Lin <ming.l at ssi.samsung.com>");
+MODULE_LICENSE("GPL v2");
+
-- 
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/core.c  |   1 +
 drivers/nvme/target/vhost.c | 264 +++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/vhost.h  |  15 +++
 3 files changed, 279 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 5c770bf..1bfef66 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -378,6 +378,7 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
 {
 	kref_put(&ctrl->ref, nvmet_ctrl_free);
 }
+EXPORT_SYMBOL_GPL(nvmet_ctrl_put);
 
 struct nvmet_subsys *nvmet_find_subsys(char *subsys_name)
 {
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 623af00..fa2e668 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -1,13 +1,275 @@
 #include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include "../../vhost/vhost.h"
+#include "nvmet.h"
 
-static int __init nvmet_vhost_init(void)
+struct nvmet_vhost_ctrl_eventfd {
+	struct file *call;
+	struct eventfd_ctx *call_ctx;
+	int __user *irq_enabled;
+	int __user *vector;
+};
+
+struct nvmet_vhost_cq {
+	struct nvmet_cq		cq;
+
+	struct eventfd_ctx	*eventfd;
+};
+
+struct nvmet_vhost_sq {
+	struct nvmet_sq		sq;
+};
+
+struct nvmet_vhost_ctrl {
+	struct vhost_dev dev;
+	struct nvmet_vhost_ctrl_eventfd *eventfd;
+
+	u16 cntlid;
+	struct nvmet_ctrl *ctrl;
+	u32 num_queues;
+
+	struct nvmet_vhost_cq **cqs;
+	struct nvmet_vhost_sq **sqs;
+};
+
+static int
+nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
+			struct vhost_nvme_target *c)
 {
+	struct nvmet_subsys *subsys;
+	struct nvmet_ctrl *ctrl;
+	int num_queues;
+	int ret = 0;
+
+	subsys = nvmet_find_subsys(c->vhost_wwpn);
+        if (!subsys) {
+		pr_warn("connect request for invalid subsystem!\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&subsys->lock);
+	ctrl = nvmet_alloc_ctrl(subsys, c->vhost_wwpn);
+	if (IS_ERR(ctrl)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	n->cntlid = ctrl->cntlid;
+	n->ctrl = ctrl;
+	n->num_queues = subsys->max_qid + 1;
+
+	num_queues = ctrl->subsys->max_qid + 1;
+	n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL);
+	if (!n->cqs) {
+		ret = -ENOMEM;
+		goto out_ctrl_put;
+	}
+	n->sqs = kzalloc(sizeof(*n->sqs) * num_queues, GFP_KERNEL);
+	if (!n->sqs) {
+		ret = -ENOMEM;
+		goto free_cqs;
+	}
+
+	n->eventfd = kmalloc(sizeof(struct nvmet_vhost_ctrl_eventfd)
+				* num_queues, GFP_KERNEL);
+	if (!n->eventfd) {
+		ret = -ENOMEM;
+		goto free_sqs;
+	}
+
+	mutex_unlock(&subsys->lock);
 	return 0;
+
+free_sqs:
+	kfree(n->sqs);
+
+free_cqs:
+	kfree(n->cqs);
+
+out_ctrl_put:
+	nvmet_ctrl_put(ctrl);
+
+out_unlock:
+	mutex_unlock(&subsys->lock);
+	return ret;
+}
+
+static int nvmet_vhost_set_eventfd(struct nvmet_vhost_ctrl *n, void __user
*argp)
+{
+	struct nvmet_vhost_eventfd eventfd;
+	int num;
+	int ret;
+
+	ret = copy_from_user(&eventfd, argp, sizeof(struct nvmet_vhost_eventfd));
+	if (unlikely(ret))
+		return ret;
+
+	num = eventfd.num;
+	if (num > n->ctrl->subsys->max_qid)
+		return -EINVAL;
+
+	n->eventfd[num].call = eventfd_fget(eventfd.fd);
+	if (IS_ERR(n->eventfd[num].call))
+		return -EBADF;
+	n->eventfd[num].call_ctx = eventfd_ctx_fileget(n->eventfd[num].call);
+	if (IS_ERR(n->eventfd[num].call_ctx)) {
+		fput(n->eventfd[num].call);
+		return -EBADF;
+	}
+
+	n->eventfd[num].irq_enabled = eventfd.irq_enabled;
+	n->eventfd[num].vector = eventfd.vector;
+
+	return 0;
+}
+
+static int nvmet_vhost_open(struct inode *inode, struct file *f)
+{
+	struct nvmet_vhost_ctrl *n = kzalloc(sizeof(*n), GFP_KERNEL);
+
+	if (!n)
+		return -ENOMEM;
+
+	/* We don't use virtqueue */
+	vhost_dev_init(&n->dev, NULL, 0);
+	f->private_data = n;
+
+	return 0;
+}
+
+static void nvme_free_sq(struct nvmet_vhost_sq *sq,
+		struct nvmet_vhost_ctrl *n)
+{
+	n->sqs[sq->sq.qid] = NULL;
+	if (sq->sq.qid)
+		kfree(sq);
+}
+
+static void nvme_free_cq(struct nvmet_vhost_cq *cq,
+		struct nvmet_vhost_ctrl *n)
+{
+	n->cqs[cq->cq.qid] = NULL;
+	if (cq->cq.qid)
+		kfree(cq);
+}
+
+static void nvmet_vhost_clear_ctrl(struct nvmet_vhost_ctrl *n)
+{
+	int i;
+
+	for (i = 0; i < n->num_queues; i++) {
+		if (n->sqs[i] != NULL)
+			nvme_free_sq(n->sqs[i], n);
+	}
+	for (i = 0; i < n->num_queues; i++) {
+		if (n->cqs[i] != NULL)
+			nvme_free_cq(n->cqs[i], n);
+	}
+
+	kfree(n->eventfd);
+	kfree(n->cqs);
+	kfree(n->sqs);
+	nvmet_ctrl_put(n->ctrl);
+}
+
+static void nvmet_vhost_clear_eventfd(struct nvmet_vhost_ctrl *n)
+{
+	int i;
+
+	for (i = 0; i < n->num_queues; i++) {
+		if (n->eventfd[i].call_ctx) {
+			eventfd_ctx_put(n->eventfd[i].call_ctx);
+			fput(n->eventfd[i].call);
+		}
+	}
+}
+
+static int nvmet_vhost_release(struct inode *inode, struct file *f)
+{
+	struct nvmet_vhost_ctrl *n = f->private_data;
+
+	nvmet_vhost_clear_eventfd(n);
+	nvmet_vhost_clear_ctrl(n);
+
+	vhost_dev_stop(&n->dev);
+	vhost_dev_cleanup(&n->dev, false);
+
+	kfree(n);
+	return 0;
+}
+
+static long nvmet_vhost_ioctl(struct file *f, unsigned int ioctl,
+			     unsigned long arg)
+{
+	struct nvmet_vhost_ctrl *n = f->private_data;
+	void __user *argp = (void __user *)arg;
+	u64 __user *featurep = argp;
+	u64 features;
+	int r;
+
+	switch (ioctl) {
+	case VHOST_NVME_SET_ENDPOINT:
+	{
+		struct vhost_nvme_target conf;
+		if (copy_from_user(&conf, argp, sizeof(conf)))
+			return -EFAULT;
+
+		return nvmet_vhost_set_endpoint(n, &conf);
+	}
+	case VHOST_NVME_SET_EVENTFD:
+		r = nvmet_vhost_set_eventfd(n, argp);
+		return r;
+	case VHOST_GET_FEATURES:
+		features = VHOST_FEATURES;
+		if (copy_to_user(featurep, &features, sizeof(features)))
+			return -EFAULT;
+		return 0;
+	default:
+		mutex_lock(&n->dev.mutex);
+		r = vhost_dev_ioctl(&n->dev, ioctl, argp);
+		mutex_unlock(&n->dev.mutex);
+		return r;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long nvmet_vhost_compat_ioctl(struct file *f, unsigned int ioctl,
+				   unsigned long arg)
+{
+	return nvmet_vhost_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations nvmet_vhost_fops = {
+	.owner          = THIS_MODULE,
+	.release        = nvmet_vhost_release,
+	.unlocked_ioctl = nvmet_vhost_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = nvmet_vhost_compat_ioctl,
+#endif
+	.open           = nvmet_vhost_open,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice nvmet_vhost_misc = {
+	MISC_DYNAMIC_MINOR,
+	"vhost-nvme",
+	&nvmet_vhost_fops,
+};
+
+static int __init nvmet_vhost_init(void)
+{
+	return misc_register(&nvmet_vhost_misc);
 }
 module_init(nvmet_vhost_init);
 
 static void nvmet_vhost_exit(void)
 {
+	misc_deregister(&nvmet_vhost_misc);
 }
 module_exit(nvmet_vhost_exit);
 
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index ab373191..ae4b619 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -169,4 +169,19 @@ struct vhost_scsi_target {
 #define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
 #define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
 
+struct vhost_nvme_target {
+	char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */
+};
+
+struct nvmet_vhost_eventfd {
+	int num;
+	int fd;
+	int *irq_enabled;
+	int *vector;
+};
+
+#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x47, struct
vhost_nvme_target)
+#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x48, struct
vhost_nvme_target)
+#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x45, struct
nvmet_vhost_eventfd)
+
 #endif
-- 
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/vhost.c | 102 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vhost.h  |  17 ++++++--
 2 files changed, 116 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index fa2e668..01c44b8 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -8,6 +8,8 @@
 #include "../../vhost/vhost.h"
 #include "nvmet.h"
 
+#define NVMET_VHOST_AQ_DEPTH		256
+
 struct nvmet_vhost_ctrl_eventfd {
 	struct file *call;
 	struct eventfd_ctx *call_ctx;
@@ -35,6 +37,10 @@ struct nvmet_vhost_ctrl {
 
 	struct nvmet_vhost_cq **cqs;
 	struct nvmet_vhost_sq **sqs;
+
+	u32 aqa;
+	u64 asq;
+	u64 acq;
 };
 
 static int
@@ -127,6 +133,100 @@ static int nvmet_vhost_set_eventfd(struct nvmet_vhost_ctrl
*n, void __user *argp
 	return 0;
 }
 
+static int nvmet_vhost_bar_read(struct nvmet_ctrl *ctrl, int offset, u64 *val)
+{
+	int status = NVME_SC_SUCCESS;
+
+	switch(offset) {
+	case NVME_REG_CAP:
+		*val = ctrl->cap;
+		break;
+	case NVME_REG_CAP+4:
+		*val = ctrl->cap >> 32;
+	case NVME_REG_VS:
+		*val = ctrl->subsys->ver;
+		break;
+	case NVME_REG_CC:
+		*val = ctrl->cc;
+		break;
+	case NVME_REG_CSTS:
+		*val = ctrl->csts;
+		break;
+	case NVME_REG_AQA:
+		*val = (NVMET_VHOST_AQ_DEPTH - 1) |
+		      (((NVMET_VHOST_AQ_DEPTH - 1) << 16));
+		break;
+	default:
+		printk("Unknown offset: 0x%x\n", offset);
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		break;
+	}
+
+	return status;
+}
+
+static int nvmet_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64 val)
+{
+	struct nvmet_ctrl *ctrl = n->ctrl;
+	int status = NVME_SC_SUCCESS;
+
+	switch(offset) {
+	case NVME_REG_CC:
+		nvmet_update_cc(ctrl, val);
+		break;
+	case NVME_REG_AQA:
+		n->aqa = val & 0xffffffff;
+		break;
+	case NVME_REG_ASQ:
+		n->asq = val;
+		break;
+	case NVME_REG_ASQ + 4:
+		n->asq |= val << 32;
+		break;
+	case NVME_REG_ACQ:
+		n->acq = val;
+		break;
+	case NVME_REG_ACQ + 4:
+		n->acq |= val << 32;
+		break;
+	default:
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		break;
+	}
+
+	return status;
+}
+
+static int nvmet_vhost_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64
val)
+{
+	if (offset < 0x1000)
+		return nvmet_bar_write(n, offset, val);
+
+	return -1;
+}
+
+static int nvmet_vhost_ioc_bar(struct nvmet_vhost_ctrl *n, void __user *argp)
+{
+	struct nvmet_vhost_bar bar;
+	struct nvmet_vhost_bar __user *user_bar = argp;
+	int ret = -EINVAL;
+
+	ret = copy_from_user(&bar, argp, sizeof(bar));
+	if (unlikely(ret))
+		return ret;
+
+	if (bar.type == VHOST_NVME_BAR_READ) {
+		u64 val;
+		ret = nvmet_vhost_bar_read(n->ctrl, bar.offset, &val);
+		if (ret != NVME_SC_SUCCESS)
+			return ret;
+		ret = copy_to_user(&user_bar->val, &val, sizeof(u64));
+	} else if (bar.type == VHOST_NVME_BAR_WRITE)
+		ret = nvmet_vhost_bar_write(n, bar.offset, bar.val);
+
+	return ret;
+}
+
 static int nvmet_vhost_open(struct inode *inode, struct file *f)
 {
 	struct nvmet_vhost_ctrl *n = kzalloc(sizeof(*n), GFP_KERNEL);
@@ -223,6 +323,8 @@ static long nvmet_vhost_ioctl(struct file *f, unsigned int
ioctl,
 	case VHOST_NVME_SET_EVENTFD:
 		r = nvmet_vhost_set_eventfd(n, argp);
 		return r;
+	case VHOST_NVME_BAR:
+		return nvmet_vhost_ioc_bar(n, argp);
 	case VHOST_GET_FEATURES:
 		features = VHOST_FEATURES;
 		if (copy_to_user(featurep, &features, sizeof(features)))
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index ae4b619..a0cefcc 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -180,8 +180,19 @@ struct nvmet_vhost_eventfd {
 	int *vector;
 };
 
-#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x47, struct
vhost_nvme_target)
-#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x48, struct
vhost_nvme_target)
-#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x45, struct
nvmet_vhost_eventfd)
+#define VHOST_NVME_BAR_READ 0
+#define VHOST_NVME_BAR_WRITE 1
+
+struct nvmet_vhost_bar {
+	int type; /* read/write */
+	u64 offset;
+	unsigned size;
+	u64 val;
+};
+
+#define VHOST_NVME_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x45, struct
vhost_nvme_target)
+#define VHOST_NVME_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x46, struct
vhost_nvme_target)
+#define VHOST_NVME_SET_EVENTFD _IOW(VHOST_VIRTIO, 0x47, struct
nvmet_vhost_eventfd)
+#define VHOST_NVME_BAR _IOW(VHOST_VIRTIO, 0x48, struct nvmet_vhost_bar)
 
 #endif
-- 
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
This is used to execute controller specific start code
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/core.c  | 3 +++
 drivers/nvme/target/nvmet.h | 3 +++
 2 files changed, 6 insertions(+)
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 1bfef66..0a0fc48 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -251,6 +251,9 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
 	}
 
 	ctrl->csts = NVME_CSTS_RDY;
+
+	if (ctrl->start)
+		ctrl->start(ctrl->opaque);
 }
 
 static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 9335584..eac008b 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -59,6 +59,9 @@ struct nvmet_ctrl {
 	struct kref		ref;
 #define NVMET_SUBSYS_NAME_LEN		256
 	char			subsys_name[NVMET_SUBSYS_NAME_LEN];
+
+	void			*opaque;
+	void			(*start)(void *);
 };
 
 struct nvmet_subsys {
-- 
1.9.1
Ming Lin
2015-Nov-20  00:21 UTC
[RFC PATCH 5/9] nvme-vhost: add controller "start" callback
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/vhost.c | 106 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 01c44b8..4a147d6 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -10,6 +10,35 @@
 
 #define NVMET_VHOST_AQ_DEPTH		256
 
+enum NvmeCcShift {
+	CC_MPS_SHIFT	= 7,
+	CC_IOSQES_SHIFT	= 16,
+	CC_IOCQES_SHIFT	= 20,
+};
+
+enum NvmeCcMask {
+	CC_MPS_MASK	= 0xf,
+	CC_IOSQES_MASK	= 0xf,
+	CC_IOCQES_MASK	= 0xf,
+};
+
+#define NVME_CC_MPS(cc)    ((cc >> CC_MPS_SHIFT)    & CC_MPS_MASK)
+#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
+#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
+
+enum NvmeAqaShift {
+	AQA_ASQS_SHIFT	= 0,
+	AQA_ACQS_SHIFT	= 16,
+};
+
+enum NvmeAqaMask {
+	AQA_ASQS_MASK	= 0xfff,
+	AQA_ACQS_MASK	= 0xfff,
+};
+
+#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
+#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
+
 struct nvmet_vhost_ctrl_eventfd {
 	struct file *call;
 	struct eventfd_ctx *call_ctx;
@@ -19,12 +48,23 @@ struct nvmet_vhost_ctrl_eventfd {
 
 struct nvmet_vhost_cq {
 	struct nvmet_cq		cq;
+	struct nvmet_vhost_ctrl	*ctrl;
 
+	u32			head;
+	u32			tail;
+	u8			phase;
+	u64			dma_addr;
 	struct eventfd_ctx	*eventfd;
 };
 
 struct nvmet_vhost_sq {
 	struct nvmet_sq		sq;
+	struct nvmet_vhost_ctrl	*ctrl;
+
+	u32			head;
+	u32			tail;
+	u64			dma_addr;
+	u16			cqid;
 };
 
 struct nvmet_vhost_ctrl {
@@ -37,12 +77,76 @@ struct nvmet_vhost_ctrl {
 
 	struct nvmet_vhost_cq **cqs;
 	struct nvmet_vhost_sq **sqs;
+	struct nvmet_vhost_cq admin_cq;
+	struct nvmet_vhost_sq admin_sq;
 
 	u32 aqa;
 	u64 asq;
 	u64 acq;
+	u16 cqe_size;
+	u16 sqe_size;
+	u16 max_prp_ents;
+	u16 page_bits;
+	u32 page_size;
 };
 
+static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq,
+		struct nvmet_vhost_ctrl *n, u64 dma_addr,
+		u16 cqid, u16 size, struct eventfd_ctx *eventfd,
+		u16 vector, u16 irq_enabled)
+{
+	cq->ctrl = n;
+	cq->dma_addr = dma_addr;
+	cq->phase = 1;
+	cq->head = cq->tail = 0;
+	cq->eventfd = eventfd;
+	n->cqs[cqid] = cq;
+
+	nvmet_cq_init(n->ctrl, &cq->cq, cqid, size);
+
+	return 0;
+}
+
+static int nvmet_vhost_init_sq(struct nvmet_vhost_sq *sq,
+		struct nvmet_vhost_ctrl *n, u64 dma_addr,
+		u16 sqid, u16 cqid, u16 size)
+{
+	sq->ctrl = n;
+	sq->dma_addr = dma_addr;
+	sq->cqid = cqid;
+	sq->head = sq->tail = 0;
+	n->sqs[sqid] = sq;
+
+	nvmet_sq_init(n->ctrl, &sq->sq, sqid, size);
+
+	return 0;
+}
+
+static void nvmet_vhost_start_ctrl(void *opaque)
+{
+	struct nvmet_vhost_ctrl *n = opaque;
+	u32 page_bits = NVME_CC_MPS(n->ctrl->cc) + 12;
+	u32 page_size = 1 << page_bits;
+	int ret;
+
+	n->page_bits = page_bits;
+	n->page_size = page_size;
+	n->max_prp_ents = n->page_size / sizeof(uint64_t);
+	n->cqe_size = 1 << NVME_CC_IOCQES(n->ctrl->cc);
+	n->sqe_size = 1 << NVME_CC_IOSQES(n->ctrl->cc);
+
+	nvmet_vhost_init_cq(&n->admin_cq, n, n->acq, 0,
+		NVME_AQA_ACQS(n->aqa) + 1, n->eventfd[0].call_ctx,
+		0, 1);
+
+	ret = nvmet_vhost_init_sq(&n->admin_sq, n, n->asq, 0, 0,
+		NVME_AQA_ASQS(n->aqa) + 1);
+	if (ret) {
+		pr_warn("nvmet_vhost_init_sq failed!!!\n");
+		BUG_ON(1);
+	}
+}
+
 static int
 nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
 			struct vhost_nvme_target *c)
@@ -67,6 +171,8 @@ nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
 	n->cntlid = ctrl->cntlid;
 	n->ctrl = ctrl;
 	n->num_queues = subsys->max_qid + 1;
+	ctrl->opaque = n;
+	ctrl->start = nvmet_vhost_start_ctrl;
 
 	num_queues = ctrl->subsys->max_qid + 1;
 	n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL);
-- 
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
This is used to execute controller specific cmd parse code
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/admin-cmd.c | 7 +++++++
 drivers/nvme/target/nvmet.h     | 3 +++
 2 files changed, 10 insertions(+)
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index d9db0d4..f009c77 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -346,6 +346,13 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
 		req->data = 0;
 		return 0;
 #endif
+	default:
+		if (req->sq->ctrl->parse_extra_admin_cmd) {
+			int ret = req->sq->ctrl->parse_extra_admin_cmd(req);
+
+			if (!ret)
+				return 0;
+		}
 	}
 
 	pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index eac008b..ef79813 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -44,6 +44,8 @@ struct nvmet_sq {
 	u16			size;
 };
 
+struct nvmet_req;
+
 struct nvmet_ctrl {
 	struct nvmet_subsys	*subsys;
 	struct nvmet_cq		**cqs;
@@ -62,6 +64,7 @@ struct nvmet_ctrl {
 
 	void			*opaque;
 	void			(*start)(void *);
+	int			(*parse_extra_admin_cmd)(struct nvmet_req *);
 };
 
 struct nvmet_subsys {
-- 
1.9.1
Ming Lin
2015-Nov-20  00:21 UTC
[RFC PATCH 7/9] nvme-vhost: add "parse_extra_admin_cmd" callback
From: Ming Lin <ming.l at ssi.samsung.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/vhost.c | 153 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 4a147d6..04ed0bc 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -39,6 +39,11 @@ enum NvmeAqaMask {
 #define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
 #define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
 
+#define NVME_CQ_FLAGS_PC(cq_flags)	(cq_flags & 0x1)
+#define NVME_CQ_FLAGS_IEN(cq_flags)	((cq_flags >> 1) & 0x1)
+
+#define NVME_SQ_FLAGS_PC(sq_flags)	(sq_flags & 0x1)
+
 struct nvmet_vhost_ctrl_eventfd {
 	struct file *call;
 	struct eventfd_ctx *call_ctx;
@@ -90,6 +95,19 @@ struct nvmet_vhost_ctrl {
 	u32 page_size;
 };
 
+#define sq_to_vsq(sq) container_of(sq, struct nvmet_vhost_sq, sq)
+#define cq_to_vcq(cq) container_of(cq, struct nvmet_vhost_cq, cq)
+
+static int nvmet_vhost_check_sqid(struct nvmet_ctrl *n, u16 sqid)
+{
+	return sqid <= n->subsys->max_qid && n->sqs[sqid] != NULL
? 0 : -1;
+}
+
+static int nvmet_vhost_check_cqid(struct nvmet_ctrl *n, u16 cqid)
+{
+	return cqid <= n->subsys->max_qid && n->cqs[cqid] != NULL
? 0 : -1;
+}
+
 static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq,
 		struct nvmet_vhost_ctrl *n, u64 dma_addr,
 		u16 cqid, u16 size, struct eventfd_ctx *eventfd,
@@ -147,6 +165,140 @@ static void nvmet_vhost_start_ctrl(void *opaque)
 	}
 }
 
+static void nvmet_vhost_create_cq(struct nvmet_req *req)
+{
+	struct nvmet_cq *cq;
+	struct nvmet_vhost_cq *vcq;
+	struct nvmet_vhost_ctrl *n;
+	struct nvme_create_cq *c;
+	u16 cqid;
+	u16 vector;
+	u16 qsize;
+	u16 qflags;
+	u64 prp1;
+	int status;
+	int ret;
+
+	cq = req->cq;
+	vcq = cq_to_vcq(cq);
+	n = vcq->ctrl;
+	c = &req->cmd->create_cq;
+	cqid = le16_to_cpu(c->cqid);
+	vector = le16_to_cpu(c->irq_vector);
+	qsize = le16_to_cpu(c->qsize);
+	qflags = le16_to_cpu(c->cq_flags);
+	prp1 = le64_to_cpu(c->prp1);
+	status = NVME_SC_SUCCESS;
+
+	if (!cqid || (cqid && !nvmet_vhost_check_cqid(n->ctrl, cqid))) {
+		status = NVME_SC_QID_INVALID | NVME_SC_DNR;
+		goto out;
+	}
+	if (!qsize || qsize > NVME_CAP_MQES(n->ctrl->cap)) {
+		status = NVME_SC_QUEUE_SIZE | NVME_SC_DNR;
+		goto out;
+	}
+	if (!prp1) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto out;
+	}
+	if (vector > n->num_queues) {
+		status = NVME_SC_INVALID_VECTOR | NVME_SC_DNR;
+		goto out;
+	}
+	if (!(NVME_CQ_FLAGS_PC(qflags))) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto out;
+	}
+
+	vcq = kmalloc(sizeof(*vcq), GFP_KERNEL);
+	if (!vcq) {
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		goto out;
+	}
+
+	ret = nvmet_vhost_init_cq(vcq, n, prp1, cqid, qsize+1,
+		n->eventfd[cqid].call_ctx, vector,
+		NVME_CQ_FLAGS_IEN(qflags));
+	if (ret)
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+
+out:
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_vhost_create_sq(struct nvmet_req *req)
+{
+	struct nvme_create_sq *c = &req->cmd->create_sq;
+	u16 cqid = le16_to_cpu(c->cqid);
+	u16 sqid = le16_to_cpu(c->sqid);
+	u16 qsize = le16_to_cpu(c->qsize);
+	u16 qflags = le16_to_cpu(c->sq_flags);
+	u64 prp1 = le64_to_cpu(c->prp1);
+
+	struct nvmet_sq *sq = req->sq;
+	struct nvmet_vhost_sq *vsq;
+	struct nvmet_vhost_ctrl *n;
+	int status;
+	int ret;
+
+	status = NVME_SC_SUCCESS;
+	vsq = sq_to_vsq(sq);
+	n = vsq->ctrl;
+
+	if (!cqid || nvmet_vhost_check_cqid(n->ctrl, cqid)) {
+		status = NVME_SC_CQ_INVALID | NVME_SC_DNR;
+		goto out;
+	}
+	if (!sqid || (sqid && !nvmet_vhost_check_sqid(n->ctrl, sqid))) {
+		status = NVME_SC_QID_INVALID | NVME_SC_DNR;
+		goto out;
+	}
+	if (!qsize || qsize > NVME_CAP_MQES(n->ctrl->cap)) {
+		status = NVME_SC_QUEUE_SIZE | NVME_SC_DNR;
+		goto out;
+	}
+	if (!prp1 || prp1 & (n->page_size - 1)) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto out;
+	}
+	if (!(NVME_SQ_FLAGS_PC(qflags))) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto out;
+	}
+
+	vsq = kmalloc(sizeof(*vsq), GFP_KERNEL);
+	if (!sq) {
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		goto out;
+	}
+
+	ret = nvmet_vhost_init_sq(vsq, n, prp1, sqid, cqid, qsize + 1);
+	if (ret)
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+
+out:
+	nvmet_req_complete(req, status);
+}
+
+static int nvmet_vhost_parse_admin_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	switch (cmd->common.opcode) {
+	case nvme_admin_create_cq:
+		req->execute = nvmet_vhost_create_cq;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_create_sq:
+		req->execute = nvmet_vhost_create_sq;
+		req->data_len = 0;
+		return 0;
+	}
+
+	return -1;
+}
+
 static int
 nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
 			struct vhost_nvme_target *c)
@@ -173,6 +325,7 @@ nvmet_vhost_set_endpoint(struct nvmet_vhost_ctrl *n,
 	n->num_queues = subsys->max_qid + 1;
 	ctrl->opaque = n;
 	ctrl->start = nvmet_vhost_start_ctrl;
+	ctrl->parse_extra_admin_cmd = nvmet_vhost_parse_admin_cmd;
 
 	num_queues = ctrl->subsys->max_qid + 1;
 	n->cqs = kzalloc(sizeof(*n->cqs) * num_queues, GFP_KERNEL);
-- 
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
This borrows code from Hannes Reinecke's rts-megasas.
Cc: Hannes Reinecke <hare at suse.de>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/vhost.c | 108 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 04ed0bc..6847c86 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -5,6 +5,7 @@
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/highmem.h>
 #include "../../vhost/vhost.h"
 #include "nvmet.h"
 
@@ -95,6 +96,113 @@ struct nvmet_vhost_ctrl {
 	u32 page_size;
 };
 
+const struct vhost_memory_region *
+find_region(struct vhost_dev *hba, __u64 addr, __u32 len)
+{
+	struct vhost_memory *mem;
+	struct vhost_memory_region *reg;
+	int i;
+
+	if (!hba->memory)
+		return NULL;
+
+	mem = hba->memory;
+	/* linear search is not brilliant, but we really have on the order of 6
+	 * regions in practice */
+	for (i = 0; i < mem->nregions; ++i) {
+		reg = mem->regions + i;
+		if (reg->guest_phys_addr <= addr &&
+		    reg->guest_phys_addr + reg->memory_size - 1 >= addr)
+			return reg;
+	}
+	return NULL;
+}
+
+static bool check_region_boundary(const struct vhost_memory_region *reg,
+				  uint64_t addr, size_t len)
+{
+	unsigned long max_size;
+
+	max_size = reg->memory_size - addr + reg->guest_phys_addr;
+	return (max_size < len);
+}
+
+static void __user *map_to_region(const struct vhost_memory_region *reg,
+				   uint64_t addr)
+{
+	return (void __user *)(unsigned long)
+		(reg->userspace_addr + addr - reg->guest_phys_addr);
+}
+
+static void __user *map_guest_to_host(struct vhost_dev *dev,
+				       uint64_t addr, int size)
+{
+	const struct vhost_memory_region *reg = NULL;
+
+	reg = find_region(dev, addr, size);
+	if (unlikely(!reg))
+		return ERR_PTR(-EPERM);
+
+	if (unlikely(check_region_boundary(reg, addr, size)))
+		return ERR_PTR(-EFAULT);
+
+	return map_to_region(reg, addr);
+}
+
+static int nvmet_vhost_rw(struct vhost_dev *dev, u64 guest_pa,
+		void *buf, uint32_t size, int write)
+{
+	void __user *host_user_va;
+	void *host_kernel_va;
+	struct page *page;
+	uintptr_t offset;
+	int ret;
+
+	host_user_va = map_guest_to_host(dev, guest_pa, size);
+	if (unlikely(!host_user_va)) {
+		pr_warn("cannot map guest addr %p, error %ld\n",
+			(void *)guest_pa, PTR_ERR(host_user_va));
+		return -EINVAL;
+	}
+
+	ret = get_user_pages(current, dev->mm,
+				(unsigned long)host_user_va, 1,
+				false, 0, &page, NULL);
+	if (unlikely(ret != 1)) {
+		pr_warn("get_user_pages fail!!!\n");
+		return -EINVAL;
+	}
+
+	host_kernel_va = kmap(page);
+	if (unlikely(!host_kernel_va)) {
+		pr_warn("kmap fail!!!\n");
+		put_page(page);
+		return -EINVAL;
+	}
+
+	offset = (uintptr_t)host_user_va & ~PAGE_MASK;
+	if (write)
+		memcpy(host_kernel_va + offset, buf, size);
+	else
+		memcpy(buf, host_kernel_va + offset, size);
+	kunmap(host_kernel_va);
+	put_page(page);
+
+	return 0;
+}
+
+int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa,
+		void *buf, uint32_t size)
+{
+	return nvmet_vhost_rw(dev, guest_pa, buf, size, 0);
+}
+
+int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa,
+		void *buf, uint32_t size)
+{
+	return nvmet_vhost_rw(dev, guest_pa, buf, size, 1);
+}
+
 #define sq_to_vsq(sq) container_of(sq, struct nvmet_vhost_sq, sq)
 #define cq_to_vcq(cq) container_of(cq, struct nvmet_vhost_cq, cq)
 
-- 
1.9.1
From: Ming Lin <ming.l at ssi.samsung.com>
This adds nvme submission/completion queue handlers,
which are ported from qemu-nvme.
And hooks into nvme-target to do the real job.
Cc: Keith Busch <keith.busch at intel.com>
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
 drivers/nvme/target/vhost.c | 420 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 416 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/target/vhost.c b/drivers/nvme/target/vhost.c
index 6847c86..3ce1348 100644
--- a/drivers/nvme/target/vhost.c
+++ b/drivers/nvme/target/vhost.c
@@ -6,10 +6,12 @@
 #include <linux/mutex.h>
 #include <linux/file.h>
 #include <linux/highmem.h>
+#include <linux/kthread.h>
 #include "../../vhost/vhost.h"
 #include "nvmet.h"
 
 #define NVMET_VHOST_AQ_DEPTH		256
+#define NVMET_VHOST_MAX_SEGMENTS	32
 
 enum NvmeCcShift {
 	CC_MPS_SHIFT	= 7,
@@ -52,6 +54,15 @@ struct nvmet_vhost_ctrl_eventfd {
 	int __user *vector;
 };
 
+struct nvmet_vhost_iod {
+	struct nvmet_vhost_sq	*sq;
+	struct scatterlist	sg[NVMET_VHOST_MAX_SEGMENTS];
+	struct nvme_command	cmd;
+	struct nvme_completion	rsp;
+	struct nvmet_req	req;
+	struct list_head	entry;
+};
+
 struct nvmet_vhost_cq {
 	struct nvmet_cq		cq;
 	struct nvmet_vhost_ctrl	*ctrl;
@@ -61,6 +72,12 @@ struct nvmet_vhost_cq {
 	u8			phase;
 	u64			dma_addr;
 	struct eventfd_ctx	*eventfd;
+
+	struct list_head	sq_list;
+	struct list_head	req_list;
+	spinlock_t		lock;
+	struct task_struct	*thread;
+	int			scheduled;
 };
 
 struct nvmet_vhost_sq {
@@ -71,6 +88,13 @@ struct nvmet_vhost_sq {
 	u32			tail;
 	u64			dma_addr;
 	u16			cqid;
+
+	struct nvmet_vhost_iod	*io_req;
+	struct list_head	req_list;
+	struct list_head	entry;
+	struct mutex            lock;
+	struct task_struct	*thread;
+	int			scheduled;
 };
 
 struct nvmet_vhost_ctrl {
@@ -191,13 +215,13 @@ static int nvmet_vhost_rw(struct vhost_dev *dev, u64
guest_pa,
 	return 0;
 }
 
-int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa,
+static int nvmet_vhost_read(struct vhost_dev *dev, u64 guest_pa,
 		void *buf, uint32_t size)
 {
 	return nvmet_vhost_rw(dev, guest_pa, buf, size, 0);
 }
 
-int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa,
+static int nvmet_vhost_write(struct vhost_dev *dev, u64 guest_pa,
 		void *buf, uint32_t size)
 {
 	return nvmet_vhost_rw(dev, guest_pa, buf, size, 1);
@@ -216,6 +240,289 @@ static int nvmet_vhost_check_cqid(struct nvmet_ctrl *n,
u16 cqid)
 	return cqid <= n->subsys->max_qid && n->cqs[cqid] != NULL
? 0 : -1;
 }
 
+static void nvmet_vhost_inc_cq_tail(struct nvmet_vhost_cq *cq)
+{
+	cq->tail++;
+	if (cq->tail >= cq->cq.size) {
+		cq->tail = 0;
+		cq->phase = !cq->phase;
+	}
+}
+
+static void nvmet_vhost_inc_sq_head(struct nvmet_vhost_sq *sq)
+{
+	sq->head = (sq->head + 1) % sq->sq.size;
+}
+
+static uint8_t nvmet_vhost_cq_full(struct nvmet_vhost_cq *cq)
+{
+	return (cq->tail + 1) % cq->cq.size == cq->head;
+}
+
+static uint8_t nvmet_vhost_sq_empty(struct nvmet_vhost_sq *sq)
+{
+	return sq->head == sq->tail;
+}
+
+static void nvmet_vhost_post_cqes(struct nvmet_vhost_cq *cq)
+{
+	struct nvmet_vhost_ctrl *n = cq->ctrl;
+	struct nvmet_vhost_iod *req;
+	struct list_head *p, *tmp;
+	int signal = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	list_for_each_safe(p, tmp, &cq->req_list) {
+		struct nvmet_vhost_sq *sq;
+		u64 addr;
+
+		if (nvmet_vhost_cq_full(cq))
+			goto unlock;
+
+		req = list_entry(p, struct nvmet_vhost_iod, entry);
+		list_del(p);
+
+		sq = req->sq;
+		req->rsp.status |= cq->phase;
+		req->rsp.sq_id = cpu_to_le16(sq->sq.qid);
+		req->rsp.sq_head = cpu_to_le16(sq->head);
+		addr = cq->dma_addr + cq->tail * n->cqe_size;
+		nvmet_vhost_inc_cq_tail(cq);
+		spin_unlock_irqrestore(&cq->lock, flags);
+
+		nvmet_vhost_write(&n->dev, addr, (void *)&req->rsp,
+			sizeof(req->rsp));
+
+		mutex_lock(&sq->lock);
+		list_add_tail(p, &sq->req_list);
+		mutex_unlock(&sq->lock);
+
+		signal = 1;
+
+		spin_lock_irqsave(&cq->lock, flags);
+	}
+
+	if (signal)
+		eventfd_signal(cq->eventfd, 1);
+
+unlock:
+	cq->scheduled = 0;
+	spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+static int nvmet_vhost_cq_thread(void *arg)
+{
+	struct nvmet_vhost_cq *sq = arg;
+
+	while (1) {
+		if (kthread_should_stop())
+			break;
+
+		nvmet_vhost_post_cqes(sq);
+
+		schedule();
+	}
+
+	return 0;
+}
+
+static void nvmet_vhost_enqueue_req_completion(
+		struct nvmet_vhost_cq *cq, struct nvmet_vhost_iod *iod)
+{
+	unsigned long flags;
+
+	BUG_ON(cq->cq.qid != iod->sq->sq.qid);
+	spin_lock_irqsave(&cq->lock, flags);
+	list_add_tail(&iod->entry, &cq->req_list);
+	if (!cq->scheduled) {
+		wake_up_process(cq->thread);
+		cq->scheduled = 1;
+	}
+	spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+static void nvmet_vhost_queue_response(struct nvmet_req *req)
+{
+	struct nvmet_vhost_iod *iod +		container_of(req, struct nvmet_vhost_iod, req);
+	struct nvmet_vhost_sq *sq = iod->sq;
+	struct nvmet_vhost_ctrl *n = sq->ctrl;
+	struct nvmet_vhost_cq *cq = n->cqs[sq->sq.qid];
+
+	nvmet_vhost_enqueue_req_completion(cq, iod);
+}
+
+static int nvmet_vhost_sglist_add(struct nvmet_vhost_ctrl *n, struct
scatterlist *sg,
+		u64 guest_addr, int len, int is_write)
+{
+	void __user *host_addr;
+	struct page *page;
+	unsigned int offset, nbytes;
+	int ret;
+
+	host_addr = map_guest_to_host(&n->dev, guest_addr, len);
+	if (unlikely(!host_addr)) {
+		pr_warn("cannot map guest addr %p, error %ld\n",
+			(void *)guest_addr, PTR_ERR(host_addr));
+		return PTR_ERR(host_addr);
+	}
+
+	ret = get_user_pages(current, n->dev.mm, (unsigned long)host_addr, 1,
+			is_write, 0, &page, NULL);
+	BUG_ON(ret == 0); /* we should either get our page or fail */
+	if (ret < 0) {
+		pr_warn("get_user_pages faild: host_addr %p, %d\n",
+			host_addr, ret);
+		return ret;
+	}
+
+	offset = (uintptr_t)host_addr & ~PAGE_MASK;
+	nbytes = min_t(unsigned int, PAGE_SIZE - offset, len);
+	sg_set_page(sg, page, nbytes, offset);
+
+	return 0;
+}
+
+static int nvmet_vhost_map_prp(struct nvmet_vhost_ctrl *n, struct scatterlist
*sgl,
+	u64 prp1, u64 prp2, unsigned int len)
+{
+	unsigned int trans_len = n->page_size - (prp1 % n->page_size);
+	int num_prps = (len >> n->page_bits) + 1;
+	//FIXME
+	int is_write = 1;
+
+	trans_len = min(len, trans_len);
+	if (!prp1)
+		return -1;
+
+	sg_init_table(sgl, num_prps);
+
+	nvmet_vhost_sglist_add(n, sgl, prp1, trans_len, is_write);
+
+	len -= trans_len;
+	if (len) {
+		if (!prp2)
+			goto error;
+		if (len > n->page_size) {
+			u64 prp_list[n->max_prp_ents];
+			u16 nents, prp_trans;
+			int i = 0;
+
+			nents = (len + n->page_size - 1) >> n->page_bits;
+			prp_trans = min(n->max_prp_ents, nents) * sizeof(u64);
+			nvmet_vhost_read(&n->dev, prp2, (void *)prp_list, prp_trans);
+
+			while (len != 0) {
+				u64 prp_ent = le64_to_cpu(prp_list[i]);
+
+				if (i == n->max_prp_ents - 1 && len > n->page_size) {
+					if (!prp_ent || prp_ent & (n->page_size - 1))
+						goto error;
+					i = 0;
+					nents = (len + n->page_size - 1) >> n->page_bits;
+					prp_trans = min(n->max_prp_ents, nents) * sizeof(u64);
+					nvmet_vhost_read(&n->dev, prp_ent, (void *)prp_list, prp_trans);
+					prp_ent = le64_to_cpu(prp_list[i]);
+				}
+
+				if (!prp_ent || prp_ent & (n->page_size - 1))
+					goto error;
+
+				trans_len = min(len, n->page_size);
+				nvmet_vhost_sglist_add(n, sgl, prp_ent, trans_len, is_write);
+				sgl++;
+				len -= trans_len;
+				i++;
+			}
+		} else {
+			if (prp2 & (n->page_size - 1))
+				goto error;
+			nvmet_vhost_sglist_add(n, sgl, prp2, trans_len, is_write);
+		}
+	}
+
+	return num_prps;
+
+error:
+	return -1;
+}
+
+static void nvmet_vhost_process_sq(struct nvmet_vhost_sq *sq)
+{
+	struct nvmet_vhost_ctrl *n = sq->ctrl;
+	struct nvmet_vhost_cq *cq = n->cqs[sq->sq.qid];
+	struct nvmet_vhost_iod *iod;
+	struct nvme_command *cmd;
+	int ret;
+
+	mutex_lock(&sq->lock);
+
+	while (!(nvmet_vhost_sq_empty(sq) || list_empty(&sq->req_list))) {
+		u64 addr = sq->dma_addr + sq->head * n->sqe_size;;
+
+		nvmet_vhost_inc_sq_head(sq);
+		iod = list_first_entry(&sq->req_list,
+					struct nvmet_vhost_iod, entry);
+		list_del(&iod->entry);
+		mutex_unlock(&sq->lock);
+
+		cmd = &iod->cmd;
+		ret = nvmet_vhost_read(&n->dev, addr,
+				(void *)cmd, sizeof(*cmd));
+		if (ret) {
+			pr_warn("nvmet_vhost_read fail\n");
+			goto out;
+		}
+
+		ret = nvmet_req_init(&iod->req, &cq->cq, &sq->sq,
+					nvmet_vhost_queue_response);
+		if (ret) {
+			pr_warn("nvmet_req_init error: ret 0x%x, qid %d\n", ret,
sq->sq.qid);
+			goto out;
+		}
+		if (iod->req.data_len) {
+			ret = nvmet_vhost_map_prp(n, iod->sg, cmd->common.prp1,
+					cmd->common.prp2, iod->req.data_len);
+			if (ret > 0) {
+				iod->req.sg = iod->sg;
+				iod->req.sg_cnt = ret;
+			} else {
+				pr_warn("map prp error\n");
+				goto out;
+			}
+		}
+		iod->req.execute(&iod->req);
+		mutex_lock(&sq->lock);
+        }
+
+unlock:
+	sq->scheduled = 0;
+	mutex_unlock(&sq->lock);
+	return;
+
+out:
+	mutex_lock(&sq->lock);
+	list_add_tail(&iod->entry, &sq->req_list);
+	goto unlock;
+}
+
+static int nvmet_vhost_sq_thread(void *opaque)
+{
+	struct nvmet_vhost_sq *sq = opaque;
+
+	while (1) {
+		if (kthread_should_stop())
+			break;
+
+		nvmet_vhost_process_sq(sq);
+
+		schedule();
+	}
+
+	return 0;
+}
+
 static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq,
 		struct nvmet_vhost_ctrl *n, u64 dma_addr,
 		u16 cqid, u16 size, struct eventfd_ctx *eventfd,
@@ -228,6 +535,12 @@ static int nvmet_vhost_init_cq(struct nvmet_vhost_cq *cq,
 	cq->eventfd = eventfd;
 	n->cqs[cqid] = cq;
 
+	spin_lock_init(&cq->lock);
+	INIT_LIST_HEAD(&cq->req_list);
+	INIT_LIST_HEAD(&cq->sq_list);
+	cq->scheduled = 0;
+	cq->thread = kthread_create(nvmet_vhost_cq_thread, cq,
"nvmet_vhost_cq");
+
 	nvmet_cq_init(n->ctrl, &cq->cq, cqid, size);
 
 	return 0;
@@ -237,12 +550,36 @@ static int nvmet_vhost_init_sq(struct nvmet_vhost_sq *sq,
 		struct nvmet_vhost_ctrl *n, u64 dma_addr,
 		u16 sqid, u16 cqid, u16 size)
 {
+	struct nvmet_vhost_cq *cq;
+	struct nvmet_vhost_iod *iod;
+	int i;
+
 	sq->ctrl = n;
 	sq->dma_addr = dma_addr;
 	sq->cqid = cqid;
 	sq->head = sq->tail = 0;
 	n->sqs[sqid] = sq;
 
+	mutex_init(&sq->lock);
+	INIT_LIST_HEAD(&sq->req_list);
+	sq->io_req = kmalloc(sizeof(struct nvmet_vhost_iod) * size, GFP_KERNEL);
+	if (!sq->io_req)
+		return -ENOMEM;
+	for (i = 0; i < size; i++) {
+		iod = &sq->io_req[i];
+
+		iod->req.cmd = &iod->cmd;
+		iod->req.rsp = &iod->rsp;
+		iod->sq = sq;
+		list_add_tail(&iod->entry, &sq->req_list);
+	}
+	sq->scheduled = 0;
+	sq->thread = kthread_create(nvmet_vhost_sq_thread, sq,
"nvmet_vhost_sq");
+
+	cq = n->cqs[cqid];
+	list_add_tail(&sq->entry, &cq->sq_list);
+	n->sqs[sqid] = sq;
+
 	nvmet_sq_init(n->ctrl, &sq->sq, sqid, size);
 
 	return 0;
@@ -564,12 +901,84 @@ static int nvmet_bar_write(struct nvmet_vhost_ctrl *n, int
offset, u64 val)
 	return status;
 }
 
+static int nvmet_vhost_process_db(struct nvmet_ctrl *ctrl, int offset, u64 val)
+{
+	u16 qid;
+
+	if (offset & ((1 << 2) - 1))
+		return -EINVAL;
+
+	if (((offset - 0x1000) >> 2) & 1) {
+		u16 new_head = val & 0xffff;
+		int start_sqs;
+		struct nvmet_vhost_cq *vcq;
+		struct nvmet_cq *cq;
+		unsigned long flags;
+
+		qid = (offset - (0x1000 + (1 << 2))) >> 3;
+		if (nvmet_vhost_check_cqid(ctrl, qid))
+			return -EINVAL;
+
+		cq = ctrl->cqs[qid];
+		if (new_head >= cq->size)
+			return -EINVAL;
+
+		vcq = cq_to_vcq(cq);
+		spin_lock_irqsave(&vcq->lock, flags);
+		start_sqs = nvmet_vhost_cq_full(vcq) ? 1 : 0;
+		vcq->head = new_head;
+		spin_unlock_irqrestore(&vcq->lock, flags);
+		if (start_sqs) {
+			struct nvmet_vhost_sq *sq;
+			struct list_head *p;
+
+			list_for_each(p, &vcq->sq_list) {
+				sq = list_entry(p, struct nvmet_vhost_sq, entry);
+				if (!sq->scheduled) {
+					sq->scheduled = 1;
+					wake_up_process(sq->thread);
+				}
+			}
+			if (!vcq->scheduled) {
+				vcq->scheduled = 1;
+				wake_up_process(vcq->thread);
+			}
+		}
+
+		if (vcq->tail != vcq->head)
+			eventfd_signal(vcq->eventfd, 1);
+	} else {
+		struct nvmet_vhost_sq *vsq;
+		struct nvmet_sq *sq;
+		u16 new_tail = val & 0xffff;
+
+		qid = (offset - 0x1000) >> 3;
+		if (nvmet_vhost_check_sqid(ctrl, qid))
+			return -EINVAL;
+
+		sq = ctrl->sqs[qid];
+		if (new_tail >= sq->size)
+			return -ENOSPC;
+
+		vsq = sq_to_vsq(sq);
+		mutex_lock(&vsq->lock);
+		vsq->tail = new_tail;
+		if (!vsq->scheduled) {
+			vsq->scheduled = 1;
+			wake_up_process(vsq->thread);
+		}
+		mutex_unlock(&vsq->lock);
+	}
+
+	return 0;
+}
+
 static int nvmet_vhost_bar_write(struct nvmet_vhost_ctrl *n, int offset, u64
val)
 {
 	if (offset < 0x1000)
 		return nvmet_bar_write(n, offset, val);
-
-	return -1;
+	else
+		return nvmet_vhost_process_db(n->ctrl, offset, val);
 }
 
 static int nvmet_vhost_ioc_bar(struct nvmet_vhost_ctrl *n, void __user *argp)
@@ -612,6 +1021,8 @@ static void nvme_free_sq(struct nvmet_vhost_sq *sq,
 		struct nvmet_vhost_ctrl *n)
 {
 	n->sqs[sq->sq.qid] = NULL;
+	kthread_stop(sq->thread);
+	kfree(sq->io_req);
 	if (sq->sq.qid)
 		kfree(sq);
 }
@@ -620,6 +1031,7 @@ static void nvme_free_cq(struct nvmet_vhost_cq *cq,
 		struct nvmet_vhost_ctrl *n)
 {
 	n->cqs[cq->cq.qid] = NULL;
+	kthread_stop(cq->thread);
 	if (cq->cq.qid)
 		kfree(cq);
 }
-- 
1.9.1
Christoph Hellwig
2015-Nov-20  05:13 UTC
[RFC PATCH 4/9] nvmet: add a controller "start" hook
On Thu, Nov 19, 2015 at 04:21:03PM -0800, Ming Lin wrote:> #define NVMET_SUBSYS_NAME_LEN 256 > char subsys_name[NVMET_SUBSYS_NAME_LEN]; > + > + void *opaque; > + void (*start)(void *); > };Why can't vhost use container_of to get at the containing structure similar to what the loop driver does? In addition I think we'll eventually need an ops structure here, but I can take care of that later.
Christoph Hellwig
2015-Nov-20  05:16 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
Thanks Ming, from a first quick view this looks great. I'll look over it in a bit more detail once I get a bit more time.
Ming Lin
2015-Nov-20  05:33 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On Fri, 2015-11-20 at 06:16 +0100, Christoph Hellwig wrote:> Thanks Ming, > > from a first quick view this looks great. I'll look over it in a bit > more detail once I get a bit more time.Thanks to CC Nic :-) But funny, I double-checked bash history. I actually CCed Nic. Don't know why it's lost. mlin at ssi:~$ history |grep "nab" 1861 git send-email --from "Ming Lin <mlin at kernel.org>" --to "linux-nvme at lists.infradead.org" --cc "qemu-devel at nongnu.org" --cc "virtualization at lists.linux-foundation.org" --cc "Christoph Hellwig <hch at lst.de>" --cc "Nicholas A. Bellinger <nab at linux-iscsi.org>" --compose ~/patches/*.patch
Paolo Bonzini
2015-Nov-21  13:11 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On 20/11/2015 01:20, Ming Lin wrote:> One improvment could be to use google's NVMe vendor extension that > I send in another thread, aslo here: > https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-google-ext > > Qemu side: > http://www.minggr.net/cgit/cgit.cgi/qemu/log/?h=vhost-nvme.0 > Kernel side also here: > https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=vhost-nvme.0How much do you get with vhost-nvme plus vendor extension, compared to 190 MB/s for QEMU? Note that in all likelihood, QEMU can actually do better than 190 MB/s, and gain more parallelism too, by moving the processing of the ioeventfds to a separate thread. This is similar to hw/block/dataplane/virtio-blk.c. It's actually pretty easy to do. Even though hw/block/dataplane/virtio-blk.c is still using some old APIs, all memory access in QEMU is now thread-safe. I have pending patches for 2.6 that cut that file down to a mere 200 lines of code, NVMe would probably be about the same. Paolo
Paolo Bonzini
2015-Dec-01  16:02 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On 01/12/2015 00:20, Ming Lin wrote:> qemu-nvme: 148MB/s > vhost-nvme + google-ext: 230MB/s > qemu-nvme + google-ext + eventfd: 294MB/s > virtio-scsi: 296MB/s > virtio-blk: 344MB/s > > "vhost-nvme + google-ext" didn't get good enough performance.I'd expect it to be on par of qemu-nvme with ioeventfd but the question is: why should it be better? For vhost-net, the answer is that more zerocopy can be done if you put the data path in the kernel. But qemu-nvme is already using io_submit for the data path, perhaps there's not much to gain from vhost-nvme... Paolo> Still tuning.
Ming Lin
2015-Dec-01  16:26 UTC
[RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
On Tue, 2015-12-01 at 17:02 +0100, Paolo Bonzini wrote:> > On 01/12/2015 00:20, Ming Lin wrote: > > qemu-nvme: 148MB/s > > vhost-nvme + google-ext: 230MB/s > > qemu-nvme + google-ext + eventfd: 294MB/s > > virtio-scsi: 296MB/s > > virtio-blk: 344MB/s > > > > "vhost-nvme + google-ext" didn't get good enough performance. > > I'd expect it to be on par of qemu-nvme with ioeventfd but the question > is: why should it be better? For vhost-net, the answer is that more > zerocopy can be done if you put the data path in the kernel. > > But qemu-nvme is already using io_submit for the data path, perhaps > there's not much to gain from vhost-nvme...What do you think about virtio-nvme+vhost-nvme? I also have patch for vritio-nvme: https://git.kernel.org/cgit/linux/kernel/git/mlin/linux.git/log/?h=nvme-split/virtio Just need to change vhost-nvme to work with it.> > Paolo > > > Still tuning.
Reasonably Related Threads
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target
- [RFC PATCH 0/9] vhost-nvme: new qemu nvme backend using nvme target