thr3ads.net - Xen devel - [PATCH 0/5] Move stuff out of xenfs [Nov 2011]

If this information is useful, please help other people find it:
Share via:

Bastian Blank

2011-Nov-27 22:07 UTC

[PATCH 0/5] Move stuff out of xenfs

Over a year ago I started a discussion about xenfs. This is the first
try to add the stuff in xenfs as regular devices and a sysfs file.

Patches for xen tools will follow.

Bastian

Bastian Blank

2011-Nov-27 22:07 UTC

head link

[PATCH 1/5] xen/sys/hypervisor: Export guest_properties/is_initial_domain

Signed-off-by: Bastian Blank <waldi@debian.org>
---
 drivers/xen/sys-hypervisor.c |   35 +++++++++++++++++++++++++++++++++++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 1e0fe01..d0916e8 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -355,6 +355,35 @@ static void xen_properties_destroy(void)
 	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }
 
+/* xen guest properties info */
+
+static ssize_t is_initial_domain_show(struct hyp_sysfs_attr *attr, char
*buffer)
+{
+	return sprintf(buffer, "%d\n", xen_initial_domain());
+}
+
+HYPERVISOR_ATTR_RO(is_initial_domain);
+
+static struct attribute *xen_guest_properties_attrs[] = {
+	&is_initial_domain_attr.attr,
+	NULL
+};
+
+static struct attribute_group xen_guest_properties_group = {
+	.name = "guest_properties",
+	.attrs = xen_guest_properties_attrs,
+};
+
+static int __init xen_guest_properties_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &xen_guest_properties_group);
+}
+
+static void xen_guest_properties_destroy(void)
+{
+	sysfs_remove_group(hypervisor_kobj, &xen_guest_properties_group);
+}
+
 static int __init hyper_sysfs_init(void)
 {
 	int ret;
@@ -377,9 +406,14 @@ static int __init hyper_sysfs_init(void)
 	ret = xen_properties_init();
 	if (ret)
 		goto prop_out;
+	ret = xen_guest_properties_init();
+	if (ret)
+		goto gprop_out;
 
 	goto out;
 
+gprop_out:
+	xen_properties_destroy();
 prop_out:
 	xen_sysfs_uuid_destroy();
 uuid_out:
@@ -394,6 +428,7 @@ out:
 
 static void __exit hyper_sysfs_exit(void)
 {
+	xen_guest_properties_destroy();
 	xen_properties_destroy();
 	xen_compilation_destroy();
 	xen_sysfs_uuid_destroy();
-- 
1.7.7.3

Bastian Blank

2011-Nov-27 22:07 UTC

head link

[PATCH 2/5] xen: Add privcmd device driver

Access to arbitrary hypercalls is currently provided via xenfs. This
adds a standard character device to handle this. The support in xenfs
remains for backward compatibility and uses the device driver code.

Signed-off-by: Bastian Blank <waldi@debian.org>
---
 drivers/xen/Kconfig         |    7 +
 drivers/xen/Makefile        |    2 +
 drivers/xen/privcmd.c       |  437 +++++++++++++++++++++++++++++++++++++++++++
 drivers/xen/privcmd.h       |    3 +
 drivers/xen/xenfs/Makefile  |    2 +-
 drivers/xen/xenfs/privcmd.c |  400 ---------------------------------------
 drivers/xen/xenfs/super.c   |    3 +-
 drivers/xen/xenfs/xenfs.h   |    1 -
 8 files changed, 452 insertions(+), 403 deletions(-)
 create mode 100644 drivers/xen/privcmd.c
 create mode 100644 drivers/xen/privcmd.h
 delete mode 100644 drivers/xen/xenfs/privcmd.c

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 5f7ff8e..eb7574c 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -86,6 +86,7 @@ config XEN_BACKEND
 
 config XENFS
 	tristate "Xen filesystem"
+	select XEN_PRIVCMD
 	default y
 	help
 	  The xen filesystem provides a way for domains to share
@@ -181,4 +182,10 @@ config XEN_PCIDEV_BACKEND
 	  xen-pciback.hide=(03:00.0)(04:00.0)
 
 	  If in doubt, say m.
+
+config XEN_PRIVCMD
+	tristate
+	depends on XEN_DOM0
+	default m
+
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 72bbb27..c35f65d 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -19,9 +19,11 @@ obj-$(CONFIG_XEN_TMEM)			+= tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)			+= pci.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
+obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
 
 xen-evtchn-y				:= evtchn.o
 xen-gntdev-y				:= gntdev.o
 xen-gntalloc-y				:= gntalloc.o
+xen-privcmd-y				:= privcmd.o
 
 xen-platform-pci-y			:= platform-pci.o
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
new file mode 100644
index 0000000..863fbd0
--- /dev/null
+++ b/drivers/xen/privcmd.c
@@ -0,0 +1,437 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#include "privcmd.h"
+
+MODULE_LICENSE("GPL");
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+	struct privcmd_hypercall hypercall;
+	long ret;
+
+	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+		return -EFAULT;
+
+	ret = privcmd_call(hypercall.op,
+			   hypercall.arg[0], hypercall.arg[1],
+			   hypercall.arg[2], hypercall.arg[3],
+			   hypercall.arg[4]);
+
+	return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+	struct page *p, *n;
+
+	list_for_each_entry_safe(p, n, pages, lru)
+		__free_page(p);
+
+	INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data.  If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+			unsigned nelem, size_t size,
+			void __user *data)
+{
+	unsigned pageidx;
+	void *pagedata;
+	int ret;
+
+	if (size > PAGE_SIZE)
+		return 0;
+
+	pageidx = PAGE_SIZE;
+	pagedata = NULL;	/* quiet, gcc */
+	while (nelem--) {
+		if (pageidx > PAGE_SIZE-size) {
+			struct page *page = alloc_page(GFP_KERNEL);
+
+			ret = -ENOMEM;
+			if (page == NULL)
+				goto fail;
+
+			pagedata = page_address(page);
+
+			list_add_tail(&page->lru, pagelist);
+			pageidx = 0;
+		}
+
+		ret = -EFAULT;
+		if (copy_from_user(pagedata + pageidx, data, size))
+			goto fail;
+
+		data += size;
+		pageidx += size;
+	}
+
+	ret = 0;
+
+fail:
+	return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+			  struct list_head *pos,
+			  int (*fn)(void *data, void *state),
+			  void *state)
+{
+	void *pagedata;
+	unsigned pageidx;
+	int ret = 0;
+
+	BUG_ON(size > PAGE_SIZE);
+
+	pageidx = PAGE_SIZE;
+	pagedata = NULL;	/* hush, gcc */
+
+	while (nelem--) {
+		if (pageidx > PAGE_SIZE-size) {
+			struct page *page;
+			pos = pos->next;
+			page = list_entry(pos, struct page, lru);
+			pagedata = page_address(page);
+			pageidx = 0;
+		}
+
+		ret = (*fn)(pagedata + pageidx, state);
+		if (ret)
+			break;
+		pageidx += size;
+	}
+
+	return ret;
+}
+
+struct mmap_mfn_state {
+	unsigned long va;
+	struct vm_area_struct *vma;
+	domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+	struct privcmd_mmap_entry *msg = data;
+	struct mmap_mfn_state *st = state;
+	struct vm_area_struct *vma = st->vma;
+	int rc;
+
+	/* Do not allow range to wrap the address space. */
+	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+		return -EINVAL;
+
+	/* Range chunks must be contiguous in va space. */
+	if ((msg->va != st->va) ||
+	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+		return -EINVAL;
+
+	rc = xen_remap_domain_mfn_range(vma,
+					msg->va & PAGE_MASK,
+					msg->mfn, msg->npages,
+					vma->vm_page_prot,
+					st->domain);
+	if (rc < 0)
+		return rc;
+
+	st->va += msg->npages << PAGE_SHIFT;
+
+	return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+	struct privcmd_mmap mmapcmd;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int rc;
+	LIST_HEAD(pagelist);
+	struct mmap_mfn_state state;
+
+	if (!xen_initial_domain())
+		return -EPERM;
+
+	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+		return -EFAULT;
+
+	rc = gather_array(&pagelist,
+			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+			  mmapcmd.entry);
+
+	if (rc || list_empty(&pagelist))
+		goto out;
+
+	down_write(&mm->mmap_sem);
+
+	{
+		struct page *page = list_first_entry(&pagelist,
+						     struct page, lru);
+		struct privcmd_mmap_entry *msg = page_address(page);
+
+		vma = find_vma(mm, msg->va);
+		rc = -EINVAL;
+
+		if (!vma || (msg->va != vma->vm_start) ||
+		    !privcmd_enforce_singleshot_mapping(vma))
+			goto out_up;
+	}
+
+	state.va = vma->vm_start;
+	state.vma = vma;
+	state.domain = mmapcmd.dom;
+
+	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+			    &pagelist,
+			    mmap_mfn_range, &state);
+
+
+out_up:
+	up_write(&mm->mmap_sem);
+
+out:
+	free_page_list(&pagelist);
+
+	return rc;
+}
+
+struct mmap_batch_state {
+	domid_t domain;
+	unsigned long va;
+	struct vm_area_struct *vma;
+	int err;
+
+	xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+	xen_pfn_t *mfnp = data;
+	struct mmap_batch_state *st = state;
+
+	if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp,
1,
+				       st->vma->vm_page_prot, st->domain) < 0) {
+		*mfnp |= 0xf0000000U;
+		st->err++;
+	}
+	st->va += PAGE_SIZE;
+
+	return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+	xen_pfn_t *mfnp = data;
+	struct mmap_batch_state *st = state;
+
+	return put_user(*mfnp, st->user++);
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+	int ret;
+	struct privcmd_mmapbatch m;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long nr_pages;
+	LIST_HEAD(pagelist);
+	struct mmap_batch_state state;
+
+	if (!xen_initial_domain())
+		return -EPERM;
+
+	if (copy_from_user(&m, udata, sizeof(m)))
+		return -EFAULT;
+
+	nr_pages = m.num;
+	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+		return -EINVAL;
+
+	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+			   m.arr);
+
+	if (ret || list_empty(&pagelist))
+		goto out;
+
+	down_write(&mm->mmap_sem);
+
+	vma = find_vma(mm, m.addr);
+	ret = -EINVAL;
+	if (!vma ||
+	    vma->vm_ops != &privcmd_vm_ops ||
+	    (m.addr != vma->vm_start) ||
+	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+	    !privcmd_enforce_singleshot_mapping(vma)) {
+		up_write(&mm->mmap_sem);
+		goto out;
+	}
+
+	state.domain = m.dom;
+	state.vma = vma;
+	state.va = m.addr;
+	state.err = 0;
+
+	ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+			     &pagelist, mmap_batch_fn, &state);
+
+	up_write(&mm->mmap_sem);
+
+	if (state.err > 0) {
+		state.user = m.arr;
+		ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+			       &pagelist,
+			       mmap_return_errors, &state);
+	}
+
+out:
+	free_page_list(&pagelist);
+
+	return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long data)
+{
+	int ret = -ENOSYS;
+	void __user *udata = (void __user *) data;
+
+	switch (cmd) {
+	case IOCTL_PRIVCMD_HYPERCALL:
+		ret = privcmd_ioctl_hypercall(udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAP:
+		ret = privcmd_ioctl_mmap(udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAPBATCH:
+		ret = privcmd_ioctl_mmap_batch(udata);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx,
uv=%p\n",
+	       vma, vma->vm_start, vma->vm_end,
+	       vmf->pgoff, vmf->virtual_address);
+
+	return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+	.fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* Unsupported for auto-translate guests. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return -ENOSYS;
+
+	/* DONTCOPY is essential for Xen because copy_page_range doesn''t know
+	 * how to recreate these mappings */
+	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+	vma->vm_ops = &privcmd_vm_ops;
+	vma->vm_private_data = NULL;
+
+	return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+
+const struct file_operations xen_privcmd_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = privcmd_ioctl,
+	.mmap = privcmd_mmap,
+};
+EXPORT_SYMBOL_GPL(xen_privcmd_fops);
+
+static struct miscdevice privcmd_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "xen/privcmd",
+	.fops = &xen_privcmd_fops,
+};
+
+static int __init privcmd_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	err = misc_register(&privcmd_dev);
+	if (err != 0) {
+		printk(KERN_ERR "Could not register privcmd device\n");
+		return err;
+	}
+	return 0;
+}
+
+static void __exit privcmd_exit(void)
+{
+	misc_deregister(&privcmd_dev);
+}
+
+module_init(privcmd_init);
+module_exit(privcmd_exit);
diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h
new file mode 100644
index 0000000..14facae
--- /dev/null
+++ b/drivers/xen/privcmd.h
@@ -0,0 +1,3 @@
+#include <linux/fs.h>
+
+extern const struct file_operations xen_privcmd_fops;
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 4fde944..5d45ff1 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_XENFS) += xenfs.o
 
-xenfs-y			  = super.o xenbus.o privcmd.o
+xenfs-y			  = super.o xenbus.o
 xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
deleted file mode 100644
index dbd3b16..0000000
--- a/drivers/xen/xenfs/privcmd.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/******************************************************************************
- * privcmd.c
- *
- * Interface to privileged domain-0 commands.
- *
- * Copyright (c) 2002-2004, K A Fraser, B Dragovic
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/uaccess.h>
-#include <linux/swap.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/seq_file.h>
-
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/tlb.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/xen.h>
-#include <xen/privcmd.h>
-#include <xen/interface/xen.h>
-#include <xen/features.h>
-#include <xen/page.h>
-#include <xen/xen-ops.h>
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
-#endif
-
-static long privcmd_ioctl_hypercall(void __user *udata)
-{
-	struct privcmd_hypercall hypercall;
-	long ret;
-
-	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
-		return -EFAULT;
-
-	ret = privcmd_call(hypercall.op,
-			   hypercall.arg[0], hypercall.arg[1],
-			   hypercall.arg[2], hypercall.arg[3],
-			   hypercall.arg[4]);
-
-	return ret;
-}
-
-static void free_page_list(struct list_head *pages)
-{
-	struct page *p, *n;
-
-	list_for_each_entry_safe(p, n, pages, lru)
-		__free_page(p);
-
-	INIT_LIST_HEAD(pages);
-}
-
-/*
- * Given an array of items in userspace, return a list of pages
- * containing the data.  If copying fails, either because of memory
- * allocation failure or a problem reading user memory, return an
- * error code; its up to the caller to dispose of any partial list.
- */
-static int gather_array(struct list_head *pagelist,
-			unsigned nelem, size_t size,
-			void __user *data)
-{
-	unsigned pageidx;
-	void *pagedata;
-	int ret;
-
-	if (size > PAGE_SIZE)
-		return 0;
-
-	pageidx = PAGE_SIZE;
-	pagedata = NULL;	/* quiet, gcc */
-	while (nelem--) {
-		if (pageidx > PAGE_SIZE-size) {
-			struct page *page = alloc_page(GFP_KERNEL);
-
-			ret = -ENOMEM;
-			if (page == NULL)
-				goto fail;
-
-			pagedata = page_address(page);
-
-			list_add_tail(&page->lru, pagelist);
-			pageidx = 0;
-		}
-
-		ret = -EFAULT;
-		if (copy_from_user(pagedata + pageidx, data, size))
-			goto fail;
-
-		data += size;
-		pageidx += size;
-	}
-
-	ret = 0;
-
-fail:
-	return ret;
-}
-
-/*
- * Call function "fn" on each element of the array fragmented
- * over a list of pages.
- */
-static int traverse_pages(unsigned nelem, size_t size,
-			  struct list_head *pos,
-			  int (*fn)(void *data, void *state),
-			  void *state)
-{
-	void *pagedata;
-	unsigned pageidx;
-	int ret = 0;
-
-	BUG_ON(size > PAGE_SIZE);
-
-	pageidx = PAGE_SIZE;
-	pagedata = NULL;	/* hush, gcc */
-
-	while (nelem--) {
-		if (pageidx > PAGE_SIZE-size) {
-			struct page *page;
-			pos = pos->next;
-			page = list_entry(pos, struct page, lru);
-			pagedata = page_address(page);
-			pageidx = 0;
-		}
-
-		ret = (*fn)(pagedata + pageidx, state);
-		if (ret)
-			break;
-		pageidx += size;
-	}
-
-	return ret;
-}
-
-struct mmap_mfn_state {
-	unsigned long va;
-	struct vm_area_struct *vma;
-	domid_t domain;
-};
-
-static int mmap_mfn_range(void *data, void *state)
-{
-	struct privcmd_mmap_entry *msg = data;
-	struct mmap_mfn_state *st = state;
-	struct vm_area_struct *vma = st->vma;
-	int rc;
-
-	/* Do not allow range to wrap the address space. */
-	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
-	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
-		return -EINVAL;
-
-	/* Range chunks must be contiguous in va space. */
-	if ((msg->va != st->va) ||
-	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
-		return -EINVAL;
-
-	rc = xen_remap_domain_mfn_range(vma,
-					msg->va & PAGE_MASK,
-					msg->mfn, msg->npages,
-					vma->vm_page_prot,
-					st->domain);
-	if (rc < 0)
-		return rc;
-
-	st->va += msg->npages << PAGE_SHIFT;
-
-	return 0;
-}
-
-static long privcmd_ioctl_mmap(void __user *udata)
-{
-	struct privcmd_mmap mmapcmd;
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	int rc;
-	LIST_HEAD(pagelist);
-	struct mmap_mfn_state state;
-
-	if (!xen_initial_domain())
-		return -EPERM;
-
-	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
-		return -EFAULT;
-
-	rc = gather_array(&pagelist,
-			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
-			  mmapcmd.entry);
-
-	if (rc || list_empty(&pagelist))
-		goto out;
-
-	down_write(&mm->mmap_sem);
-
-	{
-		struct page *page = list_first_entry(&pagelist,
-						     struct page, lru);
-		struct privcmd_mmap_entry *msg = page_address(page);
-
-		vma = find_vma(mm, msg->va);
-		rc = -EINVAL;
-
-		if (!vma || (msg->va != vma->vm_start) ||
-		    !privcmd_enforce_singleshot_mapping(vma))
-			goto out_up;
-	}
-
-	state.va = vma->vm_start;
-	state.vma = vma;
-	state.domain = mmapcmd.dom;
-
-	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
-			    &pagelist,
-			    mmap_mfn_range, &state);
-
-
-out_up:
-	up_write(&mm->mmap_sem);
-
-out:
-	free_page_list(&pagelist);
-
-	return rc;
-}
-
-struct mmap_batch_state {
-	domid_t domain;
-	unsigned long va;
-	struct vm_area_struct *vma;
-	int err;
-
-	xen_pfn_t __user *user;
-};
-
-static int mmap_batch_fn(void *data, void *state)
-{
-	xen_pfn_t *mfnp = data;
-	struct mmap_batch_state *st = state;
-
-	if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp,
1,
-				       st->vma->vm_page_prot, st->domain) < 0) {
-		*mfnp |= 0xf0000000U;
-		st->err++;
-	}
-	st->va += PAGE_SIZE;
-
-	return 0;
-}
-
-static int mmap_return_errors(void *data, void *state)
-{
-	xen_pfn_t *mfnp = data;
-	struct mmap_batch_state *st = state;
-
-	return put_user(*mfnp, st->user++);
-}
-
-static struct vm_operations_struct privcmd_vm_ops;
-
-static long privcmd_ioctl_mmap_batch(void __user *udata)
-{
-	int ret;
-	struct privcmd_mmapbatch m;
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	unsigned long nr_pages;
-	LIST_HEAD(pagelist);
-	struct mmap_batch_state state;
-
-	if (!xen_initial_domain())
-		return -EPERM;
-
-	if (copy_from_user(&m, udata, sizeof(m)))
-		return -EFAULT;
-
-	nr_pages = m.num;
-	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
-		return -EINVAL;
-
-	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
-			   m.arr);
-
-	if (ret || list_empty(&pagelist))
-		goto out;
-
-	down_write(&mm->mmap_sem);
-
-	vma = find_vma(mm, m.addr);
-	ret = -EINVAL;
-	if (!vma ||
-	    vma->vm_ops != &privcmd_vm_ops ||
-	    (m.addr != vma->vm_start) ||
-	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
-	    !privcmd_enforce_singleshot_mapping(vma)) {
-		up_write(&mm->mmap_sem);
-		goto out;
-	}
-
-	state.domain = m.dom;
-	state.vma = vma;
-	state.va = m.addr;
-	state.err = 0;
-
-	ret = traverse_pages(m.num, sizeof(xen_pfn_t),
-			     &pagelist, mmap_batch_fn, &state);
-
-	up_write(&mm->mmap_sem);
-
-	if (state.err > 0) {
-		state.user = m.arr;
-		ret = traverse_pages(m.num, sizeof(xen_pfn_t),
-			       &pagelist,
-			       mmap_return_errors, &state);
-	}
-
-out:
-	free_page_list(&pagelist);
-
-	return ret;
-}
-
-static long privcmd_ioctl(struct file *file,
-			  unsigned int cmd, unsigned long data)
-{
-	int ret = -ENOSYS;
-	void __user *udata = (void __user *) data;
-
-	switch (cmd) {
-	case IOCTL_PRIVCMD_HYPERCALL:
-		ret = privcmd_ioctl_hypercall(udata);
-		break;
-
-	case IOCTL_PRIVCMD_MMAP:
-		ret = privcmd_ioctl_mmap(udata);
-		break;
-
-	case IOCTL_PRIVCMD_MMAPBATCH:
-		ret = privcmd_ioctl_mmap_batch(udata);
-		break;
-
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx,
uv=%p\n",
-	       vma, vma->vm_start, vma->vm_end,
-	       vmf->pgoff, vmf->virtual_address);
-
-	return VM_FAULT_SIGBUS;
-}
-
-static struct vm_operations_struct privcmd_vm_ops = {
-	.fault = privcmd_fault
-};
-
-static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	/* Unsupported for auto-translate guests. */
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return -ENOSYS;
-
-	/* DONTCOPY is essential for Xen because copy_page_range doesn''t know
-	 * how to recreate these mappings */
-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
-	vma->vm_ops = &privcmd_vm_ops;
-	vma->vm_private_data = NULL;
-
-	return 0;
-}
-
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
-{
-	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
-}
-#endif
-
-const struct file_operations privcmd_file_ops = {
-	.unlocked_ioctl = privcmd_ioctl,
-	.mmap = privcmd_mmap,
-};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 1aa3897..a55fbf9 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -16,6 +16,7 @@
 #include <xen/xen.h>
 
 #include "xenfs.h"
+#include "../privcmd.h"
 
 #include <asm/xen/hypervisor.h>
 
@@ -84,7 +85,7 @@ static int xenfs_fill_super(struct super_block *sb, void
*data, int silent)
 		[1] = {},
 		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
-		{ "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
+		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
 		{""},
 	};
 	int rc;
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index b68aa62..5056306 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -2,7 +2,6 @@
 #define _XENFS_XENBUS_H
 
 extern const struct file_operations xenbus_file_ops;
-extern const struct file_operations privcmd_file_ops;
 extern const struct file_operations xsd_kva_file_ops;
 extern const struct file_operations xsd_port_file_ops;
 
-- 
1.7.7.3

Bastian Blank

2011-Nov-27 22:07 UTC

head link

[PATCH 3/5] xen/privcmd: Remove unused support for arch specific privcmp mmap

---
 drivers/xen/privcmd.c |    2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 863fbd0..c13d26a 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -365,7 +365,6 @@ static long privcmd_ioctl(struct file *file,
 	return ret;
 }
 
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
 static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx,
uv=%p\n",
@@ -398,7 +397,6 @@ static int privcmd_enforce_singleshot_mapping(struct
vm_area_struct *vma)
 {
 	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
 }
-#endif
 
 const struct file_operations xen_privcmd_fops = {
 	.owner = THIS_MODULE,
-- 
1.7.7.3

Bastian Blank

2011-Nov-27 22:07 UTC

head link

[PATCH 4/5] xen: Add xenbus device driver

Access to xenbus is currently handled via xenfs. This adds a device
driver for xenbus and makes xenfs use this code.

Signed-off-by: Bastian Blank <waldi@debian.org>
---
 drivers/xen/xenbus/Makefile              |    1 +
 drivers/xen/xenbus/xenbus_comms.h        |    4 +
 drivers/xen/xenbus/xenbus_dev_frontend.c |  624 ++++++++++++++++++++++++++++++
 drivers/xen/xenfs/Makefile               |    2 +-
 drivers/xen/xenfs/super.c                |    3 +-
 drivers/xen/xenfs/xenbus.c               |  593 ----------------------------
 drivers/xen/xenfs/xenfs.h                |    1 -
 7 files changed, 632 insertions(+), 596 deletions(-)
 create mode 100644 drivers/xen/xenbus/xenbus_dev_frontend.c
 delete mode 100644 drivers/xen/xenfs/xenbus.c

diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
index 8dca685..a2ea363 100644
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -1,4 +1,5 @@
 obj-y	+= xenbus.o
+obj-y	+= xenbus_dev_frontend.o
 
 xenbus-objs  xenbus-objs += xenbus_client.o
diff --git a/drivers/xen/xenbus/xenbus_comms.h
b/drivers/xen/xenbus/xenbus_comms.h
index c21db75..6e42800 100644
--- a/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -31,6 +31,8 @@
 #ifndef _XENBUS_COMMS_H
 #define _XENBUS_COMMS_H
 
+#include <linux/fs.h>
+
 int xs_init(void);
 int xb_init_comms(void);
 
@@ -43,4 +45,6 @@ int xs_input_avail(void);
 extern struct xenstore_domain_interface *xen_store_interface;
 extern int xen_store_evtchn;
 
+extern const struct file_operations xen_xenbus_fops;
+
 #endif /* _XENBUS_COMMS_H */
diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c
b/drivers/xen/xenbus/xenbus_dev_frontend.c
new file mode 100644
index 0000000..fb30cff
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -0,0 +1,624 @@
+/*
+ * Driver giving user-space access to the kernel''s xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software
without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Changes:
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include "xenbus_comms.h"
+
+#include <xen/xenbus.h>
+#include <asm/xen/hypervisor.h>
+
+MODULE_LICENSE("GPL");
+
+/*
+ * An element of a list of outstanding transactions, for which we''re
+ * still waiting a reply.
+ */
+struct xenbus_transaction_holder {
+	struct list_head list;
+	struct xenbus_transaction handle;
+};
+
+/*
+ * A buffer of data on the queue.
+ */
+struct read_buffer {
+	struct list_head list;
+	unsigned int cons;
+	unsigned int len;
+	char msg[];
+};
+
+struct xenbus_file_priv {
+	/*
+	 * msgbuffer_mutex is held while partial requests are built up
+	 * and complete requests are acted on.  It therefore protects
+	 * the "transactions" and "watches" lists, and the partial
+	 * request length and buffer.
+	 *
+	 * reply_mutex protects the reply being built up to return to
+	 * usermode.  It nests inside msgbuffer_mutex but may be held
+	 * alone during a watch callback.
+	 */
+	struct mutex msgbuffer_mutex;
+
+	/* In-progress transactions */
+	struct list_head transactions;
+
+	/* Active watches. */
+	struct list_head watches;
+
+	/* Partial request. */
+	unsigned int len;
+	union {
+		struct xsd_sockmsg msg;
+		char buffer[PAGE_SIZE];
+	} u;
+
+	/* Response queue. */
+	struct mutex reply_mutex;
+	struct list_head read_buffers;
+	wait_queue_head_t read_waitq;
+
+};
+
+/* Read out any raw xenbus messages queued up. */
+static ssize_t xenbus_file_read(struct file *filp,
+			       char __user *ubuf,
+			       size_t len, loff_t *ppos)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+	struct read_buffer *rb;
+	unsigned i;
+	int ret;
+
+	mutex_lock(&u->reply_mutex);
+again:
+	while (list_empty(&u->read_buffers)) {
+		mutex_unlock(&u->reply_mutex);
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(u->read_waitq,
+					       !list_empty(&u->read_buffers));
+		if (ret)
+			return ret;
+		mutex_lock(&u->reply_mutex);
+	}
+
+	rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+	i = 0;
+	while (i < len) {
+		unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
+
+		ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
+
+		i += sz - ret;
+		rb->cons += sz - ret;
+
+		if (ret != 0) {
+			if (i == 0)
+				i = -EFAULT;
+			goto out;
+		}
+
+		/* Clear out buffer if it has been consumed */
+		if (rb->cons == rb->len) {
+			list_del(&rb->list);
+			kfree(rb);
+			if (list_empty(&u->read_buffers))
+				break;
+			rb = list_entry(u->read_buffers.next,
+					struct read_buffer, list);
+		}
+	}
+	if (i == 0)
+		goto again;
+
+out:
+	mutex_unlock(&u->reply_mutex);
+	return i;
+}
+
+/*
+ * Add a buffer to the queue.  Caller must hold the appropriate lock
+ * if the queue is not local.  (Commonly the caller will build up
+ * multiple queued buffers on a temporary local list, and then add it
+ * to the appropriate list under lock once all the buffers have een
+ * successfully allocated.)
+ */
+static int queue_reply(struct list_head *queue, const void *data, size_t len)
+{
+	struct read_buffer *rb;
+
+	if (len == 0)
+		return 0;
+
+	rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+	if (rb == NULL)
+		return -ENOMEM;
+
+	rb->cons = 0;
+	rb->len = len;
+
+	memcpy(rb->msg, data, len);
+
+	list_add_tail(&rb->list, queue);
+	return 0;
+}
+
+/*
+ * Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+	struct read_buffer *rb;
+
+	while (!list_empty(list)) {
+		rb = list_entry(list->next, struct read_buffer, list);
+		list_del(list->next);
+		kfree(rb);
+	}
+}
+
+struct watch_adapter {
+	struct list_head list;
+	struct xenbus_watch watch;
+	struct xenbus_file_priv *dev_data;
+	char *token;
+};
+
+static void free_watch_adapter(struct watch_adapter *watch)
+{
+	kfree(watch->watch.node);
+	kfree(watch->token);
+	kfree(watch);
+}
+
+static struct watch_adapter *alloc_watch_adapter(const char *path,
+						 const char *token)
+{
+	struct watch_adapter *watch;
+
+	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+	if (watch == NULL)
+		goto out_fail;
+
+	watch->watch.node = kstrdup(path, GFP_KERNEL);
+	if (watch->watch.node == NULL)
+		goto out_free;
+
+	watch->token = kstrdup(token, GFP_KERNEL);
+	if (watch->token == NULL)
+		goto out_free;
+
+	return watch;
+
+out_free:
+	free_watch_adapter(watch);
+
+out_fail:
+	return NULL;
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+			const char **vec,
+			unsigned int len)
+{
+	struct watch_adapter *adap;
+	struct xsd_sockmsg hdr;
+	const char *path, *token;
+	int path_len, tok_len, body_len, data_len = 0;
+	int ret;
+	LIST_HEAD(staging_q);
+
+	adap = container_of(watch, struct watch_adapter, watch);
+
+	path = vec[XS_WATCH_PATH];
+	token = adap->token;
+
+	path_len = strlen(path) + 1;
+	tok_len = strlen(token) + 1;
+	if (len > 2)
+		data_len = vec[len] - vec[2] + 1;
+	body_len = path_len + tok_len + data_len;
+
+	hdr.type = XS_WATCH_EVENT;
+	hdr.len = body_len;
+
+	mutex_lock(&adap->dev_data->reply_mutex);
+
+	ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
+	if (!ret)
+		ret = queue_reply(&staging_q, path, path_len);
+	if (!ret)
+		ret = queue_reply(&staging_q, token, tok_len);
+	if (!ret && len > 2)
+		ret = queue_reply(&staging_q, vec[2], data_len);
+
+	if (!ret) {
+		/* success: pass reply list onto watcher */
+		list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
+		wake_up(&adap->dev_data->read_waitq);
+	} else
+		queue_cleanup(&staging_q);
+
+	mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static int xenbus_write_transaction(unsigned msg_type,
+				    struct xenbus_file_priv *u)
+{
+	int rc;
+	void *reply;
+	struct xenbus_transaction_holder *trans = NULL;
+	LIST_HEAD(staging_q);
+
+	if (msg_type == XS_TRANSACTION_START) {
+		trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+		if (!trans) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+	reply = xenbus_dev_request_and_reply(&u->u.msg);
+	if (IS_ERR(reply)) {
+		kfree(trans);
+		rc = PTR_ERR(reply);
+		goto out;
+	}
+
+	if (msg_type == XS_TRANSACTION_START) {
+		trans->handle.id = simple_strtoul(reply, NULL, 0);
+
+		list_add(&trans->list, &u->transactions);
+	} else if (msg_type == XS_TRANSACTION_END) {
+		list_for_each_entry(trans, &u->transactions, list)
+			if (trans->handle.id == u->u.msg.tx_id)
+				break;
+		BUG_ON(&trans->list == &u->transactions);
+		list_del(&trans->list);
+
+		kfree(trans);
+	}
+
+	mutex_lock(&u->reply_mutex);
+	rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
+	if (!rc)
+		rc = queue_reply(&staging_q, reply, u->u.msg.len);
+	if (!rc) {
+		list_splice_tail(&staging_q, &u->read_buffers);
+		wake_up(&u->read_waitq);
+	} else {
+		queue_cleanup(&staging_q);
+	}
+	mutex_unlock(&u->reply_mutex);
+
+	kfree(reply);
+
+out:
+	return rc;
+}
+
+static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+{
+	struct watch_adapter *watch, *tmp_watch;
+	char *path, *token;
+	int err, rc;
+	LIST_HEAD(staging_q);
+
+	path = u->u.buffer + sizeof(u->u.msg);
+	token = memchr(path, 0, u->u.msg.len);
+	if (token == NULL) {
+		rc = -EILSEQ;
+		goto out;
+	}
+	token++;
+
+	if (msg_type == XS_WATCH) {
+		watch = alloc_watch_adapter(path, token);
+		if (watch == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		watch->watch.callback = watch_fired;
+		watch->dev_data = u;
+
+		err = register_xenbus_watch(&watch->watch);
+		if (err) {
+			free_watch_adapter(watch);
+			rc = err;
+			goto out;
+		}
+		list_add(&watch->list, &u->watches);
+	} else {
+		list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+			if (!strcmp(watch->token, token) &&
+			    !strcmp(watch->watch.node, path)) {
+				unregister_xenbus_watch(&watch->watch);
+				list_del(&watch->list);
+				free_watch_adapter(watch);
+				break;
+			}
+		}
+	}
+
+	/* Success.  Synthesize a reply to say all is OK. */
+	{
+		struct {
+			struct xsd_sockmsg hdr;
+			char body[3];
+		} __packed reply = {
+			{
+				.type = msg_type,
+				.len = sizeof(reply.body)
+			},
+			"OK"
+		};
+
+		mutex_lock(&u->reply_mutex);
+		rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
+		wake_up(&u->read_waitq);
+		mutex_unlock(&u->reply_mutex);
+	}
+
+out:
+	return rc;
+}
+
+static ssize_t xenbus_file_write(struct file *filp,
+				const char __user *ubuf,
+				size_t len, loff_t *ppos)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+	uint32_t msg_type;
+	int rc = len;
+	int ret;
+	LIST_HEAD(staging_q);
+
+	/*
+	 * We''re expecting usermode to be writing properly formed
+	 * xenbus messages.  If they write an incomplete message we
+	 * buffer it up.  Once it is complete, we act on it.
+	 */
+
+	/*
+	 * Make sure concurrent writers can''t stomp all over each
+	 * other''s messages and make a mess of our partial message
+	 * buffer.  We don''t make any attemppt to stop multiple
+	 * writers from making a mess of each other''s incomplete
+	 * messages; we''re just trying to guarantee our own internal
+	 * consistency and make sure that single writes are handled
+	 * atomically.
+	 */
+	mutex_lock(&u->msgbuffer_mutex);
+
+	/* Get this out of the way early to avoid confusion */
+	if (len == 0)
+		goto out;
+
+	/* Can''t write a xenbus message larger we can buffer */
+	if ((len + u->len) > sizeof(u->u.buffer)) {
+		/* On error, dump existing buffer */
+		u->len = 0;
+		rc = -EINVAL;
+		goto out;
+	}
+
+	ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+
+	if (ret != 0) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/* Deal with a partial copy. */
+	len -= ret;
+	rc = len;
+
+	u->len += len;
+
+	/* Return if we haven''t got a full message yet */
+	if (u->len < sizeof(u->u.msg))
+		goto out;	/* not even the header yet */
+
+	/* If we''re expecting a message that''s larger than we can
+	   possibly send, dump what we have and return an error. */
+	if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
+		rc = -E2BIG;
+		u->len = 0;
+		goto out;
+	}
+
+	if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
+		goto out;	/* incomplete data portion */
+
+	/*
+	 * OK, now we have a complete message.  Do something with it.
+	 */
+
+	msg_type = u->u.msg.type;
+
+	switch (msg_type) {
+	case XS_WATCH:
+	case XS_UNWATCH:
+		/* (Un)Ask for some path to be watched for changes */
+		ret = xenbus_write_watch(msg_type, u);
+		break;
+
+	default:
+		/* Send out a transaction */
+		ret = xenbus_write_transaction(msg_type, u);
+		break;
+	}
+	if (ret != 0)
+		rc = ret;
+
+	/* Buffered message consumed */
+	u->len = 0;
+
+ out:
+	mutex_unlock(&u->msgbuffer_mutex);
+	return rc;
+}
+
+static int xenbus_file_open(struct inode *inode, struct file *filp)
+{
+	struct xenbus_file_priv *u;
+
+	if (xen_store_evtchn == 0)
+		return -ENOENT;
+
+	nonseekable_open(inode, filp);
+
+	u = kzalloc(sizeof(*u), GFP_KERNEL);
+	if (u == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&u->transactions);
+	INIT_LIST_HEAD(&u->watches);
+	INIT_LIST_HEAD(&u->read_buffers);
+	init_waitqueue_head(&u->read_waitq);
+
+	mutex_init(&u->reply_mutex);
+	mutex_init(&u->msgbuffer_mutex);
+
+	filp->private_data = u;
+
+	return 0;
+}
+
+static int xenbus_file_release(struct inode *inode, struct file *filp)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+	struct xenbus_transaction_holder *trans, *tmp;
+	struct watch_adapter *watch, *tmp_watch;
+	struct read_buffer *rb, *tmp_rb;
+
+	/*
+	 * No need for locking here because there are no other users,
+	 * by definition.
+	 */
+
+	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+		xenbus_transaction_end(trans->handle, 1);
+		list_del(&trans->list);
+		kfree(trans);
+	}
+
+	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+		unregister_xenbus_watch(&watch->watch);
+		list_del(&watch->list);
+		free_watch_adapter(watch);
+	}
+
+	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+		list_del(&rb->list);
+		kfree(rb);
+	}
+	kfree(u);
+
+	return 0;
+}
+
+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+	struct xenbus_file_priv *u = file->private_data;
+
+	poll_wait(file, &u->read_waitq, wait);
+	if (!list_empty(&u->read_buffers))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+const struct file_operations xen_xenbus_fops = {
+	.read = xenbus_file_read,
+	.write = xenbus_file_write,
+	.open = xenbus_file_open,
+	.release = xenbus_file_release,
+	.poll = xenbus_file_poll,
+	.llseek = no_llseek,
+};
+EXPORT_SYMBOL_GPL(xen_xenbus_fops);
+
+static struct miscdevice xenbus_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "xen/xenbus",
+	.fops = &xen_xenbus_fops,
+};
+
+static int __init xenbus_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	err = misc_register(&xenbus_dev);
+	if (err)
+		printk(KERN_ERR "Could not register xenbus device\n");
+	return err;
+}
+
+static void __exit xenbus_exit(void)
+{
+	misc_deregister(&xenbus_dev);
+}
+
+module_init(xenbus_init);
+module_exit(xenbus_exit);
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 5d45ff1..b019865 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_XENFS) += xenfs.o
 
-xenfs-y			  = super.o xenbus.o
+xenfs-y			  = super.o
 xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index a55fbf9..a84b53c 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -17,6 +17,7 @@
 
 #include "xenfs.h"
 #include "../privcmd.h"
+#include "../xenbus/xenbus_comms.h"
 
 #include <asm/xen/hypervisor.h>
 
@@ -83,7 +84,7 @@ static int xenfs_fill_super(struct super_block *sb, void
*data, int silent)
 {
 	static struct tree_descr xenfs_files[] = {
 		[1] = {},
-		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+		{ "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
 		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
 		{""},
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
deleted file mode 100644
index bbd000f..0000000
--- a/drivers/xen/xenfs/xenbus.c
+++ /dev/null
@@ -1,593 +0,0 @@
-/*
- * Driver giving user-space access to the kernel''s xenbus connection
- * to xenstore.
- *
- * Copyright (c) 2005, Christian Limpach
- * Copyright (c) 2005, Rusty Russell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software
without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Changes:
- * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
- *                              and /proc/xen compatibility mount point.
- *                              Turned xenfs into a loadable module.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/uio.h>
-#include <linux/notifier.h>
-#include <linux/wait.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/uaccess.h>
-#include <linux/init.h>
-#include <linux/namei.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-
-#include "xenfs.h"
-#include "../xenbus/xenbus_comms.h"
-
-#include <xen/xenbus.h>
-#include <asm/xen/hypervisor.h>
-
-/*
- * An element of a list of outstanding transactions, for which we''re
- * still waiting a reply.
- */
-struct xenbus_transaction_holder {
-	struct list_head list;
-	struct xenbus_transaction handle;
-};
-
-/*
- * A buffer of data on the queue.
- */
-struct read_buffer {
-	struct list_head list;
-	unsigned int cons;
-	unsigned int len;
-	char msg[];
-};
-
-struct xenbus_file_priv {
-	/*
-	 * msgbuffer_mutex is held while partial requests are built up
-	 * and complete requests are acted on.  It therefore protects
-	 * the "transactions" and "watches" lists, and the partial
-	 * request length and buffer.
-	 *
-	 * reply_mutex protects the reply being built up to return to
-	 * usermode.  It nests inside msgbuffer_mutex but may be held
-	 * alone during a watch callback.
-	 */
-	struct mutex msgbuffer_mutex;
-
-	/* In-progress transactions */
-	struct list_head transactions;
-
-	/* Active watches. */
-	struct list_head watches;
-
-	/* Partial request. */
-	unsigned int len;
-	union {
-		struct xsd_sockmsg msg;
-		char buffer[PAGE_SIZE];
-	} u;
-
-	/* Response queue. */
-	struct mutex reply_mutex;
-	struct list_head read_buffers;
-	wait_queue_head_t read_waitq;
-
-};
-
-/* Read out any raw xenbus messages queued up. */
-static ssize_t xenbus_file_read(struct file *filp,
-			       char __user *ubuf,
-			       size_t len, loff_t *ppos)
-{
-	struct xenbus_file_priv *u = filp->private_data;
-	struct read_buffer *rb;
-	unsigned i;
-	int ret;
-
-	mutex_lock(&u->reply_mutex);
-again:
-	while (list_empty(&u->read_buffers)) {
-		mutex_unlock(&u->reply_mutex);
-		if (filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-
-		ret = wait_event_interruptible(u->read_waitq,
-					       !list_empty(&u->read_buffers));
-		if (ret)
-			return ret;
-		mutex_lock(&u->reply_mutex);
-	}
-
-	rb = list_entry(u->read_buffers.next, struct read_buffer, list);
-	i = 0;
-	while (i < len) {
-		unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
-
-		ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
-
-		i += sz - ret;
-		rb->cons += sz - ret;
-
-		if (ret != 0) {
-			if (i == 0)
-				i = -EFAULT;
-			goto out;
-		}
-
-		/* Clear out buffer if it has been consumed */
-		if (rb->cons == rb->len) {
-			list_del(&rb->list);
-			kfree(rb);
-			if (list_empty(&u->read_buffers))
-				break;
-			rb = list_entry(u->read_buffers.next,
-					struct read_buffer, list);
-		}
-	}
-	if (i == 0)
-		goto again;
-
-out:
-	mutex_unlock(&u->reply_mutex);
-	return i;
-}
-
-/*
- * Add a buffer to the queue.  Caller must hold the appropriate lock
- * if the queue is not local.  (Commonly the caller will build up
- * multiple queued buffers on a temporary local list, and then add it
- * to the appropriate list under lock once all the buffers have een
- * successfully allocated.)
- */
-static int queue_reply(struct list_head *queue, const void *data, size_t len)
-{
-	struct read_buffer *rb;
-
-	if (len == 0)
-		return 0;
-
-	rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
-	if (rb == NULL)
-		return -ENOMEM;
-
-	rb->cons = 0;
-	rb->len = len;
-
-	memcpy(rb->msg, data, len);
-
-	list_add_tail(&rb->list, queue);
-	return 0;
-}
-
-/*
- * Free all the read_buffer s on a list.
- * Caller must have sole reference to list.
- */
-static void queue_cleanup(struct list_head *list)
-{
-	struct read_buffer *rb;
-
-	while (!list_empty(list)) {
-		rb = list_entry(list->next, struct read_buffer, list);
-		list_del(list->next);
-		kfree(rb);
-	}
-}
-
-struct watch_adapter {
-	struct list_head list;
-	struct xenbus_watch watch;
-	struct xenbus_file_priv *dev_data;
-	char *token;
-};
-
-static void free_watch_adapter(struct watch_adapter *watch)
-{
-	kfree(watch->watch.node);
-	kfree(watch->token);
-	kfree(watch);
-}
-
-static struct watch_adapter *alloc_watch_adapter(const char *path,
-						 const char *token)
-{
-	struct watch_adapter *watch;
-
-	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
-	if (watch == NULL)
-		goto out_fail;
-
-	watch->watch.node = kstrdup(path, GFP_KERNEL);
-	if (watch->watch.node == NULL)
-		goto out_free;
-
-	watch->token = kstrdup(token, GFP_KERNEL);
-	if (watch->token == NULL)
-		goto out_free;
-
-	return watch;
-
-out_free:
-	free_watch_adapter(watch);
-
-out_fail:
-	return NULL;
-}
-
-static void watch_fired(struct xenbus_watch *watch,
-			const char **vec,
-			unsigned int len)
-{
-	struct watch_adapter *adap;
-	struct xsd_sockmsg hdr;
-	const char *path, *token;
-	int path_len, tok_len, body_len, data_len = 0;
-	int ret;
-	LIST_HEAD(staging_q);
-
-	adap = container_of(watch, struct watch_adapter, watch);
-
-	path = vec[XS_WATCH_PATH];
-	token = adap->token;
-
-	path_len = strlen(path) + 1;
-	tok_len = strlen(token) + 1;
-	if (len > 2)
-		data_len = vec[len] - vec[2] + 1;
-	body_len = path_len + tok_len + data_len;
-
-	hdr.type = XS_WATCH_EVENT;
-	hdr.len = body_len;
-
-	mutex_lock(&adap->dev_data->reply_mutex);
-
-	ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
-	if (!ret)
-		ret = queue_reply(&staging_q, path, path_len);
-	if (!ret)
-		ret = queue_reply(&staging_q, token, tok_len);
-	if (!ret && len > 2)
-		ret = queue_reply(&staging_q, vec[2], data_len);
-
-	if (!ret) {
-		/* success: pass reply list onto watcher */
-		list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
-		wake_up(&adap->dev_data->read_waitq);
-	} else
-		queue_cleanup(&staging_q);
-
-	mutex_unlock(&adap->dev_data->reply_mutex);
-}
-
-static int xenbus_write_transaction(unsigned msg_type,
-				    struct xenbus_file_priv *u)
-{
-	int rc;
-	void *reply;
-	struct xenbus_transaction_holder *trans = NULL;
-	LIST_HEAD(staging_q);
-
-	if (msg_type == XS_TRANSACTION_START) {
-		trans = kmalloc(sizeof(*trans), GFP_KERNEL);
-		if (!trans) {
-			rc = -ENOMEM;
-			goto out;
-		}
-	}
-
-	reply = xenbus_dev_request_and_reply(&u->u.msg);
-	if (IS_ERR(reply)) {
-		kfree(trans);
-		rc = PTR_ERR(reply);
-		goto out;
-	}
-
-	if (msg_type == XS_TRANSACTION_START) {
-		trans->handle.id = simple_strtoul(reply, NULL, 0);
-
-		list_add(&trans->list, &u->transactions);
-	} else if (msg_type == XS_TRANSACTION_END) {
-		list_for_each_entry(trans, &u->transactions, list)
-			if (trans->handle.id == u->u.msg.tx_id)
-				break;
-		BUG_ON(&trans->list == &u->transactions);
-		list_del(&trans->list);
-
-		kfree(trans);
-	}
-
-	mutex_lock(&u->reply_mutex);
-	rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
-	if (!rc)
-		rc = queue_reply(&staging_q, reply, u->u.msg.len);
-	if (!rc) {
-		list_splice_tail(&staging_q, &u->read_buffers);
-		wake_up(&u->read_waitq);
-	} else {
-		queue_cleanup(&staging_q);
-	}
-	mutex_unlock(&u->reply_mutex);
-
-	kfree(reply);
-
-out:
-	return rc;
-}
-
-static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
-{
-	struct watch_adapter *watch, *tmp_watch;
-	char *path, *token;
-	int err, rc;
-	LIST_HEAD(staging_q);
-
-	path = u->u.buffer + sizeof(u->u.msg);
-	token = memchr(path, 0, u->u.msg.len);
-	if (token == NULL) {
-		rc = -EILSEQ;
-		goto out;
-	}
-	token++;
-
-	if (msg_type == XS_WATCH) {
-		watch = alloc_watch_adapter(path, token);
-		if (watch == NULL) {
-			rc = -ENOMEM;
-			goto out;
-		}
-
-		watch->watch.callback = watch_fired;
-		watch->dev_data = u;
-
-		err = register_xenbus_watch(&watch->watch);
-		if (err) {
-			free_watch_adapter(watch);
-			rc = err;
-			goto out;
-		}
-		list_add(&watch->list, &u->watches);
-	} else {
-		list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
-			if (!strcmp(watch->token, token) &&
-			    !strcmp(watch->watch.node, path)) {
-				unregister_xenbus_watch(&watch->watch);
-				list_del(&watch->list);
-				free_watch_adapter(watch);
-				break;
-			}
-		}
-	}
-
-	/* Success.  Synthesize a reply to say all is OK. */
-	{
-		struct {
-			struct xsd_sockmsg hdr;
-			char body[3];
-		} __packed reply = {
-			{
-				.type = msg_type,
-				.len = sizeof(reply.body)
-			},
-			"OK"
-		};
-
-		mutex_lock(&u->reply_mutex);
-		rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
-		wake_up(&u->read_waitq);
-		mutex_unlock(&u->reply_mutex);
-	}
-
-out:
-	return rc;
-}
-
-static ssize_t xenbus_file_write(struct file *filp,
-				const char __user *ubuf,
-				size_t len, loff_t *ppos)
-{
-	struct xenbus_file_priv *u = filp->private_data;
-	uint32_t msg_type;
-	int rc = len;
-	int ret;
-	LIST_HEAD(staging_q);
-
-	/*
-	 * We''re expecting usermode to be writing properly formed
-	 * xenbus messages.  If they write an incomplete message we
-	 * buffer it up.  Once it is complete, we act on it.
-	 */
-
-	/*
-	 * Make sure concurrent writers can''t stomp all over each
-	 * other''s messages and make a mess of our partial message
-	 * buffer.  We don''t make any attemppt to stop multiple
-	 * writers from making a mess of each other''s incomplete
-	 * messages; we''re just trying to guarantee our own internal
-	 * consistency and make sure that single writes are handled
-	 * atomically.
-	 */
-	mutex_lock(&u->msgbuffer_mutex);
-
-	/* Get this out of the way early to avoid confusion */
-	if (len == 0)
-		goto out;
-
-	/* Can''t write a xenbus message larger we can buffer */
-	if ((len + u->len) > sizeof(u->u.buffer)) {
-		/* On error, dump existing buffer */
-		u->len = 0;
-		rc = -EINVAL;
-		goto out;
-	}
-
-	ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
-
-	if (ret != 0) {
-		rc = -EFAULT;
-		goto out;
-	}
-
-	/* Deal with a partial copy. */
-	len -= ret;
-	rc = len;
-
-	u->len += len;
-
-	/* Return if we haven''t got a full message yet */
-	if (u->len < sizeof(u->u.msg))
-		goto out;	/* not even the header yet */
-
-	/* If we''re expecting a message that''s larger than we can
-	   possibly send, dump what we have and return an error. */
-	if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
-		rc = -E2BIG;
-		u->len = 0;
-		goto out;
-	}
-
-	if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
-		goto out;	/* incomplete data portion */
-
-	/*
-	 * OK, now we have a complete message.  Do something with it.
-	 */
-
-	msg_type = u->u.msg.type;
-
-	switch (msg_type) {
-	case XS_WATCH:
-	case XS_UNWATCH:
-		/* (Un)Ask for some path to be watched for changes */
-		ret = xenbus_write_watch(msg_type, u);
-		break;
-
-	default:
-		/* Send out a transaction */
-		ret = xenbus_write_transaction(msg_type, u);
-		break;
-	}
-	if (ret != 0)
-		rc = ret;
-
-	/* Buffered message consumed */
-	u->len = 0;
-
- out:
-	mutex_unlock(&u->msgbuffer_mutex);
-	return rc;
-}
-
-static int xenbus_file_open(struct inode *inode, struct file *filp)
-{
-	struct xenbus_file_priv *u;
-
-	if (xen_store_evtchn == 0)
-		return -ENOENT;
-
-	nonseekable_open(inode, filp);
-
-	u = kzalloc(sizeof(*u), GFP_KERNEL);
-	if (u == NULL)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&u->transactions);
-	INIT_LIST_HEAD(&u->watches);
-	INIT_LIST_HEAD(&u->read_buffers);
-	init_waitqueue_head(&u->read_waitq);
-
-	mutex_init(&u->reply_mutex);
-	mutex_init(&u->msgbuffer_mutex);
-
-	filp->private_data = u;
-
-	return 0;
-}
-
-static int xenbus_file_release(struct inode *inode, struct file *filp)
-{
-	struct xenbus_file_priv *u = filp->private_data;
-	struct xenbus_transaction_holder *trans, *tmp;
-	struct watch_adapter *watch, *tmp_watch;
-	struct read_buffer *rb, *tmp_rb;
-
-	/*
-	 * No need for locking here because there are no other users,
-	 * by definition.
-	 */
-
-	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
-		xenbus_transaction_end(trans->handle, 1);
-		list_del(&trans->list);
-		kfree(trans);
-	}
-
-	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
-		unregister_xenbus_watch(&watch->watch);
-		list_del(&watch->list);
-		free_watch_adapter(watch);
-	}
-
-	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
-		list_del(&rb->list);
-		kfree(rb);
-	}
-	kfree(u);
-
-	return 0;
-}
-
-static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
-{
-	struct xenbus_file_priv *u = file->private_data;
-
-	poll_wait(file, &u->read_waitq, wait);
-	if (!list_empty(&u->read_buffers))
-		return POLLIN | POLLRDNORM;
-	return 0;
-}
-
-const struct file_operations xenbus_file_ops = {
-	.read = xenbus_file_read,
-	.write = xenbus_file_write,
-	.open = xenbus_file_open,
-	.release = xenbus_file_release,
-	.poll = xenbus_file_poll,
-	.llseek = no_llseek,
-};
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index 5056306..6b80c77 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -1,7 +1,6 @@
 #ifndef _XENFS_XENBUS_H
 #define _XENFS_XENBUS_H
 
-extern const struct file_operations xenbus_file_ops;
 extern const struct file_operations xsd_kva_file_ops;
 extern const struct file_operations xsd_port_file_ops;
 
-- 
1.7.7.3

Bastian Blank

2011-Nov-27 22:07 UTC

head link

[PATCH 5/5] xen: Add xenbusd device driver

Access for xenstored to the event channel and pre-allocated ring is
managed via xenfs.  This adds its own device driver featuring mmap for
the ring and an ioctl for the event channel.

Signed-off-by: Bastian Blank <waldi@debian.org>
---
 drivers/xen/xenbus/Makefile             |    1 +
 drivers/xen/xenbus/xenbus_dev_backend.c |   79 +++++++++++++++++++++++++++++++
 include/xen/xenbus_dev.h                |   41 ++++++++++++++++
 3 files changed, 121 insertions(+), 0 deletions(-)
 create mode 100644 drivers/xen/xenbus/xenbus_dev_backend.c
 create mode 100644 include/xen/xenbus_dev.h

diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
index a2ea363..7e1aa85 100644
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -10,4 +10,5 @@ xenbus-objs += xenbus_probe.o
 xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
 xenbus-objs += $(xenbus-be-objs-y)
 
+obj-$(CONFIG_XEN_DOM0) += xenbus_dev_backend.o
 obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c
b/drivers/xen/xenbus/xenbus_dev_backend.c
new file mode 100644
index 0000000..5d77cee
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev_backend.c
@@ -0,0 +1,79 @@
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include <xen/page.h>
+#include <xen/xenbus_dev.h>
+
+#include "xenbus_comms.h"
+
+MODULE_LICENSE("GPL");
+
+static long xenbusd_ioctl(struct file *file, unsigned int cmd, unsigned long
data)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	switch (cmd) {
+		case IOCTL_XENBUSD_EVTCHN:
+			if (xen_store_evtchn > 0)
+				return xen_store_evtchn;
+			return -EINVAL;
+
+		default:
+			return -ENOTTY;
+	}
+}
+
+static int xenbusd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+		return -EINVAL;
+
+	if (remap_pfn_range(vma, vma->vm_start,
+			    virt_to_pfn(xen_store_interface),
+			    size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+const struct file_operations xenbusd_fops = {
+	.mmap = xenbusd_mmap,
+	.unlocked_ioctl = xenbusd_ioctl,
+};
+
+static struct miscdevice xenbusd_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "xen/xenbusd",
+	.fops = &xenbusd_fops,
+};
+
+static int __init xenbusd_init(void)
+{
+	int err;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	err = misc_register(&xenbusd_dev);
+	if (err)
+		printk(KERN_ERR "Could not register xenbus device\n");
+	return err;
+}
+
+static void __exit xenbusd_exit(void)
+{
+	misc_deregister(&xenbusd_dev);
+}
+
+module_init(xenbusd_init);
+module_exit(xenbusd_exit);
diff --git a/include/xen/xenbus_dev.h b/include/xen/xenbus_dev.h
new file mode 100644
index 0000000..f551404
--- /dev/null
+++ b/include/xen/xenbus_dev.h
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * evtchn.h
+ *
+ * Interface to /dev/xen/xenbusd.
+ *
+ * Copyright (c) 2011 Bastian Blank <waldi@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software
without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_XEN_XENBUS_DEV_H__
+#define __LINUX_XEN_XENBUS_DEV_H__
+
+#include <linux/ioctl.h>
+
+#define IOCTL_XENBUSD_EVTCHN				\
+	_IOC(_IOC_NONE, ''X'', 0, 0)
+
+#endif /* __LINUX_XEN_XENBUS_DEV_H__ */
-- 
1.7.7.3

Christoph Egger

2011-Nov-28 09:03 UTC

head link

Re: [PATCH 0/5] Move stuff out of xenfs

On 11/27/11 23:07, Bastian Blank wrote:> Over a year ago I started a discussion about xenfs. This is the first
> try to add the stuff in xenfs as regular devices and a sysfs file.
Is this intended to go into the xen kernel?

Christoph

> Patches for xen tools will follow.
>
> Bastian

-- 
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85689 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

Ian Campbell

2011-Nov-28 09:31 UTC

head link

Re: [PATCH 0/5] Move stuff out of xenfs

On Mon, 2011-11-28 at 09:03 +0000, Christoph Egger
wrote:> On 11/27/11 23:07, Bastian Blank wrote:
> > Over a year ago I started a discussion about xenfs. This is the first
> > try to add the stuff in xenfs as regular devices and a sysfs file.
> 
> Is this intended to go into the xen kernel?
No, these are Linux kernel patches.

Ian.

Christoph Egger

2011-Nov-28 09:39 UTC

head link

Re: [PATCH 0/5] Move stuff out of xenfs

On 11/28/11 10:31, Ian Campbell wrote:> On Mon, 2011-11-28 at 09:03 +0000, Christoph Egger wrote:
>> On 11/27/11 23:07, Bastian Blank wrote:
>>> Over a year ago I started a discussion about xenfs. This is the
first
>>> try to add the stuff in xenfs as regular devices and a sysfs file.
>>
>> Is this intended to go into the xen kernel?
>
> No, these are Linux kernel patches.
Shouldn''t they go to lkml instead?

Christoph


-- 
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85689 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

Ian Campbell

2011-Nov-28 09:51 UTC

head link

Re: [PATCH 0/5] Move stuff out of xenfs

On Mon, 2011-11-28 at 09:39 +0000, Christoph Egger
wrote:> On 11/28/11 10:31, Ian Campbell wrote:
> > On Mon, 2011-11-28 at 09:03 +0000, Christoph Egger wrote:
> >> On 11/27/11 23:07, Bastian Blank wrote:
> >>> Over a year ago I started a discussion about xenfs. This is
the first
> >>> try to add the stuff in xenfs as regular devices and a sysfs
file.
> >>
> >> Is this intended to go into the xen kernel?
> >
> > No, these are Linux kernel patches.
> 
> Shouldn''t they go to lkml instead?
Ideally they would go to both lists and whoever is listed in MAINTAINERS
for these files.

Ian.

Ian Campbell

2011-Nov-28 16:26 UTC

head link

Re: [PATCH 2/5] xen: Add privcmd device driver

On Sun, 2011-11-27 at 22:07 +0000, Bastian Blank wrote:> @@ -84,7 +85,7 @@ static int xenfs_fill_super(struct super_block *sb,
> void *data, int silent)
>                 [1] = {},
>                 { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR
},
>                 { "capabilities", &capabilities_file_ops,
S_IRUGO },
> -               { "privcmd", &privcmd_file_ops,
S_IRUSR|S_IWUSR },
> +               { "privcmd", &xen_privcmd_fops,
S_IRUSR|S_IWUSR },
>                 {""},
>         };
>         int rc; 
I wonder if we could do something dumb like make /proc/xen/privcmd by a
symlink to /dev/xen/privcmd instead of introducing this cross module
dependency?

The main reason would be to avoid the select since selecting on user
visible symbols is a recipe for confusion and is generally advised
against.

Perhaps the xen-privcmd.ko should simply call a newly introduced
xenfs_register()? This would be a nop if CONFIG_XENFS=n and therefore
the select would not be necessary. If COFIG_XENFS=[my] then modprobe
will do the right thing.

Ian.

Bastian Blank

2011-Nov-28 17:39 UTC

head link

Re: [PATCH 2/5] xen: Add privcmd device driver

On Mon, Nov 28, 2011 at 04:26:53PM +0000, Ian Campbell
wrote:> I wonder if we could do something dumb like make /proc/xen/privcmd by a
> symlink to /dev/xen/privcmd instead of introducing this cross module
> dependency?
Sure. However this make a dependency from kernel to the userspace naming
policy. And the xenfs module still needs some sort of dependency on
privcmd, so it gets loaded.
> The main reason would be to avoid the select since selecting on user
> visible symbols is a recipe for confusion and is generally advised
> against.
Right. It is just the easiest solution.
> Perhaps the xen-privcmd.ko should simply call a newly introduced
> xenfs_register()? This would be a nop if CONFIG_XENFS=n and therefore
> the select would not be necessary. If COFIG_XENFS=[my] then modprobe
> will do the right thing.
This would be not backward compatible. And I''d like to avoid a
dependency from privcmd to xenfs.

Bastian

-- 
Not one hundred percent efficient, of course ... but nothing ever is.
		-- Kirk, "Metamorphosis", stardate 3219.8

Ian Campbell

2011-Nov-28 18:00 UTC

head link

Re: [PATCH 2/5] xen: Add privcmd device driver

On Mon, 2011-11-28 at 17:39 +0000, Bastian Blank wrote:> On Mon, Nov 28, 2011 at 04:26:53PM +0000, Ian Campbell wrote:
> > I wonder if we could do something dumb like make /proc/xen/privcmd by
a
> > symlink to /dev/xen/privcmd instead of introducing this cross module
> > dependency?
> 
> Sure. However this make a dependency from kernel to the userspace naming
> policy. And the xenfs module still needs some sort of dependency on
> privcmd, so it gets loaded.
> 
> > The main reason would be to avoid the select since selecting on user
> > visible symbols is a recipe for confusion and is generally advised
> > against.
> 
> Right. It is just the easiest solution.
XENFS could depend on XEN_PRIVCMD and whatever else it needs?
> > Perhaps the xen-privcmd.ko should simply call a newly introduced
> > xenfs_register()? This would be a nop if CONFIG_XENFS=n and therefore
> > the select would not be necessary. If COFIG_XENFS=[my] then modprobe
> > will do the right thing.
> 
> This would be not backward compatible. And I''d like to avoid a
> dependency from privcmd to xenfs.
That''s a reasonable goal.

Ian.

Konrad Rzeszutek Wilk

2011-Nov-28 18:10 UTC

head link

Re: [PATCH 4/5] xen: Add xenbus device driver

On Sun, Nov 27, 2011 at 11:07:07PM +0100, Bastian Blank
wrote:> Access to xenbus is currently handled via xenfs. This adds a device
> driver for xenbus and makes xenfs use this code.
> 
> Signed-off-by: Bastian Blank <waldi@debian.org>
> ---
>  drivers/xen/xenbus/Makefile              |    1 +
>  drivers/xen/xenbus/xenbus_comms.h        |    4 +
>  drivers/xen/xenbus/xenbus_dev_frontend.c |  624
++++++++++++++++++++++++++++++
>  drivers/xen/xenfs/Makefile               |    2 +-
>  drivers/xen/xenfs/super.c                |    3 +-
>  drivers/xen/xenfs/xenbus.c               |  593
----------------------------
>  drivers/xen/xenfs/xenfs.h                |    1 -
Can you use ''git mv'' please?

This looks  ike you are just moving the file around.
>  7 files changed, 632 insertions(+), 596 deletions(-)
>  create mode 100644 drivers/xen/xenbus/xenbus_dev_frontend.c
>  delete mode 100644 drivers/xen/xenfs/xenbus.c
> 
> diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
> index 8dca685..a2ea363 100644
> --- a/drivers/xen/xenbus/Makefile
> +++ b/drivers/xen/xenbus/Makefile
> @@ -1,4 +1,5 @@
>  obj-y	+= xenbus.o
> +obj-y	+= xenbus_dev_frontend.o
>  
>  xenbus-objs >  xenbus-objs += xenbus_client.o
> diff --git a/drivers/xen/xenbus/xenbus_comms.h
b/drivers/xen/xenbus/xenbus_comms.h
> index c21db75..6e42800 100644
> --- a/drivers/xen/xenbus/xenbus_comms.h
> +++ b/drivers/xen/xenbus/xenbus_comms.h
> @@ -31,6 +31,8 @@
>  #ifndef _XENBUS_COMMS_H
>  #define _XENBUS_COMMS_H
>  
> +#include <linux/fs.h>
> +
>  int xs_init(void);
>  int xb_init_comms(void);
>  
> @@ -43,4 +45,6 @@ int xs_input_avail(void);
>  extern struct xenstore_domain_interface *xen_store_interface;
>  extern int xen_store_evtchn;
>  
> +extern const struct file_operations xen_xenbus_fops;
> +
>  #endif /* _XENBUS_COMMS_H */
> diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c
b/drivers/xen/xenbus/xenbus_dev_frontend.c
> new file mode 100644
> index 0000000..fb30cff
> --- /dev/null
> +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
> @@ -0,0 +1,624 @@
> +/*
> + * Driver giving user-space access to the kernel''s xenbus
connection
> + * to xenstore.
> + *
> + * Copyright (c) 2005, Christian Limpach
> + * Copyright (c) 2005, Rusty Russell, IBM Corporation
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version 2
> + * as published by the Free Software Foundation; or, when distributed
> + * separately from the Linux kernel or incorporated into other
> + * software packages, subject to the following license:
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
copy
> + * of this source file (the "Software"), to deal in the Software
without
> + * restriction, including without limitation the rights to use, copy,
modify,
> + * merge, publish, distribute, sublicense, and/or sell copies of the
Software,
> + * and to permit persons to whom the Software is furnished to do so,
subject to
> + * the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included
in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE
> + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * Changes:
> + * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs
filesystem
> + *                              and /proc/xen compatibility mount point.
> + *                              Turned xenfs into a loadable module.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/errno.h>
> +#include <linux/uio.h>
> +#include <linux/notifier.h>
> +#include <linux/wait.h>
> +#include <linux/fs.h>
> +#include <linux/poll.h>
> +#include <linux/mutex.h>
> +#include <linux/sched.h>
> +#include <linux/spinlock.h>
> +#include <linux/mount.h>
> +#include <linux/pagemap.h>
> +#include <linux/uaccess.h>
> +#include <linux/init.h>
> +#include <linux/namei.h>
> +#include <linux/string.h>
> +#include <linux/slab.h>
> +#include <linux/miscdevice.h>
> +#include <linux/module.h>
> +
> +#include "xenbus_comms.h"
> +
> +#include <xen/xenbus.h>
> +#include <asm/xen/hypervisor.h>
> +
> +MODULE_LICENSE("GPL");
> +
> +/*
> + * An element of a list of outstanding transactions, for which
we''re
> + * still waiting a reply.
> + */
> +struct xenbus_transaction_holder {
> +	struct list_head list;
> +	struct xenbus_transaction handle;
> +};
> +
> +/*
> + * A buffer of data on the queue.
> + */
> +struct read_buffer {
> +	struct list_head list;
> +	unsigned int cons;
> +	unsigned int len;
> +	char msg[];
> +};
> +
> +struct xenbus_file_priv {
> +	/*
> +	 * msgbuffer_mutex is held while partial requests are built up
> +	 * and complete requests are acted on.  It therefore protects
> +	 * the "transactions" and "watches" lists, and the
partial
> +	 * request length and buffer.
> +	 *
> +	 * reply_mutex protects the reply being built up to return to
> +	 * usermode.  It nests inside msgbuffer_mutex but may be held
> +	 * alone during a watch callback.
> +	 */
> +	struct mutex msgbuffer_mutex;
> +
> +	/* In-progress transactions */
> +	struct list_head transactions;
> +
> +	/* Active watches. */
> +	struct list_head watches;
> +
> +	/* Partial request. */
> +	unsigned int len;
> +	union {
> +		struct xsd_sockmsg msg;
> +		char buffer[PAGE_SIZE];
> +	} u;
> +
> +	/* Response queue. */
> +	struct mutex reply_mutex;
> +	struct list_head read_buffers;
> +	wait_queue_head_t read_waitq;
> +
> +};
> +
> +/* Read out any raw xenbus messages queued up. */
> +static ssize_t xenbus_file_read(struct file *filp,
> +			       char __user *ubuf,
> +			       size_t len, loff_t *ppos)
> +{
> +	struct xenbus_file_priv *u = filp->private_data;
> +	struct read_buffer *rb;
> +	unsigned i;
> +	int ret;
> +
> +	mutex_lock(&u->reply_mutex);
> +again:
> +	while (list_empty(&u->read_buffers)) {
> +		mutex_unlock(&u->reply_mutex);
> +		if (filp->f_flags & O_NONBLOCK)
> +			return -EAGAIN;
> +
> +		ret = wait_event_interruptible(u->read_waitq,
> +					       !list_empty(&u->read_buffers));
> +		if (ret)
> +			return ret;
> +		mutex_lock(&u->reply_mutex);
> +	}
> +
> +	rb = list_entry(u->read_buffers.next, struct read_buffer, list);
> +	i = 0;
> +	while (i < len) {
> +		unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
> +
> +		ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
> +
> +		i += sz - ret;
> +		rb->cons += sz - ret;
> +
> +		if (ret != 0) {
> +			if (i == 0)
> +				i = -EFAULT;
> +			goto out;
> +		}
> +
> +		/* Clear out buffer if it has been consumed */
> +		if (rb->cons == rb->len) {
> +			list_del(&rb->list);
> +			kfree(rb);
> +			if (list_empty(&u->read_buffers))
> +				break;
> +			rb = list_entry(u->read_buffers.next,
> +					struct read_buffer, list);
> +		}
> +	}
> +	if (i == 0)
> +		goto again;
> +
> +out:
> +	mutex_unlock(&u->reply_mutex);
> +	return i;
> +}
> +
> +/*
> + * Add a buffer to the queue.  Caller must hold the appropriate lock
> + * if the queue is not local.  (Commonly the caller will build up
> + * multiple queued buffers on a temporary local list, and then add it
> + * to the appropriate list under lock once all the buffers have een
> + * successfully allocated.)
> + */
> +static int queue_reply(struct list_head *queue, const void *data, size_t
len)
> +{
> +	struct read_buffer *rb;
> +
> +	if (len == 0)
> +		return 0;
> +
> +	rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
> +	if (rb == NULL)
> +		return -ENOMEM;
> +
> +	rb->cons = 0;
> +	rb->len = len;
> +
> +	memcpy(rb->msg, data, len);
> +
> +	list_add_tail(&rb->list, queue);
> +	return 0;
> +}
> +
> +/*
> + * Free all the read_buffer s on a list.
> + * Caller must have sole reference to list.
> + */
> +static void queue_cleanup(struct list_head *list)
> +{
> +	struct read_buffer *rb;
> +
> +	while (!list_empty(list)) {
> +		rb = list_entry(list->next, struct read_buffer, list);
> +		list_del(list->next);
> +		kfree(rb);
> +	}
> +}
> +
> +struct watch_adapter {
> +	struct list_head list;
> +	struct xenbus_watch watch;
> +	struct xenbus_file_priv *dev_data;
> +	char *token;
> +};
> +
> +static void free_watch_adapter(struct watch_adapter *watch)
> +{
> +	kfree(watch->watch.node);
> +	kfree(watch->token);
> +	kfree(watch);
> +}
> +
> +static struct watch_adapter *alloc_watch_adapter(const char *path,
> +						 const char *token)
> +{
> +	struct watch_adapter *watch;
> +
> +	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
> +	if (watch == NULL)
> +		goto out_fail;
> +
> +	watch->watch.node = kstrdup(path, GFP_KERNEL);
> +	if (watch->watch.node == NULL)
> +		goto out_free;
> +
> +	watch->token = kstrdup(token, GFP_KERNEL);
> +	if (watch->token == NULL)
> +		goto out_free;
> +
> +	return watch;
> +
> +out_free:
> +	free_watch_adapter(watch);
> +
> +out_fail:
> +	return NULL;
> +}
> +
> +static void watch_fired(struct xenbus_watch *watch,
> +			const char **vec,
> +			unsigned int len)
> +{
> +	struct watch_adapter *adap;
> +	struct xsd_sockmsg hdr;
> +	const char *path, *token;
> +	int path_len, tok_len, body_len, data_len = 0;
> +	int ret;
> +	LIST_HEAD(staging_q);
> +
> +	adap = container_of(watch, struct watch_adapter, watch);
> +
> +	path = vec[XS_WATCH_PATH];
> +	token = adap->token;
> +
> +	path_len = strlen(path) + 1;
> +	tok_len = strlen(token) + 1;
> +	if (len > 2)
> +		data_len = vec[len] - vec[2] + 1;
> +	body_len = path_len + tok_len + data_len;
> +
> +	hdr.type = XS_WATCH_EVENT;
> +	hdr.len = body_len;
> +
> +	mutex_lock(&adap->dev_data->reply_mutex);
> +
> +	ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
> +	if (!ret)
> +		ret = queue_reply(&staging_q, path, path_len);
> +	if (!ret)
> +		ret = queue_reply(&staging_q, token, tok_len);
> +	if (!ret && len > 2)
> +		ret = queue_reply(&staging_q, vec[2], data_len);
> +
> +	if (!ret) {
> +		/* success: pass reply list onto watcher */
> +		list_splice_tail(&staging_q,
&adap->dev_data->read_buffers);
> +		wake_up(&adap->dev_data->read_waitq);
> +	} else
> +		queue_cleanup(&staging_q);
> +
> +	mutex_unlock(&adap->dev_data->reply_mutex);
> +}
> +
> +static int xenbus_write_transaction(unsigned msg_type,
> +				    struct xenbus_file_priv *u)
> +{
> +	int rc;
> +	void *reply;
> +	struct xenbus_transaction_holder *trans = NULL;
> +	LIST_HEAD(staging_q);
> +
> +	if (msg_type == XS_TRANSACTION_START) {
> +		trans = kmalloc(sizeof(*trans), GFP_KERNEL);
> +		if (!trans) {
> +			rc = -ENOMEM;
> +			goto out;
> +		}
> +	}
> +
> +	reply = xenbus_dev_request_and_reply(&u->u.msg);
> +	if (IS_ERR(reply)) {
> +		kfree(trans);
> +		rc = PTR_ERR(reply);
> +		goto out;
> +	}
> +
> +	if (msg_type == XS_TRANSACTION_START) {
> +		trans->handle.id = simple_strtoul(reply, NULL, 0);
> +
> +		list_add(&trans->list, &u->transactions);
> +	} else if (msg_type == XS_TRANSACTION_END) {
> +		list_for_each_entry(trans, &u->transactions, list)
> +			if (trans->handle.id == u->u.msg.tx_id)
> +				break;
> +		BUG_ON(&trans->list == &u->transactions);
> +		list_del(&trans->list);
> +
> +		kfree(trans);
> +	}
> +
> +	mutex_lock(&u->reply_mutex);
> +	rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
> +	if (!rc)
> +		rc = queue_reply(&staging_q, reply, u->u.msg.len);
> +	if (!rc) {
> +		list_splice_tail(&staging_q, &u->read_buffers);
> +		wake_up(&u->read_waitq);
> +	} else {
> +		queue_cleanup(&staging_q);
> +	}
> +	mutex_unlock(&u->reply_mutex);
> +
> +	kfree(reply);
> +
> +out:
> +	return rc;
> +}
> +
> +static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv
*u)
> +{
> +	struct watch_adapter *watch, *tmp_watch;
> +	char *path, *token;
> +	int err, rc;
> +	LIST_HEAD(staging_q);
> +
> +	path = u->u.buffer + sizeof(u->u.msg);
> +	token = memchr(path, 0, u->u.msg.len);
> +	if (token == NULL) {
> +		rc = -EILSEQ;
> +		goto out;
> +	}
> +	token++;
> +
> +	if (msg_type == XS_WATCH) {
> +		watch = alloc_watch_adapter(path, token);
> +		if (watch == NULL) {
> +			rc = -ENOMEM;
> +			goto out;
> +		}
> +
> +		watch->watch.callback = watch_fired;
> +		watch->dev_data = u;
> +
> +		err = register_xenbus_watch(&watch->watch);
> +		if (err) {
> +			free_watch_adapter(watch);
> +			rc = err;
> +			goto out;
> +		}
> +		list_add(&watch->list, &u->watches);
> +	} else {
> +		list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
> +			if (!strcmp(watch->token, token) &&
> +			    !strcmp(watch->watch.node, path)) {
> +				unregister_xenbus_watch(&watch->watch);
> +				list_del(&watch->list);
> +				free_watch_adapter(watch);
> +				break;
> +			}
> +		}
> +	}
> +
> +	/* Success.  Synthesize a reply to say all is OK. */
> +	{
> +		struct {
> +			struct xsd_sockmsg hdr;
> +			char body[3];
> +		} __packed reply = {
> +			{
> +				.type = msg_type,
> +				.len = sizeof(reply.body)
> +			},
> +			"OK"
> +		};
> +
> +		mutex_lock(&u->reply_mutex);
> +		rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
> +		wake_up(&u->read_waitq);
> +		mutex_unlock(&u->reply_mutex);
> +	}
> +
> +out:
> +	return rc;
> +}
> +
> +static ssize_t xenbus_file_write(struct file *filp,
> +				const char __user *ubuf,
> +				size_t len, loff_t *ppos)
> +{
> +	struct xenbus_file_priv *u = filp->private_data;
> +	uint32_t msg_type;
> +	int rc = len;
> +	int ret;
> +	LIST_HEAD(staging_q);
> +
> +	/*
> +	 * We''re expecting usermode to be writing properly formed
> +	 * xenbus messages.  If they write an incomplete message we
> +	 * buffer it up.  Once it is complete, we act on it.
> +	 */
> +
> +	/*
> +	 * Make sure concurrent writers can''t stomp all over each
> +	 * other''s messages and make a mess of our partial message
> +	 * buffer.  We don''t make any attemppt to stop multiple
> +	 * writers from making a mess of each other''s incomplete
> +	 * messages; we''re just trying to guarantee our own internal
> +	 * consistency and make sure that single writes are handled
> +	 * atomically.
> +	 */
> +	mutex_lock(&u->msgbuffer_mutex);
> +
> +	/* Get this out of the way early to avoid confusion */
> +	if (len == 0)
> +		goto out;
> +
> +	/* Can''t write a xenbus message larger we can buffer */
> +	if ((len + u->len) > sizeof(u->u.buffer)) {
> +		/* On error, dump existing buffer */
> +		u->len = 0;
> +		rc = -EINVAL;
> +		goto out;
> +	}
> +
> +	ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
> +
> +	if (ret != 0) {
> +		rc = -EFAULT;
> +		goto out;
> +	}
> +
> +	/* Deal with a partial copy. */
> +	len -= ret;
> +	rc = len;
> +
> +	u->len += len;
> +
> +	/* Return if we haven''t got a full message yet */
> +	if (u->len < sizeof(u->u.msg))
> +		goto out;	/* not even the header yet */
> +
> +	/* If we''re expecting a message that''s larger than we
can
> +	   possibly send, dump what we have and return an error. */
> +	if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer))
{
> +		rc = -E2BIG;
> +		u->len = 0;
> +		goto out;
> +	}
> +
> +	if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
> +		goto out;	/* incomplete data portion */
> +
> +	/*
> +	 * OK, now we have a complete message.  Do something with it.
> +	 */
> +
> +	msg_type = u->u.msg.type;
> +
> +	switch (msg_type) {
> +	case XS_WATCH:
> +	case XS_UNWATCH:
> +		/* (Un)Ask for some path to be watched for changes */
> +		ret = xenbus_write_watch(msg_type, u);
> +		break;
> +
> +	default:
> +		/* Send out a transaction */
> +		ret = xenbus_write_transaction(msg_type, u);
> +		break;
> +	}
> +	if (ret != 0)
> +		rc = ret;
> +
> +	/* Buffered message consumed */
> +	u->len = 0;
> +
> + out:
> +	mutex_unlock(&u->msgbuffer_mutex);
> +	return rc;
> +}
> +
> +static int xenbus_file_open(struct inode *inode, struct file *filp)
> +{
> +	struct xenbus_file_priv *u;
> +
> +	if (xen_store_evtchn == 0)
> +		return -ENOENT;
> +
> +	nonseekable_open(inode, filp);
> +
> +	u = kzalloc(sizeof(*u), GFP_KERNEL);
> +	if (u == NULL)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&u->transactions);
> +	INIT_LIST_HEAD(&u->watches);
> +	INIT_LIST_HEAD(&u->read_buffers);
> +	init_waitqueue_head(&u->read_waitq);
> +
> +	mutex_init(&u->reply_mutex);
> +	mutex_init(&u->msgbuffer_mutex);
> +
> +	filp->private_data = u;
> +
> +	return 0;
> +}
> +
> +static int xenbus_file_release(struct inode *inode, struct file *filp)
> +{
> +	struct xenbus_file_priv *u = filp->private_data;
> +	struct xenbus_transaction_holder *trans, *tmp;
> +	struct watch_adapter *watch, *tmp_watch;
> +	struct read_buffer *rb, *tmp_rb;
> +
> +	/*
> +	 * No need for locking here because there are no other users,
> +	 * by definition.
> +	 */
> +
> +	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
> +		xenbus_transaction_end(trans->handle, 1);
> +		list_del(&trans->list);
> +		kfree(trans);
> +	}
> +
> +	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
> +		unregister_xenbus_watch(&watch->watch);
> +		list_del(&watch->list);
> +		free_watch_adapter(watch);
> +	}
> +
> +	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
> +		list_del(&rb->list);
> +		kfree(rb);
> +	}
> +	kfree(u);
> +
> +	return 0;
> +}
> +
> +static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
> +{
> +	struct xenbus_file_priv *u = file->private_data;
> +
> +	poll_wait(file, &u->read_waitq, wait);
> +	if (!list_empty(&u->read_buffers))
> +		return POLLIN | POLLRDNORM;
> +	return 0;
> +}
> +
> +const struct file_operations xen_xenbus_fops = {
> +	.read = xenbus_file_read,
> +	.write = xenbus_file_write,
> +	.open = xenbus_file_open,
> +	.release = xenbus_file_release,
> +	.poll = xenbus_file_poll,
> +	.llseek = no_llseek,
> +};
> +EXPORT_SYMBOL_GPL(xen_xenbus_fops);
> +
> +static struct miscdevice xenbus_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "xen/xenbus",
> +	.fops = &xen_xenbus_fops,
> +};
> +
> +static int __init xenbus_init(void)
> +{
> +	int err;
> +
> +	if (!xen_domain())
> +		return -ENODEV;
> +
> +	err = misc_register(&xenbus_dev);
> +	if (err)
> +		printk(KERN_ERR "Could not register xenbus device\n");
> +	return err;
> +}
> +
> +static void __exit xenbus_exit(void)
> +{
> +	misc_deregister(&xenbus_dev);
> +}
> +
> +module_init(xenbus_init);
> +module_exit(xenbus_exit);
> diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
> index 5d45ff1..b019865 100644
> --- a/drivers/xen/xenfs/Makefile
> +++ b/drivers/xen/xenfs/Makefile
> @@ -1,4 +1,4 @@
>  obj-$(CONFIG_XENFS) += xenfs.o
>  
> -xenfs-y			  = super.o xenbus.o
> +xenfs-y			  = super.o
>  xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
> diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
> index a55fbf9..a84b53c 100644
> --- a/drivers/xen/xenfs/super.c
> +++ b/drivers/xen/xenfs/super.c
> @@ -17,6 +17,7 @@
>  
>  #include "xenfs.h"
>  #include "../privcmd.h"
> +#include "../xenbus/xenbus_comms.h"
>  
>  #include <asm/xen/hypervisor.h>
>  
> @@ -83,7 +84,7 @@ static int xenfs_fill_super(struct super_block *sb, void
*data, int silent)
>  {
>  	static struct tree_descr xenfs_files[] = {
>  		[1] = {},
> -		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
> +		{ "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
>  		{ "capabilities", &capabilities_file_ops, S_IRUGO },
>  		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
>  		{""},
> diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
> deleted file mode 100644
> index bbd000f..0000000
> --- a/drivers/xen/xenfs/xenbus.c
> +++ /dev/null
> @@ -1,593 +0,0 @@
> -/*
> - * Driver giving user-space access to the kernel''s xenbus
connection
> - * to xenstore.
> - *
> - * Copyright (c) 2005, Christian Limpach
> - * Copyright (c) 2005, Rusty Russell, IBM Corporation
> - *
> - * This program is free software; you can redistribute it and/or
> - * modify it under the terms of the GNU General Public License version 2
> - * as published by the Free Software Foundation; or, when distributed
> - * separately from the Linux kernel or incorporated into other
> - * software packages, subject to the following license:
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
copy
> - * of this source file (the "Software"), to deal in the Software
without
> - * restriction, including without limitation the rights to use, copy,
modify,
> - * merge, publish, distribute, sublicense, and/or sell copies of the
Software,
> - * and to permit persons to whom the Software is furnished to do so,
subject to
> - * the following conditions:
> - *
> - * The above copyright notice and this permission notice shall be included
in
> - * all copies or substantial portions of the Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE
> - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS
> - * IN THE SOFTWARE.
> - *
> - * Changes:
> - * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs
filesystem
> - *                              and /proc/xen compatibility mount point.
> - *                              Turned xenfs into a loadable module.
> - */
> -
> -#include <linux/kernel.h>
> -#include <linux/errno.h>
> -#include <linux/uio.h>
> -#include <linux/notifier.h>
> -#include <linux/wait.h>
> -#include <linux/fs.h>
> -#include <linux/poll.h>
> -#include <linux/mutex.h>
> -#include <linux/sched.h>
> -#include <linux/spinlock.h>
> -#include <linux/mount.h>
> -#include <linux/pagemap.h>
> -#include <linux/uaccess.h>
> -#include <linux/init.h>
> -#include <linux/namei.h>
> -#include <linux/string.h>
> -#include <linux/slab.h>
> -
> -#include "xenfs.h"
> -#include "../xenbus/xenbus_comms.h"
> -
> -#include <xen/xenbus.h>
> -#include <asm/xen/hypervisor.h>
> -
> -/*
> - * An element of a list of outstanding transactions, for which
we''re
> - * still waiting a reply.
> - */
> -struct xenbus_transaction_holder {
> -	struct list_head list;
> -	struct xenbus_transaction handle;
> -};
> -
> -/*
> - * A buffer of data on the queue.
> - */
> -struct read_buffer {
> -	struct list_head list;
> -	unsigned int cons;
> -	unsigned int len;
> -	char msg[];
> -};
> -
> -struct xenbus_file_priv {
> -	/*
> -	 * msgbuffer_mutex is held while partial requests are built up
> -	 * and complete requests are acted on.  It therefore protects
> -	 * the "transactions" and "watches" lists, and the
partial
> -	 * request length and buffer.
> -	 *
> -	 * reply_mutex protects the reply being built up to return to
> -	 * usermode.  It nests inside msgbuffer_mutex but may be held
> -	 * alone during a watch callback.
> -	 */
> -	struct mutex msgbuffer_mutex;
> -
> -	/* In-progress transactions */
> -	struct list_head transactions;
> -
> -	/* Active watches. */
> -	struct list_head watches;
> -
> -	/* Partial request. */
> -	unsigned int len;
> -	union {
> -		struct xsd_sockmsg msg;
> -		char buffer[PAGE_SIZE];
> -	} u;
> -
> -	/* Response queue. */
> -	struct mutex reply_mutex;
> -	struct list_head read_buffers;
> -	wait_queue_head_t read_waitq;
> -
> -};
> -
> -/* Read out any raw xenbus messages queued up. */
> -static ssize_t xenbus_file_read(struct file *filp,
> -			       char __user *ubuf,
> -			       size_t len, loff_t *ppos)
> -{
> -	struct xenbus_file_priv *u = filp->private_data;
> -	struct read_buffer *rb;
> -	unsigned i;
> -	int ret;
> -
> -	mutex_lock(&u->reply_mutex);
> -again:
> -	while (list_empty(&u->read_buffers)) {
> -		mutex_unlock(&u->reply_mutex);
> -		if (filp->f_flags & O_NONBLOCK)
> -			return -EAGAIN;
> -
> -		ret = wait_event_interruptible(u->read_waitq,
> -					       !list_empty(&u->read_buffers));
> -		if (ret)
> -			return ret;
> -		mutex_lock(&u->reply_mutex);
> -	}
> -
> -	rb = list_entry(u->read_buffers.next, struct read_buffer, list);
> -	i = 0;
> -	while (i < len) {
> -		unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
> -
> -		ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
> -
> -		i += sz - ret;
> -		rb->cons += sz - ret;
> -
> -		if (ret != 0) {
> -			if (i == 0)
> -				i = -EFAULT;
> -			goto out;
> -		}
> -
> -		/* Clear out buffer if it has been consumed */
> -		if (rb->cons == rb->len) {
> -			list_del(&rb->list);
> -			kfree(rb);
> -			if (list_empty(&u->read_buffers))
> -				break;
> -			rb = list_entry(u->read_buffers.next,
> -					struct read_buffer, list);
> -		}
> -	}
> -	if (i == 0)
> -		goto again;
> -
> -out:
> -	mutex_unlock(&u->reply_mutex);
> -	return i;
> -}
> -
> -/*
> - * Add a buffer to the queue.  Caller must hold the appropriate lock
> - * if the queue is not local.  (Commonly the caller will build up
> - * multiple queued buffers on a temporary local list, and then add it
> - * to the appropriate list under lock once all the buffers have een
> - * successfully allocated.)
> - */
> -static int queue_reply(struct list_head *queue, const void *data, size_t
len)
> -{
> -	struct read_buffer *rb;
> -
> -	if (len == 0)
> -		return 0;
> -
> -	rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
> -	if (rb == NULL)
> -		return -ENOMEM;
> -
> -	rb->cons = 0;
> -	rb->len = len;
> -
> -	memcpy(rb->msg, data, len);
> -
> -	list_add_tail(&rb->list, queue);
> -	return 0;
> -}
> -
> -/*
> - * Free all the read_buffer s on a list.
> - * Caller must have sole reference to list.
> - */
> -static void queue_cleanup(struct list_head *list)
> -{
> -	struct read_buffer *rb;
> -
> -	while (!list_empty(list)) {
> -		rb = list_entry(list->next, struct read_buffer, list);
> -		list_del(list->next);
> -		kfree(rb);
> -	}
> -}
> -
> -struct watch_adapter {
> -	struct list_head list;
> -	struct xenbus_watch watch;
> -	struct xenbus_file_priv *dev_data;
> -	char *token;
> -};
> -
> -static void free_watch_adapter(struct watch_adapter *watch)
> -{
> -	kfree(watch->watch.node);
> -	kfree(watch->token);
> -	kfree(watch);
> -}
> -
> -static struct watch_adapter *alloc_watch_adapter(const char *path,
> -						 const char *token)
> -{
> -	struct watch_adapter *watch;
> -
> -	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
> -	if (watch == NULL)
> -		goto out_fail;
> -
> -	watch->watch.node = kstrdup(path, GFP_KERNEL);
> -	if (watch->watch.node == NULL)
> -		goto out_free;
> -
> -	watch->token = kstrdup(token, GFP_KERNEL);
> -	if (watch->token == NULL)
> -		goto out_free;
> -
> -	return watch;
> -
> -out_free:
> -	free_watch_adapter(watch);
> -
> -out_fail:
> -	return NULL;
> -}
> -
> -static void watch_fired(struct xenbus_watch *watch,
> -			const char **vec,
> -			unsigned int len)
> -{
> -	struct watch_adapter *adap;
> -	struct xsd_sockmsg hdr;
> -	const char *path, *token;
> -	int path_len, tok_len, body_len, data_len = 0;
> -	int ret;
> -	LIST_HEAD(staging_q);
> -
> -	adap = container_of(watch, struct watch_adapter, watch);
> -
> -	path = vec[XS_WATCH_PATH];
> -	token = adap->token;
> -
> -	path_len = strlen(path) + 1;
> -	tok_len = strlen(token) + 1;
> -	if (len > 2)
> -		data_len = vec[len] - vec[2] + 1;
> -	body_len = path_len + tok_len + data_len;
> -
> -	hdr.type = XS_WATCH_EVENT;
> -	hdr.len = body_len;
> -
> -	mutex_lock(&adap->dev_data->reply_mutex);
> -
> -	ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
> -	if (!ret)
> -		ret = queue_reply(&staging_q, path, path_len);
> -	if (!ret)
> -		ret = queue_reply(&staging_q, token, tok_len);
> -	if (!ret && len > 2)
> -		ret = queue_reply(&staging_q, vec[2], data_len);
> -
> -	if (!ret) {
> -		/* success: pass reply list onto watcher */
> -		list_splice_tail(&staging_q,
&adap->dev_data->read_buffers);
> -		wake_up(&adap->dev_data->read_waitq);
> -	} else
> -		queue_cleanup(&staging_q);
> -
> -	mutex_unlock(&adap->dev_data->reply_mutex);
> -}
> -
> -static int xenbus_write_transaction(unsigned msg_type,
> -				    struct xenbus_file_priv *u)
> -{
> -	int rc;
> -	void *reply;
> -	struct xenbus_transaction_holder *trans = NULL;
> -	LIST_HEAD(staging_q);
> -
> -	if (msg_type == XS_TRANSACTION_START) {
> -		trans = kmalloc(sizeof(*trans), GFP_KERNEL);
> -		if (!trans) {
> -			rc = -ENOMEM;
> -			goto out;
> -		}
> -	}
> -
> -	reply = xenbus_dev_request_and_reply(&u->u.msg);
> -	if (IS_ERR(reply)) {
> -		kfree(trans);
> -		rc = PTR_ERR(reply);
> -		goto out;
> -	}
> -
> -	if (msg_type == XS_TRANSACTION_START) {
> -		trans->handle.id = simple_strtoul(reply, NULL, 0);
> -
> -		list_add(&trans->list, &u->transactions);
> -	} else if (msg_type == XS_TRANSACTION_END) {
> -		list_for_each_entry(trans, &u->transactions, list)
> -			if (trans->handle.id == u->u.msg.tx_id)
> -				break;
> -		BUG_ON(&trans->list == &u->transactions);
> -		list_del(&trans->list);
> -
> -		kfree(trans);
> -	}
> -
> -	mutex_lock(&u->reply_mutex);
> -	rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
> -	if (!rc)
> -		rc = queue_reply(&staging_q, reply, u->u.msg.len);
> -	if (!rc) {
> -		list_splice_tail(&staging_q, &u->read_buffers);
> -		wake_up(&u->read_waitq);
> -	} else {
> -		queue_cleanup(&staging_q);
> -	}
> -	mutex_unlock(&u->reply_mutex);
> -
> -	kfree(reply);
> -
> -out:
> -	return rc;
> -}
> -
> -static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv
*u)
> -{
> -	struct watch_adapter *watch, *tmp_watch;
> -	char *path, *token;
> -	int err, rc;
> -	LIST_HEAD(staging_q);
> -
> -	path = u->u.buffer + sizeof(u->u.msg);
> -	token = memchr(path, 0, u->u.msg.len);
> -	if (token == NULL) {
> -		rc = -EILSEQ;
> -		goto out;
> -	}
> -	token++;
> -
> -	if (msg_type == XS_WATCH) {
> -		watch = alloc_watch_adapter(path, token);
> -		if (watch == NULL) {
> -			rc = -ENOMEM;
> -			goto out;
> -		}
> -
> -		watch->watch.callback = watch_fired;
> -		watch->dev_data = u;
> -
> -		err = register_xenbus_watch(&watch->watch);
> -		if (err) {
> -			free_watch_adapter(watch);
> -			rc = err;
> -			goto out;
> -		}
> -		list_add(&watch->list, &u->watches);
> -	} else {
> -		list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
> -			if (!strcmp(watch->token, token) &&
> -			    !strcmp(watch->watch.node, path)) {
> -				unregister_xenbus_watch(&watch->watch);
> -				list_del(&watch->list);
> -				free_watch_adapter(watch);
> -				break;
> -			}
> -		}
> -	}
> -
> -	/* Success.  Synthesize a reply to say all is OK. */
> -	{
> -		struct {
> -			struct xsd_sockmsg hdr;
> -			char body[3];
> -		} __packed reply = {
> -			{
> -				.type = msg_type,
> -				.len = sizeof(reply.body)
> -			},
> -			"OK"
> -		};
> -
> -		mutex_lock(&u->reply_mutex);
> -		rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
> -		wake_up(&u->read_waitq);
> -		mutex_unlock(&u->reply_mutex);
> -	}
> -
> -out:
> -	return rc;
> -}
> -
> -static ssize_t xenbus_file_write(struct file *filp,
> -				const char __user *ubuf,
> -				size_t len, loff_t *ppos)
> -{
> -	struct xenbus_file_priv *u = filp->private_data;
> -	uint32_t msg_type;
> -	int rc = len;
> -	int ret;
> -	LIST_HEAD(staging_q);
> -
> -	/*
> -	 * We''re expecting usermode to be writing properly formed
> -	 * xenbus messages.  If they write an incomplete message we
> -	 * buffer it up.  Once it is complete, we act on it.
> -	 */
> -
> -	/*
> -	 * Make sure concurrent writers can''t stomp all over each
> -	 * other''s messages and make a mess of our partial message
> -	 * buffer.  We don''t make any attemppt to stop multiple
> -	 * writers from making a mess of each other''s incomplete
> -	 * messages; we''re just trying to guarantee our own internal
> -	 * consistency and make sure that single writes are handled
> -	 * atomically.
> -	 */
> -	mutex_lock(&u->msgbuffer_mutex);
> -
> -	/* Get this out of the way early to avoid confusion */
> -	if (len == 0)
> -		goto out;
> -
> -	/* Can''t write a xenbus message larger we can buffer */
> -	if ((len + u->len) > sizeof(u->u.buffer)) {
> -		/* On error, dump existing buffer */
> -		u->len = 0;
> -		rc = -EINVAL;
> -		goto out;
> -	}
> -
> -	ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
> -
> -	if (ret != 0) {
> -		rc = -EFAULT;
> -		goto out;
> -	}
> -
> -	/* Deal with a partial copy. */
> -	len -= ret;
> -	rc = len;
> -
> -	u->len += len;
> -
> -	/* Return if we haven''t got a full message yet */
> -	if (u->len < sizeof(u->u.msg))
> -		goto out;	/* not even the header yet */
> -
> -	/* If we''re expecting a message that''s larger than we
can
> -	   possibly send, dump what we have and return an error. */
> -	if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer))
{
> -		rc = -E2BIG;
> -		u->len = 0;
> -		goto out;
> -	}
> -
> -	if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
> -		goto out;	/* incomplete data portion */
> -
> -	/*
> -	 * OK, now we have a complete message.  Do something with it.
> -	 */
> -
> -	msg_type = u->u.msg.type;
> -
> -	switch (msg_type) {
> -	case XS_WATCH:
> -	case XS_UNWATCH:
> -		/* (Un)Ask for some path to be watched for changes */
> -		ret = xenbus_write_watch(msg_type, u);
> -		break;
> -
> -	default:
> -		/* Send out a transaction */
> -		ret = xenbus_write_transaction(msg_type, u);
> -		break;
> -	}
> -	if (ret != 0)
> -		rc = ret;
> -
> -	/* Buffered message consumed */
> -	u->len = 0;
> -
> - out:
> -	mutex_unlock(&u->msgbuffer_mutex);
> -	return rc;
> -}
> -
> -static int xenbus_file_open(struct inode *inode, struct file *filp)
> -{
> -	struct xenbus_file_priv *u;
> -
> -	if (xen_store_evtchn == 0)
> -		return -ENOENT;
> -
> -	nonseekable_open(inode, filp);
> -
> -	u = kzalloc(sizeof(*u), GFP_KERNEL);
> -	if (u == NULL)
> -		return -ENOMEM;
> -
> -	INIT_LIST_HEAD(&u->transactions);
> -	INIT_LIST_HEAD(&u->watches);
> -	INIT_LIST_HEAD(&u->read_buffers);
> -	init_waitqueue_head(&u->read_waitq);
> -
> -	mutex_init(&u->reply_mutex);
> -	mutex_init(&u->msgbuffer_mutex);
> -
> -	filp->private_data = u;
> -
> -	return 0;
> -}
> -
> -static int xenbus_file_release(struct inode *inode, struct file *filp)
> -{
> -	struct xenbus_file_priv *u = filp->private_data;
> -	struct xenbus_transaction_holder *trans, *tmp;
> -	struct watch_adapter *watch, *tmp_watch;
> -	struct read_buffer *rb, *tmp_rb;
> -
> -	/*
> -	 * No need for locking here because there are no other users,
> -	 * by definition.
> -	 */
> -
> -	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
> -		xenbus_transaction_end(trans->handle, 1);
> -		list_del(&trans->list);
> -		kfree(trans);
> -	}
> -
> -	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
> -		unregister_xenbus_watch(&watch->watch);
> -		list_del(&watch->list);
> -		free_watch_adapter(watch);
> -	}
> -
> -	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
> -		list_del(&rb->list);
> -		kfree(rb);
> -	}
> -	kfree(u);
> -
> -	return 0;
> -}
> -
> -static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
> -{
> -	struct xenbus_file_priv *u = file->private_data;
> -
> -	poll_wait(file, &u->read_waitq, wait);
> -	if (!list_empty(&u->read_buffers))
> -		return POLLIN | POLLRDNORM;
> -	return 0;
> -}
> -
> -const struct file_operations xenbus_file_ops = {
> -	.read = xenbus_file_read,
> -	.write = xenbus_file_write,
> -	.open = xenbus_file_open,
> -	.release = xenbus_file_release,
> -	.poll = xenbus_file_poll,
> -	.llseek = no_llseek,
> -};
> diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
> index 5056306..6b80c77 100644
> --- a/drivers/xen/xenfs/xenfs.h
> +++ b/drivers/xen/xenfs/xenfs.h
> @@ -1,7 +1,6 @@
>  #ifndef _XENFS_XENBUS_H
>  #define _XENFS_XENBUS_H
>  
> -extern const struct file_operations xenbus_file_ops;
>  extern const struct file_operations xsd_kva_file_ops;
>  extern const struct file_operations xsd_port_file_ops;
>  
> -- 
> 1.7.7.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

Konrad Rzeszutek Wilk

2011-Nov-28 18:14 UTC

head link

Re: [PATCH 1/5] xen/sys/hypervisor: Export guest_properties/is_initial_domain

On Sun, Nov 27, 2011 at 11:07:04PM +0100, Bastian Blank
wrote:> Signed-off-by: Bastian Blank <waldi@debian.org>
> ---
>  drivers/xen/sys-hypervisor.c |   35 +++++++++++++++++++++++++++++++++++
You also need a patch to the Documentation ABI (sysfs something).

>  1 files changed, 35 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
> index 1e0fe01..d0916e8 100644
> --- a/drivers/xen/sys-hypervisor.c
> +++ b/drivers/xen/sys-hypervisor.c
> @@ -355,6 +355,35 @@ static void xen_properties_destroy(void)
>  	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
>  }
>  
> +/* xen guest properties info */
Properties is plural, but this is a single attribute.

The ''guest_properties'' does not tell _what_ type of property
this
is? Nor its purpose. Perhaps the name ''is_initial_domain''
would be a
better name? What is the purpose of this attribute? Who/what tools
benefit from this? Is there a corresponding patch in the Xen tool stack
to utilize this?

Thanks!> +
> +static ssize_t is_initial_domain_show(struct hyp_sysfs_attr *attr, char
*buffer)
> +{
> +	return sprintf(buffer, "%d\n", xen_initial_domain());
> +}
> +
> +HYPERVISOR_ATTR_RO(is_initial_domain);
> +
> +static struct attribute *xen_guest_properties_attrs[] = {
> +	&is_initial_domain_attr.attr,
> +	NULL
> +};
> +
> +static struct attribute_group xen_guest_properties_group = {
> +	.name = "guest_properties",
> +	.attrs = xen_guest_properties_attrs,
> +};
> +
> +static int __init xen_guest_properties_init(void)
> +{
> +	return sysfs_create_group(hypervisor_kobj,
&xen_guest_properties_group);
> +}
> +
> +static void xen_guest_properties_destroy(void)
> +{
> +	sysfs_remove_group(hypervisor_kobj, &xen_guest_properties_group);
> +}
> +
>  static int __init hyper_sysfs_init(void)
>  {
>  	int ret;
> @@ -377,9 +406,14 @@ static int __init hyper_sysfs_init(void)
>  	ret = xen_properties_init();
>  	if (ret)
>  		goto prop_out;
> +	ret = xen_guest_properties_init();
> +	if (ret)
> +		goto gprop_out;
>  
>  	goto out;
>  
> +gprop_out:
> +	xen_properties_destroy();
>  prop_out:
>  	xen_sysfs_uuid_destroy();
>  uuid_out:
> @@ -394,6 +428,7 @@ out:
>  
>  static void __exit hyper_sysfs_exit(void)
>  {
> +	xen_guest_properties_destroy();
>  	xen_properties_destroy();
>  	xen_compilation_destroy();
>  	xen_sysfs_uuid_destroy();
> -- 
> 1.7.7.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

Konrad Rzeszutek Wilk

2011-Nov-28 18:22 UTC

head link

Re: [PATCH 2/5] xen: Add privcmd device driver

On Sun, Nov 27, 2011 at 11:07:05PM +0100, Bastian Blank
wrote:> Access to arbitrary hypercalls is currently provided via xenfs. This
> adds a standard character device to handle this. The support in xenfs
Ok, what is the benefit of that? You mentioned in the prologue "about a
year ago I started", but you didn''t provide any links to the
conversation. Could you include the details please?
> remains for backward compatibility and uses the device driver code.
> 
> Signed-off-by: Bastian Blank <waldi@debian.org>
> ---
>  drivers/xen/Kconfig         |    7 +
>  drivers/xen/Makefile        |    2 +
>  drivers/xen/privcmd.c       |  437
+++++++++++++++++++++++++++++++++++++++++++
>  drivers/xen/privcmd.h       |    3 +
>  drivers/xen/xenfs/Makefile  |    2 +-
>  drivers/xen/xenfs/privcmd.c |  400 ---------------------------------------
It looks like you are doing a move of the file. Can you use ''git
mv''
instead please.

If it breaks compile build, you can modify the Kconfig to inhibit the
build (say make it dependent on a symbol that won''t be turned on).

And then in a later patch, reenable the Kconfig. The idea here is to
keep ''git bisection working properly''.

Another way to make this work is to provide some scaffolding code in the
existing code so that it can build in another directory. And then
in another patch remove it.
>  drivers/xen/xenfs/super.c   |    3 +-
>  drivers/xen/xenfs/xenfs.h   |    1 -
>  8 files changed, 452 insertions(+), 403 deletions(-)
>  create mode 100644 drivers/xen/privcmd.c
>  create mode 100644 drivers/xen/privcmd.h
>  delete mode 100644 drivers/xen/xenfs/privcmd.c
> 
> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> index 5f7ff8e..eb7574c 100644
> --- a/drivers/xen/Kconfig
> +++ b/drivers/xen/Kconfig
> @@ -86,6 +86,7 @@ config XEN_BACKEND
>  
>  config XENFS
>  	tristate "Xen filesystem"
> +	select XEN_PRIVCMD
>  	default y
>  	help
>  	  The xen filesystem provides a way for domains to share
> @@ -181,4 +182,10 @@ config XEN_PCIDEV_BACKEND
>  	  xen-pciback.hide=(03:00.0)(04:00.0)
>  
>  	  If in doubt, say m.
> +
> +config XEN_PRIVCMD
> +	tristate
> +	depends on XEN_DOM0
Would it be possible for HVM domains that have the backend drivers in
them (so blkback for example) to use these hypercalls? If so should this
XEN_DOM0 be perhaps changed to something else?
> +	default m
> +
>  endmenu
> diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
> index 72bbb27..c35f65d 100644
> --- a/drivers/xen/Makefile
> +++ b/drivers/xen/Makefile
> @@ -19,9 +19,11 @@ obj-$(CONFIG_XEN_TMEM)			+= tmem.o
>  obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
>  obj-$(CONFIG_XEN_DOM0)			+= pci.o
>  obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
> +obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
>  
>  xen-evtchn-y				:= evtchn.o
>  xen-gntdev-y				:= gntdev.o
>  xen-gntalloc-y				:= gntalloc.o
> +xen-privcmd-y				:= privcmd.o
>  
>  xen-platform-pci-y			:= platform-pci.o
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> new file mode 100644
> index 0000000..863fbd0
> --- /dev/null
> +++ b/drivers/xen/privcmd.c
> @@ -0,0 +1,437 @@
>
+/******************************************************************************
> + * privcmd.c
> + *
> + * Interface to privileged domain-0 commands.
> + *
> + * Copyright (c) 2002-2004, K A Fraser, B Dragovic
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +#include <linux/errno.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/uaccess.h>
> +#include <linux/swap.h>
> +#include <linux/highmem.h>
> +#include <linux/pagemap.h>
> +#include <linux/seq_file.h>
> +#include <linux/miscdevice.h>
> +
> +#include <asm/pgalloc.h>
> +#include <asm/pgtable.h>
> +#include <asm/tlb.h>
> +#include <asm/xen/hypervisor.h>
> +#include <asm/xen/hypercall.h>
> +
> +#include <xen/xen.h>
> +#include <xen/privcmd.h>
> +#include <xen/interface/xen.h>
> +#include <xen/features.h>
> +#include <xen/page.h>
> +#include <xen/xen-ops.h>
> +
> +#include "privcmd.h"
> +
> +MODULE_LICENSE("GPL");
> +
> +#ifndef HAVE_ARCH_PRIVCMD_MMAP
> +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
> +#endif
> +
> +static long privcmd_ioctl_hypercall(void __user *udata)
> +{
> +	struct privcmd_hypercall hypercall;
> +	long ret;
> +
> +	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
> +		return -EFAULT;
> +
> +	ret = privcmd_call(hypercall.op,
> +			   hypercall.arg[0], hypercall.arg[1],
> +			   hypercall.arg[2], hypercall.arg[3],
> +			   hypercall.arg[4]);
> +
> +	return ret;
> +}
> +
> +static void free_page_list(struct list_head *pages)
> +{
> +	struct page *p, *n;
> +
> +	list_for_each_entry_safe(p, n, pages, lru)
> +		__free_page(p);
> +
> +	INIT_LIST_HEAD(pages);
> +}
> +
> +/*
> + * Given an array of items in userspace, return a list of pages
> + * containing the data.  If copying fails, either because of memory
> + * allocation failure or a problem reading user memory, return an
> + * error code; its up to the caller to dispose of any partial list.
> + */
> +static int gather_array(struct list_head *pagelist,
> +			unsigned nelem, size_t size,
> +			void __user *data)
> +{
> +	unsigned pageidx;
> +	void *pagedata;
> +	int ret;
> +
> +	if (size > PAGE_SIZE)
> +		return 0;
> +
> +	pageidx = PAGE_SIZE;
> +	pagedata = NULL;	/* quiet, gcc */
> +	while (nelem--) {
> +		if (pageidx > PAGE_SIZE-size) {
> +			struct page *page = alloc_page(GFP_KERNEL);
> +
> +			ret = -ENOMEM;
> +			if (page == NULL)
> +				goto fail;
> +
> +			pagedata = page_address(page);
> +
> +			list_add_tail(&page->lru, pagelist);
> +			pageidx = 0;
> +		}
> +
> +		ret = -EFAULT;
> +		if (copy_from_user(pagedata + pageidx, data, size))
> +			goto fail;
> +
> +		data += size;
> +		pageidx += size;
> +	}
> +
> +	ret = 0;
> +
> +fail:
> +	return ret;
> +}
> +
> +/*
> + * Call function "fn" on each element of the array fragmented
> + * over a list of pages.
> + */
> +static int traverse_pages(unsigned nelem, size_t size,
> +			  struct list_head *pos,
> +			  int (*fn)(void *data, void *state),
> +			  void *state)
> +{
> +	void *pagedata;
> +	unsigned pageidx;
> +	int ret = 0;
> +
> +	BUG_ON(size > PAGE_SIZE);
> +
> +	pageidx = PAGE_SIZE;
> +	pagedata = NULL;	/* hush, gcc */
> +
> +	while (nelem--) {
> +		if (pageidx > PAGE_SIZE-size) {
> +			struct page *page;
> +			pos = pos->next;
> +			page = list_entry(pos, struct page, lru);
> +			pagedata = page_address(page);
> +			pageidx = 0;
> +		}
> +
> +		ret = (*fn)(pagedata + pageidx, state);
> +		if (ret)
> +			break;
> +		pageidx += size;
> +	}
> +
> +	return ret;
> +}
> +
> +struct mmap_mfn_state {
> +	unsigned long va;
> +	struct vm_area_struct *vma;
> +	domid_t domain;
> +};
> +
> +static int mmap_mfn_range(void *data, void *state)
> +{
> +	struct privcmd_mmap_entry *msg = data;
> +	struct mmap_mfn_state *st = state;
> +	struct vm_area_struct *vma = st->vma;
> +	int rc;
> +
> +	/* Do not allow range to wrap the address space. */
> +	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
> +	    ((unsigned long)(msg->npages << PAGE_SHIFT) >=
-st->va))
> +		return -EINVAL;
> +
> +	/* Range chunks must be contiguous in va space. */
> +	if ((msg->va != st->va) ||
> +	    ((msg->va+(msg->npages<<PAGE_SHIFT)) >
vma->vm_end))
> +		return -EINVAL;
> +
> +	rc = xen_remap_domain_mfn_range(vma,
> +					msg->va & PAGE_MASK,
> +					msg->mfn, msg->npages,
> +					vma->vm_page_prot,
> +					st->domain);
> +	if (rc < 0)
> +		return rc;
> +
> +	st->va += msg->npages << PAGE_SHIFT;
> +
> +	return 0;
> +}
> +
> +static long privcmd_ioctl_mmap(void __user *udata)
> +{
> +	struct privcmd_mmap mmapcmd;
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *vma;
> +	int rc;
> +	LIST_HEAD(pagelist);
> +	struct mmap_mfn_state state;
> +
> +	if (!xen_initial_domain())
> +		return -EPERM;
> +
> +	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
> +		return -EFAULT;
> +
> +	rc = gather_array(&pagelist,
> +			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
> +			  mmapcmd.entry);
> +
> +	if (rc || list_empty(&pagelist))
> +		goto out;
> +
> +	down_write(&mm->mmap_sem);
> +
> +	{
> +		struct page *page = list_first_entry(&pagelist,
> +						     struct page, lru);
> +		struct privcmd_mmap_entry *msg = page_address(page);
> +
> +		vma = find_vma(mm, msg->va);
> +		rc = -EINVAL;
> +
> +		if (!vma || (msg->va != vma->vm_start) ||
> +		    !privcmd_enforce_singleshot_mapping(vma))
> +			goto out_up;
> +	}
> +
> +	state.va = vma->vm_start;
> +	state.vma = vma;
> +	state.domain = mmapcmd.dom;
> +
> +	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
> +			    &pagelist,
> +			    mmap_mfn_range, &state);
> +
> +
> +out_up:
> +	up_write(&mm->mmap_sem);
> +
> +out:
> +	free_page_list(&pagelist);
> +
> +	return rc;
> +}
> +
> +struct mmap_batch_state {
> +	domid_t domain;
> +	unsigned long va;
> +	struct vm_area_struct *vma;
> +	int err;
> +
> +	xen_pfn_t __user *user;
> +};
> +
> +static int mmap_batch_fn(void *data, void *state)
> +{
> +	xen_pfn_t *mfnp = data;
> +	struct mmap_batch_state *st = state;
> +
> +	if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK,
*mfnp, 1,
> +				       st->vma->vm_page_prot, st->domain) < 0) {
> +		*mfnp |= 0xf0000000U;
> +		st->err++;
> +	}
> +	st->va += PAGE_SIZE;
> +
> +	return 0;
> +}
> +
> +static int mmap_return_errors(void *data, void *state)
> +{
> +	xen_pfn_t *mfnp = data;
> +	struct mmap_batch_state *st = state;
> +
> +	return put_user(*mfnp, st->user++);
> +}
> +
> +static struct vm_operations_struct privcmd_vm_ops;
> +
> +static long privcmd_ioctl_mmap_batch(void __user *udata)
> +{
> +	int ret;
> +	struct privcmd_mmapbatch m;
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *vma;
> +	unsigned long nr_pages;
> +	LIST_HEAD(pagelist);
> +	struct mmap_batch_state state;
> +
> +	if (!xen_initial_domain())
> +		return -EPERM;
> +
> +	if (copy_from_user(&m, udata, sizeof(m)))
> +		return -EFAULT;
> +
> +	nr_pages = m.num;
> +	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
> +		return -EINVAL;
> +
> +	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
> +			   m.arr);
> +
> +	if (ret || list_empty(&pagelist))
> +		goto out;
> +
> +	down_write(&mm->mmap_sem);
> +
> +	vma = find_vma(mm, m.addr);
> +	ret = -EINVAL;
> +	if (!vma ||
> +	    vma->vm_ops != &privcmd_vm_ops ||
> +	    (m.addr != vma->vm_start) ||
> +	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
> +	    !privcmd_enforce_singleshot_mapping(vma)) {
> +		up_write(&mm->mmap_sem);
> +		goto out;
> +	}
> +
> +	state.domain = m.dom;
> +	state.vma = vma;
> +	state.va = m.addr;
> +	state.err = 0;
> +
> +	ret = traverse_pages(m.num, sizeof(xen_pfn_t),
> +			     &pagelist, mmap_batch_fn, &state);
> +
> +	up_write(&mm->mmap_sem);
> +
> +	if (state.err > 0) {
> +		state.user = m.arr;
> +		ret = traverse_pages(m.num, sizeof(xen_pfn_t),
> +			       &pagelist,
> +			       mmap_return_errors, &state);
> +	}
> +
> +out:
> +	free_page_list(&pagelist);
> +
> +	return ret;
> +}
> +
> +static long privcmd_ioctl(struct file *file,
> +			  unsigned int cmd, unsigned long data)
> +{
> +	int ret = -ENOSYS;
> +	void __user *udata = (void __user *) data;
> +
> +	switch (cmd) {
> +	case IOCTL_PRIVCMD_HYPERCALL:
> +		ret = privcmd_ioctl_hypercall(udata);
> +		break;
> +
> +	case IOCTL_PRIVCMD_MMAP:
> +		ret = privcmd_ioctl_mmap(udata);
> +		break;
> +
> +	case IOCTL_PRIVCMD_MMAPBATCH:
> +		ret = privcmd_ioctl_mmap_batch(udata);
> +		break;
> +
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
> +#ifndef HAVE_ARCH_PRIVCMD_MMAP
> +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx,
uv=%p\n",
> +	       vma, vma->vm_start, vma->vm_end,
> +	       vmf->pgoff, vmf->virtual_address);
> +
> +	return VM_FAULT_SIGBUS;
> +}
> +
> +static struct vm_operations_struct privcmd_vm_ops = {
> +	.fault = privcmd_fault
> +};
> +
> +static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	/* Unsupported for auto-translate guests. */
> +	if (xen_feature(XENFEAT_auto_translated_physmap))
> +		return -ENOSYS;
> +
> +	/* DONTCOPY is essential for Xen because copy_page_range doesn''t
know
> +	 * how to recreate these mappings */
> +	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
> +	vma->vm_ops = &privcmd_vm_ops;
> +	vma->vm_private_data = NULL;
> +
> +	return 0;
> +}
> +
> +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
> +{
> +	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
> +}
> +#endif
> +
> +const struct file_operations xen_privcmd_fops = {
> +	.owner = THIS_MODULE,
> +	.unlocked_ioctl = privcmd_ioctl,
> +	.mmap = privcmd_mmap,
> +};
> +EXPORT_SYMBOL_GPL(xen_privcmd_fops);
> +
> +static struct miscdevice privcmd_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "xen/privcmd",
> +	.fops = &xen_privcmd_fops,
> +};
> +
> +static int __init privcmd_init(void)
> +{
> +	int err;
> +
> +	if (!xen_domain())
> +		return -ENODEV;
> +
> +	err = misc_register(&privcmd_dev);
> +	if (err != 0) {
> +		printk(KERN_ERR "Could not register privcmd device\n");
> +		return err;
> +	}
> +	return 0;
> +}
> +
> +static void __exit privcmd_exit(void)
> +{
> +	misc_deregister(&privcmd_dev);
> +}
> +
> +module_init(privcmd_init);
> +module_exit(privcmd_exit);
> diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h
> new file mode 100644
> index 0000000..14facae
> --- /dev/null
> +++ b/drivers/xen/privcmd.h
> @@ -0,0 +1,3 @@
> +#include <linux/fs.h>
> +
> +extern const struct file_operations xen_privcmd_fops;
> diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
> index 4fde944..5d45ff1 100644
> --- a/drivers/xen/xenfs/Makefile
> +++ b/drivers/xen/xenfs/Makefile
> @@ -1,4 +1,4 @@
>  obj-$(CONFIG_XENFS) += xenfs.o
>  
> -xenfs-y			  = super.o xenbus.o privcmd.o
> +xenfs-y			  = super.o xenbus.o
>  xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
> diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
> deleted file mode 100644
> index dbd3b16..0000000
> --- a/drivers/xen/xenfs/privcmd.c
> +++ /dev/null
> @@ -1,400 +0,0 @@
>
-/******************************************************************************
> - * privcmd.c
> - *
> - * Interface to privileged domain-0 commands.
> - *
> - * Copyright (c) 2002-2004, K A Fraser, B Dragovic
> - */
> -
> -#include <linux/kernel.h>
> -#include <linux/sched.h>
> -#include <linux/slab.h>
> -#include <linux/string.h>
> -#include <linux/errno.h>
> -#include <linux/mm.h>
> -#include <linux/mman.h>
> -#include <linux/uaccess.h>
> -#include <linux/swap.h>
> -#include <linux/highmem.h>
> -#include <linux/pagemap.h>
> -#include <linux/seq_file.h>
> -
> -#include <asm/pgalloc.h>
> -#include <asm/pgtable.h>
> -#include <asm/tlb.h>
> -#include <asm/xen/hypervisor.h>
> -#include <asm/xen/hypercall.h>
> -
> -#include <xen/xen.h>
> -#include <xen/privcmd.h>
> -#include <xen/interface/xen.h>
> -#include <xen/features.h>
> -#include <xen/page.h>
> -#include <xen/xen-ops.h>
> -
> -#ifndef HAVE_ARCH_PRIVCMD_MMAP
> -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
> -#endif
> -
> -static long privcmd_ioctl_hypercall(void __user *udata)
> -{
> -	struct privcmd_hypercall hypercall;
> -	long ret;
> -
> -	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
> -		return -EFAULT;
> -
> -	ret = privcmd_call(hypercall.op,
> -			   hypercall.arg[0], hypercall.arg[1],
> -			   hypercall.arg[2], hypercall.arg[3],
> -			   hypercall.arg[4]);
> -
> -	return ret;
> -}
> -
> -static void free_page_list(struct list_head *pages)
> -{
> -	struct page *p, *n;
> -
> -	list_for_each_entry_safe(p, n, pages, lru)
> -		__free_page(p);
> -
> -	INIT_LIST_HEAD(pages);
> -}
> -
> -/*
> - * Given an array of items in userspace, return a list of pages
> - * containing the data.  If copying fails, either because of memory
> - * allocation failure or a problem reading user memory, return an
> - * error code; its up to the caller to dispose of any partial list.
> - */
> -static int gather_array(struct list_head *pagelist,
> -			unsigned nelem, size_t size,
> -			void __user *data)
> -{
> -	unsigned pageidx;
> -	void *pagedata;
> -	int ret;
> -
> -	if (size > PAGE_SIZE)
> -		return 0;
> -
> -	pageidx = PAGE_SIZE;
> -	pagedata = NULL;	/* quiet, gcc */
> -	while (nelem--) {
> -		if (pageidx > PAGE_SIZE-size) {
> -			struct page *page = alloc_page(GFP_KERNEL);
> -
> -			ret = -ENOMEM;
> -			if (page == NULL)
> -				goto fail;
> -
> -			pagedata = page_address(page);
> -
> -			list_add_tail(&page->lru, pagelist);
> -			pageidx = 0;
> -		}
> -
> -		ret = -EFAULT;
> -		if (copy_from_user(pagedata + pageidx, data, size))
> -			goto fail;
> -
> -		data += size;
> -		pageidx += size;
> -	}
> -
> -	ret = 0;
> -
> -fail:
> -	return ret;
> -}
> -
> -/*
> - * Call function "fn" on each element of the array fragmented
> - * over a list of pages.
> - */
> -static int traverse_pages(unsigned nelem, size_t size,
> -			  struct list_head *pos,
> -			  int (*fn)(void *data, void *state),
> -			  void *state)
> -{
> -	void *pagedata;
> -	unsigned pageidx;
> -	int ret = 0;
> -
> -	BUG_ON(size > PAGE_SIZE);
> -
> -	pageidx = PAGE_SIZE;
> -	pagedata = NULL;	/* hush, gcc */
> -
> -	while (nelem--) {
> -		if (pageidx > PAGE_SIZE-size) {
> -			struct page *page;
> -			pos = pos->next;
> -			page = list_entry(pos, struct page, lru);
> -			pagedata = page_address(page);
> -			pageidx = 0;
> -		}
> -
> -		ret = (*fn)(pagedata + pageidx, state);
> -		if (ret)
> -			break;
> -		pageidx += size;
> -	}
> -
> -	return ret;
> -}
> -
> -struct mmap_mfn_state {
> -	unsigned long va;
> -	struct vm_area_struct *vma;
> -	domid_t domain;
> -};
> -
> -static int mmap_mfn_range(void *data, void *state)
> -{
> -	struct privcmd_mmap_entry *msg = data;
> -	struct mmap_mfn_state *st = state;
> -	struct vm_area_struct *vma = st->vma;
> -	int rc;
> -
> -	/* Do not allow range to wrap the address space. */
> -	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
> -	    ((unsigned long)(msg->npages << PAGE_SHIFT) >=
-st->va))
> -		return -EINVAL;
> -
> -	/* Range chunks must be contiguous in va space. */
> -	if ((msg->va != st->va) ||
> -	    ((msg->va+(msg->npages<<PAGE_SHIFT)) >
vma->vm_end))
> -		return -EINVAL;
> -
> -	rc = xen_remap_domain_mfn_range(vma,
> -					msg->va & PAGE_MASK,
> -					msg->mfn, msg->npages,
> -					vma->vm_page_prot,
> -					st->domain);
> -	if (rc < 0)
> -		return rc;
> -
> -	st->va += msg->npages << PAGE_SHIFT;
> -
> -	return 0;
> -}
> -
> -static long privcmd_ioctl_mmap(void __user *udata)
> -{
> -	struct privcmd_mmap mmapcmd;
> -	struct mm_struct *mm = current->mm;
> -	struct vm_area_struct *vma;
> -	int rc;
> -	LIST_HEAD(pagelist);
> -	struct mmap_mfn_state state;
> -
> -	if (!xen_initial_domain())
> -		return -EPERM;
> -
> -	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
> -		return -EFAULT;
> -
> -	rc = gather_array(&pagelist,
> -			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
> -			  mmapcmd.entry);
> -
> -	if (rc || list_empty(&pagelist))
> -		goto out;
> -
> -	down_write(&mm->mmap_sem);
> -
> -	{
> -		struct page *page = list_first_entry(&pagelist,
> -						     struct page, lru);
> -		struct privcmd_mmap_entry *msg = page_address(page);
> -
> -		vma = find_vma(mm, msg->va);
> -		rc = -EINVAL;
> -
> -		if (!vma || (msg->va != vma->vm_start) ||
> -		    !privcmd_enforce_singleshot_mapping(vma))
> -			goto out_up;
> -	}
> -
> -	state.va = vma->vm_start;
> -	state.vma = vma;
> -	state.domain = mmapcmd.dom;
> -
> -	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
> -			    &pagelist,
> -			    mmap_mfn_range, &state);
> -
> -
> -out_up:
> -	up_write(&mm->mmap_sem);
> -
> -out:
> -	free_page_list(&pagelist);
> -
> -	return rc;
> -}
> -
> -struct mmap_batch_state {
> -	domid_t domain;
> -	unsigned long va;
> -	struct vm_area_struct *vma;
> -	int err;
> -
> -	xen_pfn_t __user *user;
> -};
> -
> -static int mmap_batch_fn(void *data, void *state)
> -{
> -	xen_pfn_t *mfnp = data;
> -	struct mmap_batch_state *st = state;
> -
> -	if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK,
*mfnp, 1,
> -				       st->vma->vm_page_prot, st->domain) < 0) {
> -		*mfnp |= 0xf0000000U;
> -		st->err++;
> -	}
> -	st->va += PAGE_SIZE;
> -
> -	return 0;
> -}
> -
> -static int mmap_return_errors(void *data, void *state)
> -{
> -	xen_pfn_t *mfnp = data;
> -	struct mmap_batch_state *st = state;
> -
> -	return put_user(*mfnp, st->user++);
> -}
> -
> -static struct vm_operations_struct privcmd_vm_ops;
> -
> -static long privcmd_ioctl_mmap_batch(void __user *udata)
> -{
> -	int ret;
> -	struct privcmd_mmapbatch m;
> -	struct mm_struct *mm = current->mm;
> -	struct vm_area_struct *vma;
> -	unsigned long nr_pages;
> -	LIST_HEAD(pagelist);
> -	struct mmap_batch_state state;
> -
> -	if (!xen_initial_domain())
> -		return -EPERM;
> -
> -	if (copy_from_user(&m, udata, sizeof(m)))
> -		return -EFAULT;
> -
> -	nr_pages = m.num;
> -	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
> -		return -EINVAL;
> -
> -	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
> -			   m.arr);
> -
> -	if (ret || list_empty(&pagelist))
> -		goto out;
> -
> -	down_write(&mm->mmap_sem);
> -
> -	vma = find_vma(mm, m.addr);
> -	ret = -EINVAL;
> -	if (!vma ||
> -	    vma->vm_ops != &privcmd_vm_ops ||
> -	    (m.addr != vma->vm_start) ||
> -	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
> -	    !privcmd_enforce_singleshot_mapping(vma)) {
> -		up_write(&mm->mmap_sem);
> -		goto out;
> -	}
> -
> -	state.domain = m.dom;
> -	state.vma = vma;
> -	state.va = m.addr;
> -	state.err = 0;
> -
> -	ret = traverse_pages(m.num, sizeof(xen_pfn_t),
> -			     &pagelist, mmap_batch_fn, &state);
> -
> -	up_write(&mm->mmap_sem);
> -
> -	if (state.err > 0) {
> -		state.user = m.arr;
> -		ret = traverse_pages(m.num, sizeof(xen_pfn_t),
> -			       &pagelist,
> -			       mmap_return_errors, &state);
> -	}
> -
> -out:
> -	free_page_list(&pagelist);
> -
> -	return ret;
> -}
> -
> -static long privcmd_ioctl(struct file *file,
> -			  unsigned int cmd, unsigned long data)
> -{
> -	int ret = -ENOSYS;
> -	void __user *udata = (void __user *) data;
> -
> -	switch (cmd) {
> -	case IOCTL_PRIVCMD_HYPERCALL:
> -		ret = privcmd_ioctl_hypercall(udata);
> -		break;
> -
> -	case IOCTL_PRIVCMD_MMAP:
> -		ret = privcmd_ioctl_mmap(udata);
> -		break;
> -
> -	case IOCTL_PRIVCMD_MMAPBATCH:
> -		ret = privcmd_ioctl_mmap_batch(udata);
> -		break;
> -
> -	default:
> -		ret = -EINVAL;
> -		break;
> -	}
> -
> -	return ret;
> -}
> -
> -#ifndef HAVE_ARCH_PRIVCMD_MMAP
> -static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> -{
> -	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx,
uv=%p\n",
> -	       vma, vma->vm_start, vma->vm_end,
> -	       vmf->pgoff, vmf->virtual_address);
> -
> -	return VM_FAULT_SIGBUS;
> -}
> -
> -static struct vm_operations_struct privcmd_vm_ops = {
> -	.fault = privcmd_fault
> -};
> -
> -static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
> -{
> -	/* Unsupported for auto-translate guests. */
> -	if (xen_feature(XENFEAT_auto_translated_physmap))
> -		return -ENOSYS;
> -
> -	/* DONTCOPY is essential for Xen because copy_page_range doesn''t
know
> -	 * how to recreate these mappings */
> -	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
> -	vma->vm_ops = &privcmd_vm_ops;
> -	vma->vm_private_data = NULL;
> -
> -	return 0;
> -}
> -
> -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
> -{
> -	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
> -}
> -#endif
> -
> -const struct file_operations privcmd_file_ops = {
> -	.unlocked_ioctl = privcmd_ioctl,
> -	.mmap = privcmd_mmap,
> -};
> diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
> index 1aa3897..a55fbf9 100644
> --- a/drivers/xen/xenfs/super.c
> +++ b/drivers/xen/xenfs/super.c
> @@ -16,6 +16,7 @@
>  #include <xen/xen.h>
>  
>  #include "xenfs.h"
> +#include "../privcmd.h"
>  
>  #include <asm/xen/hypervisor.h>
>  
> @@ -84,7 +85,7 @@ static int xenfs_fill_super(struct super_block *sb, void
*data, int silent)
>  		[1] = {},
>  		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
>  		{ "capabilities", &capabilities_file_ops, S_IRUGO },
> -		{ "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
> +		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
>  		{""},
>  	};
>  	int rc;
> diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
> index b68aa62..5056306 100644
> --- a/drivers/xen/xenfs/xenfs.h
> +++ b/drivers/xen/xenfs/xenfs.h
> @@ -2,7 +2,6 @@
>  #define _XENFS_XENBUS_H
>  
>  extern const struct file_operations xenbus_file_ops;
> -extern const struct file_operations privcmd_file_ops;
>  extern const struct file_operations xsd_kva_file_ops;
>  extern const struct file_operations xsd_port_file_ops;
>  
> -- 
> 1.7.7.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

Bastian Blank

2011-Nov-28 18:23 UTC

head link

Re: [PATCH 2/5] xen: Add privcmd device driver

On Mon, Nov 28, 2011 at 06:00:21PM +0000, Ian Campbell
wrote:> On Mon, 2011-11-28 at 17:39 +0000, Bastian Blank wrote:
> > On Mon, Nov 28, 2011 at 04:26:53PM +0000, Ian Campbell wrote:
> > > The main reason would be to avoid the select since selecting on
user
> > > visible symbols is a recipe for confusion and is generally
advised
> > > against.
> > Right. It is just the easiest solution.
> XENFS could depend on XEN_PRIVCMD and whatever else it needs?
This makes sure the module is built. But it does not make sure it is
also loaded.

Bastian

-- 
Another dream that failed.  There''s nothing sadder.
		-- Kirk, "This side of Paradise", stardate 3417.3

Konrad Rzeszutek Wilk

2011-Nov-28 18:37 UTC

head link

Re: [PATCH 5/5] xen: Add xenbusd device driver

On Sun, Nov 27, 2011 at 11:07:08PM +0100, Bastian Blank
wrote:> Access for xenstored to the event channel and pre-allocated ring is
> managed via xenfs.  This adds its own device driver featuring mmap for
> the ring and an ioctl for the event channel.
> 
> Signed-off-by: Bastian Blank <waldi@debian.org>
> ---
>  drivers/xen/xenbus/Makefile             |    1 +
>  drivers/xen/xenbus/xenbus_dev_backend.c |   79
+++++++++++++++++++++++++++++++
>  include/xen/xenbus_dev.h                |   41 ++++++++++++++++
>  3 files changed, 121 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/xen/xenbus/xenbus_dev_backend.c
>  create mode 100644 include/xen/xenbus_dev.h
> 
> diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
> index a2ea363..7e1aa85 100644
> --- a/drivers/xen/xenbus/Makefile
> +++ b/drivers/xen/xenbus/Makefile
> @@ -10,4 +10,5 @@ xenbus-objs += xenbus_probe.o
>  xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
>  xenbus-objs += $(xenbus-be-objs-y)
>  
> +obj-$(CONFIG_XEN_DOM0) += xenbus_dev_backend.o
I think this needs to depend on XEN_BACKEND ?

You could have a dom0 without any backends .. (Which is one of the goals
of disegragated device driver domains).
>  obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
> diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c
b/drivers/xen/xenbus/xenbus_dev_backend.c
> new file mode 100644
> index 0000000..5d77cee
> --- /dev/null
> +++ b/drivers/xen/xenbus/xenbus_dev_backend.c
> @@ -0,0 +1,79 @@
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/mm.h>
> +#include <linux/fs.h>
> +#include <linux/miscdevice.h>
> +#include <linux/module.h>
> +
> +#include <xen/page.h>
> +#include <xen/xenbus_dev.h>
> +
> +#include "xenbus_comms.h"
> +
> +MODULE_LICENSE("GPL");
> +
> +static long xenbusd_ioctl(struct file *file, unsigned int cmd, unsigned
long data)
> +{
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
> +	switch (cmd) {
> +		case IOCTL_XENBUSD_EVTCHN:
> +			if (xen_store_evtchn > 0)
> +				return xen_store_evtchn;
> +			return -EINVAL;
Not -ENODEV? After all, the command arguments were OK, it is just that
the eventchannel has not been set.
> +
> +		default:
> +			return -ENOTTY;
> +	}
> +}
> +
> +static int xenbusd_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	size_t size = vma->vm_end - vma->vm_start;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
> +	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
> +		return -EINVAL;
> +
> +	if (remap_pfn_range(vma, vma->vm_start,
> +			    virt_to_pfn(xen_store_interface),
> +			    size, vma->vm_page_prot))
> +		return -EAGAIN;
> +
> +	return 0;
> +}
> +
> +const struct file_operations xenbusd_fops = {
> +	.mmap = xenbusd_mmap,
> +	.unlocked_ioctl = xenbusd_ioctl,
> +};
> +
> +static struct miscdevice xenbusd_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "xen/xenbusd",
> +	.fops = &xenbusd_fops,
> +};
> +
> +static int __init xenbusd_init(void)
> +{
> +	int err;
> +
> +	if (!xen_initial_domain())
With the disgregated domains (and the patches that Daniel posted), I
think this needs to relax a bit. Perhaps just make it
''xen_domain''?

Lets CC him here.> +		return -ENODEV;
> +
> +	err = misc_register(&xenbusd_dev);
> +	if (err)
> +		printk(KERN_ERR "Could not register xenbus device\n");
> +	return err;
> +}
> +
> +static void __exit xenbusd_exit(void)
> +{
> +	misc_deregister(&xenbusd_dev);
> +}
> +
> +module_init(xenbusd_init);
> +module_exit(xenbusd_exit);
> diff --git a/include/xen/xenbus_dev.h b/include/xen/xenbus_dev.h
> new file mode 100644
> index 0000000..f551404
> --- /dev/null
> +++ b/include/xen/xenbus_dev.h
> @@ -0,0 +1,41 @@
>
+/******************************************************************************
> + * evtchn.h
> + *
> + * Interface to /dev/xen/xenbusd.
> + *
> + * Copyright (c) 2011 Bastian Blank <waldi@debian.org>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version 2
> + * as published by the Free Software Foundation; or, when distributed
> + * separately from the Linux kernel or incorporated into other
> + * software packages, subject to the following license:
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
copy
> + * of this source file (the "Software"), to deal in the Software
without
> + * restriction, including without limitation the rights to use, copy,
modify,
> + * merge, publish, distribute, sublicense, and/or sell copies of the
Software,
> + * and to permit persons to whom the Software is furnished to do so,
subject to
> + * the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included
in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE
> + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#ifndef __LINUX_XEN_XENBUS_DEV_H__
> +#define __LINUX_XEN_XENBUS_DEV_H__
> +
> +#include <linux/ioctl.h>
> +
> +#define IOCTL_XENBUSD_EVTCHN				\
> +	_IOC(_IOC_NONE, ''X'', 0, 0)
_IOC_READ ?

So why ''X'', not ''B'' for bus?
> +
> +#endif /* __LINUX_XEN_XENBUS_DEV_H__ */
> -- 
> 1.7.7.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

Bastian Blank

2011-Nov-28 18:38 UTC

head link

Re: [PATCH 1/5] xen/sys/hypervisor: Export guest_properties/is_initial_domain

On Mon, Nov 28, 2011 at 02:14:39PM -0400, Konrad Rzeszutek Wilk
wrote:> On Sun, Nov 27, 2011 at 11:07:04PM +0100, Bastian Blank wrote:
> > diff --git a/drivers/xen/sys-hypervisor.c
b/drivers/xen/sys-hypervisor.c
> > index 1e0fe01..d0916e8 100644
> > --- a/drivers/xen/sys-hypervisor.c
> > +++ b/drivers/xen/sys-hypervisor.c
> > @@ -355,6 +355,35 @@ static void xen_properties_destroy(void)
> >  	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
> >  }
> >  
> > +/* xen guest properties info */
> 
> Properties is plural, but this is a single attribute.
Just like the old /proc/xen/capabilites, it only supported one attribute
ever. However it could export a flag for hvm domain.
>                      Perhaps the name ''is_initial_domain''
would be a
> better name?
It is already called this was.
>              What is the purpose of this attribute?
Replace /proc/xen/capabilities. See
<20100605162947.GA31336@wavehammer.waldi.eu.org>
>                                                     Who/what tools
> benefit from this?
The init scripts are the only users.

Bastian

-- 
Deflector shields just came on, Captain.

Bastian Blank

2011-Nov-28 18:46 UTC

head link

Re: [PATCH 2/5] xen: Add privcmd device driver

On Mon, Nov 28, 2011 at 02:22:05PM -0400, Konrad Rzeszutek Wilk
wrote:> On Sun, Nov 27, 2011 at 11:07:05PM +0100, Bastian Blank wrote:
> > Access to arbitrary hypercalls is currently provided via xenfs. This
> > adds a standard character device to handle this. The support in xenfs
> Ok, what is the benefit of that? You mentioned in the prologue "about
a
> year ago I started", but you didn''t provide any links to the
> conversation. Could you include the details please?
<20100605162947.GA31336@wavehammer.waldi.eu.org>
> It looks like you are doing a move of the file. Can you use ''git
mv''
> instead please.
I did actually. But git format-patch need an option to provide diffs of
moves.
> If it breaks compile build, you can modify the Kconfig to inhibit the
> build (say make it dependent on a symbol that won''t be turned on).
No, it does not break the build.
> > +config XEN_PRIVCMD
> > +	tristate
> > +	depends on XEN_DOM0
> 
> Would it be possible for HVM domains that have the backend drivers in
> them (so blkback for example) to use these hypercalls? If so should this
> XEN_DOM0 be perhaps changed to something else?
The depends is wrong, privcmd can be used in any domain. Currently it
can be only activated through XENFS, but this should be changed.

Bastian

-- 
Love sometimes expresses itself in sacrifice.
		-- Kirk, "Metamorphosis", stardate 3220.3

Bastian Blank

2011-Nov-28 18:52 UTC

head link

Re: [PATCH 5/5] xen: Add xenbusd device driver

On Mon, Nov 28, 2011 at 02:37:23PM -0400, Konrad Rzeszutek Wilk
wrote:> On Sun, Nov 27, 2011 at 11:07:08PM +0100, Bastian Blank wrote:
> > +obj-$(CONFIG_XEN_DOM0) += xenbus_dev_backend.o
> I think this needs to depend on XEN_BACKEND ?
Right.
> > +		case IOCTL_XENBUSD_EVTCHN:
> > +			if (xen_store_evtchn > 0)
> > +				return xen_store_evtchn;
> > +			return -EINVAL;
> 
> Not -ENODEV? After all, the command arguments were OK, it is just that
> the eventchannel has not been set.
Ups.
> > +	if (!xen_initial_domain())
> 
> With the disgregated domains (and the patches that Daniel posted), I
> think this needs to relax a bit. Perhaps just make it
''xen_domain''?
Right now, xenstored needs to run where the communication ring is
located. And this ring is allocated in dom0. How would any domU run
without xenstored available?
> > +#define IOCTL_XENBUSD_EVTCHN				\
> > +	_IOC(_IOC_NONE, ''X'', 0, 0)
> _IOC_READ ?
No. It does not communicate via the parameter, only the return value.

Bastian

-- 
Schshschshchsch.
		-- The Gorn, "Arena", stardate 3046.2

Daniel De Graaf

2011-Nov-28 19:12 UTC

head link

Re: [PATCH 5/5] xen: Add xenbusd device driver

On 11/28/2011 01:52 PM, Bastian Blank wrote:> On Mon, Nov 28, 2011 at 02:37:23PM -0400, Konrad Rzeszutek Wilk wrote:
>> On Sun, Nov 27, 2011 at 11:07:08PM +0100, Bastian Blank wrote:
>>> +	if (!xen_initial_domain())
>>
>> With the disgregated domains (and the patches that Daniel posted), I
>> think this needs to relax a bit. Perhaps just make it
''xen_domain''?
What we want is for this device to appear any time xenstored is in
the local domain. In xenbus_probe, I use the xen_start_info structure
to determine this - xen_start_info->store_evtchn is nonzero if there
is a remote xenstored.
> Right now, xenstored needs to run where the communication ring is
> located. And this ring is allocated in dom0. How would any domU run
> without xenstored available?
> 
A domU can run just fine without xenstored available - in particular,
it is possible to run xenstored itself in a domU. This requires either
dom0 patches to enable xenbus to be relocated after launch, or for
xenstored to be started prior to the Linux initial domain. In both
cases, it also requires a modified domain builder that doesn''t talk
to xenstored.

This change is only important if xenstored is running in a Linux-based
stub domain instead of a minios-based one.

Ian Campbell

2011-Nov-28 19:13 UTC

head link

Re: [PATCH 2/5] xen: Add privcmd device driver

On Mon, 2011-11-28 at 18:23 +0000, Bastian Blank wrote:> On Mon, Nov 28, 2011 at 06:00:21PM +0000, Ian Campbell wrote:
> > On Mon, 2011-11-28 at 17:39 +0000, Bastian Blank wrote:
> > > On Mon, Nov 28, 2011 at 04:26:53PM +0000, Ian Campbell wrote:
> > > > The main reason would be to avoid the select since selecting
on user
> > > > visible symbols is a recipe for confusion and is generally
advised
> > > > against.
> > > Right. It is just the easiest solution.
> > XENFS could depend on XEN_PRIVCMD and whatever else it needs?
> 
> This makes sure the module is built. But it does not make sure it is
> also loaded.
Since xenfs uses symbols from xen-privcmd "modprobe xenfs" will also
cause xen-privcmd to be loaded.

Ian.

Bastian Blank

2011-Nov-28 19:42 UTC

head link

Re: [PATCH 5/5] xen: Add xenbusd device driver

On Mon, Nov 28, 2011 at 02:12:19PM -0500, Daniel De Graaf
wrote:> On 11/28/2011 01:52 PM, Bastian Blank wrote:
> > On Mon, Nov 28, 2011 at 02:37:23PM -0400, Konrad Rzeszutek Wilk wrote:
> >> On Sun, Nov 27, 2011 at 11:07:08PM +0100, Bastian Blank wrote:
> >>> +	if (!xen_initial_domain())
> >>
> >> With the disgregated domains (and the patches that Daniel posted),
I
> >> think this needs to relax a bit. Perhaps just make it
''xen_domain''?
> 
> What we want is for this device to appear any time xenstored is in
> the local domain. In xenbus_probe, I use the xen_start_info structure
> to determine this - xen_start_info->store_evtchn is nonzero if there
> is a remote xenstored.
Ah. I missed this change. So the best way to do this is to register the
device from xenstored_local_init.

Bastian

-- 
Prepare for tomorrow -- get ready.
		-- Edith Keeler, "The City On the Edge of Forever",
		   stardate unknown

Konrad Rzeszutek Wilk

2011-Nov-29 23:24 UTC

head link

Re: [PATCH 1/5] xen/sys/hypervisor: Export guest_properties/is_initial_domain

On Mon, Nov 28, 2011 at 07:38:30PM +0100, Bastian Blank
wrote:> On Mon, Nov 28, 2011 at 02:14:39PM -0400, Konrad Rzeszutek Wilk wrote:
> > On Sun, Nov 27, 2011 at 11:07:04PM +0100, Bastian Blank wrote:
> > > diff --git a/drivers/xen/sys-hypervisor.c
b/drivers/xen/sys-hypervisor.c
> > > index 1e0fe01..d0916e8 100644
> > > --- a/drivers/xen/sys-hypervisor.c
> > > +++ b/drivers/xen/sys-hypervisor.c
> > > @@ -355,6 +355,35 @@ static void xen_properties_destroy(void)
> > >  	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
> > >  }
> > >  
> > > +/* xen guest properties info */
> > 
> > Properties is plural, but this is a single attribute.
> 
> Just like the old /proc/xen/capabilites, it only supported one attribute
> ever. However it could export a flag for hvm domain.
> 
> >                      Perhaps the name
''is_initial_domain'' would be a
> > better name?
> 
> It is already called this was.
Ah yes. Somehow I was thinking it was guest_properties.> 
> >              What is the purpose of this attribute?
> 
> Replace /proc/xen/capabilities. See
> <20100605162947.GA31336@wavehammer.waldi.eu.org>
> 
> >                                                     Who/what tools
> > benefit from this?
> 
> The init scripts are the only users.
> 
> Bastian
> 
> -- 
> Deflector shields just came on, Captain.

Xen devel - Nov 2011 - [PATCH 0/5] Move stuff out of xenfs

[PATCH 0/5] Move stuff out of xenfs

[PATCH 1/5] xen/sys/hypervisor: Export guest_properties/is_initial_domain

[PATCH 2/5] xen: Add privcmd device driver

[PATCH 3/5] xen/privcmd: Remove unused support for arch specific privcmp mmap

[PATCH 4/5] xen: Add xenbus device driver

[PATCH 5/5] xen: Add xenbusd device driver

Re: [PATCH 0/5] Move stuff out of xenfs

Re: [PATCH 0/5] Move stuff out of xenfs

Re: [PATCH 0/5] Move stuff out of xenfs

Re: [PATCH 0/5] Move stuff out of xenfs

Re: [PATCH 2/5] xen: Add privcmd device driver

Re: [PATCH 2/5] xen: Add privcmd device driver

Re: [PATCH 2/5] xen: Add privcmd device driver

Re: [PATCH 4/5] xen: Add xenbus device driver

Re: [PATCH 1/5] xen/sys/hypervisor: Export guest_properties/is_initial_domain

Re: [PATCH 2/5] xen: Add privcmd device driver

Re: [PATCH 2/5] xen: Add privcmd device driver

Re: [PATCH 5/5] xen: Add xenbusd device driver

Re: [PATCH 1/5] xen/sys/hypervisor: Export guest_properties/is_initial_domain

Re: [PATCH 2/5] xen: Add privcmd device driver

Re: [PATCH 5/5] xen: Add xenbusd device driver

Re: [PATCH 5/5] xen: Add xenbusd device driver

Re: [PATCH 2/5] xen: Add privcmd device driver

Re: [PATCH 5/5] xen: Add xenbusd device driver

Re: [PATCH 1/5] xen/sys/hypervisor: Export guest_properties/is_initial_domain