In a hosted VMM like LinuxOnLinux or UML, context switch time can be a
major problem (as mmap when repeated for each guest page frame takes a
long time). One solution is to allow the host kernel to keep a cache of
address space contexts, and switch between them in a single
operation.
The attached patch is a start at this. It works well for
LinuxOnLinux; but I'd be interested from the maintainers of other
userspace VMMs whether it (or something similar) would be helpful for
you too.
For LinuxOnLinux, the simplest delivery mechanism is system calls.
The patch below doesn't have the system call infrastructure in it
(it's trivial). For UML, a ptrace interface may be more useful; but
I'm unsure of the locking and other implications involved in
manipulating a process's address space remotely.
The way you use the patches is something like this:
... clone()
/*
* Mark data region as shared.
* You may want to do something with the stack as well.
*
* Do this *after* the clone to get the right sharing.
* We want to share with multiplexed address spaces, not
* with other virtualprocessors.
*/
{
extern char data[];
unsigned long sd = (unsigned long)data & ~(pagesize - 1);
long len = roundup((unsigned long)sbrk(0), pagesize)-sd;
char *p;
p = mmap((void *)NULL, len,
PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_ANONYMOUS, 0, 0);
if (p == MAP_FAILED){
deal with it
}
memcpy(p, (char *)sd, len);
p = mremap(p, len, len, MREMAP_MAYMOVE|MREMAP_FIXED, (char *)sd);
if (p == MAP_FAILED)
deal with it
}
while (new_as() >= 0 && nas < NCONTEXTS)
nas++;
switch_as(0);
After this, you can call switch_as in a signal handler running on a
stack in the shared data region. Anything else is liable to lead to
stack corruption or SEGFAULTs.
There are a couple of places where the semantics could be changed.
For instance if a process with multiple address spaces forks, should
it get all the address spaces, or just the current one? I've chosen
to duplicate just the current address space, as it's slightly simpler
and works for all the use cases I can think of.
It's up to the userspace caller to manage its own address space after
calling these kernel entry points. In particular, anything you want
shared between address spaces will have to mmap MAP_SHARED in each space.
Provide multiple address spaces in a single process.
It's the invokers' responsibility to make sure that (data and stack)
areas to be shared are mapped MAP_SHARED before the address spaces
are created.
---
include/linux/sched.h | 17 +++++++++
kernel/exit.c | 1
kernel/fork.c | 12 ++++++
mm/Kconfig | 7 ++++
mm/Makefile | 1
mm/multias.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 124 insertions(+), 1 deletion(-)
Index: linux-2.6/include/linux/sched.h
==================================================================---
linux-2.6.orig/include/linux/sched.h 2007-06-13 20:30:15.000000000 -0400
+++ linux-2.6/include/linux/sched.h 2007-06-25 16:14:35.000000000 -0400
@@ -864,6 +864,10 @@
struct mm_struct *mm, *active_mm;
+#ifdef CONFIG_VMMHOST
+ /* array of N_MM pointers to struct mm_structs that can be switched between
*/
+ struct mm_struct **mm_avail;
+#endif
/* task state */
struct linux_binfmt *binfmt;
int exit_state;
@@ -1712,6 +1716,19 @@
}
#endif
+#ifdef CONFIG_VMMHOST
+extern void __exit_as(struct task_struct *);
+static inline void exit_as(struct task_struct *tsk) {
+ if (tsk->mm_avail)
+ __exit_as(tsk);
+}
+extern struct mm_struct *dup_mm(struct task_struct *);
+#else
+static inline void exit_as(struct task_struct *tsk)
+{
+}
+#endif
+
#endif /* __KERNEL__ */
#endif
Index: linux-2.6/kernel/exit.c
==================================================================---
linux-2.6.orig/kernel/exit.c 2007-06-13 20:30:15.000000000 -0400
+++ linux-2.6/kernel/exit.c 2007-06-25 13:50:32.000000000 -0400
@@ -943,6 +943,7 @@
taskstats_exit(tsk, group_dead);
+ exit_as(tsk);
exit_mm(tsk);
if (group_dead)
Index: linux-2.6/mm/multias.c
==================================================================--- /dev/null
1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/mm/multias.c 2007-06-26 07:05:21.000000000 -0400
@@ -0,0 +1,87 @@
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define N_MM 16 /* Number of address-space contexts */
+
+/*
+ * sys_new_as creates a new struct mm, duplicating it from current.
+ * If this is the first `extra' it creates the array as well.
+ */
+asmlinkage unsigned long sys_new_as(void)
+{
+ struct mm_struct **mp;
+ /*
+ * Locking? This is always a process operating on its own
+ * struct task_struct.
+ * But what about threads?
+ */
+ if (!current->mm_avail) {
+ current->mm_avail = kzalloc((sizeof *current->mm_avail)*N_MM,
GFP_KERNEL);
+ if (!current->mm_avail)
+ return -ENOMEM;
+ atomic_inc(&(current->mm->mm_users));
+ current->mm_avail[0] = current->active_mm;
+ }
+ for (mp = current->mm_avail; *mp && mp <
¤t->mm_avail[N_MM]; mp++)
+ ;
+ if (mp == ¤t->mm_avail[N_MM])
+ return -ENOENT;
+ *mp = dup_mm(current);
+ if (!*mp)
+ return -ENOMEM;
+ (*mp)->token_priority = 0;
+ (*mp)->last_interval = 0;
+
+ return mp - current->mm_avail;
+}
+
+/*
+ * Could combine ... create on the fly if n is in range and
+ * the context doesn't exist yet
+ */
+asmlinkage unsigned long sys_switch_as(int n)
+{
+ struct mm_struct *oldmm;
+ if (n < 0 || n >= N_MM ||
+ !current->mm_avail || !current->mm_avail[n])
+ return -ENOENT;
+ /*
+ * Lock against /proc and ptrace attempts to look at mm
+ */
+ task_lock(current);
+ oldmm = current->mm;
+
+ current->mm = current->mm_avail[n];
+ BUG_ON (current->active_mm != oldmm);
+ /*
+ * Not sure of the best semantics here.
+ * This set of ref counting fixes has to be done on every AS switch
+ * if we want to keep things accurate.
+ * The alternative is to make current be mm_avail[0] in __exit_as.
+ */
+ atomic_dec(&oldmm->mm_users);
+ atomic_inc(&(current->mm->mm_users));
+ current->active_mm = current->mm;
+ switch_mm(oldmm, current->active_mm, current);
+ task_unlock(current);
+
+ return 0;
+}
+
+void __exit_as(struct task_struct *tsk)
+{
+ struct mm_struct **mmp;
+
+ BUG_ON(!tsk->mm_avail);
+
+ for (mmp = tsk->mm_avail; *mmp && mmp <
&tsk->mm_avail[N_MM]; mmp++)
+ mmput(*mmp);
+ kfree(tsk->mm_avail);
+ /* paranoia */
+ tsk->mm_avail = NULL;
+}
Index: linux-2.6/kernel/fork.c
==================================================================---
linux-2.6.orig/kernel/fork.c 2007-06-07 19:13:00.000000000 -0400
+++ linux-2.6/kernel/fork.c 2007-06-26 06:30:18.000000000 -0400
@@ -477,7 +477,7 @@
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err;
@@ -557,6 +557,16 @@
if (!mm)
goto fail_nomem;
+#ifdef CONFIG_VMMHOST
+ /*
+ * Could forbid forking a multi-address space process,
+ * could copy all, or could copy just the current address space.
+ * I've chosen to do the latter.
+ * Although I'm not anticipating many multi-as processes calling fork.
+ */
+ tsk->mm_avail = NULL;
+#endif
+
good_mm:
/* Initializing for Swap token stuff */
mm->token_priority = 0;
Index: linux-2.6/mm/Makefile
==================================================================---
linux-2.6.orig/mm/Makefile 2007-06-07 19:13:02.000000000 -0400
+++ linux-2.6/mm/Makefile 2007-06-25 13:50:32.000000000 -0400
@@ -30,5 +30,6 @@
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
+obj-$(CONFIG_VMMHOST) += multias.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
Index: linux-2.6/mm/Kconfig
==================================================================---
linux-2.6.orig/mm/Kconfig 2007-06-07 19:13:02.000000000 -0400
+++ linux-2.6/mm/Kconfig 2007-06-25 13:50:32.000000000 -0400
@@ -168,3 +168,10 @@
depends on QUICKLIST
default "2" if (SUPERH && !SUPERH64)
default "1"
+
+config VMMHOST
+ bool "Multiple address spaces for userspace virtualisation
acceleration"
+ default false
+ help
+ This option allows a process to have more than one address space,
+ and enables the new_as and switch_as system calls.
--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
http://www.ertos.nicta.com.au ERTOS within National ICT Australia