thr3ads.net - Virtualization - [PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel [Apr 2007]

If this information is useful, please help other people find it:
Share via:

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

[PATCH 13 of 13] Put .note.* sections into a PT_NOTE segment in vmlinux

3 files changed, 101 insertions(+), 2 deletions(-)
arch/i386/kernel/vmlinux.lds.S    |   12 ++++-
include/asm-generic/vmlinux.lds.h |    3 +
include/linux/elfnote.h           |   88 +++++++++++++++++++++++++++++++++++++


This patch will pack any .note.* section into a PT_NOTE segment in the
output file.

To do this, we tell ld that we need a PT_NOTE segment.  This requires
us to start explicitly mapping sections to segments, so we also need
to explicitly create PT_LOAD segments for text and data, and map the
sections to them appropriately.  Fortunately, each section will
default to its previous section's segment, so it doesn't take many
changes to vmlinux.lds.S.

This only changes i386 for now, but I presume the corresponding
changes for other architectures will be as simple.

This change also adds <linux/elfnote.h>, which defines C and Assembler
macros for actually creating ELF notes.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hollis Blanchard <hollisb@us.ibm.com>



==================================================================
diff -r 8235caea9d68 -r 2bf2abf6e970 arch/i386/kernel/vmlinux.lds.S
--- a/arch/i386/kernel/vmlinux.lds.S	Tue Aug 01 01:32:01 2006 -0700
+++ b/arch/i386/kernel/vmlinux.lds.S	Tue Aug 01 01:32:01 2006 -0700
@@ -13,6 +13,12 @@ OUTPUT_ARCH(i386)
 OUTPUT_ARCH(i386)
 ENTRY(phys_startup_32)
 jiffies = jiffies_64;
+
+PHDRS {
+	text PT_LOAD FLAGS(5);	/* R_E */
+	data PT_LOAD FLAGS(7);	/* RWE */
+	note PT_NOTE FLAGS(4);	/* R__ */
+}
 SECTIONS
 {
   . = __KERNEL_START;
@@ -26,7 +32,7 @@ SECTIONS
 	KPROBES_TEXT
 	*(.fixup)
 	*(.gnu.warning)
-	} = 0x9090
+	} :text = 0x9090
 
   _etext = .;			/* End of text section */
 
@@ -48,7 +54,7 @@ SECTIONS
   .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
 	*(.data)
 	CONSTRUCTORS
-	}
+	} :data
 
   . = ALIGN(4096);
   __nosave_begin = .;
@@ -184,4 +190,6 @@ SECTIONS
   STABS_DEBUG
 
   DWARF_DEBUG
+
+  NOTES
 }
diff -r 8235caea9d68 -r 2bf2abf6e970 include/asm-generic/vmlinux.lds.h
--- a/include/asm-generic/vmlinux.lds.h	Tue Aug 01 01:32:01 2006 -0700
+++ b/include/asm-generic/vmlinux.lds.h	Tue Aug 01 01:32:01 2006 -0700
@@ -194,3 +194,6 @@
 		.stab.index 0 : { *(.stab.index) }			\
 		.stab.indexstr 0 : { *(.stab.indexstr) }		\
 		.comment 0 : { *(.comment) }
+
+#define NOTES								\
+		.notes : { *(.note.*) } :note
diff -r 8235caea9d68 -r 2bf2abf6e970 include/linux/elfnote.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/elfnote.h	Tue Aug 01 01:32:01 2006 -0700
@@ -0,0 +1,88 @@
+#ifndef _LINUX_ELFNOTE_H
+#define _LINUX_ELFNOTE_H
+/* 
+ * Helper macros to generate ELF Note structures, which are put into a
+ * PT_NOTE segment of the final vmlinux image.  These are useful for
+ * including name-value pairs of metadata into the kernel binary (or
+ * modules?) for use by external programs.
+ *
+ * Each note has three parts: a name, a type and a desc.  The name is
+ * intended to distinguish the note's originator, so it would be a
+ * company, project, subsystem, etc; it must be in a suitable form for
+ * use in a section name.  The type is an integer which is used to tag
+ * the data, and is considered to be within the "name" namespace (so
+ * "FooCo"'s type 42 is distinct from "BarProj"'s
type 42).  The
+ * "desc" field is the actual data.  There are no constraints on the
+ * desc field's contents, though typically they're fairly small.
+ *
+ * All notes from a given NAME are put into a section named
+ * .note.NAME.  When the kernel image is finally linked, all the notes
+ * are packed into a single .notes section, which is mapped into the
+ * PT_NOTE segment.  Because notes for a given name are grouped into
+ * the same section, they'll all be adjacent the output file.
+ *
+ * This file defines macros for both C and assembler use.  Their
+ * syntax is slightly different, but they're semantically similar.
+ *
+ * See the ELF specification for more detail about ELF notes.
+ */
+
+#ifdef __ASSEMBLER__
+/*
+ * Generate a structure with the same shape as Elf{32,64}_Nhdr (which
+ * turn out to be the same size and shape), followed by the name and
+ * desc data with appropriate padding.  The 'desc' argument includes
+ * the assembler pseudo op defining the type of the data: .asciz
+ * "hello, world"
+ */
+.macro ELFNOTE name type desc:vararg
+.pushsection ".note.\name"
+  .align 4
+  .long 2f - 1f			/* namesz */
+  .long 4f - 3f			/* descsz */
+  .long \type
+1:.asciz "\name"
+2:.align 4
+3:\desc
+4:.align 4
+.popsection
+.endm
+#else	/* !__ASSEMBLER__ */
+#include <linux/elf.h>
+/* 
+ * Use an anonymous structure which matches the shape of
+ * Elf{32,64}_Nhdr, but includes the name and desc data.  The size and
+ * type of name and desc depend on the macro arguments.  "name" must
+ * be a literal string, and "desc" must be passed by value.  You may
+ * only define one note per line, since __LINE__ is used to generate
+ * unique symbols.
+ */
+#define _ELFNOTE_PASTE(a,b)	a##b
+#define _ELFNOTE(size, name, unique, type, desc)			\
+	static const struct {						\
+		struct elf##size##_note _nhdr;				\
+		unsigned char _name[sizeof(name)]			\
+		__attribute__((aligned(sizeof(Elf##size##_Word))));	\
+		typeof(desc) _desc					\
+			     __attribute__((aligned(sizeof(Elf##size##_Word)))); \
+	} _ELFNOTE_PASTE(_note_, unique)				\
+		__attribute_used__					\
+		__attribute__((section(".note." name),			\
+			       aligned(sizeof(Elf##size##_Word)),	\
+			       unused)) = {				\
+		{							\
+			sizeof(name),					\
+			sizeof(desc),					\
+			type,						\
+		},							\
+		name,							\
+		desc							\
+	}
+#define ELFNOTE(size, name, type, desc)		\
+	_ELFNOTE(size, name, __LINE__, type, desc)
+
+#define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc)
+#define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc)
+#endif	/* __ASSEMBLER__ */
+
+#endif /* _LINUX_ELFNOTE_H */

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 7 of 13] Make __FIXADDR_TOP variable to allow it to make space for a hypervisor

5 files changed, 68 insertions(+), 2 deletions(-)
arch/i386/Kconfig         |    1 +
arch/i386/mm/init.c       |   42 ++++++++++++++++++++++++++++++++++++++++++
arch/i386/mm/pgtable.c    |   18 ++++++++++++++++++
include/asm-i386/fixmap.h |    7 ++++++-
include/asm-i386/page.h   |    2 +-


Make __FIXADDR_TOP a variable, so that it can be set to not get in the
way of address space a hypervisor may want to reserve.

Original patch by Gerd Hoffmann <kraxel@suse.de>

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Gerd Hoffmann <kraxel@suse.de>



==================================================================
diff -r 730b4fe6bc1e -r b6c100bb5ca5 arch/i386/Kconfig
--- a/arch/i386/Kconfig	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/Kconfig	Tue Aug 01 01:32:00 2006 -0700
@@ -792,6 +792,7 @@ config COMPAT_VDSO
 config COMPAT_VDSO
 	bool "Compat VDSO support"
 	default y
+	depends on !PARAVIRT
 	help
 	  Map the VDSO to the predictable old-style address too.
 	---help---
diff -r 730b4fe6bc1e -r b6c100bb5ca5 arch/i386/mm/init.c
--- a/arch/i386/mm/init.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/mm/init.c	Tue Aug 01 01:32:00 2006 -0700
@@ -629,6 +629,48 @@ void __init mem_init(void)
 		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
 	       );
 
+#if 1 /* double-sanity-check paranoia */
+	printk("virtual kernel memory layout:\n"
+	       "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+	       "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#endif
+	       "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+	       "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+	       "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	       "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	       "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+	       FIXADDR_START, FIXADDR_TOP,
+	       (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+	       PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+	       (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+	       VMALLOC_START, VMALLOC_END,
+	       (VMALLOC_END - VMALLOC_START) >> 20,
+
+	       (unsigned long)__va(0), (unsigned long)high_memory,
+	       ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+	       (unsigned long)&__init_begin, (unsigned long)&__init_end,
+	       ((unsigned long)&__init_end - (unsigned long)&__init_begin)
>> 10,
+
+	       (unsigned long)&_etext, (unsigned long)&_edata,
+	       ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+	       (unsigned long)&_text, (unsigned long)&_etext,
+	       ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+#ifdef CONFIG_HIGHMEM
+	BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+	BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+#endif
+	BUG_ON(VMALLOC_START                   > VMALLOC_END);
+	BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+#endif /* double-sanity-check paranoia */
+
 #ifdef CONFIG_X86_PAE
 	if (!cpu_has_pae)
 		panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
diff -r 730b4fe6bc1e -r b6c100bb5ca5 arch/i386/mm/pgtable.c
--- a/arch/i386/mm/pgtable.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/mm/pgtable.c	Tue Aug 01 01:32:00 2006 -0700
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
+#include <linux/module.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -137,6 +138,12 @@ void set_pmd_pfn(unsigned long vaddr, un
 	__flush_tlb_one(vaddr);
 }
 
+static int fixmaps = 0;
+#ifndef CONFIG_COMPAT_VDSO
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+#endif
+
 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t
flags)
 {
 	unsigned long address = __fix_to_virt(idx);
@@ -146,6 +153,17 @@ void __set_fixmap (enum fixed_addresses 
 		return;
 	}
 	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+	fixmaps++;
+}
+
+void set_fixaddr_top(unsigned long top)
+{
+	BUG_ON(fixmaps > 0);
+#ifdef CONFIG_COMPAT_VDSO
+	BUG_ON(top - PAGE_SIZE != __FIXADDR_TOP);
+#else
+	__FIXADDR_TOP = top - PAGE_SIZE;
+#endif
 }
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
diff -r 730b4fe6bc1e -r b6c100bb5ca5 include/asm-i386/fixmap.h
--- a/include/asm-i386/fixmap.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/fixmap.h	Tue Aug 01 01:32:00 2006 -0700
@@ -19,7 +19,11 @@
  * Leave one empty page between vmalloc'ed areas and
  * the start of the fixmap.
  */
-#define __FIXADDR_TOP	0xfffff000
+#ifndef CONFIG_COMPAT_VDSO
+extern unsigned long __FIXADDR_TOP;
+#else
+#define __FIXADDR_TOP  0xfffff000
+#endif
 
 #ifndef __ASSEMBLY__
 #include <linux/kernel.h>
@@ -93,6 +97,7 @@ enum fixed_addresses {
 
 extern void __set_fixmap (enum fixed_addresses idx,
 					unsigned long phys, pgprot_t flags);
+extern void set_fixaddr_top(unsigned long top);
 
 #define set_fixmap(idx, phys) \
 		__set_fixmap(idx, phys, PAGE_KERNEL)
diff -r 730b4fe6bc1e -r b6c100bb5ca5 include/asm-i386/page.h
--- a/include/asm-i386/page.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/page.h	Tue Aug 01 01:32:00 2006 -0700
@@ -122,7 +122,7 @@ extern int page_is_ram(unsigned long pag
 
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
 #define VMALLOC_RESERVE		((unsigned long)__VMALLOC_RESERVE)
-#define MAXMEM			(-__PAGE_OFFSET-__VMALLOC_RESERVE)
+#define MAXMEM			(__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
 #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
 #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 3 of 13] Implement always-locked bit ops, for memory shared with an SMP hypervisor

2 files changed, 192 insertions(+)
include/asm-i386/sync_bitops.h |  156 ++++++++++++++++++++++++++++++++++++++++
include/asm-i386/system.h      |   36 +++++++++


Add "always lock'd" implementations of set_bit, clear_bit and
change_bit and the corresponding test_and_ functions.  Also add
"always lock'd" implementation of cmpxchg.  These give guaranteed
strong synchronisation and are required for non-SMP kernels running on
an SMP hypervisor.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Christoph Lameter <clameter@sgi.com>



==================================================================
diff -r ffccb62e9244 -r 85e7eadfaea1 include/asm-i386/system.h
--- a/include/asm-i386/system.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/system.h	Tue Aug 01 01:32:00 2006 -0700
@@ -261,6 +261,9 @@ static inline unsigned long __xchg(unsig
 #define cmpxchg(ptr,o,n)\
 	((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
 					(unsigned long)(n),sizeof(*(ptr))))
+#define sync_cmpxchg(ptr,o,n)\
+	((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
+					(unsigned long)(n),sizeof(*(ptr))))
 #endif
 
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -282,6 +285,39 @@ static inline unsigned long __cmpxchg(vo
 		return prev;
 	case 4:
 		__asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
+				     : "=a"(prev)
+				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	}
+	return old;
+}
+
+/*
+ * Always use locked operations when touching memory shared with a
+ * hypervisor, since the system may be SMP even if the guest kernel
+ * isn't.
+ */
+static inline unsigned long __sync_cmpxchg(volatile void *ptr,
+					    unsigned long old,
+					    unsigned long new, int size)
+{
+	unsigned long prev;
+	switch (size) {
+	case 1:
+		__asm__ __volatile__("lock; cmpxchgb %b1,%2"
+				     : "=a"(prev)
+				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	case 2:
+		__asm__ __volatile__("lock; cmpxchgw %w1,%2"
+				     : "=a"(prev)
+				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	case 4:
+		__asm__ __volatile__("lock; cmpxchgl %1,%2"
 				     : "=a"(prev)
 				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
 				     : "memory");
diff -r ffccb62e9244 -r 85e7eadfaea1 include/asm-i386/sync_bitops.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/asm-i386/sync_bitops.h	Tue Aug 01 01:32:00 2006 -0700
@@ -0,0 +1,156 @@
+#ifndef _I386_SYNC_BITOPS_H
+#define _I386_SYNC_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#define ADDR (*(volatile long *) addr)
+
+/**
+ * sync_set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered.  See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note: there are no guarantees that this function will not be reordered
+ * on non x86 architectures, so if you are writting portable code,
+ * make sure not to rely on its reordering guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_set_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__("lock; btsl %1,%0"
+			     :"+m" (ADDR)
+			     :"Ir" (nr)
+			     : "memory");
+}
+
+/**
+ * sync_clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * sync_clear_bit() is atomic and may not be reordered.  However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void sync_clear_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__("lock; btrl %1,%0"
+			     :"+m" (ADDR)
+			     :"Ir" (nr)
+			     : "memory");
+}
+
+/**
+ * sync_change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered. It may be
+ * reordered on other architectures than x86.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_change_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__("lock; btcl %1,%0"
+			     :"+m" (ADDR)
+			     :"Ir" (nr)
+			     : "memory");
+}
+
+/**
+ * sync_test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It may be reordered on other architectures than x86.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_set_bit(int nr, volatile unsigned long * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__("lock; btsl %2,%1\n\tsbbl %0,%0"
+			     :"=r" (oldbit),"+m" (ADDR)
+			     :"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/**
+ * sync_test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It can be reorderdered on other architectures other than x86.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *
addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__("lock; btrl %2,%1\n\tsbbl %0,%0"
+			     :"=r" (oldbit),"+m" (ADDR)
+			     :"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/**
+ * sync_test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_change_bit(int nr, volatile unsigned long*
addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__("lock; btcl %2,%1\n\tsbbl %0,%0"
+			     :"=r" (oldbit),"+m" (ADDR)
+			     :"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+static __always_inline int sync_const_test_bit(int nr, const volatile unsigned
long *addr)
+{
+	return ((1UL << (nr & 31)) &
+		(((const volatile unsigned int *)addr)[nr >> 5])) != 0;
+}
+
+static inline int sync_var_test_bit(int nr, const volatile unsigned long *
addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
+			     :"=r" (oldbit)
+			     :"m" (ADDR),"Ir" (nr));
+	return oldbit;
+}
+
+#define sync_test_bit(nr,addr)			\
+	(__builtin_constant_p(nr) ?		\
+	 sync_constant_test_bit((nr),(addr)) :	\
+	 sync_var_test_bit((nr),(addr)))
+
+#undef ADDR
+
+#endif /* _I386_SYNC_BITOPS_H */

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 11 of 13] Implement lazy MMU update hooks which are SMP safe for both direct and

5 files changed, 34 insertions(+)
include/asm-generic/pgtable.h |   20 ++++++++++++++++++++
mm/memory.c                   |    8 ++++++++
mm/mprotect.c                 |    2 ++
mm/mremap.c                   |    2 ++
mm/msync.c                    |    2 ++


shadow page tables.  The idea is that PTE updates and page invalidations
while in lazy mode can be batched into a single hypercall.  We use this
in VMI for shadow page table synchronization, and it is a win.  It also
can be used by PPC and for direct page tables on Xen.

For SMP, the enter / leave must happen under protection of the page table
locks for page tables which are being modified.  This is because otherwise,
you end up with stale state in the batched hypercall, which other CPUs can
race ahead of.  Doing this under the protection of the locks guarantees
the synchronization is correct, and also means that spurious faults which
are generated during this window by remote CPUs are properly handled, as
the page fault handler must re-check the PTE under protection of the same
lock.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>

==================================================================
diff -r 553154516a1b -r 398f8fd6b334 include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-generic/pgtable.h	Tue Aug 01 01:32:01 2006 -0700
@@ -164,6 +164,26 @@ static inline void ptep_set_wrprotect(st
 #endif
 
 /*
+ * A facility to provide lazy MMU batching.  This allows PTE updates and
+ * page invalidations to be delayed until a call to leave lazy MMU mode
+ * is issued.  Some architectures may benefit from doing this, and it is
+ * beneficial for both shadow and direct mode hypervisors, which may batch
+ * the PTE updates which happen during this window.  Note that using this
+ * interface requires that read hazards be removed from the code.  A read
+ * hazard could result in the direct mode hypervisor case, since the actual
+ * write to the page tables may not yet have taken place, so reads though
+ * a raw PTE pointer after it has been modified are not guaranteed to be
+ * up to date.  This mode can only be entered and left under the protection of
+ * the page table locks for all page tables which may be modified.  In the UP
+ * case, this is required so that preemption is disabled, and in the SMP case,
+ * it must synchronize the delayed page table writes properly on other CPUs.
+ */
+#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+#define arch_enter_lazy_mmu_mode()	do {} while (0)
+#define arch_leave_lazy_mmu_mode()	do {} while (0)
+#endif
+
+/*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
  * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
diff -r 553154516a1b -r 398f8fd6b334 mm/memory.c
--- a/mm/memory.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/mm/memory.c	Tue Aug 01 01:32:01 2006 -0700
@@ -505,6 +505,7 @@ again:
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
 
 	do {
 		/*
@@ -526,6 +527,7 @@ again:
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
+	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -627,6 +629,7 @@ static unsigned long zap_pte_range(struc
 	int anon_rss = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
@@ -693,6 +696,7 @@ static unsigned long zap_pte_range(struc
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
 	add_mm_rss(mm, file_rss, anon_rss);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
 	return addr;
@@ -1108,6 +1112,7 @@ static int zeromap_pte_range(struct mm_s
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
+	arch_enter_lazy_mmu_mode();
 	do {
 		struct page *page = ZERO_PAGE(addr);
 		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1117,6 +1122,7 @@ static int zeromap_pte_range(struct mm_s
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
@@ -1269,11 +1275,13 @@ static int remap_pte_range(struct mm_str
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
+	arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
diff -r 553154516a1b -r 398f8fd6b334 mm/mprotect.c
--- a/mm/mprotect.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/mm/mprotect.c	Tue Aug 01 01:32:01 2006 -0700
@@ -33,6 +33,7 @@ static void change_pte_range(struct mm_s
 	spinlock_t *ptl;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
@@ -62,6 +63,7 @@ static void change_pte_range(struct mm_s
 		}
 
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 }
 
diff -r 553154516a1b -r 398f8fd6b334 mm/mremap.c
--- a/mm/mremap.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/mm/mremap.c	Tue Aug 01 01:32:01 2006 -0700
@@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_str
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_str
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
 
+	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	pte_unmap_nested(new_pte - 1);
diff -r 553154516a1b -r 398f8fd6b334 mm/msync.c
--- a/mm/msync.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/mm/msync.c	Tue Aug 01 01:32:01 2006 -0700
@@ -30,6 +30,7 @@ static unsigned long msync_pte_range(str
 
 again:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		struct page *page;
 
@@ -51,6 +52,7 @@ again:
 			ret += set_page_dirty(page);
 		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end)

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 4 of 13] Allow a kernel to not be in ring 0

6 files changed, 22 insertions(+), 13 deletions(-)
arch/i386/kernel/entry.S   |    5 +++--
arch/i386/kernel/process.c |    2 +-
arch/i386/mm/extable.c     |    2 +-
arch/i386/mm/fault.c       |   11 ++++-------
include/asm-i386/ptrace.h  |    5 +++--
include/asm-i386/segment.h |   10 ++++++++++


We allow for the fact that the guest kernel may not run in ring 0.
This requires some abstraction in a few places when setting %cs or
checking privilege level (user vs kernel).

This is Chris' [RFC PATCH 15/33] move segment checks to subarch,
except rather than using #define USER_MODE_MASK which depends on a
config option, we use Zach's more flexible approach of assuming ring 3
== userspace.  I also used "get_kernel_rpl()" over
"get_kernel_cs()"
because I think it reads better in the code...

1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3
2) Add a get_kernel_rpl() macro, and don't assume it's zero.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>



==================================================================
diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/kernel/entry.S	Tue Aug 01 01:32:00 2006 -0700
@@ -229,8 +229,9 @@ check_userspace:
 check_userspace:
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
-	testl $(VM_MASK | 3), %eax
-	jz resume_kernel
+	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+	cmpl $SEGMENT_RPL_MASK, %eax
+	jb resume_kernel		# not returning to v8086 or userspace
 ENTRY(resume_userspace)
  	cli				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/kernel/process.c
--- a/arch/i386/kernel/process.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/kernel/process.c	Tue Aug 01 01:32:00 2006 -0700
@@ -346,7 +346,7 @@ int kernel_thread(int (*fn)(void *), voi
 	regs.xes = __USER_DS;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
-	regs.xcs = __KERNEL_CS;
+	regs.xcs = __KERNEL_CS | get_kernel_rpl();
 	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
 	/* Ok, create the new process.. */
diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/mm/extable.c
--- a/arch/i386/mm/extable.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/mm/extable.c	Tue Aug 01 01:32:00 2006 -0700
@@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs
 	const struct exception_table_entry *fixup;
 
 #ifdef CONFIG_PNPBIOS
-	if (unlikely((regs->xcs & ~15) == (GDT_ENTRY_PNPBIOS_BASE << 3)))
+	if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
 	{
 		extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
 		extern u32 pnp_bios_is_utter_crap;
diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/mm/fault.c
--- a/arch/i386/mm/fault.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/mm/fault.c	Tue Aug 01 01:32:00 2006 -0700
@@ -27,6 +27,7 @@
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/kdebug.h>
+#include <asm/segment.h>
 
 extern void die(const char *,struct pt_regs *,long);
 
@@ -119,10 +120,10 @@ static inline unsigned long get_segment_
 	}
 
 	/* The standard kernel/user address space limit. */
-	*eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
+	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
 	
 	/* By far the most common cases. */
-	if (likely(seg == __USER_CS || seg == __KERNEL_CS))
+	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
 		return eip;
 
 	/* Check the segment exists, is within the current LDT/GDT size,
@@ -436,11 +437,7 @@ good_area:
 	write = 0;
 	switch (error_code & 3) {
 		default:	/* 3: write, present */
-#ifdef TEST_VERIFY_AREA
-			if (regs->cs == KERNEL_CS)
-				printk("WP fault at %08lx\n", regs->eip);
-#endif
-			/* fall through */
+				/* fall through */
 		case 2:		/* write, not present */
 			if (!(vma->vm_flags & VM_WRITE))
 				goto bad_area;
diff -r 85e7eadfaea1 -r cf6767d9babb include/asm-i386/ptrace.h
--- a/include/asm-i386/ptrace.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/ptrace.h	Tue Aug 01 01:32:00 2006 -0700
@@ -60,6 +60,7 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #include <asm/vm86.h>
+#include <asm/segment.h>
 
 struct task_struct;
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int
error_code);
@@ -73,11 +74,11 @@ extern void send_sigtrap(struct task_str
  */
 static inline int user_mode(struct pt_regs *regs)
 {
-	return (regs->xcs & 3) != 0;
+	return (regs->xcs & SEGMENT_RPL_MASK) == 3;
 }
 static inline int user_mode_vm(struct pt_regs *regs)
 {
-	return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0;
+	return (((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags &
VM_MASK)) >= 3);
 }
 #define instruction_pointer(regs) ((regs)->eip)
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
diff -r 85e7eadfaea1 -r cf6767d9babb include/asm-i386/segment.h
--- a/include/asm-i386/segment.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/segment.h	Tue Aug 01 01:32:00 2006 -0700
@@ -83,6 +83,12 @@
 
 #define GDT_SIZE (GDT_ENTRIES * 8)
 
+/*
+ * Some tricky tests to match code segments after a fault
+ */
+#define SEGMENT_IS_FLAT_CODE(x)  (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
+#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE *
8)
+
 /* Simple and small GDT entries for booting only */
 
 #define GDT_ENTRY_BOOT_CS		2
@@ -112,4 +118,8 @@
  */
 #define IDT_ENTRIES 256
 
+/* Bottom three bits of xcs give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+
+#define get_kernel_rpl()  0
 #endif

Andi Kleen

2007-Apr-18 13:02 UTC

head link

[PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel

On Tuesday 01 August 2006 22:00, Jeremy Fitzhardinge
wrote:> [ REPOST: Apologies to anyone who has seen this before.  It
>   didn't make it onto any of the lists it should have. -J ]
I tried to apply these patches (except the ones I didn't like:
8, 10, 12) to my tree, but couldn't because they are all
MIME demaged:

+       pte =3D (mm =3D=3D &init_mm) ?

etc.

Can you please repost a version without that (and ideally
fix 8, 10, 12)?

-Andi

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 9 of 13] Remove the read hazard from the COW path in copy_one_pte

1 file changed, 1 insertion(+), 1 deletion(-)
mm/memory.c |    2 +-


We don't want to read PTEs directly like this after they have been
modified, as a lazy MMU implementation of direct page tables may not
have written the updated PTE back to memory yet.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>



==================================================================
diff -r 0adfc39039c7 -r 20f9c0c451af mm/memory.c
--- a/mm/memory.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/mm/memory.c	Tue Aug 01 01:32:00 2006 -0700
@@ -466,7 +466,7 @@ copy_one_pte(struct mm_struct *dst_mm, s
 	 */
 	if (is_cow_mapping(vm_flags)) {
 		ptep_set_wrprotect(src_mm, addr, src_pte);
-		pte = *src_pte;
+		pte = pte_wrprotect(pte);
 	}
 
 	/*

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 2 of 13] Remove locally-defined ldt structure in favour of standard type

1 file changed, 4 insertions(+), 8 deletions(-)
arch/i386/kernel/reboot.c |   12 ++++--------


arch/i386/kernel/reboot.c defines its own struct to describe an ldt
entry: it should use struct Xgt_desc_struct (currently load_ldt is a
macro, so doesn't complain: paravirt patches make it warn).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>



==================================================================
diff -r 79a98a10911f -r ffccb62e9244 arch/i386/kernel/reboot.c
--- a/arch/i386/kernel/reboot.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/kernel/reboot.c	Tue Aug 01 01:32:00 2006 -0700
@@ -145,14 +145,10 @@ real_mode_gdt_entries [3]  	0x000092000100ffffULL	/*
16-bit real-mode 64k data at 0x00000100 */
 };
 
-static struct
-{
-	unsigned short       size __attribute__ ((packed));
-	unsigned long long * base __attribute__ ((packed));
-}
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, NULL },
-no_idt = { 0, NULL };
+static struct Xgt_desc_struct
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1,
(long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 },
+no_idt = { 0, 0 };
 
 
 /* This is 16-bit protected mode code to disable paging and the cache,

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 6 of 13] Roll all the cpuid asm into one __cpuid call

1 file changed, 34 insertions(+), 40 deletions(-)
include/asm-i386/processor.h |   74 +++++++++++++++++++-----------------------


It's a little neater, and also means only one place to patch for
paravirtualization.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>



==================================================================
diff -r 09b35e6bc0ca -r 730b4fe6bc1e include/asm-i386/processor.h
--- a/include/asm-i386/processor.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/processor.h	Tue Aug 01 01:32:00 2006 -0700
@@ -143,31 +143,37 @@ static inline void detect_ht(struct cpui
 #define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
 #define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
 
-/*
- * Generic CPUID function
- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
- * resulting in stale register contents being returned.
- */
-static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
-{
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+			   unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
 	__asm__("cpuid"
 		: "=a" (*eax),
 		  "=b" (*ebx),
 		  "=c" (*ecx),
 		  "=d" (*edx)
-		: "0" (op), "c"(0));
+		: "0" (*eax), "2" (*ecx));
+}
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = 0;
+	__cpuid(eax, ebx, ecx, edx);
 }
 
 /* Some CPUID calls want 'count' to be placed in ecx */
 static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
-	       	int *edx)
-{
-	__asm__("cpuid"
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (op), "c" (count));
+			       int *edx)
+{
+	*eax = op;
+	*ecx = count;
+	__cpuid(eax, ebx, ecx, edx);
 }
 
 /*
@@ -175,42 +181,30 @@ static inline void cpuid_count(int op, i
  */
 static inline unsigned int cpuid_eax(unsigned int op)
 {
-	unsigned int eax;
-
-	__asm__("cpuid"
-		: "=a" (eax)
-		: "0" (op)
-		: "bx", "cx", "dx");
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return eax;
 }
 static inline unsigned int cpuid_ebx(unsigned int op)
 {
-	unsigned int eax, ebx;
-
-	__asm__("cpuid"
-		: "=a" (eax), "=b" (ebx)
-		: "0" (op)
-		: "cx", "dx" );
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return ebx;
 }
 static inline unsigned int cpuid_ecx(unsigned int op)
 {
-	unsigned int eax, ecx;
-
-	__asm__("cpuid"
-		: "=a" (eax), "=c" (ecx)
-		: "0" (op)
-		: "bx", "dx" );
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return ecx;
 }
 static inline unsigned int cpuid_edx(unsigned int op)
 {
-	unsigned int eax, edx;
-
-	__asm__("cpuid"
-		: "=a" (eax), "=d" (edx)
-		: "0" (op)
-		: "bx", "cx");
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return edx;
 }

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 8 of 13] Add a bootparameter to reserve high linear address space for hypervisors

1 file changed, 13 insertions(+)
arch/i386/kernel/setup.c |   13 +++++++++++++


This is necessary to allow dynamically loaded hypervisor modules, which
might not happen until userspace is already running, and also provides a
useful tool to benchmark the performance impact of reduced lowmem address
space.

Signed-off-by: Zachary Amsden <zach@vmware.com>



==================================================================
diff -r b6c100bb5ca5 -r 0adfc39039c7 arch/i386/kernel/setup.c
--- a/arch/i386/kernel/setup.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/kernel/setup.c	Tue Aug 01 01:32:00 2006 -0700
@@ -917,6 +917,19 @@ static void __init parse_cmdline_early (
 		else if (!memcmp(from, "vmalloc=", 8))
 			__VMALLOC_RESERVE = memparse(from+8, &from);
 
+		/*
+		 * reservedtop=size reserves a hole at the top of the kernel
+		 * address space which a hypervisor can load into later.
+		 * Needed for dynamically loaded hypervisors, so relocating
+		 * the fixmap can be done before paging initialization.
+		 * This hole must be a multiple of 4M.
+		 */
+		else if (!memcmp(from, "reservedtop=", 12)) {
+			unsigned long reserved = memparse(from+12, &from);
+			reserved &= ~0x3fffff;
+			set_fixaddr_top(-reserved);
+		}
+
 	next_char:
 		c = *(from++);
 		if (!c)

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel

[ REPOST: Apologies to anyone who has seen this before.  It
  didn't make it onto any of the lists it should have. -J ]

Hi Andrew,

This series of patches lays the basic ground work for the
paravirtualized kernel patches coming later on.  I think this lot is
ready for the rough-and-tumble world of the -mm tree.

For the most part, these patches do nothing or very little.  The
patches should be self explanatory, but the overview is:

Helper functions for later use:
	1/13: Add apply_to_page_range()...
	3/13: Implement always-locked bit ops...
	13/13: Put .note.* sections into a PT_NOTE segment in vmlinux

Cleanups:
	2/13: Remove locally-defined ldt structure in favour of standard type
	4/13: Allow a kernel to not be in ring 0
	6/13: Roll all the cpuid asm into one __cpuid call
	9/13: Remove the read hazard from the COW path in copy_one_pte
	10/13: Change pte_clear_full to a more appropriately named...

Hooks:
	5/13: Replace sensitive instructions with macros
	7/13: Make __FIXADDR_TOP variable to allow it to make space...
	8/13: Add a bootparameter to reserve high linear address...
	11/13: Add lazy MMU mode hooks for batching PTE updates
	12/13: Pass the mm struct into the pgd_free code so the mm...


Probably the most subtle changes here are 11/13 and 9/13, since they
add a new constraint to page-table manipulation code.  In a
paravirtualized system, pte updates may be batched and performed
lazily, so their effects will not be immediately visible on the pte
itself.  To avoid this, code which modifies ptes in a loop needs to
avoid looking at the modified ptes.  9/13 fixes the one place where it
happens.

11/13 depends on removing these read hazards for correctness when running
under a direct page table hypervisor which batches updates.  However, it
is generally agreed that using an _explicit_, rather than an _implicit_
notion of batching makes it easy to find and reason about the paths which
are doing batching.  This allows easy inspection to remove read hazards
from the code.

13/13 "Put .note.* sections into a PT_NOTE segment in vmlinux" is
mostly here to shake out problems early.  It slightly changes the way
the vmlinux image is linked together, and it uses the somewhat
esoteric PHDRS command in vmlinux.lds.  I want to make sure that this
doesn't provoke any problems in the various binutils people are using.

Thanks,
	J
-------------- next part --------------
30 files changed, 609 insertions(+), 92 deletions(-)
arch/i386/Kconfig                 |    1 
arch/i386/kernel/entry.S          |   43 +++++-----
arch/i386/kernel/process.c        |    2 
arch/i386/kernel/reboot.c         |   12 --
arch/i386/kernel/setup.c          |   13 +++
arch/i386/kernel/vmlinux.lds.S    |   12 ++
arch/i386/mm/extable.c            |    2 
arch/i386/mm/fault.c              |   11 --
arch/i386/mm/init.c               |   42 +++++++++
arch/i386/mm/pgtable.c            |   21 ++++
include/asm-generic/pgtable.h     |   24 +++++
include/asm-generic/vmlinux.lds.h |    3 
include/asm-i386/fixmap.h         |    7 +
include/asm-i386/page.h           |    2 
include/asm-i386/pgalloc.h        |    4 
include/asm-i386/pgtable.h        |    1 
include/asm-i386/processor.h      |   74 ++++++++---------
include/asm-i386/ptrace.h         |    5 -
include/asm-i386/segment.h        |   10 ++
include/asm-i386/spinlock.h       |    7 +
include/asm-i386/sync_bitops.h    |  156 +++++++++++++++++++++++++++++++++++++
include/asm-i386/system.h         |   36 ++++++++
include/linux/elfnote.h           |   88 ++++++++++++++++++++
include/linux/mm.h                |    9 ++
kernel/fork.c                     |    2 
mm/fremap.c                       |    2 
mm/memory.c                       |  106 ++++++++++++++++++++++++-
mm/mprotect.c                     |    2 
mm/mremap.c                       |    2 
mm/msync.c                        |    2

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 5 of 13] Replace sensitive instructions with macros

2 files changed, 27 insertions(+), 18 deletions(-)
arch/i386/kernel/entry.S    |   38 ++++++++++++++++++++++----------------
include/asm-i386/spinlock.h |    7 +++++--


Abstract sensitive instructions in assembler code, replacing them with
macros (which currently are #defined to the native versions).  We use
long names: assembler is case-insensitive, so if something goes wrong
and macros do not expand, it would assemble anyway.

Resulting object files are exactly the same as before.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>



==================================================================
diff -r cf6767d9babb -r 09b35e6bc0ca arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S	Tue Aug 01 01:32:00 2006 -0700
+++ b/arch/i386/kernel/entry.S	Tue Aug 01 01:32:00 2006 -0700
@@ -76,8 +76,15 @@ NT_MASK		= 0x00004000
 NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
+/* These are replaces for paravirtualization */
+#define DISABLE_INTERRUPTS		cli
+#define ENABLE_INTERRUPTS		sti
+#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
+#define INTERRUPT_RETURN		iret
+#define GET_CR0_INTO_EAX		movl %cr0, %eax
+
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli; TRACE_IRQS_OFF
+#define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
 #else
 #define preempt_stop
 #define resume_kernel		restore_nocheck
@@ -233,7 +240,7 @@ check_userspace:
 	cmpl $SEGMENT_RPL_MASK, %eax
 	jb resume_kernel		# not returning to v8086 or userspace
 ENTRY(resume_userspace)
- 	cli				# make sure we don't miss an interrupt
+ 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -244,7 +251,7 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-	cli
+	DISABLE_INTERRUPTS
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
@@ -272,7 +279,7 @@ sysenter_past_esp:
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
 	 */
-	sti
+	ENABLE_INTERRUPTS
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ss, 0*/
@@ -317,7 +324,7 @@ 1:	movl (%ebp),%ebp
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
-	cli
+	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
@@ -327,8 +334,7 @@ 1:	movl (%ebp),%ebp
 	movl OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
-	sti
-	sysexit
+	ENABLE_INTERRUPTS_SYSEXIT
 	CFI_ENDPROC
 
 
@@ -353,7 +359,7 @@ syscall_call:
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)		# store the return value
 syscall_exit:
-	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -378,11 +384,11 @@ restore_nocheck_notrace:
 	RESTORE_REGS
 	addl $4, %esp
 	CFI_ADJUST_CFA_OFFSET -4
-1:	iret
+1:	INTERRUPT_RETURN
 .section .fixup,"ax"
 iret_exc:
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -406,7 +412,7 @@ ldt_ss:
 	 * dosemu and wine happy. */
 	subl $8, %esp		# reserve space for switch16 pointer
 	CFI_ADJUST_CFA_OFFSET 8
-	cli
+	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
 	movl %esp, %eax
 	/* Set up the 16bit stack frame with switch32 pointer on top,
@@ -416,7 +422,7 @@ ldt_ss:
 	TRACE_IRQS_IRET
 	RESTORE_REGS
 	lss 20+4(%esp), %esp	# switch to 16bit stack
-1:	iret
+1:	INTERRUPT_RETURN
 .section __ex_table,"a"
 	.align 4
 	.long 1b,iret_exc
@@ -431,7 +437,7 @@ work_pending:
 	jz work_notifysig
 work_resched:
 	call schedule
-	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -487,7 +493,7 @@ syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
 	TRACE_IRQS_ON
-	sti				# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS		# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
 	movl $1, %edx
@@ -666,7 +672,7 @@ ENTRY(device_not_available)
 	pushl $-1			# mark this as an int
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
-	movl %cr0, %eax
+	GET_CR0_INTO_EAX
 	testl $0x4, %eax		# EM (math emulation bit)
 	jne device_not_available_emulate
 	preempt_stop
@@ -796,7 +802,7 @@ nmi_16bit_stack:
 	call do_nmi
 	RESTORE_REGS
 	lss 12+4(%esp), %esp		# back to 16bit stack
-1:	iret
+1:	INTERRUPT_RETURN
 	CFI_ENDPROC
 .section __ex_table,"a"
 	.align 4
diff -r cf6767d9babb -r 09b35e6bc0ca include/asm-i386/spinlock.h
--- a/include/asm-i386/spinlock.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-i386/spinlock.h	Tue Aug 01 01:32:00 2006 -0700
@@ -16,6 +16,9 @@
  *
  * (the type definitions are in asm/spinlock_types.h)
  */
+
+#define CLI_STRING	"cli"
+#define STI_STRING	"sti"
 
 #define __raw_spin_is_locked(x) \
 		(*(volatile signed char *)(&(x)->slock) <= 0)
@@ -43,12 +46,12 @@
 	"2:\t" \
 	"testl $0x200, %1\n\t" \
 	"jz 4f\n\t" \
-	"sti\n" \
+	STI_STRING "\n" \
 	"3:\t" \
 	"rep;nop\n\t" \
 	"cmpb $0, %0\n\t" \
 	"jle 3b\n\t" \
-	"cli\n\t" \
+	CLI_STRING "\n\t" \
 	"jmp 1b\n" \
 	"4:\t" \
 	"rep;nop\n\t" \

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 1 of 13] Add apply_to_page_range() which applies a function to a pte range

2 files changed, 99 insertions(+)
include/linux/mm.h |    5 ++
mm/memory.c        |   94 ++++++++++++++++++++++++++++++++++++++++++++++++++++


Add a new mm function apply_to_page_range() which applies a given
function to every pte in a given virtual address range in a given mm
structure. This is a generic alternative to cut-and-pasting the Linux
idiomatic pagetable walking code in every place that a sequence of
PTEs must be accessed.

Although this interface is intended to be useful in a wide range of
situations, it is currently used specifically by several Xen
subsystems, for example: to ensure that pagetables have been allocated
for a virtual address range, and to construct batched special
pagetable update requests to map I/O memory (in ioremap()).

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>



==================================================================
diff -r 521d1bcdaa86 -r 79a98a10911f include/linux/mm.h
--- a/include/linux/mm.h	Sun Jul 30 07:00:26 2006 +0000
+++ b/include/linux/mm.h	Tue Aug 01 01:32:00 2006 -0700
@@ -1026,6 +1026,11 @@ struct page *follow_page(struct vm_area_
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
 
+typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
+			void *data);
+extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
+			       unsigned long size, pte_fn_t fn, void *data);
+
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
 #else
diff -r 521d1bcdaa86 -r 79a98a10911f mm/memory.c
--- a/mm/memory.c	Sun Jul 30 07:00:26 2006 +0000
+++ b/mm/memory.c	Tue Aug 01 01:32:00 2006 -0700
@@ -1369,6 +1369,100 @@ int remap_pfn_range(struct vm_area_struc
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
+{
+	pte_t *pte;
+	int err;
+	struct page *pmd_page;
+	spinlock_t *ptl;
+
+	pte = (mm == &init_mm) ?
+		pte_alloc_kernel(pmd, addr) :
+		pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	if (!pte)
+		return -ENOMEM;
+
+	BUG_ON(pmd_huge(*pmd));
+
+	pmd_page = pmd_page(*pmd);
+
+	do {
+		err = fn(pte, pmd_page, addr, data);
+		if (err)
+			break;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	if (mm != &init_mm)
+		pte_unmap_unlock(pte-1, ptl);
+	return err;
+}
+
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	int err;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+		if (err)
+			break;
+	} while (pmd++, addr = next, addr != end);
+	return err;
+}
+
+static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
+{
+	pud_t *pud;
+	unsigned long next;
+	int err;
+
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+		if (err)
+			break;
+	} while (pud++, addr = next, addr != end);
+	return err;
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long end = addr + size;
+	int err;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_page_range);
+
 /*
  * handle_pte_fault chooses page fault handler according to an entry
  * which was read non-atomically.  Before making any commitment, on

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 10 of 13] Change pte_clear_full to a more appropriately named pte_clear_not_present,

3 files changed, 4 insertions(+), 4 deletions(-)
include/asm-generic/pgtable.h |    4 ++--
mm/fremap.c                   |    2 +-
mm/memory.c                   |    2 +-


allowing optimizations when not-present mapping changes need not be
reflected in the hardware TLB for protected page table modes.  There is
also another case that can use it in the fremap code.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>



==================================================================
diff -r 20f9c0c451af -r 553154516a1b include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h	Tue Aug 01 01:32:00 2006 -0700
+++ b/include/asm-generic/pgtable.h	Tue Aug 01 01:32:00 2006 -0700
@@ -110,8 +110,8 @@ do {				  					  \
 })
 #endif
 
-#ifndef __HAVE_ARCH_PTE_CLEAR_FULL
-#define pte_clear_full(__mm, __address, __ptep, __full)			\
+#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
+#define pte_clear_not_present_full(__mm, __address, __ptep, __full)	\
 do {									\
 	pte_clear((__mm), (__address), (__ptep));			\
 } while (0)
diff -r 20f9c0c451af -r 553154516a1b mm/fremap.c
--- a/mm/fremap.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/mm/fremap.c	Tue Aug 01 01:32:00 2006 -0700
@@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm,
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
-		pte_clear(mm, addr, ptep);
+		pte_clear_not_present_full(mm, addr, ptep, 0);
 	}
 	return !!page;
 }
diff -r 20f9c0c451af -r 553154516a1b mm/memory.c
--- a/mm/memory.c	Tue Aug 01 01:32:00 2006 -0700
+++ b/mm/memory.c	Tue Aug 01 01:32:00 2006 -0700
@@ -689,7 +689,7 @@ static unsigned long zap_pte_range(struc
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear_full(mm, addr, pte, tlb->fullmm);
+		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
 	add_mm_rss(mm, file_rss, anon_rss);

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 12 of 13] Pass the mm struct into the pgd_free code so the mm is available here

5 files changed, 10 insertions(+), 4 deletions(-)
arch/i386/mm/pgtable.c     |    3 ++-
include/asm-i386/pgalloc.h |    4 ++--
include/asm-i386/pgtable.h |    1 +
include/linux/mm.h         |    4 ++++
kernel/fork.c              |    2 +-


Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>



==================================================================
diff -r 398f8fd6b334 -r 8235caea9d68 arch/i386/mm/pgtable.c
--- a/arch/i386/mm/pgtable.c	Tue Aug 01 01:32:01 2006 -0700
+++ b/arch/i386/mm/pgtable.c	Tue Aug 01 01:32:01 2006 -0700
@@ -275,9 +275,10 @@ out_oom:
 	return NULL;
 }
 
-void pgd_free(pgd_t *pgd)
+void pgd_free(struct mm_struct *mm)
 {
 	int i;
+	pgd_t *pgd = mm->pgd;
 
 	/* in the PAE case user pgd entries are overwritten before usage */
 	if (PTRS_PER_PMD > 1)
diff -r 398f8fd6b334 -r 8235caea9d68 include/asm-i386/pgalloc.h
--- a/include/asm-i386/pgalloc.h	Tue Aug 01 01:32:01 2006 -0700
+++ b/include/asm-i386/pgalloc.h	Tue Aug 01 01:32:01 2006 -0700
@@ -3,7 +3,6 @@
 
 #include <asm/fixmap.h>
 #include <linux/threads.h>
-#include <linux/mm.h>		/* for struct page */
 
 #define pmd_populate_kernel(mm, pmd, pte) \
 		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
@@ -16,7 +15,8 @@
  * Allocate and free page tables.
  */
 extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(pgd_t *pgd);
+extern void pgd_free(struct mm_struct *);
+#define pgd_free_mm(mm) pgd_free(mm)
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
 extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
diff -r 398f8fd6b334 -r 8235caea9d68 include/asm-i386/pgtable.h
--- a/include/asm-i386/pgtable.h	Tue Aug 01 01:32:01 2006 -0700
+++ b/include/asm-i386/pgtable.h	Tue Aug 01 01:32:01 2006 -0700
@@ -393,6 +393,7 @@ extern pte_t *lookup_address(unsigned lo
 
 extern void noexec_setup(const char *str);
 
+#include <asm/pgalloc.h>
 #if defined(CONFIG_HIGHPTE)
 #define pte_offset_map(dir, address) \
 	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
diff -r 398f8fd6b334 -r 8235caea9d68 include/linux/mm.h
--- a/include/linux/mm.h	Tue Aug 01 01:32:01 2006 -0700
+++ b/include/linux/mm.h	Tue Aug 01 01:32:01 2006 -0700
@@ -1077,5 +1077,9 @@ extern int randomize_va_space;
 
 const char *arch_vma_name(struct vm_area_struct *vma);
 
+#ifndef pgd_free_mm
+#define pgd_free_mm(mm) pgd_free((mm)->pgd)
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff -r 398f8fd6b334 -r 8235caea9d68 kernel/fork.c
--- a/kernel/fork.c	Tue Aug 01 01:32:01 2006 -0700
+++ b/kernel/fork.c	Tue Aug 01 01:32:01 2006 -0700
@@ -299,7 +299,7 @@ static inline int mm_alloc_pgd(struct mm
 
 static inline void mm_free_pgd(struct mm_struct * mm)
 {
-	pgd_free(mm->pgd);
+	pgd_free_mm(mm);
 }
 #else
 #define dup_mmap(mm, oldmm)	(0)

Ian Campbell

2007-Apr-18 17:49 UTC

head link

[PATCH 1 of 1] x86_43: Put .note.* sections into a PT_NOTE segment in vmlinux

On Tue, 2006-08-01 at 13:00 -0700, Jeremy Fitzhardinge
wrote:> This patch will pack any .note.* section into a PT_NOTE segment in the
> output file.
[...]> This only changes i386 for now, but I presume the corresponding
> changes for other architectures will be as simple.
Here is the patch for x86_64.

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>

diff -urN ref-linux-2.6.16.13/arch/x86_64/kernel/vmlinux.lds.S
x86-64_elfnotes/arch/x86_64/kernel/vmlinux.lds.S
--- ref-linux-2.6.16.13/arch/x86_64/kernel/vmlinux.lds.S	2006-05-02
22:38:44.000000000 +0100
+++ x86-64_elfnotes/arch/x86_64/kernel/vmlinux.lds.S	2006-08-22
11:39:14.000000000 +0100
@@ -14,6 +14,11 @@
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
+PHDRS {
+	text PT_LOAD FLAGS(5);	/* R_E */
+	data PT_LOAD FLAGS(7);	/* RWE */
+	note PT_NOTE FLAGS(4);	/* R__ */
+}
 SECTIONS
 {
   . = __START_KERNEL;
@@ -26,7 +31,7 @@
 	KPROBES_TEXT
 	*(.fixup)
 	*(.gnu.warning)
-	} = 0x9090
+	} :text = 0x9090
   				/* out-of-line lock text */
   .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
 
@@ -43,7 +48,7 @@
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	*(.data)
 	CONSTRUCTORS
-	}
+	} :data
 
   _edata = .;			/* End of data section */
 
@@ -201,4 +206,6 @@
   STABS_DEBUG
 
   DWARF_DEBUG
+
+  NOTES
 }

Possibly Parallel Threads

Search for more apparently analagous threads

Virtualization - Apr 2007 - [PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel

[PATCH 13 of 13] Put .note.* sections into a PT_NOTE segment in vmlinux

[PATCH 7 of 13] Make __FIXADDR_TOP variable to allow it to make space for a hypervisor

[PATCH 3 of 13] Implement always-locked bit ops, for memory shared with an SMP hypervisor

[PATCH 11 of 13] Implement lazy MMU update hooks which are SMP safe for both direct and

[PATCH 4 of 13] Allow a kernel to not be in ring 0

[PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel

[PATCH 9 of 13] Remove the read hazard from the COW path in copy_one_pte

[PATCH 2 of 13] Remove locally-defined ldt structure in favour of standard type

[PATCH 6 of 13] Roll all the cpuid asm into one __cpuid call

[PATCH 8 of 13] Add a bootparameter to reserve high linear address space for hypervisors

[PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel

[PATCH 5 of 13] Replace sensitive instructions with macros

[PATCH 1 of 13] Add apply_to_page_range() which applies a function to a pte range

[PATCH 10 of 13] Change pte_clear_full to a more appropriately named pte_clear_not_present,

[PATCH 12 of 13] Pass the mm struct into the pgd_free code so the mm is available here

[PATCH 1 of 1] x86_43: Put .note.* sections into a PT_NOTE segment in vmlinux

Possibly Parallel Threads