Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 13 of 13] Put .note.* sections into a PT_NOTE segment in vmlinux
3 files changed, 101 insertions(+), 2 deletions(-) arch/i386/kernel/vmlinux.lds.S | 12 ++++- include/asm-generic/vmlinux.lds.h | 3 + include/linux/elfnote.h | 88 +++++++++++++++++++++++++++++++++++++ This patch will pack any .note.* section into a PT_NOTE segment in the output file. To do this, we tell ld that we need a PT_NOTE segment. This requires us to start explicitly mapping sections to segments, so we also need to explicitly create PT_LOAD segments for text and data, and map the sections to them appropriately. Fortunately, each section will default to its previous section's segment, so it doesn't take many changes to vmlinux.lds.S. This only changes i386 for now, but I presume the corresponding changes for other architectures will be as simple. This change also adds <linux/elfnote.h>, which defines C and Assembler macros for actually creating ELF notes. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Hollis Blanchard <hollisb@us.ibm.com> ================================================================== diff -r 8235caea9d68 -r 2bf2abf6e970 arch/i386/kernel/vmlinux.lds.S --- a/arch/i386/kernel/vmlinux.lds.S Tue Aug 01 01:32:01 2006 -0700 +++ b/arch/i386/kernel/vmlinux.lds.S Tue Aug 01 01:32:01 2006 -0700 @@ -13,6 +13,12 @@ OUTPUT_ARCH(i386) OUTPUT_ARCH(i386) ENTRY(phys_startup_32) jiffies = jiffies_64; + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(4); /* R__ */ +} SECTIONS { . = __KERNEL_START; @@ -26,7 +32,7 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } = 0x9090 + } :text = 0x9090 _etext = .; /* End of text section */ @@ -48,7 +54,7 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data) CONSTRUCTORS - } + } :data . = ALIGN(4096); __nosave_begin = .; @@ -184,4 +190,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + NOTES } diff -r 8235caea9d68 -r 2bf2abf6e970 include/asm-generic/vmlinux.lds.h --- a/include/asm-generic/vmlinux.lds.h Tue Aug 01 01:32:01 2006 -0700 +++ b/include/asm-generic/vmlinux.lds.h Tue Aug 01 01:32:01 2006 -0700 @@ -194,3 +194,6 @@ .stab.index 0 : { *(.stab.index) } \ .stab.indexstr 0 : { *(.stab.indexstr) } \ .comment 0 : { *(.comment) } + +#define NOTES \ + .notes : { *(.note.*) } :note diff -r 8235caea9d68 -r 2bf2abf6e970 include/linux/elfnote.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/elfnote.h Tue Aug 01 01:32:01 2006 -0700 @@ -0,0 +1,88 @@ +#ifndef _LINUX_ELFNOTE_H +#define _LINUX_ELFNOTE_H +/* + * Helper macros to generate ELF Note structures, which are put into a + * PT_NOTE segment of the final vmlinux image. These are useful for + * including name-value pairs of metadata into the kernel binary (or + * modules?) for use by external programs. + * + * Each note has three parts: a name, a type and a desc. The name is + * intended to distinguish the note's originator, so it would be a + * company, project, subsystem, etc; it must be in a suitable form for + * use in a section name. The type is an integer which is used to tag + * the data, and is considered to be within the "name" namespace (so + * "FooCo"'s type 42 is distinct from "BarProj"'s type 42). The + * "desc" field is the actual data. There are no constraints on the + * desc field's contents, though typically they're fairly small. + * + * All notes from a given NAME are put into a section named + * .note.NAME. When the kernel image is finally linked, all the notes + * are packed into a single .notes section, which is mapped into the + * PT_NOTE segment. Because notes for a given name are grouped into + * the same section, they'll all be adjacent the output file. + * + * This file defines macros for both C and assembler use. Their + * syntax is slightly different, but they're semantically similar. + * + * See the ELF specification for more detail about ELF notes. + */ + +#ifdef __ASSEMBLER__ +/* + * Generate a structure with the same shape as Elf{32,64}_Nhdr (which + * turn out to be the same size and shape), followed by the name and + * desc data with appropriate padding. The 'desc' argument includes + * the assembler pseudo op defining the type of the data: .asciz + * "hello, world" + */ +.macro ELFNOTE name type desc:vararg +.pushsection ".note.\name" + .align 4 + .long 2f - 1f /* namesz */ + .long 4f - 3f /* descsz */ + .long \type +1:.asciz "\name" +2:.align 4 +3:\desc +4:.align 4 +.popsection +.endm +#else /* !__ASSEMBLER__ */ +#include <linux/elf.h> +/* + * Use an anonymous structure which matches the shape of + * Elf{32,64}_Nhdr, but includes the name and desc data. The size and + * type of name and desc depend on the macro arguments. "name" must + * be a literal string, and "desc" must be passed by value. You may + * only define one note per line, since __LINE__ is used to generate + * unique symbols. + */ +#define _ELFNOTE_PASTE(a,b) a##b +#define _ELFNOTE(size, name, unique, type, desc) \ + static const struct { \ + struct elf##size##_note _nhdr; \ + unsigned char _name[sizeof(name)] \ + __attribute__((aligned(sizeof(Elf##size##_Word)))); \ + typeof(desc) _desc \ + __attribute__((aligned(sizeof(Elf##size##_Word)))); \ + } _ELFNOTE_PASTE(_note_, unique) \ + __attribute_used__ \ + __attribute__((section(".note." name), \ + aligned(sizeof(Elf##size##_Word)), \ + unused)) = { \ + { \ + sizeof(name), \ + sizeof(desc), \ + type, \ + }, \ + name, \ + desc \ + } +#define ELFNOTE(size, name, type, desc) \ + _ELFNOTE(size, name, __LINE__, type, desc) + +#define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc) +#define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc) +#endif /* __ASSEMBLER__ */ + +#endif /* _LINUX_ELFNOTE_H */
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 7 of 13] Make __FIXADDR_TOP variable to allow it to make space for a hypervisor
5 files changed, 68 insertions(+), 2 deletions(-) arch/i386/Kconfig | 1 + arch/i386/mm/init.c | 42 ++++++++++++++++++++++++++++++++++++++++++ arch/i386/mm/pgtable.c | 18 ++++++++++++++++++ include/asm-i386/fixmap.h | 7 ++++++- include/asm-i386/page.h | 2 +- Make __FIXADDR_TOP a variable, so that it can be set to not get in the way of address space a hypervisor may want to reserve. Original patch by Gerd Hoffmann <kraxel@suse.de> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Gerd Hoffmann <kraxel@suse.de> ================================================================== diff -r 730b4fe6bc1e -r b6c100bb5ca5 arch/i386/Kconfig --- a/arch/i386/Kconfig Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/Kconfig Tue Aug 01 01:32:00 2006 -0700 @@ -792,6 +792,7 @@ config COMPAT_VDSO config COMPAT_VDSO bool "Compat VDSO support" default y + depends on !PARAVIRT help Map the VDSO to the predictable old-style address too. ---help--- diff -r 730b4fe6bc1e -r b6c100bb5ca5 arch/i386/mm/init.c --- a/arch/i386/mm/init.c Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/mm/init.c Tue Aug 01 01:32:00 2006 -0700 @@ -629,6 +629,48 @@ void __init mem_init(void) (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); +#if 1 /* double-sanity-check paranoia */ + printk("virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +#endif + + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, + + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + +#ifdef CONFIG_HIGHMEM + BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif + BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); +#endif /* double-sanity-check paranoia */ + #ifdef CONFIG_X86_PAE if (!cpu_has_pae) panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); diff -r 730b4fe6bc1e -r b6c100bb5ca5 arch/i386/mm/pgtable.c --- a/arch/i386/mm/pgtable.c Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/mm/pgtable.c Tue Aug 01 01:32:00 2006 -0700 @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/spinlock.h> +#include <linux/module.h> #include <asm/system.h> #include <asm/pgtable.h> @@ -137,6 +138,12 @@ void set_pmd_pfn(unsigned long vaddr, un __flush_tlb_one(vaddr); } +static int fixmaps = 0; +#ifndef CONFIG_COMPAT_VDSO +unsigned long __FIXADDR_TOP = 0xfffff000; +EXPORT_SYMBOL(__FIXADDR_TOP); +#endif + void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { unsigned long address = __fix_to_virt(idx); @@ -146,6 +153,17 @@ void __set_fixmap (enum fixed_addresses return; } set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + fixmaps++; +} + +void set_fixaddr_top(unsigned long top) +{ + BUG_ON(fixmaps > 0); +#ifdef CONFIG_COMPAT_VDSO + BUG_ON(top - PAGE_SIZE != __FIXADDR_TOP); +#else + __FIXADDR_TOP = top - PAGE_SIZE; +#endif } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) diff -r 730b4fe6bc1e -r b6c100bb5ca5 include/asm-i386/fixmap.h --- a/include/asm-i386/fixmap.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-i386/fixmap.h Tue Aug 01 01:32:00 2006 -0700 @@ -19,7 +19,11 @@ * Leave one empty page between vmalloc'ed areas and * the start of the fixmap. */ -#define __FIXADDR_TOP 0xfffff000 +#ifndef CONFIG_COMPAT_VDSO +extern unsigned long __FIXADDR_TOP; +#else +#define __FIXADDR_TOP 0xfffff000 +#endif #ifndef __ASSEMBLY__ #include <linux/kernel.h> @@ -93,6 +97,7 @@ enum fixed_addresses { extern void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags); +extern void set_fixaddr_top(unsigned long top); #define set_fixmap(idx, phys) \ __set_fixmap(idx, phys, PAGE_KERNEL) diff -r 730b4fe6bc1e -r b6c100bb5ca5 include/asm-i386/page.h --- a/include/asm-i386/page.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-i386/page.h Tue Aug 01 01:32:00 2006 -0700 @@ -122,7 +122,7 @@ extern int page_is_ram(unsigned long pag #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) -#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define MAXMEM (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 3 of 13] Implement always-locked bit ops, for memory shared with an SMP hypervisor
2 files changed, 192 insertions(+) include/asm-i386/sync_bitops.h | 156 ++++++++++++++++++++++++++++++++++++++++ include/asm-i386/system.h | 36 +++++++++ Add "always lock'd" implementations of set_bit, clear_bit and change_bit and the corresponding test_and_ functions. Also add "always lock'd" implementation of cmpxchg. These give guaranteed strong synchronisation and are required for non-SMP kernels running on an SMP hypervisor. Signed-off-by: Ian Pratt <ian.pratt@xensource.com> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Christoph Lameter <clameter@sgi.com> ================================================================== diff -r ffccb62e9244 -r 85e7eadfaea1 include/asm-i386/system.h --- a/include/asm-i386/system.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-i386/system.h Tue Aug 01 01:32:00 2006 -0700 @@ -261,6 +261,9 @@ static inline unsigned long __xchg(unsig #define cmpxchg(ptr,o,n)\ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ (unsigned long)(n),sizeof(*(ptr)))) +#define sync_cmpxchg(ptr,o,n)\ + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\ + (unsigned long)(n),sizeof(*(ptr)))) #endif static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, @@ -282,6 +285,39 @@ static inline unsigned long __cmpxchg(vo return prev; case 4: __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +/* + * Always use locked operations when touching memory shared with a + * hypervisor, since the system may be SMP even if the guest kernel + * isn't. + */ +static inline unsigned long __sync_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" : "=a"(prev) : "r"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); diff -r ffccb62e9244 -r 85e7eadfaea1 include/asm-i386/sync_bitops.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/asm-i386/sync_bitops.h Tue Aug 01 01:32:00 2006 -0700 @@ -0,0 +1,156 @@ +#ifndef _I386_SYNC_BITOPS_H +#define _I386_SYNC_BITOPS_H + +/* + * Copyright 1992, Linus Torvalds. + */ + +/* + * These have to be done with inline assembly: that way the bit-setting + * is guaranteed to be atomic. All bit operations return 0 if the bit + * was cleared before the operation and != 0 if it was not. + * + * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). + */ + +#define ADDR (*(volatile long *) addr) + +/** + * sync_set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * + * Note: there are no guarantees that this function will not be reordered + * on non x86 architectures, so if you are writting portable code, + * make sure not to rely on its reordering guarantees. + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void sync_set_bit(int nr, volatile unsigned long * addr) +{ + __asm__ __volatile__("lock; btsl %1,%0" + :"+m" (ADDR) + :"Ir" (nr) + : "memory"); +} + +/** + * sync_clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * sync_clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static inline void sync_clear_bit(int nr, volatile unsigned long * addr) +{ + __asm__ __volatile__("lock; btrl %1,%0" + :"+m" (ADDR) + :"Ir" (nr) + : "memory"); +} + +/** + * sync_change_bit - Toggle a bit in memory + * @nr: Bit to change + * @addr: Address to start counting from + * + * change_bit() is atomic and may not be reordered. It may be + * reordered on other architectures than x86. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void sync_change_bit(int nr, volatile unsigned long * addr) +{ + __asm__ __volatile__("lock; btcl %1,%0" + :"+m" (ADDR) + :"Ir" (nr) + : "memory"); +} + +/** + * sync_test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It may be reordered on other architectures than x86. + * It also implies a memory barrier. + */ +static inline int sync_test_and_set_bit(int nr, volatile unsigned long * addr) +{ + int oldbit; + + __asm__ __volatile__("lock; btsl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +/** + * sync_test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It can be reorderdered on other architectures other than x86. + * It also implies a memory barrier. + */ +static inline int sync_test_and_clear_bit(int nr, volatile unsigned long * addr) +{ + int oldbit; + + __asm__ __volatile__("lock; btrl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +/** + * sync_test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int sync_test_and_change_bit(int nr, volatile unsigned long* addr) +{ + int oldbit; + + __asm__ __volatile__("lock; btcl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +static __always_inline int sync_const_test_bit(int nr, const volatile unsigned long *addr) +{ + return ((1UL << (nr & 31)) & + (((const volatile unsigned int *)addr)[nr >> 5])) != 0; +} + +static inline int sync_var_test_bit(int nr, const volatile unsigned long * addr) +{ + int oldbit; + + __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit) + :"m" (ADDR),"Ir" (nr)); + return oldbit; +} + +#define sync_test_bit(nr,addr) \ + (__builtin_constant_p(nr) ? \ + sync_constant_test_bit((nr),(addr)) : \ + sync_var_test_bit((nr),(addr))) + +#undef ADDR + +#endif /* _I386_SYNC_BITOPS_H */
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 11 of 13] Implement lazy MMU update hooks which are SMP safe for both direct and
5 files changed, 34 insertions(+) include/asm-generic/pgtable.h | 20 ++++++++++++++++++++ mm/memory.c | 8 ++++++++ mm/mprotect.c | 2 ++ mm/mremap.c | 2 ++ mm/msync.c | 2 ++ shadow page tables. The idea is that PTE updates and page invalidations while in lazy mode can be batched into a single hypercall. We use this in VMI for shadow page table synchronization, and it is a win. It also can be used by PPC and for direct page tables on Xen. For SMP, the enter / leave must happen under protection of the page table locks for page tables which are being modified. This is because otherwise, you end up with stale state in the batched hypercall, which other CPUs can race ahead of. Doing this under the protection of the locks guarantees the synchronization is correct, and also means that spurious faults which are generated during this window by remote CPUs are properly handled, as the page fault handler must re-check the PTE under protection of the same lock. Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r 553154516a1b -r 398f8fd6b334 include/asm-generic/pgtable.h --- a/include/asm-generic/pgtable.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-generic/pgtable.h Tue Aug 01 01:32:01 2006 -0700 @@ -164,6 +164,26 @@ static inline void ptep_set_wrprotect(st #endif /* + * A facility to provide lazy MMU batching. This allows PTE updates and + * page invalidations to be delayed until a call to leave lazy MMU mode + * is issued. Some architectures may benefit from doing this, and it is + * beneficial for both shadow and direct mode hypervisors, which may batch + * the PTE updates which happen during this window. Note that using this + * interface requires that read hazards be removed from the code. A read + * hazard could result in the direct mode hypervisor case, since the actual + * write to the page tables may not yet have taken place, so reads though + * a raw PTE pointer after it has been modified are not guaranteed to be + * up to date. This mode can only be entered and left under the protection of + * the page table locks for all page tables which may be modified. In the UP + * case, this is required so that preemption is disabled, and in the SMP case, + * it must synchronize the delayed page table writes properly on other CPUs. + */ +#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE +#define arch_enter_lazy_mmu_mode() do {} while (0) +#define arch_leave_lazy_mmu_mode() do {} while (0) +#endif + +/* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. diff -r 553154516a1b -r 398f8fd6b334 mm/memory.c --- a/mm/memory.c Tue Aug 01 01:32:00 2006 -0700 +++ b/mm/memory.c Tue Aug 01 01:32:01 2006 -0700 @@ -505,6 +505,7 @@ again: src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); do { /* @@ -526,6 +527,7 @@ again: progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); add_mm_rss(dst_mm, rss[0], rss[1]); @@ -627,6 +629,7 @@ static unsigned long zap_pte_range(struc int anon_rss = 0; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; if (pte_none(ptent)) { @@ -693,6 +696,7 @@ static unsigned long zap_pte_range(struc } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); add_mm_rss(mm, file_rss, anon_rss); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return addr; @@ -1108,6 +1112,7 @@ static int zeromap_pte_range(struct mm_s pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; + arch_enter_lazy_mmu_mode(); do { struct page *page = ZERO_PAGE(addr); pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); @@ -1117,6 +1122,7 @@ static int zeromap_pte_range(struct mm_s BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1269,11 +1275,13 @@ static int remap_pte_range(struct mm_str pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; + arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return 0; } diff -r 553154516a1b -r 398f8fd6b334 mm/mprotect.c --- a/mm/mprotect.c Tue Aug 01 01:32:00 2006 -0700 +++ b/mm/mprotect.c Tue Aug 01 01:32:01 2006 -0700 @@ -33,6 +33,7 @@ static void change_pte_range(struct mm_s spinlock_t *ptl; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; if (pte_present(oldpte)) { @@ -62,6 +63,7 @@ static void change_pte_range(struct mm_s } } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); } diff -r 553154516a1b -r 398f8fd6b334 mm/mremap.c --- a/mm/mremap.c Tue Aug 01 01:32:00 2006 -0700 +++ b/mm/mremap.c Tue Aug 01 01:32:01 2006 -0700 @@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_str new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { @@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_str set_pte_at(mm, new_addr, new_pte, pte); } + arch_leave_lazy_mmu_mode(); if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap_nested(new_pte - 1); diff -r 553154516a1b -r 398f8fd6b334 mm/msync.c --- a/mm/msync.c Tue Aug 01 01:32:00 2006 -0700 +++ b/mm/msync.c Tue Aug 01 01:32:01 2006 -0700 @@ -30,6 +30,7 @@ static unsigned long msync_pte_range(str again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); do { struct page *page; @@ -51,6 +52,7 @@ again: ret += set_page_dirty(page); progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); cond_resched(); if (addr != end)
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 4 of 13] Allow a kernel to not be in ring 0
6 files changed, 22 insertions(+), 13 deletions(-) arch/i386/kernel/entry.S | 5 +++-- arch/i386/kernel/process.c | 2 +- arch/i386/mm/extable.c | 2 +- arch/i386/mm/fault.c | 11 ++++------- include/asm-i386/ptrace.h | 5 +++-- include/asm-i386/segment.h | 10 ++++++++++ We allow for the fact that the guest kernel may not run in ring 0. This requires some abstraction in a few places when setting %cs or checking privilege level (user vs kernel). This is Chris' [RFC PATCH 15/33] move segment checks to subarch, except rather than using #define USER_MODE_MASK which depends on a config option, we use Zach's more flexible approach of assuming ring 3 == userspace. I also used "get_kernel_rpl()" over "get_kernel_cs()" because I think it reads better in the code... 1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3 2) Add a get_kernel_rpl() macro, and don't assume it's zero. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/kernel/entry.S --- a/arch/i386/kernel/entry.S Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/kernel/entry.S Tue Aug 01 01:32:00 2006 -0700 @@ -229,8 +229,9 @@ check_userspace: check_userspace: movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al - testl $(VM_MASK | 3), %eax - jz resume_kernel + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax + cmpl $SEGMENT_RPL_MASK, %eax + jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) cli # make sure we don't miss an interrupt # setting need_resched or sigpending diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/kernel/process.c --- a/arch/i386/kernel/process.c Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/kernel/process.c Tue Aug 01 01:32:00 2006 -0700 @@ -346,7 +346,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xes = __USER_DS; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/mm/extable.c --- a/arch/i386/mm/extable.c Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/mm/extable.c Tue Aug 01 01:32:00 2006 -0700 @@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs const struct exception_table_entry *fixup; #ifdef CONFIG_PNPBIOS - if (unlikely((regs->xcs & ~15) == (GDT_ENTRY_PNPBIOS_BASE << 3))) + if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs))) { extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; extern u32 pnp_bios_is_utter_crap; diff -r 85e7eadfaea1 -r cf6767d9babb arch/i386/mm/fault.c --- a/arch/i386/mm/fault.c Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/mm/fault.c Tue Aug 01 01:32:00 2006 -0700 @@ -27,6 +27,7 @@ #include <asm/uaccess.h> #include <asm/desc.h> #include <asm/kdebug.h> +#include <asm/segment.h> extern void die(const char *,struct pt_regs *,long); @@ -119,10 +120,10 @@ static inline unsigned long get_segment_ } /* The standard kernel/user address space limit. */ - *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg; + *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; /* By far the most common cases. */ - if (likely(seg == __USER_CS || seg == __KERNEL_CS)) + if (likely(SEGMENT_IS_FLAT_CODE(seg))) return eip; /* Check the segment exists, is within the current LDT/GDT size, @@ -436,11 +437,7 @@ good_area: write = 0; switch (error_code & 3) { default: /* 3: write, present */ -#ifdef TEST_VERIFY_AREA - if (regs->cs == KERNEL_CS) - printk("WP fault at %08lx\n", regs->eip); -#endif - /* fall through */ + /* fall through */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; diff -r 85e7eadfaea1 -r cf6767d9babb include/asm-i386/ptrace.h --- a/include/asm-i386/ptrace.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-i386/ptrace.h Tue Aug 01 01:32:00 2006 -0700 @@ -60,6 +60,7 @@ struct pt_regs { #ifdef __KERNEL__ #include <asm/vm86.h> +#include <asm/segment.h> struct task_struct; extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); @@ -73,11 +74,11 @@ extern void send_sigtrap(struct task_str */ static inline int user_mode(struct pt_regs *regs) { - return (regs->xcs & 3) != 0; + return (regs->xcs & SEGMENT_RPL_MASK) == 3; } static inline int user_mode_vm(struct pt_regs *regs) { - return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0; + return (((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= 3); } #define instruction_pointer(regs) ((regs)->eip) #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) diff -r 85e7eadfaea1 -r cf6767d9babb include/asm-i386/segment.h --- a/include/asm-i386/segment.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-i386/segment.h Tue Aug 01 01:32:00 2006 -0700 @@ -83,6 +83,12 @@ #define GDT_SIZE (GDT_ENTRIES * 8) +/* + * Some tricky tests to match code segments after a fault + */ +#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8) +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) + /* Simple and small GDT entries for booting only */ #define GDT_ENTRY_BOOT_CS 2 @@ -112,4 +118,8 @@ */ #define IDT_ENTRIES 256 +/* Bottom three bits of xcs give the ring privilege level */ +#define SEGMENT_RPL_MASK 0x3 + +#define get_kernel_rpl() 0 #endif
Andi Kleen
2007-Apr-18 13:02 UTC
[PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel
On Tuesday 01 August 2006 22:00, Jeremy Fitzhardinge wrote:> [ REPOST: Apologies to anyone who has seen this before. It > didn't make it onto any of the lists it should have. -J ]I tried to apply these patches (except the ones I didn't like: 8, 10, 12) to my tree, but couldn't because they are all MIME demaged: + pte =3D (mm =3D=3D &init_mm) ? etc. Can you please repost a version without that (and ideally fix 8, 10, 12)? -Andi
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 9 of 13] Remove the read hazard from the COW path in copy_one_pte
1 file changed, 1 insertion(+), 1 deletion(-) mm/memory.c | 2 +- We don't want to read PTEs directly like this after they have been modified, as a lazy MMU implementation of direct page tables may not have written the updated PTE back to memory yet. Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r 0adfc39039c7 -r 20f9c0c451af mm/memory.c --- a/mm/memory.c Tue Aug 01 01:32:00 2006 -0700 +++ b/mm/memory.c Tue Aug 01 01:32:00 2006 -0700 @@ -466,7 +466,7 @@ copy_one_pte(struct mm_struct *dst_mm, s */ if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); - pte = *src_pte; + pte = pte_wrprotect(pte); } /*
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 2 of 13] Remove locally-defined ldt structure in favour of standard type
1 file changed, 4 insertions(+), 8 deletions(-) arch/i386/kernel/reboot.c | 12 ++++-------- arch/i386/kernel/reboot.c defines its own struct to describe an ldt entry: it should use struct Xgt_desc_struct (currently load_ldt is a macro, so doesn't complain: paravirt patches make it warn). Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r 79a98a10911f -r ffccb62e9244 arch/i386/kernel/reboot.c --- a/arch/i386/kernel/reboot.c Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/kernel/reboot.c Tue Aug 01 01:32:00 2006 -0700 @@ -145,14 +145,10 @@ real_mode_gdt_entries [3] 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct -{ - unsigned short size __attribute__ ((packed)); - unsigned long long * base __attribute__ ((packed)); -} -real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, NULL }, -no_idt = { 0, NULL }; +static struct Xgt_desc_struct +real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, +real_mode_idt = { 0x3ff, 0 }, +no_idt = { 0, 0 }; /* This is 16-bit protected mode code to disable paging and the cache,
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 6 of 13] Roll all the cpuid asm into one __cpuid call
1 file changed, 34 insertions(+), 40 deletions(-) include/asm-i386/processor.h | 74 +++++++++++++++++++----------------------- It's a little neater, and also means only one place to patch for paravirtualization. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r 09b35e6bc0ca -r 730b4fe6bc1e include/asm-i386/processor.h --- a/include/asm-i386/processor.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-i386/processor.h Tue Aug 01 01:32:00 2006 -0700 @@ -143,31 +143,37 @@ static inline void detect_ht(struct cpui #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) -{ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ __asm__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op), "c"(0)); + : "0" (*eax), "2" (*ecx)); +} + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); } /* Some CPUID calls want 'count' to be placed in ecx */ static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) -{ - __asm__("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (op), "c" (count)); + int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); } /* @@ -175,42 +181,30 @@ static inline void cpuid_count(int op, i */ static inline unsigned int cpuid_eax(unsigned int op) { - unsigned int eax; - - __asm__("cpuid" - : "=a" (eax) - : "0" (op) - : "bx", "cx", "dx"); + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); return eax; } static inline unsigned int cpuid_ebx(unsigned int op) { - unsigned int eax, ebx; - - __asm__("cpuid" - : "=a" (eax), "=b" (ebx) - : "0" (op) - : "cx", "dx" ); + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); return ebx; } static inline unsigned int cpuid_ecx(unsigned int op) { - unsigned int eax, ecx; - - __asm__("cpuid" - : "=a" (eax), "=c" (ecx) - : "0" (op) - : "bx", "dx" ); + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); return ecx; } static inline unsigned int cpuid_edx(unsigned int op) { - unsigned int eax, edx; - - __asm__("cpuid" - : "=a" (eax), "=d" (edx) - : "0" (op) - : "bx", "cx"); + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); return edx; }
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 8 of 13] Add a bootparameter to reserve high linear address space for hypervisors
1 file changed, 13 insertions(+) arch/i386/kernel/setup.c | 13 +++++++++++++ This is necessary to allow dynamically loaded hypervisor modules, which might not happen until userspace is already running, and also provides a useful tool to benchmark the performance impact of reduced lowmem address space. Signed-off-by: Zachary Amsden <zach@vmware.com> ================================================================== diff -r b6c100bb5ca5 -r 0adfc39039c7 arch/i386/kernel/setup.c --- a/arch/i386/kernel/setup.c Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/kernel/setup.c Tue Aug 01 01:32:00 2006 -0700 @@ -917,6 +917,19 @@ static void __init parse_cmdline_early ( else if (!memcmp(from, "vmalloc=", 8)) __VMALLOC_RESERVE = memparse(from+8, &from); + /* + * reservedtop=size reserves a hole at the top of the kernel + * address space which a hypervisor can load into later. + * Needed for dynamically loaded hypervisors, so relocating + * the fixmap can be done before paging initialization. + * This hole must be a multiple of 4M. + */ + else if (!memcmp(from, "reservedtop=", 12)) { + unsigned long reserved = memparse(from+12, &from); + reserved &= ~0x3fffff; + set_fixaddr_top(-reserved); + } + next_char: c = *(from++); if (!c)
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel
[ REPOST: Apologies to anyone who has seen this before. It didn't make it onto any of the lists it should have. -J ] Hi Andrew, This series of patches lays the basic ground work for the paravirtualized kernel patches coming later on. I think this lot is ready for the rough-and-tumble world of the -mm tree. For the most part, these patches do nothing or very little. The patches should be self explanatory, but the overview is: Helper functions for later use: 1/13: Add apply_to_page_range()... 3/13: Implement always-locked bit ops... 13/13: Put .note.* sections into a PT_NOTE segment in vmlinux Cleanups: 2/13: Remove locally-defined ldt structure in favour of standard type 4/13: Allow a kernel to not be in ring 0 6/13: Roll all the cpuid asm into one __cpuid call 9/13: Remove the read hazard from the COW path in copy_one_pte 10/13: Change pte_clear_full to a more appropriately named... Hooks: 5/13: Replace sensitive instructions with macros 7/13: Make __FIXADDR_TOP variable to allow it to make space... 8/13: Add a bootparameter to reserve high linear address... 11/13: Add lazy MMU mode hooks for batching PTE updates 12/13: Pass the mm struct into the pgd_free code so the mm... Probably the most subtle changes here are 11/13 and 9/13, since they add a new constraint to page-table manipulation code. In a paravirtualized system, pte updates may be batched and performed lazily, so their effects will not be immediately visible on the pte itself. To avoid this, code which modifies ptes in a loop needs to avoid looking at the modified ptes. 9/13 fixes the one place where it happens. 11/13 depends on removing these read hazards for correctness when running under a direct page table hypervisor which batches updates. However, it is generally agreed that using an _explicit_, rather than an _implicit_ notion of batching makes it easy to find and reason about the paths which are doing batching. This allows easy inspection to remove read hazards from the code. 13/13 "Put .note.* sections into a PT_NOTE segment in vmlinux" is mostly here to shake out problems early. It slightly changes the way the vmlinux image is linked together, and it uses the somewhat esoteric PHDRS command in vmlinux.lds. I want to make sure that this doesn't provoke any problems in the various binutils people are using. Thanks, J -------------- next part -------------- 30 files changed, 609 insertions(+), 92 deletions(-) arch/i386/Kconfig | 1 arch/i386/kernel/entry.S | 43 +++++----- arch/i386/kernel/process.c | 2 arch/i386/kernel/reboot.c | 12 -- arch/i386/kernel/setup.c | 13 +++ arch/i386/kernel/vmlinux.lds.S | 12 ++ arch/i386/mm/extable.c | 2 arch/i386/mm/fault.c | 11 -- arch/i386/mm/init.c | 42 +++++++++ arch/i386/mm/pgtable.c | 21 ++++ include/asm-generic/pgtable.h | 24 +++++ include/asm-generic/vmlinux.lds.h | 3 include/asm-i386/fixmap.h | 7 + include/asm-i386/page.h | 2 include/asm-i386/pgalloc.h | 4 include/asm-i386/pgtable.h | 1 include/asm-i386/processor.h | 74 ++++++++--------- include/asm-i386/ptrace.h | 5 - include/asm-i386/segment.h | 10 ++ include/asm-i386/spinlock.h | 7 + include/asm-i386/sync_bitops.h | 156 +++++++++++++++++++++++++++++++++++++ include/asm-i386/system.h | 36 ++++++++ include/linux/elfnote.h | 88 ++++++++++++++++++++ include/linux/mm.h | 9 ++ kernel/fork.c | 2 mm/fremap.c | 2 mm/memory.c | 106 ++++++++++++++++++++++++- mm/mprotect.c | 2 mm/mremap.c | 2 mm/msync.c | 2
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 5 of 13] Replace sensitive instructions with macros
2 files changed, 27 insertions(+), 18 deletions(-) arch/i386/kernel/entry.S | 38 ++++++++++++++++++++++---------------- include/asm-i386/spinlock.h | 7 +++++-- Abstract sensitive instructions in assembler code, replacing them with macros (which currently are #defined to the native versions). We use long names: assembler is case-insensitive, so if something goes wrong and macros do not expand, it would assemble anyway. Resulting object files are exactly the same as before. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r cf6767d9babb -r 09b35e6bc0ca arch/i386/kernel/entry.S --- a/arch/i386/kernel/entry.S Tue Aug 01 01:32:00 2006 -0700 +++ b/arch/i386/kernel/entry.S Tue Aug 01 01:32:00 2006 -0700 @@ -76,8 +76,15 @@ NT_MASK = 0x00004000 NT_MASK = 0x00004000 VM_MASK = 0x00020000 +/* These are replaces for paravirtualization */ +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax + #ifdef CONFIG_PREEMPT -#define preempt_stop cli; TRACE_IRQS_OFF +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else #define preempt_stop #define resume_kernel restore_nocheck @@ -233,7 +240,7 @@ check_userspace: cmpl $SEGMENT_RPL_MASK, %eax jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -244,7 +251,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cli + DISABLE_INTERRUPTS cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -272,7 +279,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - sti + ENABLE_INTERRUPTS pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -317,7 +324,7 @@ 1: movl (%ebp),%ebp jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -327,8 +334,7 @@ 1: movl (%ebp),%ebp movl OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON - sti - sysexit + ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC @@ -353,7 +359,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) # store the return value syscall_exit: - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -378,11 +384,11 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp CFI_ADJUST_CFA_OFFSET -4 -1: iret +1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -406,7 +412,7 @@ ldt_ss: * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer CFI_ADJUST_CFA_OFFSET 8 - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, @@ -416,7 +422,7 @@ ldt_ss: TRACE_IRQS_IRET RESTORE_REGS lss 20+4(%esp), %esp # switch to 16bit stack -1: iret +1: INTERRUPT_RETURN .section __ex_table,"a" .align 4 .long 1b,iret_exc @@ -431,7 +437,7 @@ work_pending: jz work_notifysig work_resched: call schedule - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -487,7 +493,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - sti # could let do_syscall_trace() call + ENABLE_INTERRUPTS # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -666,7 +672,7 @@ ENTRY(device_not_available) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL - movl %cr0, %eax + GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate preempt_stop @@ -796,7 +802,7 @@ nmi_16bit_stack: call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to 16bit stack -1: iret +1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" .align 4 diff -r cf6767d9babb -r 09b35e6bc0ca include/asm-i386/spinlock.h --- a/include/asm-i386/spinlock.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-i386/spinlock.h Tue Aug 01 01:32:00 2006 -0700 @@ -16,6 +16,9 @@ * * (the type definitions are in asm/spinlock_types.h) */ + +#define CLI_STRING "cli" +#define STI_STRING "sti" #define __raw_spin_is_locked(x) \ (*(volatile signed char *)(&(x)->slock) <= 0) @@ -43,12 +46,12 @@ "2:\t" \ "testl $0x200, %1\n\t" \ "jz 4f\n\t" \ - "sti\n" \ + STI_STRING "\n" \ "3:\t" \ "rep;nop\n\t" \ "cmpb $0, %0\n\t" \ "jle 3b\n\t" \ - "cli\n\t" \ + CLI_STRING "\n\t" \ "jmp 1b\n" \ "4:\t" \ "rep;nop\n\t" \
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 1 of 13] Add apply_to_page_range() which applies a function to a pte range
2 files changed, 99 insertions(+) include/linux/mm.h | 5 ++ mm/memory.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Add a new mm function apply_to_page_range() which applies a given function to every pte in a given virtual address range in a given mm structure. This is a generic alternative to cut-and-pasting the Linux idiomatic pagetable walking code in every place that a sequence of PTEs must be accessed. Although this interface is intended to be useful in a wide range of situations, it is currently used specifically by several Xen subsystems, for example: to ensure that pagetables have been allocated for a virtual address range, and to construct batched special pagetable update requests to map I/O memory (in ioremap()). Signed-off-by: Ian Pratt <ian.pratt@xensource.com> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r 521d1bcdaa86 -r 79a98a10911f include/linux/mm.h --- a/include/linux/mm.h Sun Jul 30 07:00:26 2006 +0000 +++ b/include/linux/mm.h Tue Aug 01 01:32:00 2006 -0700 @@ -1026,6 +1026,11 @@ struct page *follow_page(struct vm_area_ #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ +typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr, + void *data); +extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size, pte_fn_t fn, void *data); + #ifdef CONFIG_PROC_FS void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); #else diff -r 521d1bcdaa86 -r 79a98a10911f mm/memory.c --- a/mm/memory.c Sun Jul 30 07:00:26 2006 +0000 +++ b/mm/memory.c Tue Aug 01 01:32:00 2006 -0700 @@ -1369,6 +1369,100 @@ int remap_pfn_range(struct vm_area_struc } EXPORT_SYMBOL(remap_pfn_range); +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + struct page *pmd_page; + spinlock_t *ptl; + + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + + BUG_ON(pmd_huge(*pmd)); + + pmd_page = pmd_page(*pmd); + + do { + err = fn(pte, pmd_page, addr, data); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (mm != &init_mm) + pte_unmap_unlock(pte-1, ptl); + return err; +} + +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + return err; +} +EXPORT_SYMBOL_GPL(apply_to_page_range); + /* * handle_pte_fault chooses page fault handler according to an entry * which was read non-atomically. Before making any commitment, on
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 10 of 13] Change pte_clear_full to a more appropriately named pte_clear_not_present,
3 files changed, 4 insertions(+), 4 deletions(-) include/asm-generic/pgtable.h | 4 ++-- mm/fremap.c | 2 +- mm/memory.c | 2 +- allowing optimizations when not-present mapping changes need not be reflected in the hardware TLB for protected page table modes. There is also another case that can use it in the fremap code. Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r 20f9c0c451af -r 553154516a1b include/asm-generic/pgtable.h --- a/include/asm-generic/pgtable.h Tue Aug 01 01:32:00 2006 -0700 +++ b/include/asm-generic/pgtable.h Tue Aug 01 01:32:00 2006 -0700 @@ -110,8 +110,8 @@ do { \ }) #endif -#ifndef __HAVE_ARCH_PTE_CLEAR_FULL -#define pte_clear_full(__mm, __address, __ptep, __full) \ +#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL +#define pte_clear_not_present_full(__mm, __address, __ptep, __full) \ do { \ pte_clear((__mm), (__address), (__ptep)); \ } while (0) diff -r 20f9c0c451af -r 553154516a1b mm/fremap.c --- a/mm/fremap.c Tue Aug 01 01:32:00 2006 -0700 +++ b/mm/fremap.c Tue Aug 01 01:32:00 2006 -0700 @@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm, } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(mm, addr, ptep); + pte_clear_not_present_full(mm, addr, ptep, 0); } return !!page; } diff -r 20f9c0c451af -r 553154516a1b mm/memory.c --- a/mm/memory.c Tue Aug 01 01:32:00 2006 -0700 +++ b/mm/memory.c Tue Aug 01 01:32:00 2006 -0700 @@ -689,7 +689,7 @@ static unsigned long zap_pte_range(struc continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear_full(mm, addr, pte, tlb->fullmm); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); add_mm_rss(mm, file_rss, anon_rss);
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[PATCH 12 of 13] Pass the mm struct into the pgd_free code so the mm is available here
5 files changed, 10 insertions(+), 4 deletions(-) arch/i386/mm/pgtable.c | 3 ++- include/asm-i386/pgalloc.h | 4 ++-- include/asm-i386/pgtable.h | 1 + include/linux/mm.h | 4 ++++ kernel/fork.c | 2 +- Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> ================================================================== diff -r 398f8fd6b334 -r 8235caea9d68 arch/i386/mm/pgtable.c --- a/arch/i386/mm/pgtable.c Tue Aug 01 01:32:01 2006 -0700 +++ b/arch/i386/mm/pgtable.c Tue Aug 01 01:32:01 2006 -0700 @@ -275,9 +275,10 @@ out_oom: return NULL; } -void pgd_free(pgd_t *pgd) +void pgd_free(struct mm_struct *mm) { int i; + pgd_t *pgd = mm->pgd; /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) diff -r 398f8fd6b334 -r 8235caea9d68 include/asm-i386/pgalloc.h --- a/include/asm-i386/pgalloc.h Tue Aug 01 01:32:01 2006 -0700 +++ b/include/asm-i386/pgalloc.h Tue Aug 01 01:32:01 2006 -0700 @@ -3,7 +3,6 @@ #include <asm/fixmap.h> #include <linux/threads.h> -#include <linux/mm.h> /* for struct page */ #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) @@ -16,7 +15,8 @@ * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); +extern void pgd_free(struct mm_struct *); +#define pgd_free_mm(mm) pgd_free(mm) extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); diff -r 398f8fd6b334 -r 8235caea9d68 include/asm-i386/pgtable.h --- a/include/asm-i386/pgtable.h Tue Aug 01 01:32:01 2006 -0700 +++ b/include/asm-i386/pgtable.h Tue Aug 01 01:32:01 2006 -0700 @@ -393,6 +393,7 @@ extern pte_t *lookup_address(unsigned lo extern void noexec_setup(const char *str); +#include <asm/pgalloc.h> #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) diff -r 398f8fd6b334 -r 8235caea9d68 include/linux/mm.h --- a/include/linux/mm.h Tue Aug 01 01:32:01 2006 -0700 +++ b/include/linux/mm.h Tue Aug 01 01:32:01 2006 -0700 @@ -1077,5 +1077,9 @@ extern int randomize_va_space; const char *arch_vma_name(struct vm_area_struct *vma); +#ifndef pgd_free_mm +#define pgd_free_mm(mm) pgd_free((mm)->pgd) +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -r 398f8fd6b334 -r 8235caea9d68 kernel/fork.c --- a/kernel/fork.c Tue Aug 01 01:32:01 2006 -0700 +++ b/kernel/fork.c Tue Aug 01 01:32:01 2006 -0700 @@ -299,7 +299,7 @@ static inline int mm_alloc_pgd(struct mm static inline void mm_free_pgd(struct mm_struct * mm) { - pgd_free(mm->pgd); + pgd_free_mm(mm); } #else #define dup_mmap(mm, oldmm) (0)
Ian Campbell
2007-Apr-18 17:49 UTC
[PATCH 1 of 1] x86_43: Put .note.* sections into a PT_NOTE segment in vmlinux
On Tue, 2006-08-01 at 13:00 -0700, Jeremy Fitzhardinge wrote:> This patch will pack any .note.* section into a PT_NOTE segment in the > output file.[...]> This only changes i386 for now, but I presume the corresponding > changes for other architectures will be as simple.Here is the patch for x86_64. Signed-off-by: Ian Campbell <ian.campbell@xensource.com> diff -urN ref-linux-2.6.16.13/arch/x86_64/kernel/vmlinux.lds.S x86-64_elfnotes/arch/x86_64/kernel/vmlinux.lds.S --- ref-linux-2.6.16.13/arch/x86_64/kernel/vmlinux.lds.S 2006-05-02 22:38:44.000000000 +0100 +++ x86-64_elfnotes/arch/x86_64/kernel/vmlinux.lds.S 2006-08-22 11:39:14.000000000 +0100 @@ -14,6 +14,11 @@ OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(4); /* R__ */ +} SECTIONS { . = __START_KERNEL; @@ -26,7 +31,7 @@ KPROBES_TEXT *(.fixup) *(.gnu.warning) - } = 0x9090 + } :text = 0x9090 /* out-of-line lock text */ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } @@ -43,7 +48,7 @@ .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) CONSTRUCTORS - } + } :data _edata = .; /* End of data section */ @@ -201,4 +206,6 @@ STABS_DEBUG DWARF_DEBUG + + NOTES }
Reasonably Related Threads
- [PATCH 0 of 13] Basic infrastructure patches for a paravirtualized kernel
- [patch 0/8] Basic infrastructure patches for a paravirtualized kernel
- [patch 0/8] Basic infrastructure patches for a paravirtualized kernel
- [RFC] VMI for Xen?
- [patch 00/20] paravirt_ops updates