We allow for the fact that the guest kernel may not run in ring 0. This requires some abstraction in a few places when setting %cs or checking privilege level (user vs kernel). This is Chris' [RFC PATCH 15/33] move segment checks to subarch, except rather than using #define USER_MODE_MASK which depends on a config option, we use Zach's more flexible approach of assuming ring 3 == userspace. I also used "get_kernel_rpl()" over "get_kernel_cs()" because I think it reads better in the code... 1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3 2) Add a get_kernel_rpl() macro, and don't assume it's zero. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> --- arch/i386/kernel/entry.S | 5 +++-- arch/i386/kernel/process.c | 2 +- arch/i386/mm/extable.c | 2 +- arch/i386/mm/fault.c | 11 ++++------- include/asm-i386/ptrace.h | 5 +++-- include/asm-i386/segment.h | 10 ++++++++++ 6 files changed, 22 insertions(+), 13 deletions(-) ==================================================================--- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -229,8 +229,9 @@ check_userspace: check_userspace: movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al - testl $(VM_MASK | 3), %eax - jz resume_kernel + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax + cmpl $SEGMENT_RPL_MASK, %eax + jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) cli # make sure we don't miss an interrupt # setting need_resched or sigpending ==================================================================--- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -346,7 +346,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xes = __USER_DS; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ ==================================================================--- a/arch/i386/mm/extable.c +++ b/arch/i386/mm/extable.c @@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs const struct exception_table_entry *fixup; #ifdef CONFIG_PNPBIOS - if (unlikely((regs->xcs & ~15) == (GDT_ENTRY_PNPBIOS_BASE << 3))) + if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs))) { extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; extern u32 pnp_bios_is_utter_crap; ==================================================================--- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -27,6 +27,7 @@ #include <asm/uaccess.h> #include <asm/desc.h> #include <asm/kdebug.h> +#include <asm/segment.h> extern void die(const char *,struct pt_regs *,long); @@ -119,10 +120,10 @@ static inline unsigned long get_segment_ } /* The standard kernel/user address space limit. */ - *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg; + *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; /* By far the most common cases. */ - if (likely(seg == __USER_CS || seg == __KERNEL_CS)) + if (likely(SEGMENT_IS_FLAT_CODE(seg))) return eip; /* Check the segment exists, is within the current LDT/GDT size, @@ -436,11 +437,7 @@ good_area: write = 0; switch (error_code & 3) { default: /* 3: write, present */ -#ifdef TEST_VERIFY_AREA - if (regs->cs == KERNEL_CS) - printk("WP fault at %08lx\n", regs->eip); -#endif - /* fall through */ + /* fall through */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; ==================================================================--- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -60,6 +60,7 @@ struct pt_regs { #ifdef __KERNEL__ #include <asm/vm86.h> +#include <asm/segment.h> struct task_struct; extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); @@ -73,11 +74,11 @@ extern void send_sigtrap(struct task_str */ static inline int user_mode(struct pt_regs *regs) { - return (regs->xcs & 3) != 0; + return (regs->xcs & SEGMENT_RPL_MASK) == 3; } static inline int user_mode_vm(struct pt_regs *regs) { - return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0; + return (((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= 3); } #define instruction_pointer(regs) ((regs)->eip) #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) ==================================================================--- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -83,6 +83,12 @@ #define GDT_SIZE (GDT_ENTRIES * 8) +/* + * Some tricky tests to match code segments after a fault + */ +#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8) +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) + /* Simple and small GDT entries for booting only */ #define GDT_ENTRY_BOOT_CS 2 @@ -112,4 +118,8 @@ */ #define IDT_ENTRIES 256 +/* Bottom three bits of xcs give the ring privilege level */ +#define SEGMENT_RPL_MASK 0x3 + +#define get_kernel_rpl() 0 #endif --
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[patch 6/8] Make __FIXADDR_TOP variable to allow it to make space for a hypervisor.
Make __FIXADDR_TOP a variable, so that it can be set to not get in the way of address space a hypervisor may want to reserve. Original patch by Gerd Hoffmann <kraxel@suse.de> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Gerd Hoffmann <kraxel@suse.de> --- arch/i386/Kconfig | 1 + arch/i386/mm/init.c | 42 ++++++++++++++++++++++++++++++++++++++++++ arch/i386/mm/pgtable.c | 26 ++++++++++++++++++++++++++ include/asm-i386/fixmap.h | 7 ++++++- 4 files changed, 75 insertions(+), 1 deletion(-) ==================================================================--- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -792,6 +792,7 @@ config COMPAT_VDSO config COMPAT_VDSO bool "Compat VDSO support" default y + depends on !PARAVIRT help Map the VDSO to the predictable old-style address too. ---help--- ==================================================================--- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -629,6 +629,48 @@ void __init mem_init(void) (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); +#if 1 /* double-sanity-check paranoia */ + printk("virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +#endif + + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, + + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + +#ifdef CONFIG_HIGHMEM + BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif + BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); +#endif /* double-sanity-check paranoia */ + #ifdef CONFIG_X86_PAE if (!cpu_has_pae) panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); ==================================================================--- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/spinlock.h> +#include <linux/module.h> #include <asm/system.h> #include <asm/pgtable.h> @@ -137,6 +138,12 @@ void set_pmd_pfn(unsigned long vaddr, un __flush_tlb_one(vaddr); } +static int fixmaps; +#ifndef CONFIG_COMPAT_VDSO +unsigned long __FIXADDR_TOP = 0xfffff000; +EXPORT_SYMBOL(__FIXADDR_TOP); +#endif + void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { unsigned long address = __fix_to_virt(idx); @@ -146,6 +153,25 @@ void __set_fixmap (enum fixed_addresses return; } set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + fixmaps++; +} + +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void reserve_top_address(unsigned long reserve) +{ + BUG_ON(fixmaps > 0); +#ifdef CONFIG_COMPAT_VDSO + BUG_ON(reserve != 0); +#else + __FIXADDR_TOP = -reserve - PAGE_SIZE; + __VMALLOC_RESERVE += reserve; +#endif } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) ==================================================================--- a/include/asm-i386/fixmap.h +++ b/include/asm-i386/fixmap.h @@ -19,7 +19,11 @@ * Leave one empty page between vmalloc'ed areas and * the start of the fixmap. */ -#define __FIXADDR_TOP 0xfffff000 +#ifndef CONFIG_COMPAT_VDSO +extern unsigned long __FIXADDR_TOP; +#else +#define __FIXADDR_TOP 0xfffff000 +#endif #ifndef __ASSEMBLY__ #include <linux/kernel.h> @@ -93,6 +97,7 @@ enum fixed_addresses { extern void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags); +extern void reserve_top_address(unsigned long reserve); #define set_fixmap(idx, phys) \ __set_fixmap(idx, phys, PAGE_KERNEL) --
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[patch 0/8] Basic infrastructure patches for a paravirtualized kernel
Hi Andrew, This series of patches lays the basic ground work for the paravirtualized kernel patches coming later on. I think this lot is ready for the rough-and-tumble world of the -mm tree. The main change from the last posting is that all the page-table related patches have been moved out, and will be posted separately. Also, the off-by-one in reserving the top of address space has been fixed. For the most part, these patches do nothing or very little. The patches should be self explanatory, but the overview is: Helper functions for later use: 2/8: Implement always-locked bit ops... 8/8: Put .note.* sections into a PT_NOTE segment in vmlinux Cleanups: 1/8: Remove locally-defined ldt structure in favour of standard type 3/8: Allow a kernel to not be in ring 0 5/8: Roll all the cpuid asm into one __cpuid call Hooks: 4/8: Replace sensitive instructions with macros 6/8: Make __FIXADDR_TOP variable to allow it to make space... 7/8: Add a bootparameter to reserve high linear address... 8/8 "Put .note.* sections into a PT_NOTE segment in vmlinux" is mostly here to shake out problems early. It slightly changes the way the vmlinux image is linked together, and it uses the somewhat esoteric PHDRS command in vmlinux.lds. I want to make sure that this doesn't provoke any problems in the various binutils people are using. Thanks, J
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[patch 1/8] Remove locally-defined ldt structure in favour of standard type.
arch/i386/kernel/reboot.c defines its own struct to describe an ldt entry: it should use struct Xgt_desc_struct (currently load_ldt is a macro, so doesn't complain: paravirt patches make it warn). Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> --- arch/i386/kernel/reboot.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) ==================================================================--- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -145,14 +145,10 @@ real_mode_gdt_entries [3] 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct -{ - unsigned short size __attribute__ ((packed)); - unsigned long long * base __attribute__ ((packed)); -} -real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, NULL }, -no_idt = { 0, NULL }; +static struct Xgt_desc_struct +real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, +real_mode_idt = { 0x3ff, 0 }, +no_idt = { 0, 0 }; /* This is 16-bit protected mode code to disable paging and the cache, --
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[patch 7/8] Add a bootparameter to reserve high linear address space.
Add a bootparameter to reserve high linear address space for hypervisors. This is necessary to allow dynamically loaded hypervisor modules, which might not happen until userspace is already running, and also provides a useful tool to benchmark the performance impact of reduced lowmem address space. Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> --- Documentation/kernel-parameters.txt | 5 +++++ arch/i386/kernel/setup.c | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) ==================================================================--- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1357,6 +1357,11 @@ running once the system is up. reserve= [KNL,BUGS] Force the kernel to ignore some iomem area + reservetop= [IA-32] + Format: nn[KMG] + Reserves a hole at the top of the kernel virtual + address space. + resume= [SWSUSP] Specify the partition device for software suspend ==================================================================--- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -160,6 +160,12 @@ static char command_line[COMMAND_LINE_SI static char command_line[COMMAND_LINE_SIZE]; unsigned char __initdata boot_params[PARAM_SIZE]; + +static int __init setup_reservetop(char *s) +{ + return 1; +} +__setup("reservetop", setup_reservetop); static struct resource data_resource = { .name = "Kernel data", @@ -917,6 +923,19 @@ static void __init parse_cmdline_early ( else if (!memcmp(from, "vmalloc=", 8)) __VMALLOC_RESERVE = memparse(from+8, &from); + /* + * reservetop=size reserves a hole at the top of the kernel + * address space which a hypervisor can load into later. + * Needed for dynamically loaded hypervisors, so relocating + * the fixmap can be done before paging initialization. + * This hole must be a multiple of 4M. + */ + else if (!memcmp(from, "reservetop=", 11)) { + unsigned long reserve = memparse(from+11, &from); + reserve &= ~0x3fffff; + reserve_top_address(reserve); + } + next_char: c = *(from++); if (!c) --
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[patch 5/8] Roll all the cpuid asm into one __cpuid call.
It's a little neater, and also means only one place to patch for paravirtualization. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> --- include/asm-i386/processor.h | 74 +++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 40 deletions(-) ==================================================================--- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -143,31 +143,37 @@ static inline void detect_ht(struct cpui #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) -{ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ __asm__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op), "c"(0)); + : "0" (*eax), "2" (*ecx)); +} + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); } /* Some CPUID calls want 'count' to be placed in ecx */ static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) -{ - __asm__("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (op), "c" (count)); + int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); } /* @@ -175,42 +181,30 @@ static inline void cpuid_count(int op, i */ static inline unsigned int cpuid_eax(unsigned int op) { - unsigned int eax; - - __asm__("cpuid" - : "=a" (eax) - : "0" (op) - : "bx", "cx", "dx"); + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); return eax; } static inline unsigned int cpuid_ebx(unsigned int op) { - unsigned int eax, ebx; - - __asm__("cpuid" - : "=a" (eax), "=b" (ebx) - : "0" (op) - : "cx", "dx" ); + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); return ebx; } static inline unsigned int cpuid_ecx(unsigned int op) { - unsigned int eax, ecx; - - __asm__("cpuid" - : "=a" (eax), "=c" (ecx) - : "0" (op) - : "bx", "dx" ); + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); return ecx; } static inline unsigned int cpuid_edx(unsigned int op) { - unsigned int eax, edx; - - __asm__("cpuid" - : "=a" (eax), "=d" (edx) - : "0" (op) - : "bx", "cx"); + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); return edx; } --
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[patch 8/8] Put .note.* sections into a PT_NOTE segment in vmlinux.
This patch will pack any .note.* section into a PT_NOTE segment in the output file. To do this, we tell ld that we need a PT_NOTE segment. This requires us to start explicitly mapping sections to segments, so we also need to explicitly create PT_LOAD segments for text and data, and map the sections to them appropriately. Fortunately, each section will default to its previous section's segment, so it doesn't take many changes to vmlinux.lds.S. This only changes i386 for now, but I presume the corresponding changes for other architectures will be as simple. This change also adds <linux/elfnote.h>, which defines C and Assembler macros for actually creating ELF notes. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Hollis Blanchard <hollisb@us.ibm.com> --- arch/i386/kernel/vmlinux.lds.S | 12 ++++- include/asm-generic/vmlinux.lds.h | 3 + include/linux/elfnote.h | 88 +++++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 2 deletions(-) ==================================================================--- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -13,6 +13,12 @@ OUTPUT_ARCH(i386) OUTPUT_ARCH(i386) ENTRY(phys_startup_32) jiffies = jiffies_64; + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(4); /* R__ */ +} SECTIONS { . = __KERNEL_START; @@ -26,7 +32,7 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } = 0x9090 + } :text = 0x9090 _etext = .; /* End of text section */ @@ -48,7 +54,7 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data) CONSTRUCTORS - } + } :data . = ALIGN(4096); __nosave_begin = .; @@ -184,4 +190,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + NOTES } ==================================================================--- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -194,3 +194,6 @@ .stab.index 0 : { *(.stab.index) } \ .stab.indexstr 0 : { *(.stab.indexstr) } \ .comment 0 : { *(.comment) } + +#define NOTES \ + .notes : { *(.note.*) } :note ==================================================================--- /dev/null +++ b/include/linux/elfnote.h @@ -0,0 +1,88 @@ +#ifndef _LINUX_ELFNOTE_H +#define _LINUX_ELFNOTE_H +/* + * Helper macros to generate ELF Note structures, which are put into a + * PT_NOTE segment of the final vmlinux image. These are useful for + * including name-value pairs of metadata into the kernel binary (or + * modules?) for use by external programs. + * + * Each note has three parts: a name, a type and a desc. The name is + * intended to distinguish the note's originator, so it would be a + * company, project, subsystem, etc; it must be in a suitable form for + * use in a section name. The type is an integer which is used to tag + * the data, and is considered to be within the "name" namespace (so + * "FooCo"'s type 42 is distinct from "BarProj"'s type 42). The + * "desc" field is the actual data. There are no constraints on the + * desc field's contents, though typically they're fairly small. + * + * All notes from a given NAME are put into a section named + * .note.NAME. When the kernel image is finally linked, all the notes + * are packed into a single .notes section, which is mapped into the + * PT_NOTE segment. Because notes for a given name are grouped into + * the same section, they'll all be adjacent the output file. + * + * This file defines macros for both C and assembler use. Their + * syntax is slightly different, but they're semantically similar. + * + * See the ELF specification for more detail about ELF notes. + */ + +#ifdef __ASSEMBLER__ +/* + * Generate a structure with the same shape as Elf{32,64}_Nhdr (which + * turn out to be the same size and shape), followed by the name and + * desc data with appropriate padding. The 'desc' argument includes + * the assembler pseudo op defining the type of the data: .asciz + * "hello, world" + */ +.macro ELFNOTE name type desc:vararg +.pushsection ".note.\name" + .align 4 + .long 2f - 1f /* namesz */ + .long 4f - 3f /* descsz */ + .long \type +1:.asciz "\name" +2:.align 4 +3:\desc +4:.align 4 +.popsection +.endm +#else /* !__ASSEMBLER__ */ +#include <linux/elf.h> +/* + * Use an anonymous structure which matches the shape of + * Elf{32,64}_Nhdr, but includes the name and desc data. The size and + * type of name and desc depend on the macro arguments. "name" must + * be a literal string, and "desc" must be passed by value. You may + * only define one note per line, since __LINE__ is used to generate + * unique symbols. + */ +#define _ELFNOTE_PASTE(a,b) a##b +#define _ELFNOTE(size, name, unique, type, desc) \ + static const struct { \ + struct elf##size##_note _nhdr; \ + unsigned char _name[sizeof(name)] \ + __attribute__((aligned(sizeof(Elf##size##_Word)))); \ + typeof(desc) _desc \ + __attribute__((aligned(sizeof(Elf##size##_Word)))); \ + } _ELFNOTE_PASTE(_note_, unique) \ + __attribute_used__ \ + __attribute__((section(".note." name), \ + aligned(sizeof(Elf##size##_Word)), \ + unused)) = { \ + { \ + sizeof(name), \ + sizeof(desc), \ + type, \ + }, \ + name, \ + desc \ + } +#define ELFNOTE(size, name, type, desc) \ + _ELFNOTE(size, name, __LINE__, type, desc) + +#define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc) +#define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc) +#endif /* __ASSEMBLER__ */ + +#endif /* _LINUX_ELFNOTE_H */ --
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[patch 4/8] Replace sensitive instructions with macros.
Abstract sensitive instructions in assembler code, replacing them with macros (which currently are #defined to the native versions). We use long names: assembler is case-insensitive, so if something goes wrong and macros do not expand, it would assemble anyway. Resulting object files are exactly the same as before. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> --- arch/i386/kernel/entry.S | 38 ++++++++++++++++++++++---------------- include/asm-i386/spinlock.h | 7 +++++-- 2 files changed, 27 insertions(+), 18 deletions(-) ==================================================================--- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -76,8 +76,15 @@ NT_MASK = 0x00004000 NT_MASK = 0x00004000 VM_MASK = 0x00020000 +/* These are replaces for paravirtualization */ +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax + #ifdef CONFIG_PREEMPT -#define preempt_stop cli; TRACE_IRQS_OFF +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else #define preempt_stop #define resume_kernel restore_nocheck @@ -233,7 +240,7 @@ check_userspace: cmpl $SEGMENT_RPL_MASK, %eax jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -244,7 +251,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cli + DISABLE_INTERRUPTS cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -272,7 +279,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - sti + ENABLE_INTERRUPTS pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -317,7 +324,7 @@ 1: movl (%ebp),%ebp jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -327,8 +334,7 @@ 1: movl (%ebp),%ebp movl OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON - sti - sysexit + ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC @@ -353,7 +359,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) # store the return value syscall_exit: - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -378,11 +384,11 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp CFI_ADJUST_CFA_OFFSET -4 -1: iret +1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -406,7 +412,7 @@ ldt_ss: * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer CFI_ADJUST_CFA_OFFSET 8 - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, @@ -416,7 +422,7 @@ ldt_ss: TRACE_IRQS_IRET RESTORE_REGS lss 20+4(%esp), %esp # switch to 16bit stack -1: iret +1: INTERRUPT_RETURN .section __ex_table,"a" .align 4 .long 1b,iret_exc @@ -431,7 +437,7 @@ work_pending: jz work_notifysig work_resched: call schedule - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -487,7 +493,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - sti # could let do_syscall_trace() call + ENABLE_INTERRUPTS # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -666,7 +672,7 @@ ENTRY(device_not_available) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL - movl %cr0, %eax + GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate preempt_stop @@ -796,7 +802,7 @@ nmi_16bit_stack: call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to 16bit stack -1: iret +1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" .align 4 ==================================================================--- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -16,6 +16,9 @@ * * (the type definitions are in asm/spinlock_types.h) */ + +#define CLI_STRING "cli" +#define STI_STRING "sti" #define __raw_spin_is_locked(x) \ (*(volatile signed char *)(&(x)->slock) <= 0) @@ -43,12 +46,12 @@ "2:\t" \ "testl $0x200, %1\n\t" \ "jz 4f\n\t" \ - "sti\n" \ + STI_STRING "\n" \ "3:\t" \ "rep;nop\n\t" \ "cmpb $0, %0\n\t" \ "jle 3b\n\t" \ - "cli\n\t" \ + CLI_STRING "\n\t" \ "jmp 1b\n" \ "4:\t" \ "rep;nop\n\t" \ --
Jeremy Fitzhardinge
2007-Apr-18 13:02 UTC
[patch 2/8] Implement always-locked bit ops, for memory shared with an SMP hypervisor.
Add "always lock'd" implementations of set_bit, clear_bit and change_bit and the corresponding test_and_ functions. Also add "always lock'd" implementation of cmpxchg. These give guaranteed strong synchronisation and are required for non-SMP kernels running on an SMP hypervisor. Signed-off-by: Ian Pratt <ian.pratt@xensource.com> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Christoph Lameter <clameter@sgi.com> --- include/asm-i386/sync_bitops.h | 156 ++++++++++++++++++++++++++++++++++++++++ include/asm-i386/system.h | 36 +++++++++ 2 files changed, 192 insertions(+) ==================================================================--- a/include/asm-i386/system.h +++ b/include/asm-i386/system.h @@ -261,6 +261,9 @@ static inline unsigned long __xchg(unsig #define cmpxchg(ptr,o,n)\ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ (unsigned long)(n),sizeof(*(ptr)))) +#define sync_cmpxchg(ptr,o,n)\ + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\ + (unsigned long)(n),sizeof(*(ptr)))) #endif static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, @@ -282,6 +285,39 @@ static inline unsigned long __cmpxchg(vo return prev; case 4: __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +/* + * Always use locked operations when touching memory shared with a + * hypervisor, since the system may be SMP even if the guest kernel + * isn't. + */ +static inline unsigned long __sync_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" : "=a"(prev) : "r"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); ==================================================================--- /dev/null +++ b/include/asm-i386/sync_bitops.h @@ -0,0 +1,156 @@ +#ifndef _I386_SYNC_BITOPS_H +#define _I386_SYNC_BITOPS_H + +/* + * Copyright 1992, Linus Torvalds. + */ + +/* + * These have to be done with inline assembly: that way the bit-setting + * is guaranteed to be atomic. All bit operations return 0 if the bit + * was cleared before the operation and != 0 if it was not. + * + * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). + */ + +#define ADDR (*(volatile long *) addr) + +/** + * sync_set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * + * Note: there are no guarantees that this function will not be reordered + * on non x86 architectures, so if you are writting portable code, + * make sure not to rely on its reordering guarantees. + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void sync_set_bit(int nr, volatile unsigned long * addr) +{ + __asm__ __volatile__("lock; btsl %1,%0" + :"+m" (ADDR) + :"Ir" (nr) + : "memory"); +} + +/** + * sync_clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * sync_clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static inline void sync_clear_bit(int nr, volatile unsigned long * addr) +{ + __asm__ __volatile__("lock; btrl %1,%0" + :"+m" (ADDR) + :"Ir" (nr) + : "memory"); +} + +/** + * sync_change_bit - Toggle a bit in memory + * @nr: Bit to change + * @addr: Address to start counting from + * + * change_bit() is atomic and may not be reordered. It may be + * reordered on other architectures than x86. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void sync_change_bit(int nr, volatile unsigned long * addr) +{ + __asm__ __volatile__("lock; btcl %1,%0" + :"+m" (ADDR) + :"Ir" (nr) + : "memory"); +} + +/** + * sync_test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It may be reordered on other architectures than x86. + * It also implies a memory barrier. + */ +static inline int sync_test_and_set_bit(int nr, volatile unsigned long * addr) +{ + int oldbit; + + __asm__ __volatile__("lock; btsl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +/** + * sync_test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It can be reorderdered on other architectures other than x86. + * It also implies a memory barrier. + */ +static inline int sync_test_and_clear_bit(int nr, volatile unsigned long * addr) +{ + int oldbit; + + __asm__ __volatile__("lock; btrl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +/** + * sync_test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int sync_test_and_change_bit(int nr, volatile unsigned long* addr) +{ + int oldbit; + + __asm__ __volatile__("lock; btcl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +static __always_inline int sync_const_test_bit(int nr, const volatile unsigned long *addr) +{ + return ((1UL << (nr & 31)) & + (((const volatile unsigned int *)addr)[nr >> 5])) != 0; +} + +static inline int sync_var_test_bit(int nr, const volatile unsigned long * addr) +{ + int oldbit; + + __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit) + :"m" (ADDR),"Ir" (nr)); + return oldbit; +} + +#define sync_test_bit(nr,addr) \ + (__builtin_constant_p(nr) ? \ + sync_constant_test_bit((nr),(addr)) : \ + sync_var_test_bit((nr),(addr))) + +#undef ADDR + +#endif /* _I386_SYNC_BITOPS_H */ --