Rusty Russell wrote:> Hi all, > > I've been looking at finding common ground between the VMI, Xen and > other paravirtualization approaches, and after some discussion, we're > getting somewhere. > > These first two patches are the fundamentals, stolen mainly from the > VMI patches: removing assumptions about the kernel running in ring 0, > and macro-izing all the obvious para-virtualize-needing insns. The > third patch is more ambitious: it introduces a "paravirt_ops" structure > (a-la PPC's ppc_md) through which all these ops are indirected. This > should allow Xen, VMI and other variants to build on a common base. > > These patches also live at > http://kernel.org/pub/linux/kernel/people/rusty/Paravirt > > Feedback welcome! > Rusty. > > Name: Kernel Ring Cleanups > Status: Booted on 2.6.16-rc2-git7 > Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> > > This is Zach's patch to clean up assumptions about the kernel running > in ring 0 (which it doesn't when running paravirtualized). > > 1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3 > 2) Add a get_kernel_rpl() function > 3) Create COMPARE_SEGMENT_STACK and COMPARE_SEGMENT_REG macros which > can mask out the bottom two bits (RPL) when comparing for > paravirtualization. >This looks good to me (obviously), but I always seem to run into problems with UML whenever I touch ptrace.h. I did make sure UML worked when I sent the original patch, but these things do change. I'll give it a spin again to make sure UML compiles. Zach
Rusty Russell
2007-Apr-18 13:02 UTC
[PATCH 2/3] Paravirtualization: Abstract sensitive instructions
Gruntwork, basically. You'll see why it's paravirt.h and no_paravirt.h in the next patch. Name: Make Paravirtualization-requiring Instructions into Macros Status: Booted on 2.6.16-rc2-git7 Depends: ring_assumptions_cleanup.patch.gz Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> The basic requirement of paravirtualization is that certain critical operations call into the hypervisor. So the first step towards this is to identify the minimal subset of "sensitive" instructions and wrap them in macros so they can be replaced. This patch does that: it has no effect on the compiled kernel result. I stole this fairly wholesale from the VMI patches by Zachary Amsden <zach@vmware.com> and the Xen patches by Chris Wright <chrisw@sous-sol.org> We create some new headers: paravirt.h and paravirt_desc.h. These currently just include no_paravirt.h and no_paravirt_desc.h: we move all the native versions of these macros to there. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/arch/i386/kernel/entry.S tmp2/arch/i386/kernel/entry.S --- tmp/arch/i386/kernel/entry.S 2006-05-02 16:00:17.000000000 +1000 +++ tmp2/arch/i386/kernel/entry.S 2006-05-02 15:58:15.000000000 +1000 @@ -48,6 +48,7 @@ #include <asm/smp.h> #include <asm/page.h> #include <asm/desc.h> +#include <asm/paravirt.h> #include "irq_vectors.h" #define nr_syscalls ((syscall_table_size)/4) @@ -76,7 +77,7 @@ NT_MASK = 0x00004000 VM_MASK = 0x00020000 #ifdef CONFIG_PREEMPT -#define preempt_stop cli +#define preempt_stop CLI #else #define preempt_stop #define resume_kernel restore_nocheck @@ -150,7 +151,7 @@ ret_from_intr: cmpl $SEGMENT_RPL_MASK, %eax jb resume_kernel # returning to kernel or vm86-space ENTRY(resume_userspace) - cli # make sure we don't miss an interrupt + CLI # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -161,7 +162,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cli + CLI cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -181,7 +182,7 @@ need_resched: ENTRY(sysenter_entry) movl TSS_sysenter_esp0(%esp),%esp sysenter_past_esp: - sti + STI pushl $(__USER_DS) pushl %ebp pushfl @@ -211,7 +212,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) - cli + CLI movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work @@ -219,8 +220,7 @@ sysenter_past_esp: movl EIP(%esp), %edx movl OLDESP(%esp), %ecx xorl %ebp,%ebp - sti - sysexit + STI_SYSEXIT # system call handler stub @@ -242,7 +242,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) # store the return value syscall_exit: - cli # make sure we don't miss an interrupt + CLI # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -262,14 +262,14 @@ restore_all: restore_nocheck: RESTORE_REGS addl $4, %esp -1: iret -.section .fixup,"ax" +1: IRET +.pushsection .fixup,"ax" iret_exc: - sti + STI pushl $0 # no error code pushl $do_iret_error jmp error_code -.previous +.popsection .section __ex_table,"a" .align 4 .long 1b,iret_exc @@ -287,14 +287,14 @@ ldt_ss: * CPUs, which we can try to work around to make * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer - cli + CLI movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, * and a switch16 pointer on top of the current frame. */ call setup_x86_bogus_stack RESTORE_REGS lss 20+4(%esp), %esp # switch to 16bit stack -1: iret +1: IRET .section __ex_table,"a" .align 4 .long 1b,iret_exc @@ -307,7 +307,7 @@ work_pending: jz work_notifysig work_resched: call schedule - cli # make sure we don't miss an interrupt + CLI # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -359,7 +359,7 @@ syscall_trace_entry: syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending - sti # could let do_syscall_trace() call + STI # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -479,7 +479,7 @@ ENTRY(simd_coprocessor_error) ENTRY(device_not_available) pushl $-1 # mark this as an int SAVE_ALL - movl %cr0, %eax + GET_CR0 testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate preempt_stop @@ -587,7 +587,7 @@ nmi_16bit_stack: call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to 16bit stack -1: iret +1: IRET .section __ex_table,"a" .align 4 .long 1b,iret_exc diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/arch/i386/kernel/ioport.c tmp2/arch/i386/kernel/ioport.c --- tmp/arch/i386/kernel/ioport.c 2006-03-23 12:44:26.000000000 +1100 +++ tmp2/arch/i386/kernel/ioport.c 2006-05-02 15:58:15.000000000 +1000 @@ -16,6 +16,7 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/thread_info.h> +#include <asm/desc.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/arch/i386/kernel/traps.c tmp2/arch/i386/kernel/traps.c --- tmp/arch/i386/kernel/traps.c 2006-05-02 15:57:41.000000000 +1000 +++ tmp2/arch/i386/kernel/traps.c 2006-05-02 15:58:15.000000000 +1000 @@ -1086,20 +1086,6 @@ void __init trap_init_f00f_bug(void) } #endif -#define _set_gate(gate_addr,type,dpl,addr,seg) \ -do { \ - int __d0, __d1; \ - __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ - "movw %4,%%dx\n\t" \ - "movl %%eax,%0\n\t" \ - "movl %%edx,%1" \ - :"=m" (*((long *) (gate_addr))), \ - "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ - :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ - "3" ((char *) (addr)),"2" ((seg) << 16)); \ -} while (0) - - /* * This needs to use 'idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the @@ -1108,7 +1094,7 @@ do { \ */ void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); } /* @@ -1116,22 +1102,22 @@ void set_intr_gate(unsigned int n, void */ static inline void set_system_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS); + _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); } static void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/desc.h tmp2/include/asm-i386/desc.h --- tmp/include/asm-i386/desc.h 2006-03-23 12:44:59.000000000 +1100 +++ tmp2/include/asm-i386/desc.h 2006-05-02 15:58:15.000000000 +1000 @@ -33,50 +33,66 @@ static inline struct desc_struct *get_cp return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; } -#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) -#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) - -#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) -#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) -#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr)) -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt)) - -#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) -#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) -#define store_tr(tr) __asm__ ("str %0":"=mr" (tr)) -#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt)) - /* * This is the ldt that every process will get unless we need * something other than this. */ extern struct desc_struct default_ldt[]; +extern struct desc_struct idt_table[]; extern void set_intr_gate(unsigned int irq, void * addr); -#define _set_tssldt_desc(n,addr,limit,type) \ -__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ - "movw %w1,2(%2)\n\t" \ - "rorl $16,%1\n\t" \ - "movb %b1,4(%2)\n\t" \ - "movb %4,5(%2)\n\t" \ - "movb $0,6(%2)\n\t" \ - "movb %h1,7(%2)\n\t" \ - "rorl $16,%1" \ - : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type)) +static inline void pack_descriptor(__u32 *a, __u32 *b, + unsigned long base, unsigned long limit, unsigned char type, unsigned char flags) +{ + *a = ((base & 0xffff) << 16) | (limit & 0xffff); + *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | + ((type & 0xff) << 8) | ((flags & 0xf) << 12); +} -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr) +static inline void pack_gate(__u32 *a, __u32 *b, + unsigned long base, unsigned short seg, unsigned char type, unsigned char flags) { - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr, - offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89); + *a = (seg << 16) | (base & 0xffff); + *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff); } -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) +#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */ +#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */ +#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */ +#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */ +#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */ +#define DESCTYPE_DPL3 0x60 /* DPL-3 */ +#define DESCTYPE_S 0x10 /* !system */ -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) +#include <asm/paravirt_desc.h> + +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) { - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); + __u32 a, b; + pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); + write_idt_entry(idt_table, gate, a, b); +} + +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) +{ + __u32 a, b; + pack_descriptor(&a, &b, (unsigned long)addr, + offsetof(struct tss_struct, __cacheline_filler) - 1, + DESCTYPE_TSS, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); +} + +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries) +{ + __u32 a, b; + pack_descriptor(&a, &b, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); } +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + #define LDT_entry_a(info) \ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) @@ -102,24 +118,6 @@ static inline void set_ldt_desc(unsigned (info)->seg_not_present == 1 && \ (info)->useable == 0 ) -static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b) -{ - __u32 *lp = (__u32 *)((char *)ldt + entry*8); - *lp = entry_a; - *(lp+1) = entry_b; -} - -#if TLS_SIZE != 24 -# error update this code. -#endif - -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] - C(0); C(1); C(2); -#undef C -} - static inline void clear_LDT(void) { int cpu = get_cpu(); diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/no_paravirt.h tmp2/include/asm-i386/no_paravirt.h --- tmp/include/asm-i386/no_paravirt.h 1970-01-01 10:00:00.000000000 +1000 +++ tmp2/include/asm-i386/no_paravirt.h 2006-05-02 15:58:15.000000000 +1000 @@ -0,0 +1,141 @@ +#ifndef __ASM_NO_PARAVIRT_H +#define __ASM_NO_PARAVIRT_H +/* This is the native implementation of the paravirtualized + * instruction wrappers. */ + +#ifndef __ASSEMBLY__ +/* The non-paravirtualized CPUID instruction. */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well: see processor.h. */ + __asm__("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + __asm__("movl %%db" #register ", %0" \ + :"=r" (var)) +#define set_debugreg(value, register) \ + __asm__("movl %0,%%db" #register \ + : /* no output */ \ + :"r" (value)) + +/* Stop speculative execution */ +static inline void sync_core(void) +{ + unsigned int eax = 1, ebx, ecx, edx; + __cpuid(&eax, &ebx, &ecx, &edx); +} + +/* + * Clear and set 'TS' bit respectively + */ +#define clts() __asm__ __volatile__ ("clts") +#define read_cr0() ({ \ + unsigned int __dummy; \ + __asm__ __volatile__( \ + "movl %%cr0,%0\n\t" \ + :"=r" (__dummy)); \ + __dummy; \ +}) +#define write_cr0(x) \ + __asm__ __volatile__("movl %0,%%cr0": :"r" (x)); + +#define read_cr2() ({ \ + unsigned int __dummy; \ + __asm__ __volatile__( \ + "movl %%cr2,%0\n\t" \ + :"=r" (__dummy)); \ + __dummy; \ +}) +#define write_cr2(x) \ + __asm__ __volatile__("movl %0,%%cr2": :"r" (x)); + +#define read_cr3() ({ \ + unsigned int __dummy; \ + __asm__ ( \ + "movl %%cr3,%0\n\t" \ + :"=r" (__dummy)); \ + __dummy; \ +}) +#define write_cr3(x) \ + __asm__ __volatile__("movl %0,%%cr3": :"r" (x)); + +#define read_cr4() ({ \ + unsigned int __dummy; \ + __asm__( \ + "movl %%cr4,%0\n\t" \ + :"=r" (__dummy)); \ + __dummy; \ +}) + +#define read_cr4_safe() ({ \ + unsigned int __dummy; \ + /* This could fault if %cr4 does not exist */ \ + __asm__("1: movl %%cr4, %0 \n" \ + "2: \n" \ + ".section __ex_table,\"a\" \n" \ + ".long 1b,2b \n" \ + ".previous \n" \ + : "=r" (__dummy): "0" (0)); \ + __dummy; \ +}) + +#define write_cr4(x) \ + __asm__ __volatile__("movl %0,%%cr4": :"r" (x)); + +static inline unsigned long __local_save_flags(void) +{ + unsigned long f; + __asm__ __volatile__("pushfl ; popl %0":"=g" (f): /* no input */); + return f; +} + +static inline void __local_irq_restore(unsigned long f) +{ + __asm__ __volatile__("pushl %0 ; popfl": /* no output */ + :"g" (f) + :"memory", "cc"); +} + +#define local_irq_disable() __asm__ __volatile__("cli": : :"memory") +#define local_irq_enable() __asm__ __volatile__("sti": : :"memory") +/* used in the idle loop; sti takes one instruction cycle to complete */ +#define safe_halt() __asm__ __volatile__("sti; hlt": : :"memory") +/* used when interrupts are already enabled or to shutdown the processor */ +#define halt() __asm__ __volatile__("hlt": : :"memory") +#define wbinvd() __asm__ __volatile__("wbinvd": : :"memory"); + +#define get_kernel_rpl() 0 + +#else /* ... __ASSEMBLY__ */ +#define IRET iret +#define CLI cli +#define STI sti +#define STI_SYSEXIT sti; sysexit +#define GET_CR0 mov %cr0, %eax +#define WRMSR wrmsr +#define RDMSR rdmsr +#define CPUID cpuid + +#define COMPARE_SEGMENT_STACK(segment, offset) \ + cmpw $segment, offset(%esp); + +#define COMPARE_SEGMENT_REG(segment, reg) \ + pushl %eax; \ + mov reg, %eax; \ + cmpw $segment,%ax; \ + popl %eax; +#endif /* __ASSEMBLY__ */ + +#define CLI_STRING "cli" +#define STI_STRING "sti" +#endif /* __ASM_NO_PARAVIRT_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/no_paravirt_desc.h tmp2/include/asm-i386/no_paravirt_desc.h --- tmp/include/asm-i386/no_paravirt_desc.h 1970-01-01 10:00:00.000000000 +1000 +++ tmp2/include/asm-i386/no_paravirt_desc.h 2006-05-02 15:58:15.000000000 +1000 @@ -0,0 +1,57 @@ +#ifndef __ASM_NO_PARAVIRT_DESC_H +#define __ASM_NO_PARAVIRT_DESC_H +/* The GDT instructions are here, not in paravirt.h because they need + * processor.h, which needs paravirt.h... */ + +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) + +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) + +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) +#define store_tr(tr) __asm__ ("str %0":"=m" (tr)) +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) + +#if TLS_SIZE != 24 +# error update this code. +#endif + +static inline void load_TLS(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] + C(0); C(1); C(2); +#undef C +} + +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) +{ + __u32 *lp = (__u32 *)((char *)dt + entry*8); + *lp = entry_a; + *(lp+1) = entry_b; +} + +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void set_iopl_mask(unsigned mask) +{ + unsigned int reg; + __asm__ __volatile__ ("pushfl;" + "popl %0;" + "andl %1, %0;" + "orl %2, %0;" + "pushl %0;" + "popfl" + : "=&r" (reg) + : "i" (~X86_EFLAGS_IOPL), "r" (mask)); +} + +#endif /* __ASM_NO_PARAVIRT_DESC_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/paravirt.h tmp2/include/asm-i386/paravirt.h --- tmp/include/asm-i386/paravirt.h 1970-01-01 10:00:00.000000000 +1000 +++ tmp2/include/asm-i386/paravirt.h 2006-05-02 15:58:15.000000000 +1000 @@ -0,0 +1,7 @@ +#ifndef __ASM_PARAVIRT_H +#define __ASM_PARAVIRT_H +/* Various instructions on x86 need to be replaced for + * para-virtualization: those hooks are defined here. */ +#include <asm/no_paravirt.h> + +#endif /* __ASM_PARAVIRT_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/paravirt_desc.h tmp2/include/asm-i386/paravirt_desc.h --- tmp/include/asm-i386/paravirt_desc.h 1970-01-01 10:00:00.000000000 +1000 +++ tmp2/include/asm-i386/paravirt_desc.h 2006-05-02 15:58:15.000000000 +1000 @@ -0,0 +1,6 @@ +#ifndef __ASM_PARAVIRT_DESC_H +#define __ASM_PARAVIRT_DESC_H +/* A separate header because they need processor.h, which needs paravirt.h */ +#include <asm/no_paravirt_desc.h> + +#endif /* __ASM_PARAVIRT_DESC_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/processor.h tmp2/include/asm-i386/processor.h --- tmp/include/asm-i386/processor.h 2006-04-21 12:05:46.000000000 +1000 +++ tmp2/include/asm-i386/processor.h 2006-05-02 15:58:15.000000000 +1000 @@ -21,6 +21,7 @@ #include <linux/threads.h> #include <asm/percpu.h> #include <linux/cpumask.h> +#include <asm/paravirt.h> /* flag for disabling the tsc */ extern int tsc_disable; @@ -148,24 +149,18 @@ static inline void detect_ht(struct cpui */ static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { - __asm__("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (op), "c"(0)); + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); } /* Some CPUID calls want 'count' to be placed in ecx */ static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) + int *edx) { - __asm__("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (op), "c" (count)); + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); } /* @@ -173,42 +168,30 @@ static inline void cpuid_count(int op, i */ static inline unsigned int cpuid_eax(unsigned int op) { - unsigned int eax; + unsigned int eax, ebx, ecx, edx; - __asm__("cpuid" - : "=a" (eax) - : "0" (op) - : "bx", "cx", "dx"); + cpuid(op, &eax, &ebx, &ecx, &edx); return eax; } static inline unsigned int cpuid_ebx(unsigned int op) { - unsigned int eax, ebx; + unsigned int eax, ebx, ecx, edx; - __asm__("cpuid" - : "=a" (eax), "=b" (ebx) - : "0" (op) - : "cx", "dx" ); + cpuid(op, &eax, &ebx, &ecx, &edx); return ebx; } static inline unsigned int cpuid_ecx(unsigned int op) { - unsigned int eax, ecx; + unsigned int eax, ebx, ecx, edx; - __asm__("cpuid" - : "=a" (eax), "=c" (ecx) - : "0" (op) - : "bx", "dx" ); + cpuid(op, &eax, &ebx, &ecx, &edx); return ecx; } static inline unsigned int cpuid_edx(unsigned int op) { - unsigned int eax, edx; + unsigned int eax, ebx, ecx, edx; - __asm__("cpuid" - : "=a" (eax), "=d" (edx) - : "0" (op) - : "bx", "cx"); + cpuid(op, &eax, &ebx, &ecx, &edx); return edx; } @@ -286,13 +269,6 @@ static inline void clear_in_cr4 (unsigne outb((data), 0x23); \ } while (0) -/* Stop speculative execution */ -static inline void sync_core(void) -{ - int tmp; - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); -} - static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { @@ -512,33 +488,6 @@ static inline void load_esp0(struct tss_ regs->esp = new_esp; \ } while (0) -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - __asm__("movl %%db" #register ", %0" \ - :"=r" (var)) -#define set_debugreg(value, register) \ - __asm__("movl %0,%%db" #register \ - : /* no output */ \ - :"r" (value)) - -/* - * Set IOPL bits in EFLAGS from given mask - */ -static inline void set_iopl_mask(unsigned mask) -{ - unsigned int reg; - __asm__ __volatile__ ("pushfl;" - "popl %0;" - "andl %1, %0;" - "orl %2, %0;" - "pushl %0;" - "popfl" - : "=&r" (reg) - : "i" (~X86_EFLAGS_IOPL), "r" (mask)); -} - /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/segment.h tmp2/include/asm-i386/segment.h --- tmp/include/asm-i386/segment.h 2006-05-02 15:57:41.000000000 +1000 +++ tmp2/include/asm-i386/segment.h 2006-05-02 15:58:15.000000000 +1000 @@ -112,15 +112,5 @@ /* Bottom three bits of xcs give the ring privilege level */ #define SEGMENT_RPL_MASK 0x3 -#define get_kernel_rpl() 0 - -#define COMPARE_SEGMENT_STACK(segment, offset) \ - cmpw $segment, offset(%esp); - -#define COMPARE_SEGMENT_REG(segment, reg) \ - pushl %eax; \ - mov reg, %eax; \ - cmpw $segment,%ax; \ - popl %eax; #endif diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/spinlock.h tmp2/include/asm-i386/spinlock.h --- tmp/include/asm-i386/spinlock.h 2006-04-21 12:05:46.000000000 +1000 +++ tmp2/include/asm-i386/spinlock.h 2006-05-02 15:58:15.000000000 +1000 @@ -6,6 +6,7 @@ #include <asm/page.h> #include <linux/config.h> #include <linux/compiler.h> +#include <asm/paravirt.h> /* * Your basic SMP spinlocks, allowing only a single CPU anywhere @@ -39,12 +40,12 @@ "2:\t" \ "testl $0x200, %1\n\t" \ "jz 4f\n\t" \ - "sti\n" \ + STI_STRING "\n" \ "3:\t" \ "rep;nop\n\t" \ "cmpb $0, %0\n\t" \ "jle 3b\n\t" \ - "cli\n\t" \ + CLI_STRING "\n\t" \ "jmp 1b\n" \ "4:\t" \ "rep;nop\n\t" \ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal tmp/include/asm-i386/system.h tmp2/include/asm-i386/system.h --- tmp/include/asm-i386/system.h 2006-04-21 12:05:46.000000000 +1000 +++ tmp2/include/asm-i386/system.h 2006-05-02 15:58:15.000000000 +1000 @@ -6,6 +6,7 @@ #include <asm/segment.h> #include <asm/cpufeature.h> #include <linux/bitops.h> /* for LOCK_PREFIX */ +#include <asm/paravirt.h> #ifdef __KERNEL__ @@ -83,69 +84,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" #define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) -/* - * Clear and set 'TS' bit respectively - */ -#define clts() __asm__ __volatile__ ("clts") -#define read_cr0() ({ \ - unsigned int __dummy; \ - __asm__ __volatile__( \ - "movl %%cr0,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr0(x) \ - __asm__ __volatile__("movl %0,%%cr0": :"r" (x)); - -#define read_cr2() ({ \ - unsigned int __dummy; \ - __asm__ __volatile__( \ - "movl %%cr2,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr2(x) \ - __asm__ __volatile__("movl %0,%%cr2": :"r" (x)); - -#define read_cr3() ({ \ - unsigned int __dummy; \ - __asm__ ( \ - "movl %%cr3,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr3(x) \ - __asm__ __volatile__("movl %0,%%cr3": :"r" (x)); - -#define read_cr4() ({ \ - unsigned int __dummy; \ - __asm__( \ - "movl %%cr4,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) - -#define read_cr4_safe() ({ \ - unsigned int __dummy; \ - /* This could fault if %cr4 does not exist */ \ - __asm__("1: movl %%cr4, %0 \n" \ - "2: \n" \ - ".section __ex_table,\"a\" \n" \ - ".long 1b,2b \n" \ - ".previous \n" \ - : "=r" (__dummy): "0" (0)); \ - __dummy; \ -}) - -#define write_cr4(x) \ - __asm__ __volatile__("movl %0,%%cr4": :"r" (x)); #define stts() write_cr0(8 | read_cr0()) - #endif /* __KERNEL__ */ -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory"); - static inline unsigned long get_limit(unsigned long segment) { unsigned long __limit; @@ -457,16 +398,6 @@ static inline unsigned long long __cmpxc #define set_wmb(var, value) do { var = value; wmb(); } while (0) -/* interrupt control.. */ -#define local_save_flags(x) do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0) -#define local_irq_restore(x) do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0) -#define local_irq_disable() __asm__ __volatile__("cli": : :"memory") -#define local_irq_enable() __asm__ __volatile__("sti": : :"memory") -/* used in the idle loop; sti takes one instruction cycle to complete */ -#define safe_halt() __asm__ __volatile__("sti; hlt": : :"memory") -/* used when interrupts are already enabled or to shutdown the processor */ -#define halt() __asm__ __volatile__("hlt": : :"memory") - #define irqs_disabled() \ ({ \ unsigned long flags; \ @@ -475,7 +406,9 @@ static inline unsigned long long __cmpxc }) /* For spinlocks etc */ -#define local_irq_save(x) __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") +#define local_irq_save(x) do { local_save_flags(x); local_irq_disable(); } while (0) +#define local_save_flags(x) do { typecheck(unsigned long,x); (x) = __local_save_flags(); } while (0) +#define local_irq_restore(x) do { typecheck(unsigned long,x); __local_irq_restore(x); } while (0) /* * disable hlt during certain critical i/o operations -- ccontrol: http://ccontrol.ozlabs.org
Hi all, I've been looking at finding common ground between the VMI, Xen and other paravirtualization approaches, and after some discussion, we're getting somewhere. These first two patches are the fundamentals, stolen mainly from the VMI patches: removing assumptions about the kernel running in ring 0, and macro-izing all the obvious para-virtualize-needing insns. The third patch is more ambitious: it introduces a "paravirt_ops" structure (a-la PPC's ppc_md) through which all these ops are indirected. This should allow Xen, VMI and other variants to build on a common base. These patches also live at http://kernel.org/pub/linux/kernel/people/rusty/Paravirt Feedback welcome! Rusty. Name: Kernel Ring Cleanups Status: Booted on 2.6.16-rc2-git7 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> This is Zach's patch to clean up assumptions about the kernel running in ring 0 (which it doesn't when running paravirtualized). 1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3 2) Add a get_kernel_rpl() function 3) Create COMPARE_SEGMENT_STACK and COMPARE_SEGMENT_REG macros which can mask out the bottom two bits (RPL) when comparing for paravirtualization. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/arch/i386/kernel/entry.S tmp/arch/i386/kernel/entry.S --- linux-2.6.17-rc2-git7/arch/i386/kernel/entry.S 2006-04-21 12:05:02.000000000 +1000 +++ tmp/arch/i386/kernel/entry.S 2006-05-02 16:00:17.000000000 +1000 @@ -144,9 +144,11 @@ ret_from_exception: ret_from_intr: GET_THREAD_INFO(%ebp) movl EFLAGS(%esp), %eax # mix EFLAGS and CS + andl $VM_MASK, %eax movb CS(%esp), %al - testl $(VM_MASK | 3), %eax - jz resume_kernel + andb $SEGMENT_RPL_MASK, %al + cmpl $SEGMENT_RPL_MASK, %eax + jb resume_kernel # returning to kernel or vm86-space ENTRY(resume_userspace) cli # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -386,17 +388,14 @@ syscall_badsys: /* put ESP to the proper location */ \ movl %eax, %esp; #define UNWIND_ESPFIX_STACK \ - pushl %eax; \ - movl %ss, %eax; \ - /* see if on 16bit stack */ \ - cmpw $__ESPFIX_SS, %ax; \ + COMPARE_SEGMENT_REG(__ESPFIX_SS, %ss); \ jne 28f; \ - movl $__KERNEL_DS, %edx; \ + movl $__USER_DS, %edx; \ movl %edx, %ds; \ movl %edx, %es; \ /* switch to 32bit stack */ \ FIXUP_ESPFIX_STACK \ -28: popl %eax; +28:; /* * Build the entry stubs and pointer table with @@ -455,6 +454,7 @@ error_code: pushl %es UNWIND_ESPFIX_STACK popl %ecx + movl EAX(%esp), %eax movl ES(%esp), %edi # get the function address movl ORIG_EAX(%esp), %edx # get the error code movl %eax, ORIG_EAX(%esp) @@ -505,12 +505,12 @@ device_not_available_emulate: * the instruction that would have done it for sysenter. */ #define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ + COMPARE_SEGMENT_STACK(__KERNEL_CS, 4); \ jne ok; \ label: \ movl TSS_sysenter_esp0+offset(%esp),%esp; \ pushfl; \ - pushl $__KERNEL_CS; \ + push %cs; \ pushl $sysenter_past_esp KPROBE_ENTRY(debug) @@ -534,10 +534,7 @@ debug_stack_correct: * fault happened on the sysenter path. */ ENTRY(nmi) - pushl %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax + COMPARE_SEGMENT_REG(__ESPFIX_SS, %ss) je nmi_16bit_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup @@ -564,7 +561,7 @@ nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) jmp nmi_stack_correct nmi_debug_stack_check: - cmpw $__KERNEL_CS,16(%esp) + COMPARE_SEGMENT_STACK(__KERNEL_CS, 16) jne nmi_stack_correct cmpl $debug,(%esp) jb nmi_stack_correct diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/arch/i386/kernel/process.c tmp/arch/i386/kernel/process.c --- linux-2.6.17-rc2-git7/arch/i386/kernel/process.c 2006-04-21 12:05:02.000000000 +1000 +++ tmp/arch/i386/kernel/process.c 2006-05-02 15:57:41.000000000 +1000 @@ -347,7 +347,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xes = __USER_DS; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/arch/i386/kernel/traps.c tmp/arch/i386/kernel/traps.c --- linux-2.6.17-rc2-git7/arch/i386/kernel/traps.c 2006-04-21 12:05:02.000000000 +1000 +++ tmp/arch/i386/kernel/traps.c 2006-05-02 15:57:41.000000000 +1000 @@ -1013,10 +1013,10 @@ fastcall void setup_x86_bogus_stack(unsi memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); /* fill in the switch pointers */ switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; - switch16_ptr[1] = __ESPFIX_SS; + switch16_ptr[1] = __ESPFIX_SS | get_kernel_rpl(); switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + 8 - CPU_16BIT_STACK_SIZE; - switch32_ptr[1] = __KERNEL_DS; + switch32_ptr[1] = __KERNEL_DS | get_kernel_rpl(); } fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/include/asm-i386/ptrace.h tmp/include/asm-i386/ptrace.h --- linux-2.6.17-rc2-git7/include/asm-i386/ptrace.h 2006-03-23 12:44:59.000000000 +1100 +++ tmp/include/asm-i386/ptrace.h 2006-05-02 15:57:41.000000000 +1000 @@ -60,6 +60,7 @@ struct pt_regs { #ifdef __KERNEL__ #include <asm/vm86.h> +#include <asm/segment.h> struct task_struct; extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); @@ -73,11 +74,11 @@ extern void send_sigtrap(struct task_str */ static inline int user_mode(struct pt_regs *regs) { - return (regs->xcs & 3) != 0; + return (regs->xcs & SEGMENT_RPL_MASK) == 3; } static inline int user_mode_vm(struct pt_regs *regs) { - return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0; + return (((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= 3); } #define instruction_pointer(regs) ((regs)->eip) #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/include/asm-i386/segment.h tmp/include/asm-i386/segment.h --- linux-2.6.17-rc2-git7/include/asm-i386/segment.h 2006-03-23 12:44:59.000000000 +1100 +++ tmp/include/asm-i386/segment.h 2006-05-02 15:57:41.000000000 +1000 @@ -112,4 +112,18 @@ */ #define IDT_ENTRIES 256 +/* Bottom three bits of xcs give the ring privilege level */ +#define SEGMENT_RPL_MASK 0x3 + +#define get_kernel_rpl() 0 + +#define COMPARE_SEGMENT_STACK(segment, offset) \ + cmpw $segment, offset(%esp); + +#define COMPARE_SEGMENT_REG(segment, reg) \ + pushl %eax; \ + mov reg, %eax; \ + cmpw $segment,%ax; \ + popl %eax; + #endif -- ccontrol: http://ccontrol.ozlabs.org
Maybe Matching Threads
- [PATCH 1/3] Paravirtualization: Kernel Ring Cleanups
- [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops
- [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops
- Per-cpu patches on top of PDA stuff...
- Per-cpu patches on top of PDA stuff...