(And Andi's request, I drew out the noop-writing part of alternative.c, but unfortunately not much else can be shared.) ==It turns out that the most called ops, by several orders of magnitude, are the interrupt manipulation ops. These are obvious candidates for patching, so mark them up and create infrastructure for it. The method used is that the ops structure has a patch function, which is called for each place which needs to be patched: this returns a number of instructions (the rest are NOP-padded). Usually we can spare a register (%eax) for the binary patched code to use, but in a couple of critical places in entry.S we can't: we make the clobbers explicit at the call site, and manually clobber the allowed registers in debug mode as an extra check. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Zachary Amsden <zach@vmware.com> ==================================================================--- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -123,6 +123,20 @@ static unsigned char** find_nop_table(vo #endif /* CONFIG_X86_64 */ +static void nop_out(void *insns, unsigned int len) +{ + unsigned char **noptable = find_nop_table(); + + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, noptable[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; @@ -137,10 +151,9 @@ extern u8 __smp_alt_begin[], __smp_alt_e void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { - unsigned char **noptable = find_nop_table(); struct alt_instr *a; u8 *instr; - int diff, i, k; + int diff; DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); for (a = start; a < end; a++) { @@ -158,13 +171,7 @@ void apply_alternatives(struct alt_instr #endif memcpy(instr, a->replacement, a->replacementlen); diff = a->instrlen - a->replacementlen; - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - memcpy(a->instr + i, noptable[k], k); - } + nop_out(instr + a->replacementlen, diff); } } @@ -208,7 +215,6 @@ static void alternatives_smp_lock(u8 **s static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) { - unsigned char **noptable = find_nop_table(); u8 **ptr; for (ptr = start; ptr < end; ptr++) { @@ -216,7 +222,7 @@ static void alternatives_smp_unlock(u8 * continue; if (*ptr > text_end) continue; - **ptr = noptable[1][0]; + nop_out(*ptr, 1); }; } @@ -341,6 +347,45 @@ void alternatives_smp_switch(int smp) } #endif + +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{ + struct paravirt_patch *p; + int i; + + local_irq_disable(); + for (p = start; p < end; p++) { + unsigned int used; + + used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, + p->len); +#ifdef CONFIG_DEBUG_KERNEL + /* Deliberately clobber regs using "not %reg" to find bugs. */ + for (i = 0; i < 3; i++) { + if (p->len - used >= 2 && (p->clobbers & (1 << i))) { + memcpy(p->instr + used, "\xf7\xd0", 2); + p->instr[used+1] |= i; + used += 2; + } + } +#endif + /* Pad the rest with nops */ + nop_out(p->instr + used, p->len - used); + } + + /* Sync in case we patched this coming "sti" last. */ + sync_core(); + local_irq_enable(); +} +extern struct paravirt_patch __start_parainstructions[], + __stop_parainstructions[]; +#else +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{ +} +extern struct paravirt_patch *__start_parainstructions, *__stop_parainstructions; +#endif /* CONFIG_PARAVIRT */ void __init alternative_instructions(void) { @@ -386,4 +430,6 @@ void __init alternative_instructions(voi alternatives_smp_switch(0); } #endif -} + + apply_paravirt(__start_parainstructions, __stop_parainstructions); +} ==================================================================--- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -77,9 +77,9 @@ VM_MASK = 0x00020000 VM_MASK = 0x00020000 #ifdef CONFIG_PREEMPT -#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else -#define preempt_stop +#define preempt_stop(clobbers) #define resume_kernel restore_nocheck #endif @@ -223,7 +223,7 @@ ENTRY(ret_from_fork) ALIGN RING0_PTREGS_FRAME ret_from_exception: - preempt_stop + preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: @@ -233,7 +233,7 @@ check_userspace: cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -244,7 +244,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -272,7 +272,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -317,7 +317,7 @@ 1: movl (%ebp),%ebp jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -352,7 +352,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) # store the return value syscall_exit: - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -381,7 +381,7 @@ 1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: TRACE_IRQS_ON - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -405,7 +405,7 @@ ldt_ss: * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer CFI_ADJUST_CFA_OFFSET 8 - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_EAX) TRACE_IRQS_OFF movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, @@ -430,7 +430,7 @@ work_pending: jz work_notifysig work_resched: call schedule - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -486,7 +486,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -667,7 +667,7 @@ ENTRY(device_not_available) GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate - preempt_stop + preempt_stop(CLBR_ANY) call math_state_restore jmp ret_from_exception device_not_available_emulate: ==================================================================--- a/arch/i386/kernel/module.c +++ b/arch/i386/kernel/module.c @@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -132,6 +135,10 @@ int module_finalize(const Elf_Ehdr *hdr, lseg, lseg + locks->sh_size, tseg, tseg + text->sh_size); } + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } return 0; } ==================================================================--- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -322,8 +322,52 @@ extern fastcall void native_iret(void); extern fastcall void native_iret(void); extern fastcall void native_irq_enable_sysexit(void); +/* Simple instruction patching code. */ +#define DEF_NATIVE(name, code) \ + extern const char start_##name[], end_##name[]; \ + asm("start_" #name ": " code "; end_" #name ":") +DEF_NATIVE(cli, "cli"); +DEF_NATIVE(sti, "sti"); +DEF_NATIVE(popf, "push %eax; popf"); +DEF_NATIVE(pushf, "pushf; pop %eax"); +DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); +DEF_NATIVE(iret, "iret"); +DEF_NATIVE(sti_sysexit, "sti; sysexit"); + +static const struct native_insns +{ + const char *start, *end; +} native_insns[] = { + [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli }, + [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti }, + [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf }, + [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf }, + [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli }, + [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret }, + [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit }, +}; + +static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + unsigned int insn_len; + + /* Don't touch it if we don't have a replacement */ + if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) + return len; + + insn_len = native_insns[type].end - native_insns[type].start; + + /* Similarly if we can't fit replacement. */ + if (len < insn_len) + return len; + + memcpy(insns, native_insns[type].start, insn_len); + return insn_len; +} + struct paravirt_ops paravirt_ops = { .kernel_rpl = 0, + .patch = native_patch, .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, ==================================================================--- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -152,6 +152,12 @@ SECTIONS .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } + . = ALIGN(4); + __start_parainstructions = .; + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + *(.parainstructions) + } + __stop_parainstructions = .; /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } ==================================================================--- a/include/asm-i386/alternative.h +++ b/include/asm-i386/alternative.h @@ -138,4 +138,7 @@ static inline void alternatives_smp_swit #define LOCK_PREFIX "" #endif +struct paravirt_patch; +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end); + #endif /* _I386_ALTERNATIVE_H */ ==================================================================--- a/include/asm-i386/irqflags.h +++ b/include/asm-i386/irqflags.h @@ -79,8 +79,8 @@ static inline unsigned long __raw_local_ } #else -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti +#define DISABLE_INTERRUPTS(clobbers) cli +#define ENABLE_INTERRUPTS(clobbers) sti #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit #define INTERRUPT_RETURN iret #define GET_CR0_INTO_EAX movl %cr0, %eax ==================================================================--- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -3,14 +3,34 @@ /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #include <linux/linkage.h> +#include <linux/stringify.h> #ifdef CONFIG_PARAVIRT +/* These are the most performance critical ops, so we want to be able to patch + * callers */ +#define PARAVIRT_IRQ_DISABLE 0 +#define PARAVIRT_IRQ_ENABLE 1 +#define PARAVIRT_RESTORE_FLAGS 2 +#define PARAVIRT_SAVE_FLAGS 3 +#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4 +#define PARAVIRT_INTERRUPT_RETURN 5 +#define PARAVIRT_STI_SYSEXIT 6 + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0x0 +#define CLBR_EAX 0x1 +#define CLBR_ECX 0x2 +#define CLBR_EDX 0x4 +#define CLBR_ANY 0x7 + #ifndef __ASSEMBLY__ struct thread_struct; struct Xgt_desc_struct; struct paravirt_ops { unsigned int kernel_rpl; + + unsigned (*patch)(u8 type, u16 clobber, void *firstinsn, unsigned len); /* All the function pointers here are declared as "fastcall" so that we get a specific register-based calling @@ -101,35 +121,6 @@ static inline void __cpuid(unsigned int #define read_cr4() paravirt_ops.read_cr4() #define read_cr4_safe(x) paravirt_ops.read_cr4_safe() #define write_cr4(x) paravirt_ops.write_cr4(x) - -static inline unsigned long __raw_local_save_flags(void) -{ - return paravirt_ops.save_fl(); -} - -static inline void raw_local_irq_restore(unsigned long flags) -{ - return paravirt_ops.restore_fl(flags); -} - -static inline void raw_local_irq_disable(void) -{ - paravirt_ops.irq_disable(); -} - -static inline void raw_local_irq_enable(void) -{ - paravirt_ops.irq_enable(); -} - -static inline unsigned long __raw_local_irq_save(void) -{ - unsigned long flags = paravirt_ops.save_fl(); - - paravirt_ops.irq_disable(); - - return flags; -} static inline void raw_safe_halt(void) { @@ -209,15 +200,130 @@ static inline void halt(void) #define write_idt_entry(dt, entry, a, b) (paravirt_ops.write_idt_entry((dt), (entry), ((u64)a) << 32 | b)) #define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask)) -#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax" -#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax" +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +#define paravirt_alt(insn_string, typenum, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + " .long 771b\n" \ + " .byte " __stringify(typenum) "\n" \ + " .byte 772b-771b\n" \ + " .short " __stringify(clobber) "\n" \ + ".popsection" + +static inline unsigned long __raw_local_save_flags(void) +{ + unsigned long f; + + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1;" + "popl %%edx; popl %%ecx", + PARAVIRT_SAVE_FLAGS, CLBR_NONE) + : "=a"(f): "m"(paravirt_ops.save_fl) + : "memory", "cc"); + return f; +} + +static inline void raw_local_irq_restore(unsigned long f) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1;" + "popl %%edx; popl %%ecx", + PARAVIRT_RESTORE_FLAGS, CLBR_EAX) + : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f) + : "memory", "cc"); +} + +static inline void raw_local_irq_disable(void) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%0;" + "popl %%edx; popl %%ecx", + PARAVIRT_IRQ_DISABLE, CLBR_EAX) + : : "m" (paravirt_ops.irq_disable) + : "memory", "eax", "cc"); +} + +static inline void raw_local_irq_enable(void) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%0;" + "popl %%edx; popl %%ecx", + PARAVIRT_IRQ_ENABLE, CLBR_EAX) + : : "m" (paravirt_ops.irq_enable) + : "memory", "eax", "cc"); +} + +static inline unsigned long __raw_local_irq_save(void) +{ + unsigned long f; + + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1; pushl %%eax;" + "call *%2; popl %%eax;" + "popl %%edx; popl %%ecx", + PARAVIRT_SAVE_FLAGS_IRQ_DISABLE, + CLBR_NONE) + : "=a"(f) + : "m" (paravirt_ops.save_fl), + "m" (paravirt_ops.irq_disable) + : "memory", "cc"); + return f; +} + +#define CLI_STRING paravirt_alt("pushl %ecx; pushl %edx;" \ + "call *paravirt_ops+PARAVIRT_irq_disable;" \ + "popl %edx; popl %ecx", \ + PARAVIRT_IRQ_DISABLE, CLBR_EAX) + +#define STI_STRING paravirt_alt("pushl %ecx; pushl %edx;" \ + "call *paravirt_ops+PARAVIRT_irq_enable;" \ + "popl %edx; popl %ecx", \ + PARAVIRT_IRQ_ENABLE, CLBR_EAX) +#define CLI_STI_CLOBBERS , "%eax" + #else /* __ASSEMBLY__ */ - -#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret -#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax -#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax -#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit -#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0 + +#define PARA_PATCH(ptype, clobbers, ops) \ +771:; \ + ops; \ +772:; \ + .pushsection .parainstructions,"a"; \ + .long 771b; \ + .byte ptype; \ + .byte 772b-771b; \ + .short clobbers; \ + .popsection + +#define INTERRUPT_RETURN \ + PARA_PATCH(PARAVIRT_INTERRUPT_RETURN, CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_iret) + +#define DISABLE_INTERRUPTS(clobbers) \ + PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \ + pushl %ecx; pushl %edx; \ + call *paravirt_ops+PARAVIRT_irq_disable; \ + popl %edx; popl %ecx) \ + +#define ENABLE_INTERRUPTS(clobbers) \ + PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \ + pushl %ecx; pushl %edx; \ + call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ + popl %edx; popl %ecx) + +#define ENABLE_INTERRUPTS_SYSEXIT \ + PARA_PATCH(PARAVIRT_STI_SYSEXIT, CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) + +#define GET_CR0_INTO_EAX \ + call *paravirt_ops+PARAVIRT_read_cr0 + #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PARAVIRT */ #endif /* __ASM_PARAVIRT_H */ ==================================================================--- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -22,6 +22,7 @@ #else #define CLI_STRING "cli" #define STI_STRING "sti" +#define CLI_STI_CLOBBERS #endif /* CONFIG_PARAVIRT */ #define __raw_spin_is_locked(x) \ @@ -86,7 +87,8 @@ static inline void __raw_spin_lock_flags alternative_smp( __raw_spin_lock_string_flags, __raw_spin_lock_string_up, - "+m" (lock->slock) : "r" (flags) : "memory"); + "+m" (lock->slock) : "r" (flags) + : "memory" CLI_STI_CLOBBERS); } #endif -- Help! Save Australia from the worst of the DMCA: http://linux.org.au/law
OK, this is the revised paravirt.h (Andi has seen this before), then the second is the binary patching stuff. More things get added to the paravirt struct in future patches, but this basic stuff hasn't changed for some time. ===This patch does the dumbest possible replacement of paravirtualized instructions: calls through a "paravirt_ops" structure. Currently these are function implementations of native hardware: hypervisors will override the ops structure with their own variants. All the pv-ops functions are declared "fastcall" so that a specific register-based ABI is used, to make inlining assember easier. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> --- arch/i386/Kconfig | 11 + arch/i386/kernel/Makefile | 1 arch/i386/kernel/asm-offsets.c | 8 arch/i386/kernel/entry.S | 9 arch/i386/kernel/paravirt.c | 392 ++++++++++++++++++++++++++++++++++++++ include/asm-i386/msr.h | 5 include/asm-i386/paravirt.h | 226 +++++++++++++++++++++ include/asm-i386/paravirt_desc.h | 5 8 files changed, 657 insertions(+) ==================================================================--- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -180,6 +180,17 @@ config X86_ES7000 should say N here. endchoice + +config PARAVIRT + bool "Paravirtualization support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Paravirtualization is a way of running multiple instances of + Linux on the same machine, under a hypervisor. This option + changes the kernel so it can modify itself when it is run + under a hypervisor, improving performance significantly. + However, when run without a hypervisor the kernel is + theoretically slower. If in doubt, say N. config ACPI_SRAT bool ==================================================================--- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_AUDIT) += audit.o +obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional ==================================================================--- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -15,6 +15,7 @@ #include <asm/processor.h> #include <asm/thread_info.h> #include <asm/elf.h> +#include <asm/paravirt.h> #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -74,4 +75,11 @@ void foo(void) DEFINE(VDSO_PRELINK, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); +#ifdef CONFIG_PARAVIRT + OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); + OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); + OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); + OFFSET(PARAVIRT_iret, paravirt_ops, iret); + OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); +#endif } ==================================================================--- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -803,6 +803,15 @@ 1: INTERRUPT_RETURN .long 1b,iret_exc .previous +#ifdef CONFIG_PARAVIRT +ENTRY(nopara_iret) + iret + +ENTRY(nopara_irq_enable_sysexit) + sti + sysexit +#endif + KPROBE_ENTRY(int3) RING0_INT_FRAME pushl $-1 # mark this as an int ==================================================================--- a/include/asm-i386/msr.h +++ b/include/asm-i386/msr.h @@ -1,5 +1,9 @@ #ifndef __ASM_MSR_H #define __ASM_MSR_H + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else /* * Access to machine-specific registers (available on 586 and better only) @@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long __asm__ __volatile__("rdpmc" \ : "=a" (low), "=d" (high) \ : "c" (counter)) +#endif /* !CONFIG_PARAVIRT */ /* symbolic names for some interesting MSRs */ /* Intel defined MSRs. */ ==================================================================--- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -2,6 +2,232 @@ #define __ASM_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ +#include <linux/linkage.h> + +#ifndef CONFIG_PARAVIRT #include <asm/no_paravirt.h> +#else + +#ifndef __ASSEMBLY__ +struct thread_struct; +struct Xgt_desc_struct; +struct paravirt_ops +{ + unsigned int kernel_rpl; + + /* All the function pointers here are declared as "fastcall" + so that we get a specific register-based calling + convention. This makes it easier to implement inline + assembler replacements. */ + + void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + unsigned int (fastcall *get_debugreg)(int regno); + void (fastcall *set_debugreg)(int regno, unsigned int value); + + void (fastcall *sync_core)(void); + + void (fastcall *clts)(void); + + unsigned int (fastcall *read_cr0)(void); + void (fastcall *write_cr0)(unsigned int); + + unsigned int (fastcall *read_cr2)(void); + void (fastcall *write_cr2)(unsigned int); + + unsigned int (fastcall *read_cr3)(void); + void (fastcall *write_cr3)(unsigned int); + + unsigned int (fastcall *read_cr4_safe)(void); + unsigned int (fastcall *read_cr4)(void); + void (fastcall *write_cr4)(unsigned int); + + unsigned long (fastcall *save_fl)(void); + void (fastcall *restore_fl)(unsigned long); + unsigned long (fastcall *save_fl_irq_disable)(void); + void (fastcall *irq_disable)(void); + void (fastcall *irq_enable)(void); + void (fastcall *safe_halt)(void); + void (fastcall *halt)(void); + void (fastcall *wbinvd)(void); + + /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (fastcall *read_msr)(unsigned int msr, int *err); + int (fastcall *write_msr)(unsigned int msr, u64 val); + + u64 (fastcall *read_tsc)(void); + u64 (fastcall *read_pmc)(void); + + void (fastcall *load_tr_desc)(void); + void (fastcall *load_ldt_desc)(void); + void (fastcall *load_gdt)(const struct Xgt_desc_struct *); + void (fastcall *load_idt)(const struct Xgt_desc_struct *); + void (fastcall *store_gdt)(struct Xgt_desc_struct *); + void (fastcall *store_idt)(struct Xgt_desc_struct *); + unsigned long (fastcall *store_tr)(void); + void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu); + void (fastcall *write_ldt_entry)(void *dt, int entrynum, u64 entry); + void (fastcall *write_gdt_entry)(void *dt, int entrynum, u64 entry); + void (fastcall *write_idt_entry)(void *dt, int entrynum, u64 entry); + + void (fastcall *set_iopl_mask)(unsigned mask); + + /* These two are jmp to, not actually called. */ + void (fastcall *irq_enable_sysexit)(void); + void (fastcall *iret)(void); +}; + +extern struct paravirt_ops paravirt_ops; + +/* The paravirtualized CPUID instruction. */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + paravirt_ops.cpuid(eax, ebx, ecx, edx); +} + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg) +#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val) + +/* Stop speculative execution */ +static inline void sync_core(void) +{ + paravirt_ops.sync_core(); +} + +#define clts() paravirt_ops.clts() + +#define read_cr0() paravirt_ops.read_cr0() +#define write_cr0(x) paravirt_ops.write_cr0(x) + +#define read_cr2() paravirt_ops.read_cr2() +#define write_cr2(x) paravirt_ops.write_cr2(x) + +#define read_cr3() paravirt_ops.read_cr3() +#define write_cr3(x) paravirt_ops.write_cr3(x) + +#define read_cr4() paravirt_ops.read_cr4() +#define read_cr4_safe(x) paravirt_ops.read_cr4_safe() +#define write_cr4(x) paravirt_ops.write_cr4(x) + +static inline unsigned long __raw_local_save_flags(void) +{ + return paravirt_ops.save_fl(); +} + +static inline void raw_local_irq_restore(unsigned long flags) +{ + return paravirt_ops.restore_fl(flags); +} + +static inline void raw_local_irq_disable(void) +{ + paravirt_ops.irq_disable(); +} + +static inline void raw_local_irq_enable(void) +{ + paravirt_ops.irq_enable(); +} + +static inline unsigned long __raw_local_irq_save(void) +{ + return paravirt_ops.save_fl_irq_disable(); +} + +static inline void raw_safe_halt(void) +{ + paravirt_ops.safe_halt(); +} + +static inline void halt(void) +{ + paravirt_ops.safe_halt(); +} +#define wbinvd() paravirt_ops.wbinvd() + +#define get_kernel_rpl() (paravirt_ops.kernel_rpl) + +#define rdmsr(msr,val1,val2) do { \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + val1 = (u32)_l; \ + val2 = _l >> 32; \ +} while(0) + +#define wrmsr(msr,val1,val2) do { \ + u64 _l = ((u64)(val2) << 32) | (val1); \ + paravirt_ops.write_msr((msr), _l); \ +} while(0) + +#define rdmsrl(msr,val) do { \ + int _err; \ + val = paravirt_ops.read_msr((msr),&_err); \ +} while(0) + +#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val))) +#define wrmsr_safe(msr,a,b) ({ \ + u64 _l = ((u64)(b) << 32) | (a); \ + paravirt_ops.write_msr((msr),_l); \ +}) + +/* rdmsr with exception handling */ +#define rdmsr_safe(msr,a,b) ({ \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; }) + +#define rdtsc(low,high) do { \ + u64 _l = paravirt_ops.read_tsc(); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while(0) + +#define rdtscl(low) do { \ + u64 _l = paravirt_ops.read_tsc(); \ + low = (int)_l; \ +} while(0) + +#define rdtscll(val) (val = paravirt_ops.read_tsc()) + +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2) + +#define rdpmc(counter,low,high) do { \ + u64 _l = paravirt_ops.read_pmc(); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while(0) + +#define load_TR_desc() (paravirt_ops.load_tr_desc()) +#define load_LDT_desc() (paravirt_ops.load_ldt_desc()) +#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr)) +#define load_idt(dtr) (paravirt_ops.load_idt(dtr)) +#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr)) +#define store_idt(dtr) (paravirt_ops.store_idt(dtr)) +#define store_tr(tr) ((tr) = paravirt_ops.store_tr()) +#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu))) +#define write_ldt_entry(dt, entry, a, b) (paravirt_ops.write_ldt_entry((dt), (entry), ((u64)a) << 32 | b)) +#define write_gdt_entry(dt, entry, a, b) (paravirt_ops.write_gdt_entry((dt), (entry), ((u64)a) << 32 | b)) +#define write_idt_entry(dt, entry, a, b) (paravirt_ops.write_idt_entry((dt), (entry), ((u64)a) << 32 | b)) +#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask)) + +#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax" +#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax" +#else /* __ASSEMBLY__ */ + +#define INTERRUPT_RETURN jmp *paravirt_ops+PARAVIRT_iret +#define DISABLE_INTERRUPTS call *paravirt_ops+PARAVIRT_irq_disable +#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax +#define ENABLE_INTERRUPTS_SYSEXIT jmp *paravirt_ops+PARAVIRT_irq_enable_sysexit +#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0 +#endif /* __ASSEMBLY__ */ + +#endif /* PARAVIRT */ #endif /* __ASM_PARAVIRT_H */ ==================================================================--- a/include/asm-i386/paravirt_desc.h +++ b/include/asm-i386/paravirt_desc.h @@ -1,6 +1,10 @@ #ifndef __ASM_PARAVIRT_DESC_H #define __ASM_PARAVIRT_DESC_H /* A separate header because they need processor.h, which needs paravirt.h */ +#ifndef CONFIG_PARAVIRT #include <asm/no_paravirt_desc.h> +#else +#include <asm/paravirt.h> +#endif #endif /* __ASM_PARAVIRT_DESC_H */ ==================================================================--- /dev/null +++ b/arch/i386/kernel/paravirt.c @@ -0,0 +1,382 @@ +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include <linux/errno.h> +#include <linux/module.h> +#include <asm/bug.h> +#include <asm/paravirt.h> +#include <asm/desc.h> + +static fastcall void nopara_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* must be "asm volatile" so that it won't be optimised out in + nopara_sync_core */ + asm volatile ("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static fastcall unsigned int nopara_get_debugreg(int regno) +{ + unsigned int val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + __asm__("movl %%db0, %0" :"=r" (val)); break; + case 1: + __asm__("movl %%db1, %0" :"=r" (val)); break; + case 2: + __asm__("movl %%db2, %0" :"=r" (val)); break; + case 3: + __asm__("movl %%db3, %0" :"=r" (val)); break; + case 6: + __asm__("movl %%db6, %0" :"=r" (val)); break; + case 7: + __asm__("movl %%db7, %0" :"=r" (val)); break; + default: + BUG(); + } + return val; +} + +static fastcall void nopara_set_debugreg(int regno, unsigned int value) +{ + switch (regno) { + case 0: + __asm__("movl %0,%%db0" : /* no output */ :"r" (value)); + break; + case 1: + __asm__("movl %0,%%db1" : /* no output */ :"r" (value)); + break; + case 2: + __asm__("movl %0,%%db2" : /* no output */ :"r" (value)); + break; + case 3: + __asm__("movl %0,%%db3" : /* no output */ :"r" (value)); + break; + case 6: + __asm__("movl %0,%%db6" : /* no output */ :"r" (value)); + break; + case 7: + __asm__("movl %0,%%db7" : /* no output */ :"r" (value)); + break; + default: + BUG(); + } +} + +static fastcall void nopara_sync_core(void) +{ + unsigned int eax = 1, ebx, ecx = 0, edx; + nopara_cpuid(&eax, &ebx, &ecx, &edx); +} + +static fastcall void nopara_clts(void) +{ + __asm__ __volatile__ ("clts"); +} + +static fastcall unsigned int nopara_read_cr0(void) +{ + unsigned int val; + __asm__ __volatile__("movl %%cr0,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void nopara_write_cr0(unsigned int val) +{ + __asm__ __volatile__("movl %0,%%cr0": :"r" (val)); +} + +static fastcall unsigned int nopara_read_cr2(void) +{ + unsigned int val; + __asm__ __volatile__("movl %%cr2,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void nopara_write_cr2(unsigned int val) +{ + __asm__ __volatile__("movl %0,%%cr2": :"r" (val)); +} + +static fastcall unsigned int nopara_read_cr3(void) +{ + unsigned int val; + __asm__ __volatile__("movl %%cr3,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void nopara_write_cr3(unsigned int val) +{ + __asm__ __volatile__("movl %0,%%cr3": :"r" (val)); +} + +static fastcall unsigned int nopara_read_cr4(void) +{ + unsigned int val; + __asm__ __volatile__("movl %%cr4,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall unsigned int nopara_read_cr4_safe(void) +{ + unsigned int val; + /* This could fault if %cr4 does not exist */ + __asm__("1: movl %%cr4, %0 \n" + "2: \n" + ".section __ex_table,\"a\" \n" + ".long 1b,2b \n" + ".previous \n" + : "=r" (val): "0" (0)); + return val; +} + +static fastcall void nopara_write_cr4(unsigned int val) +{ + __asm__ __volatile__("movl %0,%%cr4": :"r" (val)); +} + +static fastcall unsigned long nopara_save_fl(void) +{ + unsigned long f; + __asm__ __volatile__("pushfl ; popl %0":"=g" (f): /* no input */); + return f; +} + +static fastcall unsigned long nopara_save_fl_irq_disable(void) +{ + unsigned long f; + __asm__ __volatile__("pushfl ; popl %0; cli":"=g" (f): : "memory"); + return f; +} + +static fastcall void nopara_restore_fl(unsigned long f) +{ + __asm__ __volatile__("pushl %0 ; popfl": /* no output */ + :"g" (f) + :"memory", "cc"); +} + +static fastcall void nopara_irq_disable(void) +{ + __asm__ __volatile__("cli": : :"memory"); +} + +static fastcall void nopara_irq_enable(void) +{ + __asm__ __volatile__("sti": : :"memory"); +} + +static fastcall void nopara_safe_halt(void) +{ + __asm__ __volatile__("sti; hlt": : :"memory"); +} + +static fastcall void nopara_halt(void) +{ + __asm__ __volatile__("hlt": : :"memory"); +} + +static fastcall void nopara_wbinvd(void) +{ + __asm__ __volatile__("wbinvd": : :"memory"); +} + +static fastcall unsigned long long nopara_read_msr(unsigned int msr, int *err) +{ + unsigned long long val; + + asm volatile("2: rdmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %3,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=r" (*err), "=A" (val) + : "c" (msr), "i" (-EFAULT)); + + return val; +} + +static fastcall int nopara_write_msr(unsigned int msr, unsigned long long val) +{ + int err; + asm volatile("2: wrmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %4,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=a" (err) + : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), + "i" (-EFAULT)); + return err; +} + +static fastcall unsigned long long nopara_read_tsc(void) +{ + unsigned long long val; + __asm__ __volatile__("rdtsc" : "=A" (val)); + return val; +} + +static fastcall unsigned long long nopara_read_pmc(void) +{ + unsigned long long val; + __asm__ __volatile__("rdpmc" : "=A" (val)); + return val; +} + +static fastcall void nopara_load_tr_desc(void) +{ + __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static fastcall void nopara_load_ldt_desc(void) +{ + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); +} + +static fastcall void nopara_load_gdt(const struct Xgt_desc_struct *dtr) +{ + __asm__ __volatile("lgdt %0"::"m" (*dtr)); +} + +static fastcall void nopara_load_idt(const struct Xgt_desc_struct *dtr) +{ + __asm__ __volatile("lidt %0"::"m" (*dtr)); +} + +static fastcall void nopara_store_gdt(struct Xgt_desc_struct *dtr) +{ + __asm__ ("sgdt %0":"=m" (*dtr)); +} + +static fastcall void nopara_store_idt(struct Xgt_desc_struct *dtr) +{ + __asm__ ("sidt %0":"=m" (*dtr)); +} + +static fastcall unsigned long nopara_store_tr(void) +{ + unsigned long tr; + __asm__ ("str %0":"=r" (tr)); + return tr; +} + +static fastcall void nopara_load_tls(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] + C(0); C(1); C(2); +#undef C +} + +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) +{ + __u32 *lp = (__u32 *)((char *)dt + entry*8); + *lp = entry_a; + *(lp+1) = entry_b; +} + +static fastcall void nopara_write_ldt_entry(void *dt, int entrynum, u64 entry) +{ + write_dt_entry(dt, entrynum, entry >> 32, entry); +} + +static fastcall void nopara_write_gdt_entry(void *dt, int entrynum, u64 entry) +{ + write_dt_entry(dt, entrynum, entry >> 32, entry); +} + +static fastcall void nopara_write_idt_entry(void *dt, int entrynum, u64 entry) +{ + write_dt_entry(dt, entrynum, entry >> 32, entry); +} + +static fastcall void nopara_set_iopl_mask(unsigned mask) +{ + unsigned int reg; + __asm__ __volatile__ ("pushfl;" + "popl %0;" + "andl %1, %0;" + "orl %2, %0;" + "pushl %0;" + "popfl" + : "=&r" (reg) + : "i" (~X86_EFLAGS_IOPL), "r" (mask)); +} + +/* These are in entry.S */ +extern fastcall void nopara_iret(void); +extern fastcall void nopara_irq_enable_sysexit(void); + +struct paravirt_ops paravirt_ops = { + .kernel_rpl = 0, + .cpuid = nopara_cpuid, + .get_debugreg = nopara_get_debugreg, + .set_debugreg = nopara_set_debugreg, + .sync_core = nopara_sync_core, + .clts = nopara_clts, + .read_cr0 = nopara_read_cr0, + .write_cr0 = nopara_write_cr0, + .read_cr2 = nopara_read_cr2, + .write_cr2 = nopara_write_cr2, + .read_cr3 = nopara_read_cr3, + .write_cr3 = nopara_write_cr3, + .read_cr4 = nopara_read_cr4, + .read_cr4_safe = nopara_read_cr4_safe, + .write_cr4 = nopara_write_cr4, + .save_fl = nopara_save_fl, + .restore_fl = nopara_restore_fl, + .save_fl_irq_disable = nopara_save_fl_irq_disable, + .irq_disable = nopara_irq_disable, + .irq_enable = nopara_irq_enable, + .safe_halt = nopara_safe_halt, + .halt = nopara_halt, + .wbinvd = nopara_wbinvd, + .read_msr = nopara_read_msr, + .write_msr = nopara_write_msr, + .read_tsc = nopara_read_tsc, + .read_pmc = nopara_read_pmc, + .load_tr_desc = nopara_load_tr_desc, + .load_ldt_desc = nopara_load_ldt_desc, + .load_gdt = nopara_load_gdt, + .load_idt = nopara_load_idt, + .store_gdt = nopara_store_gdt, + .store_idt = nopara_store_idt, + .store_tr = nopara_store_tr, + .load_tls = nopara_load_tls, + .write_ldt_entry = nopara_write_ldt_entry, + .write_gdt_entry = nopara_write_gdt_entry, + .write_idt_entry = nopara_write_idt_entry, + + .set_iopl_mask = nopara_set_iopl_mask, + .irq_enable_sysexit = nopara_irq_enable_sysexit, + .iret = nopara_iret, +}; +EXPORT_SYMBOL_GPL(paravirt_ops); -- Help! Save Australia from the worst of the DMCA: http://linux.org.au/law
On Tue, 22 Aug 2006 15:55:15 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:> OK, this is the revised paravirt.h (Andi has seen this before), then the > second is the binary patching stuff. More things get added to the > paravirt struct in future patches, but this basic stuff hasn't changed > for some time.Thanks. I plan to revisit this after the .19 merge is done. Currently there is too much noise in the tree and even your tree seems to be still changing too quickly. The patch looks reasonable, except that you still have far too many of these nasty __s around asm and volatile. -Andi
On Tue, 2006-08-22 at 15:55 +1000, Rusty Russell wrote:> OK, this is the revised paravirt.h (Andi has seen this before),No, it wasn't. That was the old patch, which won't apply. This is the right one (and applied with a little fuzz against rc4-mm2). Sorry, Rusty. Create a paravirt.h header for all the critical operations which need to be replaced with hypervisor calls, and include that instead of defining native operations, when CONFIG_PARAVIRT. This patch does the dumbest possible replacement of paravirtualized instructions: calls through a "paravirt_ops" structure. Currently these are function implementations of native hardware: hypervisors will override the ops structure with their own variants. All the pv-ops functions are declared "fastcall" so that a specific register-based ABI is used, to make inlining assember easier. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Chris Wright <chrisw@sous-sol.org> ==================================================================--- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -179,6 +179,17 @@ config X86_ES7000 should say N here. endchoice + +config PARAVIRT + bool "Paravirtualization support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Paravirtualization is a way of running multiple instances of + Linux on the same machine, under a hypervisor. This option + changes the kernel so it can modify itself when it is run + under a hypervisor, improving performance significantly. + However, when run without a hypervisor the kernel is + theoretically slower. If in doubt, say N. config ACPI_SRAT bool ==================================================================--- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_AUDIT) += audit.o +obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional ==================================================================--- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -74,4 +74,11 @@ void foo(void) DEFINE(VDSO_PRELINK, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); +#ifdef CONFIG_PARAVIRT + OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); + OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); + OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); + OFFSET(PARAVIRT_iret, paravirt_ops, iret); + OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); +#endif } ==================================================================--- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -76,13 +76,6 @@ NT_MASK = 0x00004000 NT_MASK = 0x00004000 VM_MASK = 0x00020000 -/* These are replaces for paravirtualization */ -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti -#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit -#define INTERRUPT_RETURN iret -#define GET_CR0_INTO_EAX movl %cr0, %eax - #ifdef CONFIG_PREEMPT #define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else @@ -809,6 +802,19 @@ 1: INTERRUPT_RETURN .long 1b,iret_exc .previous +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +#endif + KPROBE_ENTRY(int3) RING0_INT_FRAME pushl $-1 # mark this as an int ==================================================================--- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -64,6 +64,9 @@ static inline void pack_gate(__u32 *a, _ #define DESCTYPE_DPL3 0x60 /* DPL-3 */ #define DESCTYPE_S 0x10 /* !system */ +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) @@ -98,6 +101,7 @@ static inline void write_dt_entry(void * #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#endif /* CONFIG_PARAVIRT */ static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) { ==================================================================--- a/include/asm-i386/irqflags.h +++ b/include/asm-i386/irqflags.h @@ -10,6 +10,9 @@ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else #ifndef __ASSEMBLY__ static inline unsigned long __raw_local_save_flags(void) @@ -24,9 +27,6 @@ static inline unsigned long __raw_local_ return flags; } - -#define raw_local_save_flags(flags) \ - do { (flags) = __raw_local_save_flags(); } while (0) static inline void raw_local_irq_restore(unsigned long flags) { @@ -66,18 +66,6 @@ static inline void halt(void) __asm__ __volatile__("hlt": : :"memory"); } -static inline int raw_irqs_disabled_flags(unsigned long flags) -{ - return !(flags & (1 << 9)); -} - -static inline int raw_irqs_disabled(void) -{ - unsigned long flags = __raw_local_save_flags(); - - return raw_irqs_disabled_flags(flags); -} - /* * For spinlocks, etc: */ @@ -90,9 +78,33 @@ static inline unsigned long __raw_local_ return flags; } +#else +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ + +#ifndef __ASSEMBLY__ +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + #define raw_local_irq_save(flags) \ do { (flags) = __raw_local_irq_save(); } while (0) +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return !(flags & (1 << 9)); +} + +static inline int raw_irqs_disabled(void) +{ + unsigned long flags = __raw_local_save_flags(); + + return raw_irqs_disabled_flags(flags); +} #endif /* __ASSEMBLY__ */ /* ==================================================================--- a/include/asm-i386/msr.h +++ b/include/asm-i386/msr.h @@ -1,5 +1,9 @@ #ifndef __ASM_MSR_H #define __ASM_MSR_H + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else /* * Access to machine-specific registers (available on 586 and better only) @@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long __asm__ __volatile__("rdpmc" \ : "=a" (low), "=d" (high) \ : "c" (counter)) +#endif /* !CONFIG_PARAVIRT */ /* symbolic names for some interesting MSRs */ /* Intel defined MSRs. */ ==================================================================--- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -143,6 +143,9 @@ static inline void detect_ht(struct cpui #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { @@ -154,6 +157,34 @@ static inline void __cpuid(unsigned int "=d" (*edx) : "0" (*eax), "2" (*ecx)); } + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + __asm__("movl %%db" #register ", %0" \ + :"=r" (var)) +#define set_debugreg(value, register) \ + __asm__("movl %0,%%db" #register \ + : /* no output */ \ + :"r" (value)) + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void set_iopl_mask(unsigned mask) +{ + unsigned int reg; + __asm__ __volatile__ ("pushfl;" + "popl %0;" + "andl %1, %0;" + "orl %2, %0;" + "pushl %0;" + "popfl" + : "=&r" (reg) + : "i" (~X86_EFLAGS_IOPL), "r" (mask)); +} +#endif /* CONFIG_PARAVIRT */ /* * Generic CPUID function @@ -508,33 +539,6 @@ static inline void load_esp0(struct tss_ regs->esp = new_esp; \ } while (0) -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - __asm__("movl %%db" #register ", %0" \ - :"=r" (var)) -#define set_debugreg(value, register) \ - __asm__("movl %0,%%db" #register \ - : /* no output */ \ - :"r" (value)) - -/* - * Set IOPL bits in EFLAGS from given mask - */ -static inline void set_iopl_mask(unsigned mask) -{ - unsigned int reg; - __asm__ __volatile__ ("pushfl;" - "popl %0;" - "andl %1, %0;" - "orl %2, %0;" - "pushl %0;" - "popfl" - : "=&r" (reg) - : "i" (~X86_EFLAGS_IOPL), "r" (mask)); -} - /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; ==================================================================--- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -128,5 +128,7 @@ #define SEGMENT_LDT 0x4 #define SEGMENT_GDT 0x0 +#ifndef CONFIG_PARAVIRT #define get_kernel_rpl() 0 #endif +#endif ==================================================================--- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -17,8 +17,12 @@ * (the type definitions are in asm/spinlock_types.h) */ +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else #define CLI_STRING "cli" #define STI_STRING "sti" +#endif /* CONFIG_PARAVIRT */ #define __raw_spin_is_locked(x) \ (*(volatile signed char *)(&(x)->slock) <= 0) ==================================================================--- a/include/asm-i386/system.h +++ b/include/asm-i386/system.h @@ -82,6 +82,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" #define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else #define read_cr0() ({ \ unsigned int __dummy; \ __asm__ __volatile__( \ @@ -133,16 +136,17 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" #define write_cr4(x) \ __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) -/* - * Clear and set 'TS' bit respectively - */ -#define clts() __asm__ __volatile__ ("clts") -#define stts() write_cr0(8 | read_cr0()) - -#endif /* __KERNEL__ */ - #define wbinvd() \ __asm__ __volatile__ ("wbinvd": : :"memory") + +/* Clear the 'TS' bit */ +#define clts() __asm__ __volatile__ ("clts") +#endif/* CONFIG_PARAVIRT */ + +/* Set the 'TS' bit */ +#define stts() write_cr0(8 | read_cr0()) + +#endif /* __KERNEL__ */ static inline unsigned long get_limit(unsigned long segment) { ==================================================================--- /dev/null +++ b/arch/i386/kernel/paravirt.c @@ -0,0 +1,367 @@ +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include <linux/errno.h> +#include <linux/module.h> +#include <asm/bug.h> +#include <asm/paravirt.h> +#include <asm/desc.h> + +static fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* must be "asm volatile" so that it won't be optimised out in + native_sync_core */ + asm volatile ("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static fastcall unsigned int native_get_debugreg(int regno) +{ + unsigned int val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("movl %%db0, %0" :"=r" (val)); break; + case 1: + asm("movl %%db1, %0" :"=r" (val)); break; + case 2: + asm("movl %%db2, %0" :"=r" (val)); break; + case 3: + asm("movl %%db3, %0" :"=r" (val)); break; + case 6: + asm("movl %%db6, %0" :"=r" (val)); break; + case 7: + asm("movl %%db7, %0" :"=r" (val)); break; + default: + BUG(); + } + return val; +} + +static fastcall void native_set_debugreg(int regno, unsigned int value) +{ + switch (regno) { + case 0: + asm("movl %0,%%db0" : /* no output */ :"r" (value)); + break; + case 1: + asm("movl %0,%%db1" : /* no output */ :"r" (value)); + break; + case 2: + asm("movl %0,%%db2" : /* no output */ :"r" (value)); + break; + case 3: + asm("movl %0,%%db3" : /* no output */ :"r" (value)); + break; + case 6: + asm("movl %0,%%db6" : /* no output */ :"r" (value)); + break; + case 7: + asm("movl %0,%%db7" : /* no output */ :"r" (value)); + break; + default: + BUG(); + } +} + +static fastcall void native_clts(void) +{ + asm volatile ("clts"); +} + +static fastcall unsigned int native_read_cr0(void) +{ + unsigned int val; + asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr0(unsigned int val) +{ + asm volatile("movl %0,%%cr0": :"r" (val)); +} + +static fastcall unsigned int native_read_cr2(void) +{ + unsigned int val; + asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr2(unsigned int val) +{ + asm volatile("movl %0,%%cr2": :"r" (val)); +} + +static fastcall unsigned int native_read_cr3(void) +{ + unsigned int val; + asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr3(unsigned int val) +{ + asm volatile("movl %0,%%cr3": :"r" (val)); +} + +static fastcall unsigned int native_read_cr4(void) +{ + unsigned int val; + asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall unsigned int native_read_cr4_safe(void) +{ + unsigned int val; + /* This could fault if %cr4 does not exist */ + asm("1: movl %%cr4, %0 \n" + "2: \n" + ".section __ex_table,\"a\" \n" + ".long 1b,2b \n" + ".previous \n" + : "=r" (val): "0" (0)); + return val; +} + +static fastcall void native_write_cr4(unsigned int val) +{ + asm volatile("movl %0,%%cr4": :"r" (val)); +} + +static fastcall unsigned long native_save_fl(void) +{ + unsigned long f; + asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); + return f; +} + +static fastcall void native_restore_fl(unsigned long f) +{ + asm volatile("pushl %0 ; popfl": /* no output */ + :"g" (f) + :"memory", "cc"); +} + +static fastcall void native_irq_disable(void) +{ + asm volatile("cli": : :"memory"); +} + +static fastcall void native_irq_enable(void) +{ + asm volatile("sti": : :"memory"); +} + +static fastcall void native_safe_halt(void) +{ + asm volatile("sti; hlt": : :"memory"); +} + +static fastcall void native_halt(void) +{ + asm volatile("hlt": : :"memory"); +} + +static fastcall void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) +{ + unsigned long long val; + + asm volatile("2: rdmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %3,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=r" (*err), "=A" (val) + : "c" (msr), "i" (-EFAULT)); + + return val; +} + +static fastcall int native_write_msr(unsigned int msr, unsigned long long val) +{ + int err; + asm volatile("2: wrmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %4,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=a" (err) + : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), + "i" (-EFAULT)); + return err; +} + +static fastcall unsigned long long native_read_tsc(void) +{ + unsigned long long val; + asm volatile("rdtsc" : "=A" (val)); + return val; +} + +static fastcall unsigned long long native_read_pmc(void) +{ + unsigned long long val; + asm volatile("rdpmc" : "=A" (val)); + return val; +} + +static fastcall void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static fastcall void native_load_ldt_desc(void) +{ + asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); +} + +static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +{ + asm ("sgdt %0":"=m" (*dtr)); +} + +static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +{ + asm ("sidt %0":"=m" (*dtr)); +} + +static fastcall unsigned long native_store_tr(void) +{ + unsigned long tr; + asm ("str %0":"=r" (tr)); + return tr; +} + +static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] + C(0); C(1); C(2); +#undef C +} + +static inline void native_write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) +{ + __u32 *lp = (__u32 *)((char *)dt + entry*8); + *lp = entry_a; + *(lp+1) = entry_b; +} + +static fastcall void native_write_ldt_entry(void *dt, int entrynum, u64 entry) +{ + native_write_dt_entry(dt, entrynum, entry >> 32, entry); +} + +static fastcall void native_write_gdt_entry(void *dt, int entrynum, u64 entry) +{ + native_write_dt_entry(dt, entrynum, entry >> 32, entry); +} + +static fastcall void native_write_idt_entry(void *dt, int entrynum, u64 entry) +{ + native_write_dt_entry(dt, entrynum, entry >> 32, entry); +} + +static fastcall void native_set_iopl_mask(unsigned mask) +{ + unsigned int reg; + asm volatile ("pushfl;" + "popl %0;" + "andl %1, %0;" + "orl %2, %0;" + "pushl %0;" + "popfl" + : "=&r" (reg) + : "i" (~X86_EFLAGS_IOPL), "r" (mask)); +} + +/* These are in entry.S */ +extern fastcall void native_iret(void); +extern fastcall void native_irq_enable_sysexit(void); + +struct paravirt_ops paravirt_ops = { + .kernel_rpl = 0, + .cpuid = native_cpuid, + .get_debugreg = native_get_debugreg, + .set_debugreg = native_set_debugreg, + .clts = native_clts, + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = native_write_cr4, + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + .safe_halt = native_safe_halt, + .halt = native_halt, + .wbinvd = native_wbinvd, + .read_msr = native_read_msr, + .write_msr = native_write_msr, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + .load_tr_desc = native_load_tr_desc, + .load_ldt_desc = native_load_ldt_desc, + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = native_store_tr, + .load_tls = native_load_tls, + .write_ldt_entry = native_write_ldt_entry, + .write_gdt_entry = native_write_gdt_entry, + .write_idt_entry = native_write_idt_entry, + + .set_iopl_mask = native_set_iopl_mask, + .irq_enable_sysexit = native_irq_enable_sysexit, + .iret = native_iret, +}; +EXPORT_SYMBOL(paravirt_ops); ==================================================================--- /dev/null +++ b/include/asm-i386/paravirt.h @@ -0,0 +1,223 @@ +#ifndef __ASM_PARAVIRT_H +#define __ASM_PARAVIRT_H +/* Various instructions on x86 need to be replaced for + * para-virtualization: those hooks are defined here. */ +#include <linux/linkage.h> + +#ifdef CONFIG_PARAVIRT +#ifndef __ASSEMBLY__ +struct thread_struct; +struct Xgt_desc_struct; +struct paravirt_ops +{ + unsigned int kernel_rpl; + + /* All the function pointers here are declared as "fastcall" + so that we get a specific register-based calling + convention. This makes it easier to implement inline + assembler replacements. */ + + void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + unsigned int (fastcall *get_debugreg)(int regno); + void (fastcall *set_debugreg)(int regno, unsigned int value); + + void (fastcall *clts)(void); + + unsigned int (fastcall *read_cr0)(void); + void (fastcall *write_cr0)(unsigned int); + + unsigned int (fastcall *read_cr2)(void); + void (fastcall *write_cr2)(unsigned int); + + unsigned int (fastcall *read_cr3)(void); + void (fastcall *write_cr3)(unsigned int); + + unsigned int (fastcall *read_cr4_safe)(void); + unsigned int (fastcall *read_cr4)(void); + void (fastcall *write_cr4)(unsigned int); + + unsigned long (fastcall *save_fl)(void); + void (fastcall *restore_fl)(unsigned long); + void (fastcall *irq_disable)(void); + void (fastcall *irq_enable)(void); + void (fastcall *safe_halt)(void); + void (fastcall *halt)(void); + void (fastcall *wbinvd)(void); + + /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (fastcall *read_msr)(unsigned int msr, int *err); + int (fastcall *write_msr)(unsigned int msr, u64 val); + + u64 (fastcall *read_tsc)(void); + u64 (fastcall *read_pmc)(void); + + void (fastcall *load_tr_desc)(void); + void (fastcall *load_ldt_desc)(void); + void (fastcall *load_gdt)(const struct Xgt_desc_struct *); + void (fastcall *load_idt)(const struct Xgt_desc_struct *); + void (fastcall *store_gdt)(struct Xgt_desc_struct *); + void (fastcall *store_idt)(struct Xgt_desc_struct *); + unsigned long (fastcall *store_tr)(void); + void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu); + void (fastcall *write_ldt_entry)(void *dt, int entrynum, u64 entry); + void (fastcall *write_gdt_entry)(void *dt, int entrynum, u64 entry); + void (fastcall *write_idt_entry)(void *dt, int entrynum, u64 entry); + + void (fastcall *set_iopl_mask)(unsigned mask); + + /* These two are jmp to, not actually called. */ + void (fastcall *irq_enable_sysexit)(void); + void (fastcall *iret)(void); +}; + +extern struct paravirt_ops paravirt_ops; + +/* The paravirtualized CPUID instruction. */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + paravirt_ops.cpuid(eax, ebx, ecx, edx); +} + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg) +#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val) + +#define clts() paravirt_ops.clts() + +#define read_cr0() paravirt_ops.read_cr0() +#define write_cr0(x) paravirt_ops.write_cr0(x) + +#define read_cr2() paravirt_ops.read_cr2() +#define write_cr2(x) paravirt_ops.write_cr2(x) + +#define read_cr3() paravirt_ops.read_cr3() +#define write_cr3(x) paravirt_ops.write_cr3(x) + +#define read_cr4() paravirt_ops.read_cr4() +#define read_cr4_safe(x) paravirt_ops.read_cr4_safe() +#define write_cr4(x) paravirt_ops.write_cr4(x) + +static inline unsigned long __raw_local_save_flags(void) +{ + return paravirt_ops.save_fl(); +} + +static inline void raw_local_irq_restore(unsigned long flags) +{ + return paravirt_ops.restore_fl(flags); +} + +static inline void raw_local_irq_disable(void) +{ + paravirt_ops.irq_disable(); +} + +static inline void raw_local_irq_enable(void) +{ + paravirt_ops.irq_enable(); +} + +static inline unsigned long __raw_local_irq_save(void) +{ + unsigned long flags = paravirt_ops.save_fl(); + + paravirt_ops.irq_disable(); + + return flags; +} + +static inline void raw_safe_halt(void) +{ + paravirt_ops.safe_halt(); +} + +static inline void halt(void) +{ + paravirt_ops.safe_halt(); +} +#define wbinvd() paravirt_ops.wbinvd() + +#define get_kernel_rpl() (paravirt_ops.kernel_rpl) + +#define rdmsr(msr,val1,val2) do { \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + val1 = (u32)_l; \ + val2 = _l >> 32; \ +} while(0) + +#define wrmsr(msr,val1,val2) do { \ + u64 _l = ((u64)(val2) << 32) | (val1); \ + paravirt_ops.write_msr((msr), _l); \ +} while(0) + +#define rdmsrl(msr,val) do { \ + int _err; \ + val = paravirt_ops.read_msr((msr),&_err); \ +} while(0) + +#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val))) +#define wrmsr_safe(msr,a,b) ({ \ + u64 _l = ((u64)(b) << 32) | (a); \ + paravirt_ops.write_msr((msr),_l); \ +}) + +/* rdmsr with exception handling */ +#define rdmsr_safe(msr,a,b) ({ \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; }) + +#define rdtsc(low,high) do { \ + u64 _l = paravirt_ops.read_tsc(); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while(0) + +#define rdtscl(low) do { \ + u64 _l = paravirt_ops.read_tsc(); \ + low = (int)_l; \ +} while(0) + +#define rdtscll(val) (val = paravirt_ops.read_tsc()) + +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2) + +#define rdpmc(counter,low,high) do { \ + u64 _l = paravirt_ops.read_pmc(); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while(0) + +#define load_TR_desc() (paravirt_ops.load_tr_desc()) +#define load_LDT_desc() (paravirt_ops.load_ldt_desc()) +#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr)) +#define load_idt(dtr) (paravirt_ops.load_idt(dtr)) +#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr)) +#define store_idt(dtr) (paravirt_ops.store_idt(dtr)) +#define store_tr(tr) ((tr) = paravirt_ops.store_tr()) +#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu))) +#define write_ldt_entry(dt, entry, a, b) (paravirt_ops.write_ldt_entry((dt), (entry), ((u64)a) << 32 | b)) +#define write_gdt_entry(dt, entry, a, b) (paravirt_ops.write_gdt_entry((dt), (entry), ((u64)a) << 32 | b)) +#define write_idt_entry(dt, entry, a, b) (paravirt_ops.write_idt_entry((dt), (entry), ((u64)a) << 32 | b)) +#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask)) + +#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax" +#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax" +#else /* __ASSEMBLY__ */ + +#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret +#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax +#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax +#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit +#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0 +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ +#endif /* __ASM_PARAVIRT_H */ -- Help! Save Australia from the worst of the DMCA: http://linux.org.au/law