Jan Beulich
2010-Dec-22 12:17 UTC
[Xen-devel] [PATCH, RFC 4/5] x86: avoid unlikely taken forward branches
... since these get statically mis-predicted by most CPUs and increase the cache footprint. This mostly concerns hypercall tracing and vm86 mode handling. Signed-off-by: Jan Beulich <jbeulich@novell.com> --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -1330,7 +1330,7 @@ asmlinkage void do_page_fault(struct cpu * during early boot (an issue was seen once, but was most likely a hardware * problem). */ -asmlinkage void do_early_page_fault(struct cpu_user_regs *regs) +asmlinkage void __init do_early_page_fault(struct cpu_user_regs *regs) { static int stuck; static unsigned long prev_eip, prev_cr2; --- a/xen/arch/x86/x86_32/entry.S +++ b/xen/arch/x86/x86_32/entry.S @@ -142,7 +142,7 @@ restore_all_xen: ENTRY(hypercall) subl $4,%esp FIXUP_RING0_GUEST_STACK - SAVE_ALL(1f,1f) + SAVE_ALL(,1f) 1: sti GET_CURRENT(%ebx) cmpl $NR_hypercalls,%eax @@ -182,12 +182,14 @@ ENTRY(hypercall) #define SHADOW_BYTES 24 /* 6 shadow parameters */ #endif cmpb $0,tb_init_done - je 1f +UNLIKELY_START(ne, trace) call trace_hypercall /* Now restore all the registers that trace_hypercall clobbered */ movl UREGS_eax+SHADOW_BYTES(%esp),%eax /* Hypercall # */ +UNLIKELY_END(trace) + call *hypercall_table(,%eax,4) + movl %eax,UREGS_eax+SHADOW_BYTES(%esp) # save the return value #undef SHADOW_BYTES -1: call *hypercall_table(,%eax,4) addl $24,%esp # Discard the shadow parameters #ifndef NDEBUG /* Deliberately corrupt real parameter regs used by this hypercall. */ @@ -197,13 +199,10 @@ ENTRY(hypercall) jne skip_clobber # If EIP has changed then don''t clobber movzb hypercall_args_table(,%ecx,1),%ecx movl %esp,%edi - movl %eax,%esi movl $0xDEADBEEF,%eax rep stosl - movl %esi,%eax skip_clobber: #endif - movl %eax,UREGS_eax(%esp) # save the return value test_all_events: xorl %ecx,%ecx @@ -293,8 +292,8 @@ create_bounce_frame: jz ring1 /* jump if returning to an existing ring-1 activation */ movl VCPU_kernel_sp(%ebx),%esi .Lft6: mov VCPU_kernel_ss(%ebx),%gs - testl $X86_EFLAGS_VM,UREGS_eflags+4(%esp) - jz .Lnvm86_1 + testl $X86_EFLAGS_VM,%ecx +UNLIKELY_START(nz, bounce_vm86_1) subl $16,%esi /* push ES/DS/FS/GS (VM86 stack frame) */ movl UREGS_es+4(%esp),%eax .Lft7: movl %eax,%gs:(%esi) @@ -304,7 +303,7 @@ create_bounce_frame: .Lft9: movl %eax,%gs:8(%esi) movl UREGS_gs+4(%esp),%eax .Lft10: movl %eax,%gs:12(%esi) -.Lnvm86_1: +UNLIKELY_END(bounce_vm86_1) subl $8,%esi /* push SS/ESP (inter-priv iret) */ movl UREGS_esp+4(%esp),%eax .Lft11: movl %eax,%gs:(%esi) @@ -346,17 +345,10 @@ ring1: /* obtain ss/esp from oldss/olde movl TRAPBOUNCE_error_code(%edx),%eax .Lft17: movl %eax,%gs:(%esi) 1: testb $TBF_FAILSAFE,%cl - jz 2f +UNLIKELY_START(nz, bounce_failsafe) subl $16,%esi # add DS/ES/FS/GS to failsafe stack frame testl $X86_EFLAGS_VM,UREGS_eflags+4(%esp) - jz .Lnvm86_2 - xorl %eax,%eax # VM86: we write zero selector values -.Lft18: movl %eax,%gs:(%esi) -.Lft19: movl %eax,%gs:4(%esi) -.Lft20: movl %eax,%gs:8(%esi) -.Lft21: movl %eax,%gs:12(%esi) - jmp 2f -.Lnvm86_2: + jnz .Lvm86_2 movl UREGS_ds+4(%esp),%eax # non-VM86: write real selector values .Lft22: movl %eax,%gs:(%esi) movl UREGS_es+4(%esp),%eax @@ -365,13 +357,22 @@ ring1: /* obtain ss/esp from oldss/olde .Lft24: movl %eax,%gs:8(%esi) movl UREGS_gs+4(%esp),%eax .Lft25: movl %eax,%gs:12(%esi) -2: testl $X86_EFLAGS_VM,UREGS_eflags+4(%esp) - jz .Lnvm86_3 + jmp .Lnvm86_3 +.Lvm86_2: + xorl %eax,%eax # VM86: we write zero selector values +.Lft18: movl %eax,%gs:(%esi) +.Lft19: movl %eax,%gs:4(%esi) +.Lft20: movl %eax,%gs:8(%esi) +.Lft21: movl %eax,%gs:12(%esi) +UNLIKELY_END(bounce_failsafe) + testl $X86_EFLAGS_VM,UREGS_eflags+4(%esp) +UNLIKELY_START(nz, bounce_vm86_3) xorl %eax,%eax /* zero DS-GS, just as a real CPU would */ movl %eax,UREGS_ds+4(%esp) movl %eax,UREGS_es+4(%esp) movl %eax,UREGS_fs+4(%esp) movl %eax,UREGS_gs+4(%esp) +UNLIKELY_END(bounce_vm86_3) .Lnvm86_3: /* Rewrite our stack frame and return to ring 1. */ /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */ @@ -564,6 +565,7 @@ ENTRY(spurious_interrupt_bug) pushl $TRAP_spurious_int<<16 jmp handle_exception + .pushsection .init.text, "ax", @progbits ENTRY(early_page_fault) SAVE_ALL(1f,1f) 1: movl %esp,%eax @@ -571,6 +573,7 @@ ENTRY(early_page_fault) call do_early_page_fault addl $4,%esp jmp restore_all_xen + .popsection handle_nmi_mce: #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL --- a/xen/arch/x86/x86_64/compat/entry.S +++ b/xen/arch/x86/x86_64/compat/entry.S @@ -49,7 +49,7 @@ ENTRY(compat_hypercall) #define SHADOW_BYTES 0 /* No on-stack shadow state */ #endif cmpb $0,tb_init_done(%rip) - je 1f +UNLIKELY_START(ne, compat_trace) call trace_hypercall /* Now restore all the registers that trace_hypercall clobbered */ movl UREGS_rax+SHADOW_BYTES(%rsp),%eax /* Hypercall # */ @@ -60,7 +60,8 @@ ENTRY(compat_hypercall) movl UREGS_rdi+SHADOW_BYTES(%rsp),%r8d /* Arg 5 */ movl UREGS_rbp+SHADOW_BYTES(%rsp),%r9d /* Arg 6 */ #undef SHADOW_BYTES -1: leaq compat_hypercall_table(%rip),%r10 +UNLIKELY_END(compat_trace) + leaq compat_hypercall_table(%rip),%r10 PERFC_INCR(PERFC_hypercalls, %rax, %rbx) callq *(%r10,%rax,8) #ifndef NDEBUG @@ -295,7 +296,7 @@ compat_create_bounce_frame: .Lft8: movl %eax,%fs:(%rsi) # ERROR CODE 1: testb $TBF_FAILSAFE,%cl - jz 2f +UNLIKELY_START(nz, compat_bounce_failsafe) subl $4*4,%esi movl %gs,%eax .Lft9: movl %eax,%fs:3*4(%rsi) # GS @@ -304,7 +305,7 @@ compat_create_bounce_frame: .Lft11: movl %eax,%fs:1*4(%rsi) # ES movl %ds,%eax .Lft12: movl %eax,%fs:0*4(%rsi) # DS -2: +UNLIKELY_END(compat_bounce_failsafe) /* Rewrite our stack frame and return to guest-OS mode. */ /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */ andl $~(X86_EFLAGS_VM|X86_EFLAGS_RF|\ --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -148,7 +148,7 @@ ENTRY(syscall_enter) #define SHADOW_BYTES 0 /* No on-stack shadow state */ #endif cmpb $0,tb_init_done(%rip) - je 1f +UNLIKELY_START(ne, trace) call trace_hypercall /* Now restore all the registers that trace_hypercall clobbered */ movq UREGS_rax+SHADOW_BYTES(%rsp),%rax /* Hypercall # */ @@ -159,7 +159,8 @@ ENTRY(syscall_enter) movq UREGS_r8 +SHADOW_BYTES(%rsp),%r8 /* Arg 5 */ movq UREGS_r9 +SHADOW_BYTES(%rsp),%r9 /* Arg 6 */ #undef SHADOW_BYTES -1: leaq hypercall_table(%rip),%r10 +UNLIKELY_END(trace) + leaq hypercall_table(%rip),%r10 PERFC_INCR(PERFC_hypercalls, %rax, %rbx) callq *(%r10,%rax,8) #ifndef NDEBUG @@ -341,11 +342,12 @@ create_bounce_frame: 2: andq $~0xf,%rsi # Stack frames are 16-byte aligned. movq $HYPERVISOR_VIRT_START,%rax cmpq %rax,%rsi - jb 1f # In +ve address space? Then okay. movq $HYPERVISOR_VIRT_END+60,%rax + sbb %ecx,%ecx # In +ve address space? Then okay. cmpq %rax,%rsi - jb domain_crash_synchronous # Above Xen private area? Then okay. -1: movb TRAPBOUNCE_flags(%rdx),%cl + adc %ecx,%ecx # Above Xen private area? Then okay. + jg domain_crash_synchronous + movb TRAPBOUNCE_flags(%rdx),%cl subq $40,%rsi movq UREGS_ss+8(%rsp),%rax .Lft2: movq %rax,32(%rsi) # SS @@ -376,7 +378,7 @@ create_bounce_frame: movl TRAPBOUNCE_error_code(%rdx),%eax .Lft7: movq %rax,(%rsi) # ERROR CODE 1: testb $TBF_FAILSAFE,%cl - jz 2f +UNLIKELY_START(nz, bounce_failsafe) subq $32,%rsi movl %gs,%eax .Lft8: movq %rax,24(%rsi) # GS @@ -386,7 +388,8 @@ create_bounce_frame: .Lft10: movq %rax,8(%rsi) # ES movl %ds,%eax .Lft11: movq %rax,(%rsi) # DS -2: subq $16,%rsi +UNLIKELY_END(bounce_failsafe) + subq $16,%rsi movq UREGS_r11+8(%rsp),%rax .Lft12: movq %rax,8(%rsi) # R11 movq UREGS_rcx+8(%rsp),%rax @@ -601,11 +604,13 @@ ENTRY(double_fault) call do_double_fault ud2 + .pushsection .init.text, "ax", @progbits ENTRY(early_page_fault) SAVE_ALL movq %rsp,%rdi call do_early_page_fault jmp restore_all_xen + .popsection handle_ist_exception: SAVE_ALL --- a/xen/include/asm-x86/asm_defns.h +++ b/xen/include/asm-x86/asm_defns.h @@ -32,4 +32,18 @@ #define _ASM_EXTABLE(from, to) _ASM__EXTABLE(, from, to) #define _ASM_PRE_EXTABLE(from, to) _ASM__EXTABLE(.pre, from, to) +#ifdef __ASSEMBLY__ + +#define UNLIKELY_START(cond, tag) \ + j##cond .Lunlikely.tag; \ + .subsection 1; \ + .Lunlikely.tag: + +#define UNLIKELY_END(tag) \ + jmp .Llikely.tag; \ + .subsection 0; \ + .Llikely.tag: + +#endif + #endif /* __X86_ASM_DEFNS_H__ */ --- a/xen/include/asm-x86/x86_32/asm_defns.h +++ b/xen/include/asm-x86/x86_32/asm_defns.h @@ -1,6 +1,7 @@ #ifndef __X86_32_ASM_DEFNS_H__ #define __X86_32_ASM_DEFNS_H__ +#include <xen/stringify.h> #include <asm/percpu.h> #ifdef CONFIG_FRAME_POINTER @@ -53,12 +54,14 @@ mov %es,%esi; \ mov $(__HYPERVISOR_DS),%ecx; \ jnz 86f; \ - .text 1; \ + .subsection 1; \ 86: call setup_vm86_frame; \ jmp vm86_lbl; \ .previous; \ + .ifnes __stringify(xen_lbl), ""; \ testb $3,UREGS_cs(%esp); \ jz xen_lbl; \ + .endif; \ /* \ * We are the outermost Xen context, but our \ * life is complicated by NMIs and MCEs. These \ --- /dev/null +++ b/xen/include/xen/stringify.h @@ -0,0 +1,12 @@ +#ifndef __XEN_STRINGIFY_H +#define __XEN_STRINGIFY_H + +/* Indirect stringification. Doing two levels allows the parameter to be a + * macro itself. For example, compile with -DFOO=bar, __stringify(FOO) + * converts to "bar". + */ + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#endif /* !__XEN_STRINGIFY_H */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2010-Dec-22 12:24 UTC
Re: [Xen-devel] [PATCH, RFC 4/5] x86: avoid unlikely taken forward branches
>>> On 22.12.10 at 13:17, "Jan Beulich" <JBeulich@novell.com> wrote: > ... since these get statically mis-predicted by most CPUs and increase > the cache footprint. This mostly concerns hypercall tracing and vm86 > mode handling. > > Signed-off-by: Jan Beulich <jbeulich@novell.com>Oops, I''m sorry - I just noticed that I already had submitted this one. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel