1: enhance rsp-relative calculations 2: use compiler visible "add" instead of inline assembly "or" in get_cpu_info() 3: slightly streamline __prepare_to_wait() inline assembly 4: clean up interrupt stub generation Note that some of this may look less worthwhile now that the unification of 32- and 64-bit code isn''t an aspect anymore, but I think the net result is still an improvement, so I decided to retain and post these patches unchanged (apart from dropping the 32-bit specific pieces). Signed-off-by: Jan Beulich <jbeulich@suse.com>
The use of "or" in GET_CPUINFO_FIELD so far wasn''t ideal, as it doesn''t lend itself to folding this operation with a possibly subsequent one (e.g. the well known mov+add=lea conversion). Split out the sub- operations, and shorten assembly code slightly with this. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -445,10 +445,10 @@ domain_crash_synchronous_string: ENTRY(domain_crash_synchronous) # Get out of the guest-save area of the stack. - GET_CPUINFO_FIELD(CPUINFO_guest_cpu_user_regs,%rax) - movq %rax,%rsp + GET_STACK_BASE(%rax) + leaq STACK_CPUINFO_FIELD(guest_cpu_user_regs)(%rax),%rsp # create_bounce_frame() temporarily clobbers CS.RPL. Fix up. - GET_CURRENT(%rax) + __GET_CURRENT(%rax) movq VCPU_domain(%rax),%rax testb $1,DOMAIN_is_32bit_pv(%rax) setz %al @@ -622,7 +622,7 @@ handle_ist_exception: testb $3,UREGS_cs(%rsp) jz 1f /* Interrupted guest context. Copy the context to stack bottom. */ - GET_CPUINFO_FIELD(CPUINFO_guest_cpu_user_regs,%rdi) + GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi) movq %rsp,%rsi movl $UREGS_kernel_sizeof/8,%ecx movq %rdi,%rsp --- a/xen/include/asm-x86/asm_defns.h +++ b/xen/include/asm-x86/asm_defns.h @@ -44,6 +44,21 @@ void ret_from_intr(void); .subsection 0; \ .Llikely.tag: +#define STACK_CPUINFO_FIELD(field) (STACK_SIZE-CPUINFO_sizeof+CPUINFO_##field) +#define GET_STACK_BASE(reg) \ + movq $~(STACK_SIZE-1),reg; \ + andq %rsp,reg + +#define GET_CPUINFO_FIELD(field, reg) \ + GET_STACK_BASE(reg); \ + addq $STACK_CPUINFO_FIELD(field),reg + +#define __GET_CURRENT(reg) \ + movq STACK_CPUINFO_FIELD(current_vcpu)(reg),reg +#define GET_CURRENT(reg) \ + GET_STACK_BASE(reg); \ + __GET_CURRENT(reg) + #endif #endif /* __X86_ASM_DEFNS_H__ */ --- a/xen/include/asm-x86/x86_64/asm_defns.h +++ b/xen/include/asm-x86/x86_64/asm_defns.h @@ -111,14 +111,6 @@ STR(IRQ) #nr "_interrupt:\n\t" "movl $"#nr",4(%rsp)\n\t" \ "jmp common_interrupt"); -#define GET_CPUINFO_FIELD(field,reg) \ - movq $~(STACK_SIZE-1),reg; \ - andq %rsp,reg; \ - orq $(STACK_SIZE-CPUINFO_sizeof+field),reg; -#define GET_CURRENT(reg) \ - GET_CPUINFO_FIELD(CPUINFO_current_vcpu,reg) \ - movq (reg),reg; - #ifdef __ASSEMBLY__ # define _ASM_EX(p) p-. #else
Jan Beulich
2012-Sep-21 12:21 UTC
[PATCH 2/4] x86: use compiler visible "add" instead of inline assembly "or" in get_cpu_info()
This follows the same idea as the previous patch, just that the effect is much more visible here: With a half-way [dr]ecent gcc this reduced .text size by over 12k for me. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/include/asm-x86/current.h +++ b/xen/include/asm-x86/current.h @@ -25,12 +25,9 @@ struct cpu_info { static inline struct cpu_info *get_cpu_info(void) { - struct cpu_info *cpu_info; - __asm__ ( "and %%"__OP"sp,%0; or %2,%0" - : "=r" (cpu_info) - : "0" (~(STACK_SIZE-1)), "i" (STACK_SIZE-sizeof(struct cpu_info)) - ); - return cpu_info; + unsigned long tos; + __asm__ ( "and %%rsp,%0" : "=r" (tos) : "0" (~(STACK_SIZE-1)) ); + return (struct cpu_info *)(tos + STACK_SIZE) - 1; } #define get_current() (get_cpu_info()->current_vcpu) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2012-Sep-21 12:22 UTC
[PATCH 3/4] x86: slightly streamline __prepare_to_wait() inline assembly
Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/common/wait.c +++ b/xen/common/wait.c @@ -143,15 +143,13 @@ static void __prepare_to_wait(struct wai "push %%rax; push %%rbx; push %%rdx; " "push %%rbp; push %%r8; push %%r9; push %%r10; push %%r11; " "push %%r12; push %%r13; push %%r14; push %%r15; call 1f; " - "1: mov %%rsp,%%rsi; addq $2f-1b,(%%rsp); " - "sub %%rsi,%%rcx; cmp %3,%%rcx; jbe 2f; " - "xor %%esi,%%esi; jmp 3f; " - "2: rep movsb; mov %%rsp,%%rsi; 3: pop %%rax; " + "1: addq $2f-1b,(%%rsp); sub %%esp,%%ecx; cmp %3,%%ecx; jbe 3f; " + "mov %%rsp,%%rsi; 2: rep movsb; mov %%rsp,%%rsi; 3: pop %%rax; " "pop %%r15; pop %%r14; pop %%r13; pop %%r12; " "pop %%r11; pop %%r10; pop %%r9; pop %%r8; " "pop %%rbp; pop %%rdx; pop %%rbx; pop %%rax" : "=&S" (wqv->esp), "=&c" (dummy), "=&D" (dummy) - : "i" (PAGE_SIZE), "1" (cpu_info), "2" (wqv->stack) + : "i" (PAGE_SIZE), "0" (0), "1" (cpu_info), "2" (wqv->stack) : "memory" ); if ( unlikely(wqv->esp == 0) ) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Apart from moving some code that is only used here from the header file to the actual source one, this also - moves interrupt[] into .init.data, - prevents generating (unused) stubs for vectors below FIRST_DYNAMIC_VECTOR, and - shortens and sanitizes the names of the stubs. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/xen/arch/x86/i8259.c +++ b/xen/arch/x86/i8259.c @@ -37,26 +37,35 @@ __asm__(".section .text"); BUILD_COMMON_IRQ() -#define BI(x,y) \ - BUILD_IRQ(x##y) +#define IRQ_NAME(nr) VEC##nr##_interrupt + +#define BI(nr) \ +void IRQ_NAME(nr)(void); \ +__asm__( \ +".if " STR(0x##nr) " >= " STR(FIRST_DYNAMIC_VECTOR) "\n" \ +__ALIGN_STR "\n" \ +STR(IRQ_NAME(nr)) ":\n\t" \ +BUILD_IRQ(0x##nr) "\n" \ +".else\n" \ +".equ " STR(IRQ_NAME(nr)) ", 0\n" \ +".endif\n") #define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -BUILD_16_IRQS(0x0) BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) + BI(x##0); BI(x##1); BI(x##2); BI(x##3); \ + BI(x##4); BI(x##5); BI(x##6); BI(x##7); \ + BI(x##8); BI(x##9); BI(x##a); BI(x##b); \ + BI(x##c); BI(x##d); BI(x##e); BI(x##f) + +BUILD_16_IRQS(0); BUILD_16_IRQS(1); BUILD_16_IRQS(2); BUILD_16_IRQS(3); +BUILD_16_IRQS(4); BUILD_16_IRQS(5); BUILD_16_IRQS(6); BUILD_16_IRQS(7); +BUILD_16_IRQS(8); BUILD_16_IRQS(9); BUILD_16_IRQS(a); BUILD_16_IRQS(b); +BUILD_16_IRQS(c); BUILD_16_IRQS(d); BUILD_16_IRQS(e); BUILD_16_IRQS(f); #undef BUILD_16_IRQS #undef BI -#define IRQ(x,y) \ - IRQ##x##y##_interrupt +#define IRQ(x,y) IRQ_NAME(x##y) #define IRQLIST_16(x) \ IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ @@ -64,12 +73,12 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BU IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - static void (*interrupt[])(void) = { - IRQLIST_16(0x0), IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) - }; +static void (*__initdata interrupt[NR_VECTORS])(void) = { + IRQLIST_16(0), IRQLIST_16(1), IRQLIST_16(2), IRQLIST_16(3), + IRQLIST_16(4), IRQLIST_16(5), IRQLIST_16(6), IRQLIST_16(7), + IRQLIST_16(8), IRQLIST_16(9), IRQLIST_16(a), IRQLIST_16(b), + IRQLIST_16(c), IRQLIST_16(d), IRQLIST_16(e), IRQLIST_16(f) +}; #undef IRQ #undef IRQLIST_16 @@ -400,6 +409,7 @@ void __init init_IRQ(void) { if (vector == HYPERCALL_VECTOR || vector == LEGACY_SYSCALL_VECTOR) continue; + BUG_ON(!interrupt[vector]); set_intr_gate(vector, interrupt[vector]); } --- a/xen/include/asm-x86/x86_64/asm_defns.h +++ b/xen/include/asm-x86/x86_64/asm_defns.h @@ -99,17 +99,10 @@ __asm__( "callq " STR(do_IRQ) "\n\t" \ "jmp ret_from_intr\n"); -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - #define BUILD_IRQ(nr) \ -void IRQ_NAME(nr); \ -__asm__( \ -"\n"__ALIGN_STR"\n" \ -STR(IRQ) #nr "_interrupt:\n\t" \ "pushq $0\n\t" \ "movl $"#nr",4(%rsp)\n\t" \ - "jmp common_interrupt"); + "jmp common_interrupt" #ifdef __ASSEMBLY__ # define _ASM_EX(p) p-. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
On 21/09/2012 13:17, "Jan Beulich" <JBeulich@suse.com> wrote:> 1: enhance rsp-relative calculations > 2: use compiler visible "add" instead of inline assembly "or" in > get_cpu_info() > 3: slightly streamline __prepare_to_wait() inline assembly > 4: clean up interrupt stub generation > > Note that some of this may look less worthwhile now that the > unification of 32- and 64-bit code isn''t an aspect anymore, but > I think the net result is still an improvement, so I decided to > retain and post these patches unchanged (apart from dropping > the 32-bit specific pieces). > > Signed-off-by: Jan Beulich <jbeulich@suse.com>I don''t see anything really contentious here. Acked-by: Keir Fraser <keir@xen.org>> > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel