1: enhance rsp-relative calculations 2: use compiler visible "add" instead of inline assembly "or" in get_cpu_info() 3: slightly streamline __prepare_to_wait() inline assembly 4: clean up interrupt stub generation Note that some of this may look less worthwhile now that the unification of 32- and 64-bit code isn''t an aspect anymore, but I think the net result is still an improvement, so I decided to retain and post these patches unchanged (apart from dropping the 32-bit specific pieces). Signed-off-by: Jan Beulich <jbeulich@suse.com>
The use of "or" in GET_CPUINFO_FIELD so far wasn''t ideal, as
it doesn''t
lend itself to folding this operation with a possibly subsequent one
(e.g. the well known mov+add=lea conversion). Split out the sub-
operations, and shorten assembly code slightly with this.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -445,10 +445,10 @@ domain_crash_synchronous_string:
ENTRY(domain_crash_synchronous)
# Get out of the guest-save area of the stack.
- GET_CPUINFO_FIELD(CPUINFO_guest_cpu_user_regs,%rax)
- movq %rax,%rsp
+ GET_STACK_BASE(%rax)
+ leaq STACK_CPUINFO_FIELD(guest_cpu_user_regs)(%rax),%rsp
# create_bounce_frame() temporarily clobbers CS.RPL. Fix up.
- GET_CURRENT(%rax)
+ __GET_CURRENT(%rax)
movq VCPU_domain(%rax),%rax
testb $1,DOMAIN_is_32bit_pv(%rax)
setz %al
@@ -622,7 +622,7 @@ handle_ist_exception:
testb $3,UREGS_cs(%rsp)
jz 1f
/* Interrupted guest context. Copy the context to stack bottom. */
- GET_CPUINFO_FIELD(CPUINFO_guest_cpu_user_regs,%rdi)
+ GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi)
movq %rsp,%rsi
movl $UREGS_kernel_sizeof/8,%ecx
movq %rdi,%rsp
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -44,6 +44,21 @@ void ret_from_intr(void);
.subsection 0; \
.Llikely.tag:
+#define STACK_CPUINFO_FIELD(field) (STACK_SIZE-CPUINFO_sizeof+CPUINFO_##field)
+#define GET_STACK_BASE(reg) \
+ movq $~(STACK_SIZE-1),reg; \
+ andq %rsp,reg
+
+#define GET_CPUINFO_FIELD(field, reg) \
+ GET_STACK_BASE(reg); \
+ addq $STACK_CPUINFO_FIELD(field),reg
+
+#define __GET_CURRENT(reg) \
+ movq STACK_CPUINFO_FIELD(current_vcpu)(reg),reg
+#define GET_CURRENT(reg) \
+ GET_STACK_BASE(reg); \
+ __GET_CURRENT(reg)
+
#endif
#endif /* __X86_ASM_DEFNS_H__ */
--- a/xen/include/asm-x86/x86_64/asm_defns.h
+++ b/xen/include/asm-x86/x86_64/asm_defns.h
@@ -111,14 +111,6 @@ STR(IRQ) #nr "_interrupt:\n\t"
"movl $"#nr",4(%rsp)\n\t" \
"jmp common_interrupt");
-#define GET_CPUINFO_FIELD(field,reg) \
- movq $~(STACK_SIZE-1),reg; \
- andq %rsp,reg; \
- orq $(STACK_SIZE-CPUINFO_sizeof+field),reg;
-#define GET_CURRENT(reg) \
- GET_CPUINFO_FIELD(CPUINFO_current_vcpu,reg) \
- movq (reg),reg;
-
#ifdef __ASSEMBLY__
# define _ASM_EX(p) p-.
#else
Jan Beulich
2012-Sep-21 12:21 UTC
[PATCH 2/4] x86: use compiler visible "add" instead of inline assembly "or" in get_cpu_info()
This follows the same idea as the previous patch, just that the effect
is much more visible here: With a half-way [dr]ecent gcc this reduced
.text size by over 12k for me.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -25,12 +25,9 @@ struct cpu_info {
static inline struct cpu_info *get_cpu_info(void)
{
- struct cpu_info *cpu_info;
- __asm__ ( "and %%"__OP"sp,%0; or %2,%0"
- : "=r" (cpu_info)
- : "0" (~(STACK_SIZE-1)), "i"
(STACK_SIZE-sizeof(struct cpu_info))
- );
- return cpu_info;
+ unsigned long tos;
+ __asm__ ( "and %%rsp,%0" : "=r" (tos) : "0"
(~(STACK_SIZE-1)) );
+ return (struct cpu_info *)(tos + STACK_SIZE) - 1;
}
#define get_current() (get_cpu_info()->current_vcpu)
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Jan Beulich
2012-Sep-21 12:22 UTC
[PATCH 3/4] x86: slightly streamline __prepare_to_wait() inline assembly
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/common/wait.c
+++ b/xen/common/wait.c
@@ -143,15 +143,13 @@ static void __prepare_to_wait(struct wai
"push %%rax; push %%rbx; push %%rdx; "
"push %%rbp; push %%r8; push %%r9; push %%r10; push %%r11; "
"push %%r12; push %%r13; push %%r14; push %%r15; call 1f; "
- "1: mov %%rsp,%%rsi; addq $2f-1b,(%%rsp); "
- "sub %%rsi,%%rcx; cmp %3,%%rcx; jbe 2f; "
- "xor %%esi,%%esi; jmp 3f; "
- "2: rep movsb; mov %%rsp,%%rsi; 3: pop %%rax; "
+ "1: addq $2f-1b,(%%rsp); sub %%esp,%%ecx; cmp %3,%%ecx; jbe 3f;
"
+ "mov %%rsp,%%rsi; 2: rep movsb; mov %%rsp,%%rsi; 3: pop %%rax;
"
"pop %%r15; pop %%r14; pop %%r13; pop %%r12; "
"pop %%r11; pop %%r10; pop %%r9; pop %%r8; "
"pop %%rbp; pop %%rdx; pop %%rbx; pop %%rax"
: "=&S" (wqv->esp), "=&c" (dummy),
"=&D" (dummy)
- : "i" (PAGE_SIZE), "1" (cpu_info), "2"
(wqv->stack)
+ : "i" (PAGE_SIZE), "0" (0), "1"
(cpu_info), "2" (wqv->stack)
: "memory" );
if ( unlikely(wqv->esp == 0) )
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Apart from moving some code that is only used here from the header file
to the actual source one, this also
- moves interrupt[] into .init.data,
- prevents generating (unused) stubs for vectors below
FIRST_DYNAMIC_VECTOR, and
- shortens and sanitizes the names of the stubs.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/i8259.c
+++ b/xen/arch/x86/i8259.c
@@ -37,26 +37,35 @@ __asm__(".section .text");
BUILD_COMMON_IRQ()
-#define BI(x,y) \
- BUILD_IRQ(x##y)
+#define IRQ_NAME(nr) VEC##nr##_interrupt
+
+#define BI(nr) \
+void IRQ_NAME(nr)(void); \
+__asm__( \
+".if " STR(0x##nr) " >= " STR(FIRST_DYNAMIC_VECTOR)
"\n" \
+__ALIGN_STR "\n" \
+STR(IRQ_NAME(nr)) ":\n\t" \
+BUILD_IRQ(0x##nr) "\n" \
+".else\n" \
+".equ " STR(IRQ_NAME(nr)) ", 0\n" \
+".endif\n")
#define BUILD_16_IRQS(x) \
- BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
- BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
- BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
- BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-BUILD_16_IRQS(0x0) BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
+ BI(x##0); BI(x##1); BI(x##2); BI(x##3); \
+ BI(x##4); BI(x##5); BI(x##6); BI(x##7); \
+ BI(x##8); BI(x##9); BI(x##a); BI(x##b); \
+ BI(x##c); BI(x##d); BI(x##e); BI(x##f)
+
+BUILD_16_IRQS(0); BUILD_16_IRQS(1); BUILD_16_IRQS(2); BUILD_16_IRQS(3);
+BUILD_16_IRQS(4); BUILD_16_IRQS(5); BUILD_16_IRQS(6); BUILD_16_IRQS(7);
+BUILD_16_IRQS(8); BUILD_16_IRQS(9); BUILD_16_IRQS(a); BUILD_16_IRQS(b);
+BUILD_16_IRQS(c); BUILD_16_IRQS(d); BUILD_16_IRQS(e); BUILD_16_IRQS(f);
#undef BUILD_16_IRQS
#undef BI
-#define IRQ(x,y) \
- IRQ##x##y##_interrupt
+#define IRQ(x,y) IRQ_NAME(x##y)
#define IRQLIST_16(x) \
IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
@@ -64,12 +73,12 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BU
IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
- static void (*interrupt[])(void) = {
- IRQLIST_16(0x0), IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
- IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
- IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
- IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
- };
+static void (*__initdata interrupt[NR_VECTORS])(void) = {
+ IRQLIST_16(0), IRQLIST_16(1), IRQLIST_16(2), IRQLIST_16(3),
+ IRQLIST_16(4), IRQLIST_16(5), IRQLIST_16(6), IRQLIST_16(7),
+ IRQLIST_16(8), IRQLIST_16(9), IRQLIST_16(a), IRQLIST_16(b),
+ IRQLIST_16(c), IRQLIST_16(d), IRQLIST_16(e), IRQLIST_16(f)
+};
#undef IRQ
#undef IRQLIST_16
@@ -400,6 +409,7 @@ void __init init_IRQ(void)
{
if (vector == HYPERCALL_VECTOR || vector == LEGACY_SYSCALL_VECTOR)
continue;
+ BUG_ON(!interrupt[vector]);
set_intr_gate(vector, interrupt[vector]);
}
--- a/xen/include/asm-x86/x86_64/asm_defns.h
+++ b/xen/include/asm-x86/x86_64/asm_defns.h
@@ -99,17 +99,10 @@ __asm__(
"callq " STR(do_IRQ) "\n\t" \
"jmp ret_from_intr\n");
-#define IRQ_NAME2(nr) nr##_interrupt(void)
-#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
#define BUILD_IRQ(nr) \
-void IRQ_NAME(nr); \
-__asm__( \
-"\n"__ALIGN_STR"\n" \
-STR(IRQ) #nr "_interrupt:\n\t" \
"pushq $0\n\t" \
"movl $"#nr",4(%rsp)\n\t" \
- "jmp common_interrupt");
+ "jmp common_interrupt"
#ifdef __ASSEMBLY__
# define _ASM_EX(p) p-.
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
On 21/09/2012 13:17, "Jan Beulich" <JBeulich@suse.com> wrote:> 1: enhance rsp-relative calculations > 2: use compiler visible "add" instead of inline assembly "or" in > get_cpu_info() > 3: slightly streamline __prepare_to_wait() inline assembly > 4: clean up interrupt stub generation > > Note that some of this may look less worthwhile now that the > unification of 32- and 64-bit code isn''t an aspect anymore, but > I think the net result is still an improvement, so I decided to > retain and post these patches unchanged (apart from dropping > the 32-bit specific pieces). > > Signed-off-by: Jan Beulich <jbeulich@suse.com>I don''t see anything really contentious here. Acked-by: Keir Fraser <keir@xen.org>> > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel