Xu Zhang
2013-Mar-08 21:30 UTC
[PATCH 0/6] mini-os: check and fix up against nested events in x86-64 kernel entry
mini-os almost always use direct iret to return from interrupt. But this operation is not atomic because Xen uses event mask to enable/disable event delivery. So there is a window for nested events to happen after re-enabling event delivery and before a direct iret. The issues come with such non-atomicity have been discussed in: http://lists.xen.org/archives/html/xen-devel/2007-06/msg00142.html And also on Xen-devel: http://markmail.org/message/jkzhzy6fyes6igcf This patch checks and fixes up against nested events in a similar fashion of mini-os 32bit and Fitzhardinge''s (whom is also CCed to). It checks against re-entrant of critical section in event handling callback. Try to fix up by looking up the number of bytes restored when the second event came and coalescing the two stack frames into one. And resume execution as if the second event never happened. It also refactors mini-os''s x86-64 kernel entry assembly code. Xu Zhang (6): mini-os/x86-64 entry: code clean-ups mini-os/x86-64 entry: define macros for registers partial save and restore mini-os/x86-64 entry: code refactoring; no functional changes mini-os/x86-64 entry: remove unnecessary event block mini-os/x86-64 entry: defer RESTORE_REST until return mini-os/x86-64 entry: check against nested events and try to fix up extras/mini-os/arch/x86/x86_64.S | 261 +++++++++++++++++++++++++------------- 1 files changed, 175 insertions(+), 86 deletions(-) -- 1.7.7.6
Make use of tabs and spaces consistent in arch/x86/x86_64.S Signed-off-by: Xu Zhang <xzhang@cs.uic.edu> --- extras/mini-os/arch/x86/x86_64.S | 54 +++++++++++++++++++------------------- 1 files changed, 27 insertions(+), 27 deletions(-) diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S index a65e5d5..6139c2d 100644 --- a/extras/mini-os/arch/x86/x86_64.S +++ b/extras/mini-os/arch/x86/x86_64.S @@ -111,9 +111,9 @@ error_call_handler: jmp error_exit .macro zeroentry sym - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* skip rcx and r11 */ + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x10,%rsp /* skip rcx and r11 */ pushq $0 /* push error code/oldrax */ pushq %rax /* push real oldrax to the rdi slot */ leaq \sym(%rip),%rax @@ -121,9 +121,9 @@ error_call_handler: .endm .macro errorentry sym - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* rsp points to the error code */ + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x10,%rsp /* rsp points to the error code */ pushq %rax leaq \sym(%rip),%rax jmp error_entry @@ -147,18 +147,18 @@ error_call_handler: ENTRY(hypervisor_callback) - zeroentry hypervisor_callback2 + zeroentry hypervisor_callback2 ENTRY(hypervisor_callback2) - movq %rdi, %rsp -11: movq %gs:8,%rax - incl %gs:0 - cmovzq %rax,%rsp - pushq %rdi - call do_hypervisor_callback - popq %rsp - decl %gs:0 - jmp error_exit + movq %rdi, %rsp +11: movq %gs:8,%rax + incl %gs:0 + cmovzq %rax,%rsp + pushq %rdi + call do_hypervisor_callback + popq %rsp + decl %gs:0 + jmp error_exit restore_all_enable_events: XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... @@ -167,19 +167,19 @@ scrit: /**** START OF CRITICAL REGION ****/ XEN_TEST_PENDING(%rsi) jnz 14f # process more events if necessary... XEN_PUT_VCPU_INFO(%rsi) - RESTORE_ALL - HYPERVISOR_IRET 0 - + RESTORE_ALL + HYPERVISOR_IRET 0 + 14: XEN_LOCKED_BLOCK_EVENTS(%rsi) XEN_PUT_VCPU_INFO(%rsi) subq $6*8,%rsp - movq %rbx,5*8(%rsp) - movq %rbp,4*8(%rsp) - movq %r12,3*8(%rsp) - movq %r13,2*8(%rsp) - movq %r14,1*8(%rsp) - movq %r15,(%rsp) - movq %rsp,%rdi # set the argument again + movq %rbx,5*8(%rsp) + movq %rbp,4*8(%rsp) + movq %r12,3*8(%rsp) + movq %r13,2*8(%rsp) + movq %r14,1*8(%rsp) + movq %r15,(%rsp) + movq %rsp,%rdi # set the argument again jmp 11b ecrit: /**** END OF CRITICAL REGION ****/ @@ -193,7 +193,7 @@ retint_restore_args: andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask jnz restore_all_enable_events # != 0 => enable event delivery XEN_PUT_VCPU_INFO(%rsi) - + RESTORE_ALL HYPERVISOR_IRET 0 -- 1.7.7.6
Xu Zhang
2013-Mar-08 21:30 UTC
[PATCH 2/6] mini-os/x86-64 entry: define macros for registers partial save and restore
For saving and restoring registers rbx, rbp, and r12-r15, define and use macro SAVE_REST and RESTORE_REST respectively. Signed-off-by: Xu Zhang <xzhang@cs.uic.edu> --- extras/mini-os/arch/x86/x86_64.S | 38 +++++++++++++++++++++++--------------- 1 files changed, 23 insertions(+), 15 deletions(-) diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S index 6139c2d..addb7b1 100644 --- a/extras/mini-os/arch/x86/x86_64.S +++ b/extras/mini-os/arch/x86/x86_64.S @@ -46,6 +46,26 @@ NMI_MASK = 0x80000000 #define ORIG_RAX 120 /* + error_code */ #define EFLAGS 144 +.macro SAVE_REST + subq $6*8,%rsp + movq %rbx,5*8(%rsp) + movq %rbp,4*8(%rsp) + movq %r12,3*8(%rsp) + movq %r13,2*8(%rsp) + movq %r14,1*8(%rsp) + movq %r15,(%rsp) +.endm + +.macro RESTORE_REST + movq (%rsp),%r15 + movq 1*8(%rsp),%r14 + movq 2*8(%rsp),%r13 + movq 3*8(%rsp),%r12 + movq 4*8(%rsp),%rbp + movq 5*8(%rsp),%rbx + addq $6*8,%rsp +.endm + .macro RESTORE_ALL movq (%rsp),%r11 movq 1*8(%rsp),%r10 @@ -172,13 +192,7 @@ scrit: /**** START OF CRITICAL REGION ****/ 14: XEN_LOCKED_BLOCK_EVENTS(%rsi) XEN_PUT_VCPU_INFO(%rsi) - subq $6*8,%rsp - movq %rbx,5*8(%rsp) - movq %rbp,4*8(%rsp) - movq %r12,3*8(%rsp) - movq %r13,2*8(%rsp) - movq %r14,1*8(%rsp) - movq %r15,(%rsp) + SAVE_REST movq %rsp,%rdi # set the argument again jmp 11b ecrit: /**** END OF CRITICAL REGION ****/ @@ -198,14 +212,8 @@ retint_restore_args: HYPERVISOR_IRET 0 -error_exit: - movq (%rsp),%r15 - movq 1*8(%rsp),%r14 - movq 2*8(%rsp),%r13 - movq 3*8(%rsp),%r12 - movq 4*8(%rsp),%rbp - movq 5*8(%rsp),%rbx - addq $6*8,%rsp +error_exit: + RESTORE_REST XEN_BLOCK_EVENTS(%rsi) jmp retint_kernel -- 1.7.7.6
Xu Zhang
2013-Mar-08 21:30 UTC
[PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes
Re-arrange assembly code blocks so that they are in called order instead of jumping around, enhancing readability. Macros are grouped together as well. Signed-off-by: Xu Zhang <xzhang@cs.uic.edu> --- extras/mini-os/arch/x86/x86_64.S | 118 +++++++++++++++++++------------------- 1 files changed, 59 insertions(+), 59 deletions(-) diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S index addb7b1..79e893f 100644 --- a/extras/mini-os/arch/x86/x86_64.S +++ b/extras/mini-os/arch/x86/x86_64.S @@ -36,6 +36,22 @@ hypercall_page: .org 0x3000 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg +#define XEN_PUT_VCPU_INFO(reg) +#define XEN_PUT_VCPU_INFO_fixup +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) + +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ + XEN_LOCKED_BLOCK_EVENTS(reg) ; \ + XEN_PUT_VCPU_INFO(reg) + +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ + XEN_PUT_VCPU_INFO(reg) + + /* Offsets into shared_info_t. */ #define evtchn_upcall_pending /* 0 */ #define evtchn_upcall_mask 1 @@ -46,6 +62,27 @@ NMI_MASK = 0x80000000 #define ORIG_RAX 120 /* + error_code */ #define EFLAGS 144 + +/* Macros */ +.macro zeroentry sym + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x10,%rsp /* skip rcx and r11 */ + pushq $0 /* push error code/oldrax */ + pushq %rax /* push real oldrax to the rdi slot */ + leaq \sym(%rip),%rax + jmp error_entry +.endm + +.macro errorentry sym + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x10,%rsp /* rsp points to the error code */ + pushq %rax + leaq \sym(%rip),%rax + jmp error_entry +.endm + .macro SAVE_REST subq $6*8,%rsp movq %rbx,5*8(%rsp) @@ -79,7 +116,6 @@ NMI_MASK = 0x80000000 addq $9*8+8,%rsp .endm - .macro HYPERVISOR_IRET flag testl $NMI_MASK,2*8(%rsp) jnz 2f @@ -98,6 +134,8 @@ NMI_MASK = 0x80000000 jmp hypercall_page + (__HYPERVISOR_iret * 32) .endm + + /* * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. @@ -130,73 +168,24 @@ error_call_handler: call *%rax jmp error_exit -.macro zeroentry sym - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* skip rcx and r11 */ - pushq $0 /* push error code/oldrax */ - pushq %rax /* push real oldrax to the rdi slot */ - leaq \sym(%rip),%rax - jmp error_entry -.endm - -.macro errorentry sym - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* rsp points to the error code */ - pushq %rax - leaq \sym(%rip),%rax - jmp error_entry -.endm - -#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg -#define XEN_PUT_VCPU_INFO(reg) -#define XEN_PUT_VCPU_INFO_fixup -#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) -#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) - -#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ - XEN_LOCKED_BLOCK_EVENTS(reg) ; \ - XEN_PUT_VCPU_INFO(reg) - -#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ - XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ - XEN_PUT_VCPU_INFO(reg) - ENTRY(hypervisor_callback) zeroentry hypervisor_callback2 ENTRY(hypervisor_callback2) - movq %rdi, %rsp + movq %rdi, %rsp 11: movq %gs:8,%rax incl %gs:0 cmovzq %rax,%rsp pushq %rdi - call do_hypervisor_callback + call do_hypervisor_callback popq %rsp decl %gs:0 - jmp error_exit - -restore_all_enable_events: - XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... - -scrit: /**** START OF CRITICAL REGION ****/ - XEN_TEST_PENDING(%rsi) - jnz 14f # process more events if necessary... - XEN_PUT_VCPU_INFO(%rsi) - RESTORE_ALL - HYPERVISOR_IRET 0 - -14: XEN_LOCKED_BLOCK_EVENTS(%rsi) - XEN_PUT_VCPU_INFO(%rsi) - SAVE_REST - movq %rsp,%rdi # set the argument again - jmp 11b -ecrit: /**** END OF CRITICAL REGION ****/ +error_exit: + RESTORE_REST + XEN_BLOCK_EVENTS(%rsi) retint_kernel: retint_restore_args: @@ -211,11 +200,22 @@ retint_restore_args: RESTORE_ALL HYPERVISOR_IRET 0 +restore_all_enable_events: + XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... -error_exit: - RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) - jmp retint_kernel +scrit: /**** START OF CRITICAL REGION ****/ + XEN_TEST_PENDING(%rsi) + jnz 14f # process more events if necessary... + XEN_PUT_VCPU_INFO(%rsi) + RESTORE_ALL + HYPERVISOR_IRET 0 + +14: XEN_LOCKED_BLOCK_EVENTS(%rsi) + XEN_PUT_VCPU_INFO(%rsi) + SAVE_REST + movq %rsp,%rdi # set the argument again + jmp 11b +ecrit: /**** END OF CRITICAL REGION ****/ -- 1.7.7.6
Xu Zhang
2013-Mar-08 21:30 UTC
[PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking
We don''t need to block events here because: - if we came from "hypervisor_callback", events are disabled at this point, no need to block again; - if we came from "error_entry", we shouldn''t touch event mask, for exception hanlding are meant to be interrupted by Xen events (virtual irq). Signed-off-by: Xu Zhang <xzhang@cs.uic.edu> --- extras/mini-os/arch/x86/x86_64.S | 1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S index 79e893f..5e0021b 100644 --- a/extras/mini-os/arch/x86/x86_64.S +++ b/extras/mini-os/arch/x86/x86_64.S @@ -185,7 +185,6 @@ ENTRY(hypervisor_callback2) error_exit: RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) retint_kernel: retint_restore_args: -- 1.7.7.6
Xu Zhang
2013-Mar-08 21:30 UTC
[PATCH 5/6] mini-os/x86-64 entry: defer RESTORE_REST until return
No need to do a RESTORE_REST at this point because if we saw pending events after we enabled event delivery, we have to do a SAVE_REST again. Instead, we do a "lazy" RESTORE_REST, deferring it until immediate before actual return. The offset of saved-on-stack rflags register is changed as well. Signed-off-by: Xu Zhang <xzhang@cs.uic.edu> --- extras/mini-os/arch/x86/x86_64.S | 16 ++++++++-------- 1 files changed, 8 insertions(+), 8 deletions(-) diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S index 5e0021b..25add86 100644 --- a/extras/mini-os/arch/x86/x86_64.S +++ b/extras/mini-os/arch/x86/x86_64.S @@ -60,7 +60,7 @@ NMI_MASK = 0x80000000 #define RDI 112 #define ORIG_RAX 120 /* + error_code */ -#define EFLAGS 144 +#define RFLAGS 144 /* Macros */ @@ -184,18 +184,17 @@ ENTRY(hypervisor_callback2) decl %gs:0 error_exit: - RESTORE_REST - retint_kernel: -retint_restore_args: - movl EFLAGS-6*8(%rsp), %eax - shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + movl RFLAGS(%rsp), %eax + shr $9, %eax # EAX[0] == IRET_RFLAGS.IF XEN_GET_VCPU_INFO(%rsi) andb evtchn_upcall_mask(%rsi),%al - andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + andb $1,%al # EAX[0] == IRET_RFLAGS.IF & event_mask jnz restore_all_enable_events # != 0 => enable event delivery XEN_PUT_VCPU_INFO(%rsi) +retint_restore_args: + RESTORE_REST RESTORE_ALL HYPERVISOR_IRET 0 @@ -206,12 +205,13 @@ scrit: /**** START OF CRITICAL REGION ****/ XEN_TEST_PENDING(%rsi) jnz 14f # process more events if necessary... XEN_PUT_VCPU_INFO(%rsi) + + RESTORE_REST RESTORE_ALL HYPERVISOR_IRET 0 14: XEN_LOCKED_BLOCK_EVENTS(%rsi) XEN_PUT_VCPU_INFO(%rsi) - SAVE_REST movq %rsp,%rdi # set the argument again jmp 11b ecrit: /**** END OF CRITICAL REGION ****/ -- 1.7.7.6
Xu Zhang
2013-Mar-08 21:30 UTC
[PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up
In hypervisor_callback, check against event re-entrant. If that''s the case, try to fix up by looking up the number of bytes restored when the second event came and coalescing the two stack frames. The execution is resumed as if the second event never happened. Signed-off-by: Xu Zhang <xzhang@cs.uic.edu> --- extras/mini-os/arch/x86/x86_64.S | 90 ++++++++++++++++++++++++++++++++++++-- 1 files changed, 86 insertions(+), 4 deletions(-) diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S index 25add86..0a13d3e 100644 --- a/extras/mini-os/arch/x86/x86_64.S +++ b/extras/mini-os/arch/x86/x86_64.S @@ -57,10 +57,14 @@ hypercall_page: #define evtchn_upcall_mask 1 NMI_MASK = 0x80000000 +KERNEL_CS_MASK = 0xfc -#define RDI 112 -#define ORIG_RAX 120 /* + error_code */ -#define RFLAGS 144 +#define RDI 112 +#define ORIG_RAX 120 /* + error_code */ +#define RIP 128 +#define CS 136 +#define RFLAGS 144 +#define RSP 152 /* Macros */ @@ -161,7 +165,7 @@ ENTRY(error_entry) movq %r15,(%rsp) error_call_handler: - movq %rdi, RDI(%rsp) + movq %rdi, RDI(%rsp) movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi # get error code movq $-1,ORIG_RAX(%rsp) @@ -175,6 +179,14 @@ ENTRY(hypervisor_callback) ENTRY(hypervisor_callback2) movq %rdi, %rsp + + /* check against event re-entrant */ + movq RIP(%rsp),%rax + cmpq $scrit,%rax + jb 11f + cmpq $ecrit,%rax + jb critical_region_fixup + 11: movq %gs:8,%rax incl %gs:0 cmovzq %rax,%rsp @@ -217,6 +229,76 @@ scrit: /**** START OF CRITICAL REGION ****/ ecrit: /**** END OF CRITICAL REGION ****/ +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# ''critical_fixup_table''. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. This is the number of bytes from the current stack +# that we need to copy at the end of the previous activation frame so that +# we can continue as if we''ve never even reached 11 running in the old +# activation frame. +critical_region_fixup: + addq $critical_fixup_table - scrit, %rax + movzbq (%rax),%rax # %rax contains num bytes popped + mov %rsp,%rsi + add %rax,%rsi # %esi points at end of src region + + movq RSP(%rsp),%rdi # acquire interrupted %rsp from current stack frame + # %edi points at end of dst region + mov %rax,%rcx + shr $3,%rcx # convert bytes into count of 64-bit entities + je 16f # skip loop if nothing to copy +15: subq $8,%rsi # pre-decrementing copy loop + subq $8,%rdi + movq (%rsi),%rax + movq %rax,(%rdi) + loop 15b +16: movq %rdi,%rsp # final %rdi is top of merged stack + andb $KERNEL_CS_MASK,CS(%rsp) # CS on stack might have changed + jmp 11b + + +/* Nested event fixup look-up table*/ +critical_fixup_table: + .byte 0x00,0x00,0x00 # XEN_TEST_PENDING(%rsi) + .byte 0x00,0x00,0x00,0x00,0x00,0x00 # jnz 14f + .byte 0x00,0x00,0x00,0x00 # mov (%rsp),%r15 + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x8(%rsp),%r14 + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x10(%rsp),%r13 + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x18(%rsp),%r12 + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x20(%rsp),%rbp + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x28(%rsp),%rbx + .byte 0x00,0x00,0x00,0x00 # add $0x30,%rsp + .byte 0x30,0x30,0x30,0x30 # mov (%rsp),%r11 + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x8(%rsp),%r10 + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x10(%rsp),%r9 + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x18(%rsp),%r8 + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x20(%rsp),%rax + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x28(%rsp),%rcx + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x30(%rsp),%rdx + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x38(%rsp),%rsi + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x40(%rsp),%rdi + .byte 0x30,0x30,0x30,0x30 # add $0x50,%rsp + .byte 0x80,0x80,0x80,0x80 # testl $NMI_MASK,2*8(%rsp) + .byte 0x80,0x80,0x80,0x80 + .byte 0x80,0x80 # jnz 2f + .byte 0x80,0x80,0x80,0x80 # testb $1,(xen_features+XENFEAT_supervisor_mode_kernel) + .byte 0x80,0x80,0x80,0x80 + .byte 0x80,0x80 # jnz 1f + .byte 0x80,0x80,0x80,0x80,0x80 # orb $3,1*8(%rsp) + .byte 0x80,0x80,0x80,0x80,0x80 # orb $3,4*8(%rsp) + .byte 0x80,0x80 # iretq + .byte 0x80,0x80,0x80,0x80 # andl $~NMI_MASK, 16(%rsp) + .byte 0x80,0x80,0x80,0x80 + .byte 0x80,0x80 # pushq $\flag + .byte 0x78,0x78,0x78,0x78,0x78 # jmp hypercall_page + (__HYPERVISOR_iret * 32) + .byte 0x00,0x00,0x00,0x00 # XEN_LOCKED_BLOCK_EVENTS(%rsi) + .byte 0x00,0x00,0x00 # mov %rsp,%rdi + .byte 0x00,0x00,0x00,0x00,0x00 # jmp 11b + + ENTRY(failsafe_callback) popq %rcx -- 1.7.7.6
Samuel Thibault
2013-Mar-09 20:55 UTC
Re: [PATCH 2/6] mini-os/x86-64 entry: define macros for registers partial save and restore
Xu Zhang, le Fri 08 Mar 2013 15:30:15 -0600, a écrit :> +.macro SAVE_REST > +.macro RESTORE_REST > .macro RESTORE_ALLI''d say rather put them after RESTORE_ALL, so that "REST" gets sense. We could also go even further in this kind of cleanup, but we can do that later. Samuel
Samuel Thibault
2013-Mar-09 20:57 UTC
Re: [PATCH 1/6] mini-os/x86-64 entry: code clean-ups
Xu Zhang, le Fri 08 Mar 2013 15:30:14 -0600, a écrit :> Make use of tabs and spaces consistent in arch/x86/x86_64.SAcked-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Samuel Thibault
2013-Mar-09 21:03 UTC
Re: [PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes
Xu Zhang, le Fri 08 Mar 2013 15:30:16 -0600, a écrit :> +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg > +#define XEN_PUT_VCPU_INFO(reg) > +#define XEN_PUT_VCPU_INFO_fixup > +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) > +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) > +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) > + > +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ > + XEN_LOCKED_BLOCK_EVENTS(reg) ; \ > + XEN_PUT_VCPU_INFO(reg) > + > +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ > + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ > + XEN_PUT_VCPU_INFO(reg)I agree on moving those, as they don''t really have a context.> +/* Macros */ > +.macro zeroentry sym > + movq (%rsp),%rcx > + movq 8(%rsp),%r11 > + addq $0x10,%rsp /* skip rcx and r11 */ > + pushq $0 /* push error code/oldrax */ > + pushq %rax /* push real oldrax to the rdi slot */ > + leaq \sym(%rip),%rax > + jmp error_entry > +.endm > + > +.macro errorentry sym > + movq (%rsp),%rcx > + movq 8(%rsp),%r11 > + addq $0x10,%rsp /* rsp points to the error code */ > + pushq %rax > + leaq \sym(%rip),%rax > + jmp error_entry > +.endmI disagree on moving those, as they do have some context.> > ENTRY(hypervisor_callback) > zeroentry hypervisor_callback2 > > ENTRY(hypervisor_callback2) > - movq %rdi, %rsp > + movq %rdi, %rspPlease avoid such kind of difference, or make it another patch.> retint_kernel: > retint_restore_args:Please also drop these labels, they''re not used. I agree on the rest. Samuel
Samuel Thibault
2013-Mar-09 21:07 UTC
Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking
Xu Zhang, le Fri 08 Mar 2013 15:30:17 -0600, a écrit :> We don''t need to block events here because: > - if we came from "hypervisor_callback", events are disabled at this point, > no need to block again; > - if we came from "error_entry", we shouldn''t touch event mask, for > exception hanlding are meant to be interrupted by Xen events (virtual > irq). > > Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>Acked-by: Samuel Thibault <samuel.thibault@ens-lyon.org>> --- > extras/mini-os/arch/x86/x86_64.S | 1 - > 1 files changed, 0 insertions(+), 1 deletions(-) > > diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S > index 79e893f..5e0021b 100644 > --- a/extras/mini-os/arch/x86/x86_64.S > +++ b/extras/mini-os/arch/x86/x86_64.S > @@ -185,7 +185,6 @@ ENTRY(hypervisor_callback2) > > error_exit: > RESTORE_REST > - XEN_BLOCK_EVENTS(%rsi) > > retint_kernel: > retint_restore_args: > -- > 1.7.7.6 >-- Samuel > Il [e2fsck] a bien démarré, mais il m''a rendu la main aussitot en me > disant "houlala, c''est pas beau à voir votre truc, je préfèrerai que > vous teniez vous même la tronçonneuse" (traduction libre) NC in Guide du linuxien pervers : "Bien configurer sa tronçonneuse."
Samuel Thibault
2013-Mar-09 21:15 UTC
Re: [PATCH 5/6] mini-os/x86-64 entry: defer RESTORE_REST until return
Xu Zhang, le Fri 08 Mar 2013 15:30:18 -0600, a écrit :> No need to do a RESTORE_REST at this point because if we saw pending > events after we enabled event delivery, we have to do a SAVE_REST again. > Instead, we do a "lazy" RESTORE_REST, deferring it until immediate before > actual return.So in the end RESTORE_REST is always along RESTORE_ALL? I guess after this series of patches, we''ll just get merged then? One could say that it''s better to avoid restoring many registers on the critical path, but I tend to prefer something simple (to maintain), even if long. Acked-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Samuel Thibault
2013-Mar-09 21:19 UTC
Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up
Xu Zhang, le Fri 08 Mar 2013 15:30:19 -0600, a écrit :> In hypervisor_callback, check against event re-entrant. > If that''s the case, try to fix up by looking up the number of bytes > restored when the second event came and coalescing the two stack frames. > The execution is resumed as if the second event never happened.> - movq %rdi, RDI(%rsp) > + movq %rdi, RDI(%rsp)Again, please avoid.> + andb $KERNEL_CS_MASK,CS(%rsp) # CS on stack might have changedI don''t see why this. Samuel
Jeremy Fitzhardinge
2013-Mar-09 22:44 UTC
Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up
On 03/08/2013 01:30 PM, Xu Zhang wrote:> +# [How we do the fixup]. We want to merge the current stack frame with the > +# just-interrupted frame. How we do this depends on where in the critical > +# region the interrupted handler was executing, and so how many saved > +# registers are in each frame. We do this quickly using the lookup table > +# ''critical_fixup_table''. For each byte offset in the critical region, it > +# provides the number of bytes which have already been popped from the > +# interrupted stack frame. This is the number of bytes from the current stack > +# that we need to copy at the end of the previous activation frame so that > +# we can continue as if we''ve never even reached 11 running in the old > +# activation frame. > +critical_region_fixup: > + addq $critical_fixup_table - scrit, %rax > + movzbq (%rax),%rax # %rax contains num bytes popped > + mov %rsp,%rsi > + add %rax,%rsi # %esi points at end of src region > + > + movq RSP(%rsp),%rdi # acquire interrupted %rsp from current stack frame > + # %edi points at end of dst region > + mov %rax,%rcx > + shr $3,%rcx # convert bytes into count of 64-bit entities > + je 16f # skip loop if nothing to copy > +15: subq $8,%rsi # pre-decrementing copy loop > + subq $8,%rdi > + movq (%rsi),%rax > + movq %rax,(%rdi) > + loop 15b > +16: movq %rdi,%rsp # final %rdi is top of merged stack > + andb $KERNEL_CS_MASK,CS(%rsp) # CS on stack might have changed > + jmp 11b > + > + > +/* Nested event fixup look-up table*/ > +critical_fixup_table: > + .byte 0x00,0x00,0x00 # XEN_TEST_PENDING(%rsi) > + .byte 0x00,0x00,0x00,0x00,0x00,0x00 # jnz 14f > + .byte 0x00,0x00,0x00,0x00 # mov (%rsp),%r15 > + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x8(%rsp),%r14 > + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x10(%rsp),%r13 > + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x18(%rsp),%r12 > + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x20(%rsp),%rbp > + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x28(%rsp),%rbx > + .byte 0x00,0x00,0x00,0x00 # add $0x30,%rsp > + .byte 0x30,0x30,0x30,0x30 # mov (%rsp),%r11 > + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x8(%rsp),%r10 > + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x10(%rsp),%r9 > + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x18(%rsp),%r8 > + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x20(%rsp),%rax > + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x28(%rsp),%rcx > + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x30(%rsp),%rdx > + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x38(%rsp),%rsi > + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x40(%rsp),%rdi > + .byte 0x30,0x30,0x30,0x30 # add $0x50,%rsp > + .byte 0x80,0x80,0x80,0x80 # testl $NMI_MASK,2*8(%rsp) > + .byte 0x80,0x80,0x80,0x80 > + .byte 0x80,0x80 # jnz 2f > + .byte 0x80,0x80,0x80,0x80 # testb $1,(xen_features+XENFEAT_supervisor_mode_kernel) > + .byte 0x80,0x80,0x80,0x80 > + .byte 0x80,0x80 # jnz 1f > + .byte 0x80,0x80,0x80,0x80,0x80 # orb $3,1*8(%rsp) > + .byte 0x80,0x80,0x80,0x80,0x80 # orb $3,4*8(%rsp) > + .byte 0x80,0x80 # iretq > + .byte 0x80,0x80,0x80,0x80 # andl $~NMI_MASK, 16(%rsp) > + .byte 0x80,0x80,0x80,0x80 > + .byte 0x80,0x80 # pushq $\flag > + .byte 0x78,0x78,0x78,0x78,0x78 # jmp hypercall_page + (__HYPERVISOR_iret * 32) > + .byte 0x00,0x00,0x00,0x00 # XEN_LOCKED_BLOCK_EVENTS(%rsi) > + .byte 0x00,0x00,0x00 # mov %rsp,%rdi > + .byte 0x00,0x00,0x00,0x00,0x00 # jmp 11bThis looks super-fragile. The original Xen-linux kernel code had a similar kind of fixup table, but I went to some lengths to make it as simple and robust as possible in the pvops kernels. See the comment in xen-asm_32.S: * Because the nested interrupt handler needs to deal with the current * stack state in whatever form its in, we keep things simple by only * using a single register which is pushed/popped on the stack. 64-bit pvops Linux always uses the iret hypercall, so the issue is moot there. (In principle a nested kernel interrupt could avoid the iret, but it wasn''t obvious that the extra complexity was worth it.) J
Xu Zhang
2013-Mar-13 02:42 UTC
Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up
On 03/09/2013 03:19 PM, Samuel Thibault wrote:>> + andb $KERNEL_CS_MASK,CS(%rsp) # CS on stack might have changed > I don''t see why this. >Thank you for all your comments and acks. :-) ... /* Direct iret to kernel space. Correct CS and SS. */ orb $3,1*8(%rsp) orb $3,4*8(%rsp) 1: iretq ... HYPERVISOR_IRET is part of the critical section. It could be the case that a nested event comes immediately after we set the lowest byte of on-stack CS from 0 to 3 (orb $3,1*8(%rsp)). After invoking the fixup routine, we want the stack frame to be exactly the same of the original one. One scenario this could be useful is that if mini-os had an userspace. If the on-stack CS had changed before nested event came and we didn''t reset it, we could end up doing an iret with a hypercall iret, assuming we return to kernel with direct iret if on-stack CS is 0 (yet have to change it to 3 before doing so because guest kernel is in ring 3) and userspace with hypercall iret if 3 in x86 64bit. The scenario looks something like this: ... - enable events; in critical section: returning to kernel via direct iret (on-stack CS is 0); ... - orb $3,1*8(%rsp): set on-stack CS to 3 - nested event came: fixing up the stack frame; - go back to handle events; perform iret again; if we didn''t reset on-stack CS to be 0 in fixup routine, we would return with a hypercall iret instead of a direct iret (assume if on-stack CS is 3, we return to userspace with hypercall iret) In x86 64-bit mini-os, the critical section only gets executed iff events need to be enabled and we are returning to kernel. So does fixup routine. Thus it is safe to reset the lowest byte of CS to 0 in fixup routine. It doesn''t hurt not doing this. But I think conceptually the stack frame should be the same after fixing up. ;-) Thanks, -- xu :q!
Xu Zhang
2013-Mar-13 02:42 UTC
Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up
On 03/09/2013 04:44 PM, Jeremy Fitzhardinge wrote:> On 03/08/2013 01:30 PM, Xu Zhang wrote: >> +# [How we do the fixup]. We want to merge the current stack frame with the >> +# just-interrupted frame. How we do this depends on where in the critical >> +# region the interrupted handler was executing, and so how many saved >> +# registers are in each frame. We do this quickly using the lookup table >> +# ''critical_fixup_table''. For each byte offset in the critical region, it >> +# provides the number of bytes which have already been popped from the >> +# interrupted stack frame. This is the number of bytes from the current stack >> +# that we need to copy at the end of the previous activation frame so that >> +# we can continue as if we''ve never even reached 11 running in the old >> +# activation frame. >> +critical_region_fixup: >> + addq $critical_fixup_table - scrit, %rax >> + movzbq (%rax),%rax # %rax contains num bytes popped >> + mov %rsp,%rsi >> + add %rax,%rsi # %esi points at end of src region >> + >> + movq RSP(%rsp),%rdi # acquire interrupted %rsp from current stack frame >> + # %edi points at end of dst region >> + mov %rax,%rcx >> + shr $3,%rcx # convert bytes into count of 64-bit entities >> + je 16f # skip loop if nothing to copy >> +15: subq $8,%rsi # pre-decrementing copy loop >> + subq $8,%rdi >> + movq (%rsi),%rax >> + movq %rax,(%rdi) >> + loop 15b >> +16: movq %rdi,%rsp # final %rdi is top of merged stack >> + andb $KERNEL_CS_MASK,CS(%rsp) # CS on stack might have changed >> + jmp 11b >> + >> + >> +/* Nested event fixup look-up table*/ >> +critical_fixup_table: >> + .byte 0x00,0x00,0x00 # XEN_TEST_PENDING(%rsi) >> + .byte 0x00,0x00,0x00,0x00,0x00,0x00 # jnz 14f >> + .byte 0x00,0x00,0x00,0x00 # mov (%rsp),%r15 >> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x8(%rsp),%r14 >> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x10(%rsp),%r13 >> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x18(%rsp),%r12 >> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x20(%rsp),%rbp >> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x28(%rsp),%rbx >> + .byte 0x00,0x00,0x00,0x00 # add $0x30,%rsp >> + .byte 0x30,0x30,0x30,0x30 # mov (%rsp),%r11 >> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x8(%rsp),%r10 >> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x10(%rsp),%r9 >> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x18(%rsp),%r8 >> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x20(%rsp),%rax >> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x28(%rsp),%rcx >> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x30(%rsp),%rdx >> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x38(%rsp),%rsi >> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x40(%rsp),%rdi >> + .byte 0x30,0x30,0x30,0x30 # add $0x50,%rsp >> + .byte 0x80,0x80,0x80,0x80 # testl $NMI_MASK,2*8(%rsp) >> + .byte 0x80,0x80,0x80,0x80 >> + .byte 0x80,0x80 # jnz 2f >> + .byte 0x80,0x80,0x80,0x80 # testb $1,(xen_features+XENFEAT_supervisor_mode_kernel) >> + .byte 0x80,0x80,0x80,0x80 >> + .byte 0x80,0x80 # jnz 1f >> + .byte 0x80,0x80,0x80,0x80,0x80 # orb $3,1*8(%rsp) >> + .byte 0x80,0x80,0x80,0x80,0x80 # orb $3,4*8(%rsp) >> + .byte 0x80,0x80 # iretq >> + .byte 0x80,0x80,0x80,0x80 # andl $~NMI_MASK, 16(%rsp) >> + .byte 0x80,0x80,0x80,0x80 >> + .byte 0x80,0x80 # pushq $\flag >> + .byte 0x78,0x78,0x78,0x78,0x78 # jmp hypercall_page + (__HYPERVISOR_iret * 32) >> + .byte 0x00,0x00,0x00,0x00 # XEN_LOCKED_BLOCK_EVENTS(%rsi) >> + .byte 0x00,0x00,0x00 # mov %rsp,%rdi >> + .byte 0x00,0x00,0x00,0x00,0x00 # jmp 11b > This looks super-fragile. The original Xen-linux kernel code had a > similar kind of fixup table, but I went to some lengths to make it as > simple and robust as possible in the pvops kernels. See the comment in > xen-asm_32.S: > > * Because the nested interrupt handler needs to deal with the current > * stack state in whatever form its in, we keep things simple by only > * using a single register which is pushed/popped on the stack. > > 64-bit pvops Linux always uses the iret hypercall, so the issue is moot > there. (In principle a nested kernel interrupt could avoid the iret, > but it wasn''t obvious that the extra complexity was worth it.) > > J >Thanks very much for you feedbacks. I guess the concern is that using a chunky look-up table like this is somewhat complicated and would be less easier to maintain? Samuel made some suggestions, making it less "fragile" to some extent: Also, it would be good to check against critical section size change, in case somebody e.g. changes a value, or a macro like XEN_PUT_VCPU_INFO. For instance, stuff right after the table: .if (ecrit-scrit) != (critical_fixup_table_end - critical_fixup_table) .error "The critical has changed, the fixup table needs updating" .endif I read through Xen-Linux 32bit implementation for this issue. If I understand it right, the use of look-up table is avoided by leaving RESTORE_ALL/REST out of the critical section. So when a nested event came, you always certain about the stack frame layout and how many bytes to copy over when fixing up (in Xen-Linux 32bit, PT_EIP/4 - 1). And the register that would be clobbered in order to read and write event mask (%eax in Xen-Linux 32bit case) shall be saved on stack. This time we only need to worry about this one register instead of many when fixing up the stack frame. Basically both approaches are correct to me and do the same job. Xen-Linux 32bit is definitely a more neat solution. However, one thing to notice is that mini-os x86 32bit also uses a fixup table. I guess we could apply these patches as a temporary solution to make sure everything is correct and consistent, and having following patches to do the refinement. Thanks, -- xu :q!
Xu Zhang
2013-Mar-13 05:53 UTC
Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up
On 3/12/2013 6:42 PM, Xu Zhang wrote:> Basically both approaches are correct to me and do the same job. > Xen-Linux 32bit is definitely a more neat solution. However, one thing > to notice is that mini-os x86 32bit also uses a fixup table. I guess > we could apply these patches as a temporary solution to make sure > everything is correct and consistent, and having following patches to > do the refinement.Ah, somewhat "consistent". :-) And by refinement I mean refinement on both 32bit and 64bit mini-os x86 kernel entries: basically replacing use of look up table with Xen-Linux x86-32bit''s approach. Thanks, - Xu
Jeremy Fitzhardinge
2013-Mar-14 01:09 UTC
Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up
On 03/12/2013 07:42 PM, Xu Zhang wrote:> On 03/09/2013 04:44 PM, Jeremy Fitzhardinge wrote: >> On 03/08/2013 01:30 PM, Xu Zhang wrote: >>> +# [How we do the fixup]. We want to merge the current stack frame >>> with the >>> +# just-interrupted frame. How we do this depends on where in the >>> critical >>> +# region the interrupted handler was executing, and so how many saved >>> +# registers are in each frame. We do this quickly using the lookup >>> table >>> +# ''critical_fixup_table''. For each byte offset in the critical >>> region, it >>> +# provides the number of bytes which have already been popped from the >>> +# interrupted stack frame. This is the number of bytes from the >>> current stack >>> +# that we need to copy at the end of the previous activation frame >>> so that >>> +# we can continue as if we''ve never even reached 11 running in the old >>> +# activation frame. >>> +critical_region_fixup: >>> + addq $critical_fixup_table - scrit, %rax >>> + movzbq (%rax),%rax # %rax contains num bytes popped >>> + mov %rsp,%rsi >>> + add %rax,%rsi # %esi points at end of src region >>> + >>> + movq RSP(%rsp),%rdi # acquire interrupted %rsp from >>> current stack frame >>> + # %edi points at end of dst region >>> + mov %rax,%rcx >>> + shr $3,%rcx # convert bytes into count of 64-bit >>> entities >>> + je 16f # skip loop if nothing to copy >>> +15: subq $8,%rsi # pre-decrementing copy loop >>> + subq $8,%rdi >>> + movq (%rsi),%rax >>> + movq %rax,(%rdi) >>> + loop 15b >>> +16: movq %rdi,%rsp # final %rdi is top of merged stack >>> + andb $KERNEL_CS_MASK,CS(%rsp) # CS on stack might have >>> changed >>> + jmp 11b >>> + >>> + >>> +/* Nested event fixup look-up table*/ >>> +critical_fixup_table: >>> + .byte 0x00,0x00,0x00 # XEN_TEST_PENDING(%rsi) >>> + .byte 0x00,0x00,0x00,0x00,0x00,0x00 # jnz 14f >>> + .byte 0x00,0x00,0x00,0x00 # mov (%rsp),%r15 >>> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x8(%rsp),%r14 >>> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x10(%rsp),%r13 >>> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x18(%rsp),%r12 >>> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x20(%rsp),%rbp >>> + .byte 0x00,0x00,0x00,0x00,0x00 # mov 0x28(%rsp),%rbx >>> + .byte 0x00,0x00,0x00,0x00 # add $0x30,%rsp >>> + .byte 0x30,0x30,0x30,0x30 # mov (%rsp),%r11 >>> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x8(%rsp),%r10 >>> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x10(%rsp),%r9 >>> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x18(%rsp),%r8 >>> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x20(%rsp),%rax >>> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x28(%rsp),%rcx >>> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x30(%rsp),%rdx >>> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x38(%rsp),%rsi >>> + .byte 0x30,0x30,0x30,0x30,0x30 # mov 0x40(%rsp),%rdi >>> + .byte 0x30,0x30,0x30,0x30 # add $0x50,%rsp >>> + .byte 0x80,0x80,0x80,0x80 # testl >>> $NMI_MASK,2*8(%rsp) >>> + .byte 0x80,0x80,0x80,0x80 >>> + .byte 0x80,0x80 # jnz 2f >>> + .byte 0x80,0x80,0x80,0x80 # testb >>> $1,(xen_features+XENFEAT_supervisor_mode_kernel) >>> + .byte 0x80,0x80,0x80,0x80 >>> + .byte 0x80,0x80 # jnz 1f >>> + .byte 0x80,0x80,0x80,0x80,0x80 # orb $3,1*8(%rsp) >>> + .byte 0x80,0x80,0x80,0x80,0x80 # orb $3,4*8(%rsp) >>> + .byte 0x80,0x80 # iretq >>> + .byte 0x80,0x80,0x80,0x80 # andl $~NMI_MASK, >>> 16(%rsp) >>> + .byte 0x80,0x80,0x80,0x80 >>> + .byte 0x80,0x80 # pushq $\flag >>> + .byte 0x78,0x78,0x78,0x78,0x78 # jmp hypercall_page >>> + (__HYPERVISOR_iret * 32) >>> + .byte 0x00,0x00,0x00,0x00 # >>> XEN_LOCKED_BLOCK_EVENTS(%rsi) >>> + .byte 0x00,0x00,0x00 # mov %rsp,%rdi >>> + .byte 0x00,0x00,0x00,0x00,0x00 # jmp 11b >> This looks super-fragile. The original Xen-linux kernel code had a >> similar kind of fixup table, but I went to some lengths to make it as >> simple and robust as possible in the pvops kernels. See the comment in >> xen-asm_32.S: >> >> * Because the nested interrupt handler needs to deal with the current >> * stack state in whatever form its in, we keep things simple by only >> * using a single register which is pushed/popped on the stack. >> >> 64-bit pvops Linux always uses the iret hypercall, so the issue is moot >> there. (In principle a nested kernel interrupt could avoid the iret, >> but it wasn''t obvious that the extra complexity was worth it.) >> >> J >> > > Thanks very much for you feedbacks. > > I guess the concern is that using a chunky look-up table like this is > somewhat complicated and would be less easier to maintain? Samuel made > some suggestions, making it less "fragile" to some extent: > > Also, it would be good to check against critical section size change, in > case somebody e.g. changes a value, or a macro like XEN_PUT_VCPU_INFO. > For instance, stuff right after the table: > > .if (ecrit-scrit) != (critical_fixup_table_end - > critical_fixup_table) > .error "The critical has changed, the fixup table needs updating" > .endifThat would catch some problems, but not all. I worry that the whole construction is too error-prone, and its too easy for some later unsuspecting developer to come in and introduce a subtle bug that only occurs under very particular timing conditions.> I read through Xen-Linux 32bit implementation for this issue. If I > understand it right, the use of look-up table is avoided by leaving > RESTORE_ALL/REST out of the critical section. So when a nested event > came, you always certain about the stack frame layout and how many > bytes to copy over when fixing up (in Xen-Linux 32bit, PT_EIP/4 - 1). > And the register that would be clobbered in order to read and write > event mask (%eax in Xen-Linux 32bit case) shall be saved on stack. > This time we only need to worry about this one register instead of > many when fixing up the stack frame.Right.> Basically both approaches are correct to me and do the same job. > Xen-Linux 32bit is definitely a more neat solution. However, one thing > to notice is that mini-os x86 32bit also uses a fixup table.I don''t have mini-os in front of me at the moment, but I wouldn''t be surprised if the original Xen-Linux and mini-os share heritage in this area.> I guess we could apply these patches as a temporary solution to make > sure everything is correct and consistent, and having following > patches to do the refinement.That makes sense to me if the refinement is a large amount of new work on top of what you''ve already done, but I don''t think that''s obviously true. Given that you''re familiar with the code and the problem domain, I expect you could knock out a clean solution pretty quickly. I guess you''re interested in this path for 64-bit as well, because a same ring-ring interrupt return is much more likely in minios. J
Konrad Rzeszutek Wilk
2013-Mar-15 20:16 UTC
Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking
On Fri, Mar 08, 2013 at 03:30:17PM -0600, Xu Zhang wrote:> We don''t need to block events here because: > - if we came from "hypervisor_callback", events are disabled at this point, > no need to block again;Hm, I think I am not following it. It does a bit of %gs manipulation which looks to point to the cpu0_pda.irqcount. ?> - if we came from "error_entry", we shouldn''t touch event mask, for > exception hanlding are meant to be interrupted by Xen events (virtualhandling> irq). > > Signed-off-by: Xu Zhang <xzhang@cs.uic.edu> > --- > extras/mini-os/arch/x86/x86_64.S | 1 - > 1 files changed, 0 insertions(+), 1 deletions(-) > > diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S > index 79e893f..5e0021b 100644 > --- a/extras/mini-os/arch/x86/x86_64.S > +++ b/extras/mini-os/arch/x86/x86_64.S > @@ -185,7 +185,6 @@ ENTRY(hypervisor_callback2) > > error_exit: > RESTORE_REST > - XEN_BLOCK_EVENTS(%rsi) > > retint_kernel: > retint_restore_args: > -- > 1.7.7.6 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel >
Xu Zhang
2013-Apr-11 04:40 UTC
Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking
Sorry about my delayed response. I just got some time to take a look and revise these patches. It seems to me the bit of %gs manipulation is used for executing "do_hypervisor_callback" on a different stack. After it''s done, it switches back to the original stack and resumes from there. The event mask is not touched during this process. I put mini-os under gdb just to verify, and for the runs I observed, event mask remains untouched. Is there anything I am missing? On 03/15/2013 03:16 PM, Konrad Rzeszutek Wilk wrote:> On Fri, Mar 08, 2013 at 03:30:17PM -0600, Xu Zhang wrote: >> We don''t need to block events here because: >> - if we came from "hypervisor_callback", events are disabled at this point, >> no need to block again; > Hm, I think I am not following it. It does a bit of %gs manipulation which > looks to point to the cpu0_pda.irqcount. > > ? >> - if we came from "error_entry", we shouldn''t touch event mask, for >> exception hanlding are meant to be interrupted by Xen events (virtual > handling >> irq). >> >> Signed-off-by: Xu Zhang <xzhang@cs.uic.edu> >> --- >> extras/mini-os/arch/x86/x86_64.S | 1 - >> 1 files changed, 0 insertions(+), 1 deletions(-) >> >> diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S >> index 79e893f..5e0021b 100644 >> --- a/extras/mini-os/arch/x86/x86_64.S >> +++ b/extras/mini-os/arch/x86/x86_64.S >> @@ -185,7 +185,6 @@ ENTRY(hypervisor_callback2) >> >> error_exit: >> RESTORE_REST >> - XEN_BLOCK_EVENTS(%rsi) >> >> retint_kernel: >> retint_restore_args: >> -- >> 1.7.7.6 >> >> >> _______________________________________________ >> Xen-devel mailing list >> Xen-devel@lists.xen.org >> http://lists.xen.org/xen-devel >> > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Xu Zhang
2013-Apr-11 04:40 UTC
Re: [PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes
On 03/09/2013 03:03 PM, Samuel Thibault wrote:>> +/* Macros */ >> +.macro zeroentry sym >> + movq (%rsp),%rcx >> + movq 8(%rsp),%r11 >> + addq $0x10,%rsp /* skip rcx and r11 */ >> + pushq $0 /* push error code/oldrax */ >> + pushq %rax /* push real oldrax to the rdi slot */ >> + leaq \sym(%rip),%rax >> + jmp error_entry >> +.endm >> + >> +.macro errorentry sym >> + movq (%rsp),%rcx >> + movq 8(%rsp),%r11 >> + addq $0x10,%rsp /* rsp points to the error code */ >> + pushq %rax >> + leaq \sym(%rip),%rax >> + jmp error_entry >> +.endm > I disagree on moving those, as they do have some context.Well, if they have some context, they probably end off better before they first get called, say before hypervisor_callback. Note they are also used in exception handlers. Besides, it''s kind of annoying to have these macros sit in between assembly routines. Probably it''s just me. If you prefer leave them untouched, I am perfectly fine with that.> >> retint_kernel: >> retint_restore_args: > Please also drop these labels, they''re not used. >I would argue it gonna be a different story if mini-os has an user space: "retint_kernel" returns to kernel, and presumably "retint_user" would do schedule, x87 lazy switch, handle softirqs and etc. But since it does not, I guess they can be dropped.