thr3ads.net - Xen devel - [PATCH 0/6] mini-os: check and fix up against nested events in x86-64 kernel entry [Mar 2013]

If this information is useful, please help other people find it:
Share via:

Xu Zhang

2013-Mar-08 21:30 UTC

[PATCH 0/6] mini-os: check and fix up against nested events in x86-64 kernel entry

mini-os almost always use direct iret to return from interrupt.
But this operation is not atomic because Xen uses event mask to
enable/disable event delivery. So there is a window for nested 
events to happen after re-enabling event delivery and before
a direct iret.

The issues come with such non-atomicity have been discussed in:
http://lists.xen.org/archives/html/xen-devel/2007-06/msg00142.html

And also on Xen-devel:
http://markmail.org/message/jkzhzy6fyes6igcf

This patch checks and fixes up against nested events in a similar 
fashion of mini-os 32bit and Fitzhardinge''s (whom is also CCed to).
It checks against re-entrant of critical section in event handling 
callback. Try to fix up by looking up the number of bytes restored 
when the second event came and coalescing the two stack frames into
one. And resume execution as if the second event never happened.

It also refactors mini-os''s x86-64 kernel entry assembly code.

Xu Zhang (6):
  mini-os/x86-64 entry: code clean-ups
  mini-os/x86-64 entry: define macros for registers partial save and
    restore
  mini-os/x86-64 entry: code refactoring; no functional changes
  mini-os/x86-64 entry: remove unnecessary event block
  mini-os/x86-64 entry: defer RESTORE_REST until return
  mini-os/x86-64 entry: check against nested events and try to fix up

 extras/mini-os/arch/x86/x86_64.S |  261 +++++++++++++++++++++++++-------------
 1 files changed, 175 insertions(+), 86 deletions(-)

-- 
1.7.7.6

Xu Zhang

2013-Mar-08 21:30 UTC

head link

[PATCH 1/6] mini-os/x86-64 entry: code clean-ups

Make use of tabs and spaces consistent in arch/x86/x86_64.S

Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
---
 extras/mini-os/arch/x86/x86_64.S |   54 +++++++++++++++++++-------------------
 1 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S
index a65e5d5..6139c2d 100644
--- a/extras/mini-os/arch/x86/x86_64.S
+++ b/extras/mini-os/arch/x86/x86_64.S
@@ -111,9 +111,9 @@ error_call_handler:
 	jmp error_exit
 
 .macro zeroentry sym
-    movq (%rsp),%rcx
-    movq 8(%rsp),%r11
-    addq $0x10,%rsp /* skip rcx and r11 */
+	movq (%rsp),%rcx
+	movq 8(%rsp),%r11
+	addq $0x10,%rsp /* skip rcx and r11 */
 	pushq $0	/* push error code/oldrax */ 
 	pushq %rax	/* push real oldrax to the rdi slot */ 
 	leaq  \sym(%rip),%rax
@@ -121,9 +121,9 @@ error_call_handler:
 .endm	
 
 .macro errorentry sym
-        movq (%rsp),%rcx
-        movq 8(%rsp),%r11
-        addq $0x10,%rsp /* rsp points to the error code */
+	movq (%rsp),%rcx
+	movq 8(%rsp),%r11
+	addq $0x10,%rsp /* rsp points to the error code */
 	pushq %rax
 	leaq  \sym(%rip),%rax
 	jmp error_entry
@@ -147,18 +147,18 @@ error_call_handler:
 
 
 ENTRY(hypervisor_callback)
-    zeroentry hypervisor_callback2
+	zeroentry hypervisor_callback2
 
 ENTRY(hypervisor_callback2)
-        movq %rdi, %rsp 
-11:     movq %gs:8,%rax
-        incl %gs:0
-        cmovzq %rax,%rsp
-        pushq %rdi
-        call do_hypervisor_callback 
-        popq %rsp
-        decl %gs:0
-        jmp error_exit
+	movq %rdi, %rsp 
+11:	movq %gs:8,%rax
+	incl %gs:0
+	cmovzq %rax,%rsp
+	pushq %rdi
+	call do_hypervisor_callback 
+	popq %rsp
+	decl %gs:0
+	jmp error_exit
 
 restore_all_enable_events:  
 	XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
@@ -167,19 +167,19 @@ scrit:	/**** START OF CRITICAL REGION ****/
 	XEN_TEST_PENDING(%rsi)
 	jnz  14f			# process more events if necessary...
 	XEN_PUT_VCPU_INFO(%rsi)
-        RESTORE_ALL
-        HYPERVISOR_IRET 0
-        
+	RESTORE_ALL
+	HYPERVISOR_IRET 0
+
 14:	XEN_LOCKED_BLOCK_EVENTS(%rsi)
 	XEN_PUT_VCPU_INFO(%rsi)
 	subq $6*8,%rsp
-	movq %rbx,5*8(%rsp) 
-	movq %rbp,4*8(%rsp) 
-	movq %r12,3*8(%rsp) 
-	movq %r13,2*8(%rsp) 
-	movq %r14,1*8(%rsp) 
-	movq %r15,(%rsp) 
-        movq %rsp,%rdi                  # set the argument again
+	movq %rbx,5*8(%rsp)
+	movq %rbp,4*8(%rsp)
+	movq %r12,3*8(%rsp)
+	movq %r13,2*8(%rsp)
+	movq %r14,1*8(%rsp)
+	movq %r15,(%rsp)
+	movq %rsp,%rdi                  # set the argument again
 	jmp  11b
 ecrit:  /**** END OF CRITICAL REGION ****/
 
@@ -193,7 +193,7 @@ retint_restore_args:
 	andb $1,%al			# EAX[0] == IRET_EFLAGS.IF & event_mask
 	jnz restore_all_enable_events	#        != 0 => enable event delivery
 	XEN_PUT_VCPU_INFO(%rsi)
-		
+
 	RESTORE_ALL
 	HYPERVISOR_IRET 0
 
-- 
1.7.7.6

Xu Zhang

2013-Mar-08 21:30 UTC

head link

[PATCH 2/6] mini-os/x86-64 entry: define macros for registers partial save and restore

For saving and restoring registers rbx, rbp, and r12-r15,
define and use macro SAVE_REST and RESTORE_REST respectively.

Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
---
 extras/mini-os/arch/x86/x86_64.S |   38 +++++++++++++++++++++++---------------
 1 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S
index 6139c2d..addb7b1 100644
--- a/extras/mini-os/arch/x86/x86_64.S
+++ b/extras/mini-os/arch/x86/x86_64.S
@@ -46,6 +46,26 @@ NMI_MASK = 0x80000000
 #define ORIG_RAX 120       /* + error_code */ 
 #define EFLAGS 144
 
+.macro SAVE_REST
+	subq $6*8,%rsp
+	movq %rbx,5*8(%rsp)
+	movq %rbp,4*8(%rsp)
+	movq %r12,3*8(%rsp)
+	movq %r13,2*8(%rsp)
+	movq %r14,1*8(%rsp)
+	movq %r15,(%rsp)
+.endm
+
+.macro RESTORE_REST
+	movq (%rsp),%r15
+	movq 1*8(%rsp),%r14
+	movq 2*8(%rsp),%r13
+	movq 3*8(%rsp),%r12
+	movq 4*8(%rsp),%rbp
+	movq 5*8(%rsp),%rbx
+	addq $6*8,%rsp
+.endm
+
 .macro RESTORE_ALL
 	movq (%rsp),%r11
 	movq 1*8(%rsp),%r10
@@ -172,13 +192,7 @@ scrit:	/**** START OF CRITICAL REGION ****/
 
 14:	XEN_LOCKED_BLOCK_EVENTS(%rsi)
 	XEN_PUT_VCPU_INFO(%rsi)
-	subq $6*8,%rsp
-	movq %rbx,5*8(%rsp)
-	movq %rbp,4*8(%rsp)
-	movq %r12,3*8(%rsp)
-	movq %r13,2*8(%rsp)
-	movq %r14,1*8(%rsp)
-	movq %r15,(%rsp)
+	SAVE_REST
 	movq %rsp,%rdi                  # set the argument again
 	jmp  11b
 ecrit:  /**** END OF CRITICAL REGION ****/
@@ -198,14 +212,8 @@ retint_restore_args:
 	HYPERVISOR_IRET 0
 
 
-error_exit:		
-	movq (%rsp),%r15
-	movq 1*8(%rsp),%r14
-	movq 2*8(%rsp),%r13
-	movq 3*8(%rsp),%r12
-	movq 4*8(%rsp),%rbp
-	movq 5*8(%rsp),%rbx
-	addq $6*8,%rsp
+error_exit:
+	RESTORE_REST
 	XEN_BLOCK_EVENTS(%rsi)		
 	jmp retint_kernel
 
-- 
1.7.7.6

Xu Zhang

2013-Mar-08 21:30 UTC

head link

[PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes

Re-arrange assembly code blocks so that they are in called
order instead of jumping around, enhancing readability.
Macros are grouped together as well.

Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
---
 extras/mini-os/arch/x86/x86_64.S |  118 +++++++++++++++++++-------------------
 1 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S
index addb7b1..79e893f 100644
--- a/extras/mini-os/arch/x86/x86_64.S
+++ b/extras/mini-os/arch/x86/x86_64.S
@@ -36,6 +36,22 @@ hypercall_page:
         .org 0x3000
 
 
+#define XEN_GET_VCPU_INFO(reg)	movq HYPERVISOR_shared_info,reg
+#define XEN_PUT_VCPU_INFO(reg)
+#define XEN_PUT_VCPU_INFO_fixup
+#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)
+#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)
+#define XEN_TEST_PENDING(reg)	testb $0xFF,evtchn_upcall_pending(reg)
+
+#define XEN_BLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
+                    			XEN_LOCKED_BLOCK_EVENTS(reg)	; \
+    				            XEN_PUT_VCPU_INFO(reg)
+
+#define XEN_UNBLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
+                				XEN_LOCKED_UNBLOCK_EVENTS(reg)	; \
+    			            	XEN_PUT_VCPU_INFO(reg)
+
+
 /* Offsets into shared_info_t. */                
 #define evtchn_upcall_pending		/* 0 */
 #define evtchn_upcall_mask		1
@@ -46,6 +62,27 @@ NMI_MASK = 0x80000000
 #define ORIG_RAX 120       /* + error_code */ 
 #define EFLAGS 144
 
+
+/* Macros */
+.macro zeroentry sym
+	movq (%rsp),%rcx
+	movq 8(%rsp),%r11
+	addq $0x10,%rsp /* skip rcx and r11 */
+	pushq $0	/* push error code/oldrax */
+	pushq %rax	/* push real oldrax to the rdi slot */
+	leaq  \sym(%rip),%rax
+	jmp error_entry
+.endm
+
+.macro errorentry sym
+	movq (%rsp),%rcx
+	movq 8(%rsp),%r11
+	addq $0x10,%rsp /* rsp points to the error code */
+	pushq %rax
+	leaq  \sym(%rip),%rax
+	jmp error_entry
+.endm
+
 .macro SAVE_REST
 	subq $6*8,%rsp
 	movq %rbx,5*8(%rsp)
@@ -79,7 +116,6 @@ NMI_MASK = 0x80000000
 	addq $9*8+8,%rsp
 .endm	
 
-
 .macro HYPERVISOR_IRET flag
 	testl $NMI_MASK,2*8(%rsp)
 	jnz   2f
@@ -98,6 +134,8 @@ NMI_MASK = 0x80000000
 	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
 .endm
 
+
+
 /*
  * Exception entry point. This expects an error code/orig_rax on the stack
  * and the exception handler in %rax.	
@@ -130,73 +168,24 @@ error_call_handler:
 	call *%rax
 	jmp error_exit
 
-.macro zeroentry sym
-	movq (%rsp),%rcx
-	movq 8(%rsp),%r11
-	addq $0x10,%rsp /* skip rcx and r11 */
-	pushq $0	/* push error code/oldrax */ 
-	pushq %rax	/* push real oldrax to the rdi slot */ 
-	leaq  \sym(%rip),%rax
-	jmp error_entry
-.endm	
-
-.macro errorentry sym
-	movq (%rsp),%rcx
-	movq 8(%rsp),%r11
-	addq $0x10,%rsp /* rsp points to the error code */
-	pushq %rax
-	leaq  \sym(%rip),%rax
-	jmp error_entry
-.endm
-
-#define XEN_GET_VCPU_INFO(reg)	movq HYPERVISOR_shared_info,reg
-#define XEN_PUT_VCPU_INFO(reg)
-#define XEN_PUT_VCPU_INFO_fixup
-#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)
-#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)
-#define XEN_TEST_PENDING(reg)	testb $0xFF,evtchn_upcall_pending(reg)
-
-#define XEN_BLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
-                    			XEN_LOCKED_BLOCK_EVENTS(reg)	; \
-    				            XEN_PUT_VCPU_INFO(reg)
-
-#define XEN_UNBLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
-                				XEN_LOCKED_UNBLOCK_EVENTS(reg)	; \
-    			            	XEN_PUT_VCPU_INFO(reg)
-
 
 
 ENTRY(hypervisor_callback)
 	zeroentry hypervisor_callback2
 
 ENTRY(hypervisor_callback2)
-	movq %rdi, %rsp 
+	movq %rdi, %rsp
 11:	movq %gs:8,%rax
 	incl %gs:0
 	cmovzq %rax,%rsp
 	pushq %rdi
-	call do_hypervisor_callback 
+	call do_hypervisor_callback
 	popq %rsp
 	decl %gs:0
-	jmp error_exit
-
-restore_all_enable_events:  
-	XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
-
-scrit:	/**** START OF CRITICAL REGION ****/
-	XEN_TEST_PENDING(%rsi)
-	jnz  14f			# process more events if necessary...
-	XEN_PUT_VCPU_INFO(%rsi)
-	RESTORE_ALL
-	HYPERVISOR_IRET 0
-
-14:	XEN_LOCKED_BLOCK_EVENTS(%rsi)
-	XEN_PUT_VCPU_INFO(%rsi)
-	SAVE_REST
-	movq %rsp,%rdi                  # set the argument again
-	jmp  11b
-ecrit:  /**** END OF CRITICAL REGION ****/
 
+error_exit:
+	RESTORE_REST
+	XEN_BLOCK_EVENTS(%rsi)
 
 retint_kernel:
 retint_restore_args:
@@ -211,11 +200,22 @@ retint_restore_args:
 	RESTORE_ALL
 	HYPERVISOR_IRET 0
 
+restore_all_enable_events:
+	XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
 
-error_exit:
-	RESTORE_REST
-	XEN_BLOCK_EVENTS(%rsi)		
-	jmp retint_kernel
+scrit:	/**** START OF CRITICAL REGION ****/
+	XEN_TEST_PENDING(%rsi)
+	jnz  14f			# process more events if necessary...
+	XEN_PUT_VCPU_INFO(%rsi)
+	RESTORE_ALL
+	HYPERVISOR_IRET 0
+
+14:	XEN_LOCKED_BLOCK_EVENTS(%rsi)
+	XEN_PUT_VCPU_INFO(%rsi)
+	SAVE_REST
+	movq %rsp,%rdi                  # set the argument again
+	jmp  11b
+ecrit:  /**** END OF CRITICAL REGION ****/
 
 
 
-- 
1.7.7.6

Xu Zhang

2013-Mar-08 21:30 UTC

head link

[PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking

We don''t need to block events here because:
 - if we came from "hypervisor_callback", events are disabled at this
point,
   no need to block again;
 - if we came from "error_entry", we shouldn''t touch event
mask, for
   exception hanlding are meant to be interrupted by Xen events (virtual
   irq).

Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
---
 extras/mini-os/arch/x86/x86_64.S |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S
index 79e893f..5e0021b 100644
--- a/extras/mini-os/arch/x86/x86_64.S
+++ b/extras/mini-os/arch/x86/x86_64.S
@@ -185,7 +185,6 @@ ENTRY(hypervisor_callback2)
 
 error_exit:
 	RESTORE_REST
-	XEN_BLOCK_EVENTS(%rsi)
 
 retint_kernel:
 retint_restore_args:
-- 
1.7.7.6

Xu Zhang

2013-Mar-08 21:30 UTC

head link

[PATCH 5/6] mini-os/x86-64 entry: defer RESTORE_REST until return

No need to do a RESTORE_REST at this point because if we saw pending
events after we enabled event delivery, we have to do a SAVE_REST again.
Instead, we do a "lazy" RESTORE_REST, deferring it until immediate
before
actual return.
The offset of saved-on-stack rflags register is changed as well.

Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
---
 extras/mini-os/arch/x86/x86_64.S |   16 ++++++++--------
 1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S
index 5e0021b..25add86 100644
--- a/extras/mini-os/arch/x86/x86_64.S
+++ b/extras/mini-os/arch/x86/x86_64.S
@@ -60,7 +60,7 @@ NMI_MASK = 0x80000000
 
 #define RDI 112
 #define ORIG_RAX 120       /* + error_code */ 
-#define EFLAGS 144
+#define RFLAGS 144
 
 
 /* Macros */
@@ -184,18 +184,17 @@ ENTRY(hypervisor_callback2)
 	decl %gs:0
 
 error_exit:
-	RESTORE_REST
-
 retint_kernel:
-retint_restore_args:
-	movl EFLAGS-6*8(%rsp), %eax
-	shr $9, %eax			# EAX[0] == IRET_EFLAGS.IF
+	movl RFLAGS(%rsp), %eax
+	shr $9, %eax			# EAX[0] == IRET_RFLAGS.IF
 	XEN_GET_VCPU_INFO(%rsi)
 	andb evtchn_upcall_mask(%rsi),%al
-	andb $1,%al			# EAX[0] == IRET_EFLAGS.IF & event_mask
+	andb $1,%al			# EAX[0] == IRET_RFLAGS.IF & event_mask
 	jnz restore_all_enable_events	#        != 0 => enable event delivery
 	XEN_PUT_VCPU_INFO(%rsi)
 
+retint_restore_args:
+	RESTORE_REST
 	RESTORE_ALL
 	HYPERVISOR_IRET 0
 
@@ -206,12 +205,13 @@ scrit:	/**** START OF CRITICAL REGION ****/
 	XEN_TEST_PENDING(%rsi)
 	jnz  14f			# process more events if necessary...
 	XEN_PUT_VCPU_INFO(%rsi)
+
+	RESTORE_REST
 	RESTORE_ALL
 	HYPERVISOR_IRET 0
 
 14:	XEN_LOCKED_BLOCK_EVENTS(%rsi)
 	XEN_PUT_VCPU_INFO(%rsi)
-	SAVE_REST
 	movq %rsp,%rdi                  # set the argument again
 	jmp  11b
 ecrit:  /**** END OF CRITICAL REGION ****/
-- 
1.7.7.6

Xu Zhang

2013-Mar-08 21:30 UTC

head link

[PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

In hypervisor_callback, check against event re-entrant.
If that''s the case, try to fix up by looking up the number of bytes
restored when the second event came and coalescing the two stack frames.
The execution is resumed as if the second event never happened.

Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
---
 extras/mini-os/arch/x86/x86_64.S |   90 ++++++++++++++++++++++++++++++++++++--
 1 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/extras/mini-os/arch/x86/x86_64.S b/extras/mini-os/arch/x86/x86_64.S
index 25add86..0a13d3e 100644
--- a/extras/mini-os/arch/x86/x86_64.S
+++ b/extras/mini-os/arch/x86/x86_64.S
@@ -57,10 +57,14 @@ hypercall_page:
 #define evtchn_upcall_mask		1
 
 NMI_MASK = 0x80000000
+KERNEL_CS_MASK = 0xfc
 
-#define RDI 112
-#define ORIG_RAX 120       /* + error_code */ 
-#define RFLAGS 144
+#define RDI      112
+#define ORIG_RAX 120       /* + error_code */
+#define RIP      128
+#define CS       136
+#define RFLAGS   144
+#define RSP      152
 
 
 /* Macros */
@@ -161,7 +165,7 @@ ENTRY(error_entry)
 	movq %r15,(%rsp) 
 
 error_call_handler:
-	movq %rdi, RDI(%rsp)            
+	movq %rdi, RDI(%rsp)
 	movq %rsp,%rdi
 	movq ORIG_RAX(%rsp),%rsi	# get error code 
 	movq $-1,ORIG_RAX(%rsp)
@@ -175,6 +179,14 @@ ENTRY(hypervisor_callback)
 
 ENTRY(hypervisor_callback2)
 	movq %rdi, %rsp
+
+	/* check against event re-entrant */
+	movq RIP(%rsp),%rax
+	cmpq $scrit,%rax
+	jb 11f
+	cmpq $ecrit,%rax
+	jb  critical_region_fixup
+
 11:	movq %gs:8,%rax
 	incl %gs:0
 	cmovzq %rax,%rsp
@@ -217,6 +229,76 @@ scrit:	/**** START OF CRITICAL REGION ****/
 ecrit:  /**** END OF CRITICAL REGION ****/
 
 
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# ''critical_fixup_table''. For each byte offset in the
critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame. This is the number of bytes from the current stack
+# that we need to copy at the end of the previous activation frame so that
+# we can continue as if we''ve never even reached 11 running in the old
+# activation frame.
+critical_region_fixup:
+		addq $critical_fixup_table - scrit, %rax
+		movzbq (%rax),%rax    # %rax contains num bytes popped
+		mov  %rsp,%rsi
+		add  %rax,%rsi        # %esi points at end of src region
+
+		movq RSP(%rsp),%rdi   # acquire interrupted %rsp from current stack frame
+		                      # %edi points at end of dst region
+		mov  %rax,%rcx
+		shr  $3,%rcx          # convert bytes into count of 64-bit entities
+		je   16f              # skip loop if nothing to copy
+15:		subq $8,%rsi          # pre-decrementing copy loop
+		subq $8,%rdi
+		movq (%rsi),%rax
+		movq %rax,(%rdi)
+		loop 15b
+16:		movq %rdi,%rsp        # final %rdi is top of merged stack
+		andb $KERNEL_CS_MASK,CS(%rsp)      # CS on stack might have changed
+		jmp  11b
+
+
+/* Nested event fixup look-up table*/
+critical_fixup_table:
+	.byte 0x00,0x00,0x00                    # XEN_TEST_PENDING(%rsi)
+	.byte 0x00,0x00,0x00,0x00,0x00,0x00     # jnz    14f
+	.byte 0x00,0x00,0x00,0x00               # mov    (%rsp),%r15
+	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x8(%rsp),%r14
+	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x10(%rsp),%r13
+	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x18(%rsp),%r12
+	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x20(%rsp),%rbp
+	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x28(%rsp),%rbx
+	.byte 0x00,0x00,0x00,0x00               # add    $0x30,%rsp
+	.byte 0x30,0x30,0x30,0x30               # mov    (%rsp),%r11
+	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x8(%rsp),%r10
+	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x10(%rsp),%r9
+	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x18(%rsp),%r8
+	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x20(%rsp),%rax
+	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x28(%rsp),%rcx
+	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x30(%rsp),%rdx
+	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x38(%rsp),%rsi
+	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x40(%rsp),%rdi
+	.byte 0x30,0x30,0x30,0x30               # add    $0x50,%rsp
+	.byte 0x80,0x80,0x80,0x80               # testl  $NMI_MASK,2*8(%rsp)
+	.byte 0x80,0x80,0x80,0x80
+	.byte 0x80,0x80                         # jnz    2f
+	.byte 0x80,0x80,0x80,0x80               # testb 
$1,(xen_features+XENFEAT_supervisor_mode_kernel)
+	.byte 0x80,0x80,0x80,0x80
+	.byte 0x80,0x80                         # jnz    1f
+	.byte 0x80,0x80,0x80,0x80,0x80          # orb    $3,1*8(%rsp)
+	.byte 0x80,0x80,0x80,0x80,0x80          # orb    $3,4*8(%rsp)
+	.byte 0x80,0x80                         # iretq
+	.byte 0x80,0x80,0x80,0x80               # andl   $~NMI_MASK, 16(%rsp)
+	.byte 0x80,0x80,0x80,0x80
+	.byte 0x80,0x80                         # pushq  $\flag
+	.byte 0x78,0x78,0x78,0x78,0x78          # jmp    hypercall_page +
(__HYPERVISOR_iret * 32)
+	.byte 0x00,0x00,0x00,0x00               # XEN_LOCKED_BLOCK_EVENTS(%rsi)
+	.byte 0x00,0x00,0x00                    # mov    %rsp,%rdi
+	.byte 0x00,0x00,0x00,0x00,0x00          # jmp    11b
+
+
 
 ENTRY(failsafe_callback)
         popq  %rcx
-- 
1.7.7.6

Samuel Thibault

2013-Mar-09 20:55 UTC

head link

Re: [PATCH 2/6] mini-os/x86-64 entry: define macros for registers partial save and restore

Xu Zhang, le Fri 08 Mar 2013 15:30:15 -0600, a écrit :> +.macro SAVE_REST
> +.macro RESTORE_REST
>  .macro RESTORE_ALL
I''d say rather put them after RESTORE_ALL, so that "REST"
gets sense.

We could also go even further in this kind of cleanup, but we can do
that later.

Samuel

Samuel Thibault

2013-Mar-09 20:57 UTC

head link

Re: [PATCH 1/6] mini-os/x86-64 entry: code clean-ups

Xu Zhang, le Fri 08 Mar 2013 15:30:14 -0600, a écrit :> Make use of tabs and spaces consistent in arch/x86/x86_64.S
Acked-by: Samuel Thibault <samuel.thibault@ens-lyon.org>

Samuel Thibault

2013-Mar-09 21:03 UTC

head link

Re: [PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes

Xu Zhang, le Fri 08 Mar 2013 15:30:16 -0600, a écrit :> +#define XEN_GET_VCPU_INFO(reg)	movq HYPERVISOR_shared_info,reg
> +#define XEN_PUT_VCPU_INFO(reg)
> +#define XEN_PUT_VCPU_INFO_fixup
> +#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)
> +#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)
> +#define XEN_TEST_PENDING(reg)	testb $0xFF,evtchn_upcall_pending(reg)
> +
> +#define XEN_BLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
> +                    			XEN_LOCKED_BLOCK_EVENTS(reg)	; \
> +    				            XEN_PUT_VCPU_INFO(reg)
> +
> +#define XEN_UNBLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
> +                				XEN_LOCKED_UNBLOCK_EVENTS(reg)	; \
> +    			            	XEN_PUT_VCPU_INFO(reg)
I agree on moving those, as they don''t really have a context.
> +/* Macros */
> +.macro zeroentry sym
> +	movq (%rsp),%rcx
> +	movq 8(%rsp),%r11
> +	addq $0x10,%rsp /* skip rcx and r11 */
> +	pushq $0	/* push error code/oldrax */
> +	pushq %rax	/* push real oldrax to the rdi slot */
> +	leaq  \sym(%rip),%rax
> +	jmp error_entry
> +.endm
> +
> +.macro errorentry sym
> +	movq (%rsp),%rcx
> +	movq 8(%rsp),%r11
> +	addq $0x10,%rsp /* rsp points to the error code */
> +	pushq %rax
> +	leaq  \sym(%rip),%rax
> +	jmp error_entry
> +.endm
I disagree on moving those, as they do have some context.
>  
>  ENTRY(hypervisor_callback)
>  	zeroentry hypervisor_callback2
>  
>  ENTRY(hypervisor_callback2)
> -	movq %rdi, %rsp 
> +	movq %rdi, %rsp
Please avoid such kind of difference, or make it another patch.
>  retint_kernel:
>  retint_restore_args:
Please also drop these labels, they''re not used.

I agree on the rest.

Samuel

Samuel Thibault

2013-Mar-09 21:07 UTC

head link

Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking

Xu Zhang, le Fri 08 Mar 2013 15:30:17 -0600, a écrit :> We don''t need to block events here because:
>  - if we came from "hypervisor_callback", events are disabled at
this point,
>    no need to block again;
>  - if we came from "error_entry", we shouldn''t touch
event mask, for
>    exception hanlding are meant to be interrupted by Xen events (virtual
>    irq).
> 
> Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
Acked-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
> ---
>  extras/mini-os/arch/x86/x86_64.S |    1 -
>  1 files changed, 0 insertions(+), 1 deletions(-)
> 
> diff --git a/extras/mini-os/arch/x86/x86_64.S
b/extras/mini-os/arch/x86/x86_64.S
> index 79e893f..5e0021b 100644
> --- a/extras/mini-os/arch/x86/x86_64.S
> +++ b/extras/mini-os/arch/x86/x86_64.S
> @@ -185,7 +185,6 @@ ENTRY(hypervisor_callback2)
>  
>  error_exit:
>  	RESTORE_REST
> -	XEN_BLOCK_EVENTS(%rsi)
>  
>  retint_kernel:
>  retint_restore_args:
> -- 
> 1.7.7.6
> 
-- 
Samuel
 > Il [e2fsck] a bien démarré, mais il m''a rendu la main aussitot en
me
 > disant "houlala, c''est pas beau à voir votre truc, je
préfèrerai que
 > vous teniez vous même la tronçonneuse" (traduction libre)
 NC in Guide du linuxien pervers : "Bien configurer sa tronçonneuse."

Samuel Thibault

2013-Mar-09 21:15 UTC

head link

Re: [PATCH 5/6] mini-os/x86-64 entry: defer RESTORE_REST until return

Xu Zhang, le Fri 08 Mar 2013 15:30:18 -0600, a écrit :> No need to do a RESTORE_REST at this point because if we saw pending
> events after we enabled event delivery, we have to do a SAVE_REST again.
> Instead, we do a "lazy" RESTORE_REST, deferring it until
immediate before
> actual return.
So in the end RESTORE_REST is always along RESTORE_ALL? I guess after
this series of patches, we''ll just get merged then?

One could say that it''s better to avoid restoring many registers on the
critical path, but I tend to prefer something simple (to maintain), even
if long.

Acked-by: Samuel Thibault <samuel.thibault@ens-lyon.org>

Samuel Thibault

2013-Mar-09 21:19 UTC

head link

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

Xu Zhang, le Fri 08 Mar 2013 15:30:19 -0600, a écrit :> In hypervisor_callback, check against event re-entrant.
> If that''s the case, try to fix up by looking up the number of
bytes
> restored when the second event came and coalescing the two stack frames.
> The execution is resumed as if the second event never happened.
> -	movq %rdi, RDI(%rsp)            
> +	movq %rdi, RDI(%rsp)
Again, please avoid.
> +		andb $KERNEL_CS_MASK,CS(%rsp)      # CS on stack might have changed
I don''t see why this.

Samuel

Jeremy Fitzhardinge

2013-Mar-09 22:44 UTC

head link

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

On 03/08/2013 01:30 PM, Xu Zhang wrote:> +# [How we do the fixup]. We want to merge the current stack frame with the
> +# just-interrupted frame. How we do this depends on where in the critical
> +# region the interrupted handler was executing, and so how many saved
> +# registers are in each frame. We do this quickly using the lookup table
> +# ''critical_fixup_table''. For each byte offset in the
critical region, it
> +# provides the number of bytes which have already been popped from the
> +# interrupted stack frame. This is the number of bytes from the current
stack
> +# that we need to copy at the end of the previous activation frame so that
> +# we can continue as if we''ve never even reached 11 running in
the old
> +# activation frame.
> +critical_region_fixup:
> +		addq $critical_fixup_table - scrit, %rax
> +		movzbq (%rax),%rax    # %rax contains num bytes popped
> +		mov  %rsp,%rsi
> +		add  %rax,%rsi        # %esi points at end of src region
> +
> +		movq RSP(%rsp),%rdi   # acquire interrupted %rsp from current stack
frame
> +		                      # %edi points at end of dst region
> +		mov  %rax,%rcx
> +		shr  $3,%rcx          # convert bytes into count of 64-bit entities
> +		je   16f              # skip loop if nothing to copy
> +15:		subq $8,%rsi          # pre-decrementing copy loop
> +		subq $8,%rdi
> +		movq (%rsi),%rax
> +		movq %rax,(%rdi)
> +		loop 15b
> +16:		movq %rdi,%rsp        # final %rdi is top of merged stack
> +		andb $KERNEL_CS_MASK,CS(%rsp)      # CS on stack might have changed
> +		jmp  11b
> +
> +
> +/* Nested event fixup look-up table*/
> +critical_fixup_table:
> +	.byte 0x00,0x00,0x00                    # XEN_TEST_PENDING(%rsi)
> +	.byte 0x00,0x00,0x00,0x00,0x00,0x00     # jnz    14f
> +	.byte 0x00,0x00,0x00,0x00               # mov    (%rsp),%r15
> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x8(%rsp),%r14
> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x10(%rsp),%r13
> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x18(%rsp),%r12
> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x20(%rsp),%rbp
> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x28(%rsp),%rbx
> +	.byte 0x00,0x00,0x00,0x00               # add    $0x30,%rsp
> +	.byte 0x30,0x30,0x30,0x30               # mov    (%rsp),%r11
> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x8(%rsp),%r10
> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x10(%rsp),%r9
> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x18(%rsp),%r8
> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x20(%rsp),%rax
> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x28(%rsp),%rcx
> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x30(%rsp),%rdx
> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x38(%rsp),%rsi
> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x40(%rsp),%rdi
> +	.byte 0x30,0x30,0x30,0x30               # add    $0x50,%rsp
> +	.byte 0x80,0x80,0x80,0x80               # testl  $NMI_MASK,2*8(%rsp)
> +	.byte 0x80,0x80,0x80,0x80
> +	.byte 0x80,0x80                         # jnz    2f
> +	.byte 0x80,0x80,0x80,0x80               # testb 
$1,(xen_features+XENFEAT_supervisor_mode_kernel)
> +	.byte 0x80,0x80,0x80,0x80
> +	.byte 0x80,0x80                         # jnz    1f
> +	.byte 0x80,0x80,0x80,0x80,0x80          # orb    $3,1*8(%rsp)
> +	.byte 0x80,0x80,0x80,0x80,0x80          # orb    $3,4*8(%rsp)
> +	.byte 0x80,0x80                         # iretq
> +	.byte 0x80,0x80,0x80,0x80               # andl   $~NMI_MASK, 16(%rsp)
> +	.byte 0x80,0x80,0x80,0x80
> +	.byte 0x80,0x80                         # pushq  $\flag
> +	.byte 0x78,0x78,0x78,0x78,0x78          # jmp    hypercall_page +
(__HYPERVISOR_iret * 32)
> +	.byte 0x00,0x00,0x00,0x00               # XEN_LOCKED_BLOCK_EVENTS(%rsi)
> +	.byte 0x00,0x00,0x00                    # mov    %rsp,%rdi
> +	.byte 0x00,0x00,0x00,0x00,0x00          # jmp    11b
This looks super-fragile.  The original Xen-linux kernel code had a
similar kind of fixup table, but I went to some lengths to make it as
simple and robust as possible in the pvops kernels.  See the comment in
xen-asm_32.S:

 * Because the nested interrupt handler needs to deal with the current
 * stack state in whatever form its in, we keep things simple by only
 * using a single register which is pushed/popped on the stack.

64-bit pvops Linux always uses the iret hypercall, so the issue is moot
there.  (In principle a nested kernel interrupt could avoid the iret,
but it wasn''t obvious that the extra complexity was worth it.)

    J

Xu Zhang

2013-Mar-13 02:42 UTC

head link

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

On 03/09/2013 03:19 PM, Samuel Thibault wrote:>> +		andb $KERNEL_CS_MASK,CS(%rsp)      # CS on stack might have changed
> I don''t see why this.
>
Thank you for all your comments and acks. :-)

     ...
     /* Direct iret to kernel space. Correct CS and SS. */
     orb   $3,1*8(%rsp)
     orb   $3,4*8(%rsp)
1:  iretq
     ...

HYPERVISOR_IRET is part of the critical section. It could be the case 
that a nested event comes immediately after we set the lowest byte of 
on-stack CS from 0 to 3 (orb   $3,1*8(%rsp)). After invoking the fixup 
routine, we want the stack frame to be exactly the same of the original one.

One scenario this could be useful is that if mini-os had an userspace. 
If the on-stack CS had changed before nested event came and we didn''t 
reset it, we could end up doing an iret with a hypercall iret, assuming 
we return to kernel with direct iret if on-stack CS is 0 (yet have to 
change it to 3 before doing so because guest kernel is in ring 3) and 
userspace with hypercall iret if 3 in x86 64bit. The scenario looks 
something like this:

  ...
  - enable events; in critical section: returning to kernel via direct 
iret (on-stack CS is 0);
  ...
  - orb   $3,1*8(%rsp): set on-stack CS to 3
  - nested event came: fixing up the stack frame;
  - go back to handle events; perform iret again; if we didn''t reset 
on-stack CS to be 0 in fixup routine, we would return with a hypercall 
iret instead of a direct iret (assume if on-stack CS is 3, we return to 
userspace with hypercall iret)

In x86 64-bit mini-os, the critical section only gets executed iff 
events need to be enabled and we are returning to kernel. So does fixup 
routine. Thus it is safe to reset the lowest byte of CS to 0 in fixup 
routine.

It doesn''t hurt not doing this. But I think conceptually the stack
frame
should be the same after fixing up. ;-)

Thanks,

-- 
xu

:q!

Xu Zhang

2013-Mar-13 02:42 UTC

head link

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

On 03/09/2013 04:44 PM, Jeremy Fitzhardinge wrote:> On 03/08/2013 01:30 PM, Xu Zhang wrote:
>> +# [How we do the fixup]. We want to merge the current stack frame with
the
>> +# just-interrupted frame. How we do this depends on where in the
critical
>> +# region the interrupted handler was executing, and so how many saved
>> +# registers are in each frame. We do this quickly using the lookup
table
>> +# ''critical_fixup_table''. For each byte offset in
the critical region, it
>> +# provides the number of bytes which have already been popped from the
>> +# interrupted stack frame. This is the number of bytes from the
current stack
>> +# that we need to copy at the end of the previous activation frame so
that
>> +# we can continue as if we''ve never even reached 11 running
in the old
>> +# activation frame.
>> +critical_region_fixup:
>> +		addq $critical_fixup_table - scrit, %rax
>> +		movzbq (%rax),%rax    # %rax contains num bytes popped
>> +		mov  %rsp,%rsi
>> +		add  %rax,%rsi        # %esi points at end of src region
>> +
>> +		movq RSP(%rsp),%rdi   # acquire interrupted %rsp from current stack
frame
>> +		                      # %edi points at end of dst region
>> +		mov  %rax,%rcx
>> +		shr  $3,%rcx          # convert bytes into count of 64-bit entities
>> +		je   16f              # skip loop if nothing to copy
>> +15:		subq $8,%rsi          # pre-decrementing copy loop
>> +		subq $8,%rdi
>> +		movq (%rsi),%rax
>> +		movq %rax,(%rdi)
>> +		loop 15b
>> +16:		movq %rdi,%rsp        # final %rdi is top of merged stack
>> +		andb $KERNEL_CS_MASK,CS(%rsp)      # CS on stack might have changed
>> +		jmp  11b
>> +
>> +
>> +/* Nested event fixup look-up table*/
>> +critical_fixup_table:
>> +	.byte 0x00,0x00,0x00                    # XEN_TEST_PENDING(%rsi)
>> +	.byte 0x00,0x00,0x00,0x00,0x00,0x00     # jnz    14f
>> +	.byte 0x00,0x00,0x00,0x00               # mov    (%rsp),%r15
>> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x8(%rsp),%r14
>> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x10(%rsp),%r13
>> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x18(%rsp),%r12
>> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x20(%rsp),%rbp
>> +	.byte 0x00,0x00,0x00,0x00,0x00          # mov    0x28(%rsp),%rbx
>> +	.byte 0x00,0x00,0x00,0x00               # add    $0x30,%rsp
>> +	.byte 0x30,0x30,0x30,0x30               # mov    (%rsp),%r11
>> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x8(%rsp),%r10
>> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x10(%rsp),%r9
>> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x18(%rsp),%r8
>> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x20(%rsp),%rax
>> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x28(%rsp),%rcx
>> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x30(%rsp),%rdx
>> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x38(%rsp),%rsi
>> +	.byte 0x30,0x30,0x30,0x30,0x30          # mov    0x40(%rsp),%rdi
>> +	.byte 0x30,0x30,0x30,0x30               # add    $0x50,%rsp
>> +	.byte 0x80,0x80,0x80,0x80               # testl  $NMI_MASK,2*8(%rsp)
>> +	.byte 0x80,0x80,0x80,0x80
>> +	.byte 0x80,0x80                         # jnz    2f
>> +	.byte 0x80,0x80,0x80,0x80               # testb 
$1,(xen_features+XENFEAT_supervisor_mode_kernel)
>> +	.byte 0x80,0x80,0x80,0x80
>> +	.byte 0x80,0x80                         # jnz    1f
>> +	.byte 0x80,0x80,0x80,0x80,0x80          # orb    $3,1*8(%rsp)
>> +	.byte 0x80,0x80,0x80,0x80,0x80          # orb    $3,4*8(%rsp)
>> +	.byte 0x80,0x80                         # iretq
>> +	.byte 0x80,0x80,0x80,0x80               # andl   $~NMI_MASK, 16(%rsp)
>> +	.byte 0x80,0x80,0x80,0x80
>> +	.byte 0x80,0x80                         # pushq  $\flag
>> +	.byte 0x78,0x78,0x78,0x78,0x78          # jmp    hypercall_page +
(__HYPERVISOR_iret * 32)
>> +	.byte 0x00,0x00,0x00,0x00               #
XEN_LOCKED_BLOCK_EVENTS(%rsi)
>> +	.byte 0x00,0x00,0x00                    # mov    %rsp,%rdi
>> +	.byte 0x00,0x00,0x00,0x00,0x00          # jmp    11b
> This looks super-fragile.  The original Xen-linux kernel code had a
> similar kind of fixup table, but I went to some lengths to make it as
> simple and robust as possible in the pvops kernels.  See the comment in
> xen-asm_32.S:
>
>   * Because the nested interrupt handler needs to deal with the current
>   * stack state in whatever form its in, we keep things simple by only
>   * using a single register which is pushed/popped on the stack.
>
> 64-bit pvops Linux always uses the iret hypercall, so the issue is moot
> there.  (In principle a nested kernel interrupt could avoid the iret,
> but it wasn''t obvious that the extra complexity was worth it.)
>
>      J
>
Thanks very much for you feedbacks.

I guess the concern is that using a chunky look-up table like this is 
somewhat complicated and would be less easier to maintain? Samuel made 
some suggestions, making it less "fragile" to some extent:

Also, it would be good to check against critical section size change, in
case somebody e.g. changes a value, or a macro like XEN_PUT_VCPU_INFO.
For instance, stuff right after the table:

	.if (ecrit-scrit) != (critical_fixup_table_end - critical_fixup_table)
	.error "The critical has changed, the fixup table needs updating"
	.endif


I read through Xen-Linux 32bit implementation for this issue. If I 
understand it right, the use of look-up table is avoided by leaving 
RESTORE_ALL/REST out of the critical section. So when a nested event 
came, you always certain about the stack frame layout and how many bytes 
to copy over when fixing up (in Xen-Linux 32bit, PT_EIP/4 - 1). And the 
register that would be clobbered in order to read and write event mask 
(%eax in Xen-Linux 32bit case) shall be saved on stack. This time we 
only need to worry about this one register instead of many when fixing 
up the stack frame.

Basically both approaches are correct to me and do the same job. 
Xen-Linux 32bit is definitely a more neat solution. However, one thing 
to notice is that mini-os x86 32bit also uses a fixup table. I guess we 
could apply these patches as a temporary solution to make sure 
everything is correct and consistent, and having following patches to do 
the refinement.

Thanks,

-- 
xu

:q!

Xu Zhang

2013-Mar-13 05:53 UTC

head link

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

On 3/12/2013 6:42 PM, Xu Zhang wrote:> Basically both approaches are correct to me and do the same job. 
> Xen-Linux 32bit is definitely a more neat solution. However, one thing 
> to notice is that mini-os x86 32bit also uses a fixup table. I guess 
> we could apply these patches as a temporary solution to make sure 
> everything is correct and consistent, and having following patches to 
> do the refinement.Ah, somewhat "consistent". :-) And by refinement I mean refinement on 
both 32bit and 64bit mini-os x86 kernel entries: basically replacing use 
of look up table with Xen-Linux x86-32bit''s approach.

Thanks,

- Xu

Jeremy Fitzhardinge

2013-Mar-14 01:09 UTC

head link

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

On 03/12/2013 07:42 PM, Xu Zhang wrote:> On 03/09/2013 04:44 PM, Jeremy Fitzhardinge wrote:
>> On 03/08/2013 01:30 PM, Xu Zhang wrote:
>>> +# [How we do the fixup]. We want to merge the current stack frame
>>> with the
>>> +# just-interrupted frame. How we do this depends on where in the
>>> critical
>>> +# region the interrupted handler was executing, and so how many
saved
>>> +# registers are in each frame. We do this quickly using the lookup
>>> table
>>> +# ''critical_fixup_table''. For each byte offset
in the critical
>>> region, it
>>> +# provides the number of bytes which have already been popped from
the
>>> +# interrupted stack frame. This is the number of bytes from the
>>> current stack
>>> +# that we need to copy at the end of the previous activation frame
>>> so that
>>> +# we can continue as if we''ve never even reached 11
running in the old
>>> +# activation frame.
>>> +critical_region_fixup:
>>> +        addq $critical_fixup_table - scrit, %rax
>>> +        movzbq (%rax),%rax    # %rax contains num bytes popped
>>> +        mov  %rsp,%rsi
>>> +        add  %rax,%rsi        # %esi points at end of src region
>>> +
>>> +        movq RSP(%rsp),%rdi   # acquire interrupted %rsp from
>>> current stack frame
>>> +                              # %edi points at end of dst region
>>> +        mov  %rax,%rcx
>>> +        shr  $3,%rcx          # convert bytes into count of 64-bit
>>> entities
>>> +        je   16f              # skip loop if nothing to copy
>>> +15:        subq $8,%rsi          # pre-decrementing copy loop
>>> +        subq $8,%rdi
>>> +        movq (%rsi),%rax
>>> +        movq %rax,(%rdi)
>>> +        loop 15b
>>> +16:        movq %rdi,%rsp        # final %rdi is top of merged
stack
>>> +        andb $KERNEL_CS_MASK,CS(%rsp)      # CS on stack might
have
>>> changed
>>> +        jmp  11b
>>> +
>>> +
>>> +/* Nested event fixup look-up table*/
>>> +critical_fixup_table:
>>> +    .byte 0x00,0x00,0x00                    #
XEN_TEST_PENDING(%rsi)
>>> +    .byte 0x00,0x00,0x00,0x00,0x00,0x00     # jnz    14f
>>> +    .byte 0x00,0x00,0x00,0x00               # mov    (%rsp),%r15
>>> +    .byte 0x00,0x00,0x00,0x00,0x00          # mov   
0x8(%rsp),%r14
>>> +    .byte 0x00,0x00,0x00,0x00,0x00          # mov   
0x10(%rsp),%r13
>>> +    .byte 0x00,0x00,0x00,0x00,0x00          # mov   
0x18(%rsp),%r12
>>> +    .byte 0x00,0x00,0x00,0x00,0x00          # mov   
0x20(%rsp),%rbp
>>> +    .byte 0x00,0x00,0x00,0x00,0x00          # mov   
0x28(%rsp),%rbx
>>> +    .byte 0x00,0x00,0x00,0x00               # add    $0x30,%rsp
>>> +    .byte 0x30,0x30,0x30,0x30               # mov    (%rsp),%r11
>>> +    .byte 0x30,0x30,0x30,0x30,0x30          # mov   
0x8(%rsp),%r10
>>> +    .byte 0x30,0x30,0x30,0x30,0x30          # mov   
0x10(%rsp),%r9
>>> +    .byte 0x30,0x30,0x30,0x30,0x30          # mov   
0x18(%rsp),%r8
>>> +    .byte 0x30,0x30,0x30,0x30,0x30          # mov   
0x20(%rsp),%rax
>>> +    .byte 0x30,0x30,0x30,0x30,0x30          # mov   
0x28(%rsp),%rcx
>>> +    .byte 0x30,0x30,0x30,0x30,0x30          # mov   
0x30(%rsp),%rdx
>>> +    .byte 0x30,0x30,0x30,0x30,0x30          # mov   
0x38(%rsp),%rsi
>>> +    .byte 0x30,0x30,0x30,0x30,0x30          # mov   
0x40(%rsp),%rdi
>>> +    .byte 0x30,0x30,0x30,0x30               # add    $0x50,%rsp
>>> +    .byte 0x80,0x80,0x80,0x80               # testl 
>>> $NMI_MASK,2*8(%rsp)
>>> +    .byte 0x80,0x80,0x80,0x80
>>> +    .byte 0x80,0x80                         # jnz    2f
>>> +    .byte 0x80,0x80,0x80,0x80               # testb 
>>> $1,(xen_features+XENFEAT_supervisor_mode_kernel)
>>> +    .byte 0x80,0x80,0x80,0x80
>>> +    .byte 0x80,0x80                         # jnz    1f
>>> +    .byte 0x80,0x80,0x80,0x80,0x80          # orb    $3,1*8(%rsp)
>>> +    .byte 0x80,0x80,0x80,0x80,0x80          # orb    $3,4*8(%rsp)
>>> +    .byte 0x80,0x80                         # iretq
>>> +    .byte 0x80,0x80,0x80,0x80               # andl   $~NMI_MASK,
>>> 16(%rsp)
>>> +    .byte 0x80,0x80,0x80,0x80
>>> +    .byte 0x80,0x80                         # pushq  $\flag
>>> +    .byte 0x78,0x78,0x78,0x78,0x78          # jmp   
hypercall_page
>>> + (__HYPERVISOR_iret * 32)
>>> +    .byte 0x00,0x00,0x00,0x00               #
>>> XEN_LOCKED_BLOCK_EVENTS(%rsi)
>>> +    .byte 0x00,0x00,0x00                    # mov    %rsp,%rdi
>>> +    .byte 0x00,0x00,0x00,0x00,0x00          # jmp    11b
>> This looks super-fragile.  The original Xen-linux kernel code had a
>> similar kind of fixup table, but I went to some lengths to make it as
>> simple and robust as possible in the pvops kernels.  See the comment in
>> xen-asm_32.S:
>>
>>   * Because the nested interrupt handler needs to deal with the current
>>   * stack state in whatever form its in, we keep things simple by only
>>   * using a single register which is pushed/popped on the stack.
>>
>> 64-bit pvops Linux always uses the iret hypercall, so the issue is moot
>> there.  (In principle a nested kernel interrupt could avoid the iret,
>> but it wasn''t obvious that the extra complexity was worth it.)
>>
>>      J
>>
>
> Thanks very much for you feedbacks.
>
> I guess the concern is that using a chunky look-up table like this is
> somewhat complicated and would be less easier to maintain? Samuel made
> some suggestions, making it less "fragile" to some extent:
>
> Also, it would be good to check against critical section size change, in
> case somebody e.g. changes a value, or a macro like XEN_PUT_VCPU_INFO.
> For instance, stuff right after the table:
>
>     .if (ecrit-scrit) != (critical_fixup_table_end -
> critical_fixup_table)
>     .error "The critical has changed, the fixup table needs
updating"
>     .endif
That would catch some problems, but not all.  I worry that the whole
construction is too error-prone, and its too easy for some later
unsuspecting developer to come in and introduce a subtle bug that only
occurs under very particular timing conditions.
> I read through Xen-Linux 32bit implementation for this issue. If I
> understand it right, the use of look-up table is avoided by leaving
> RESTORE_ALL/REST out of the critical section. So when a nested event
> came, you always certain about the stack frame layout and how many
> bytes to copy over when fixing up (in Xen-Linux 32bit, PT_EIP/4 - 1).
> And the register that would be clobbered in order to read and write
> event mask (%eax in Xen-Linux 32bit case) shall be saved on stack.
> This time we only need to worry about this one register instead of
> many when fixing up the stack frame.
Right.
> Basically both approaches are correct to me and do the same job.
> Xen-Linux 32bit is definitely a more neat solution. However, one thing
> to notice is that mini-os x86 32bit also uses a fixup table.
I don''t have mini-os in front of me at the moment, but I
wouldn''t be
surprised if the original Xen-Linux and mini-os share heritage in this area.
> I guess we could apply these patches as a temporary solution to make
> sure everything is correct and consistent, and having following
> patches to do the refinement.
That makes sense to me if the refinement is a large amount of new work
on top of what you''ve already done, but I don''t think
that''s obviously
true.  Given that you''re familiar with the code and the problem domain,
I expect you could knock out a clean solution pretty quickly.

I guess you''re interested in this path for 64-bit as well, because a
same ring-ring interrupt return is much more likely in minios.

    J

Konrad Rzeszutek Wilk

2013-Mar-15 20:16 UTC

head link

Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking

On Fri, Mar 08, 2013 at 03:30:17PM -0600, Xu Zhang
wrote:> We don''t need to block events here because:
>  - if we came from "hypervisor_callback", events are disabled at
this point,
>    no need to block again;
Hm, I think I am not following it. It does a bit of %gs manipulation which
looks to point to the cpu0_pda.irqcount.

?>  - if we came from "error_entry", we shouldn''t touch
event mask, for
>    exception hanlding are meant to be interrupted by Xen events (virtual
handling>    irq).
> 
> Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
> ---
>  extras/mini-os/arch/x86/x86_64.S |    1 -
>  1 files changed, 0 insertions(+), 1 deletions(-)
> 
> diff --git a/extras/mini-os/arch/x86/x86_64.S
b/extras/mini-os/arch/x86/x86_64.S
> index 79e893f..5e0021b 100644
> --- a/extras/mini-os/arch/x86/x86_64.S
> +++ b/extras/mini-os/arch/x86/x86_64.S
> @@ -185,7 +185,6 @@ ENTRY(hypervisor_callback2)
>  
>  error_exit:
>  	RESTORE_REST
> -	XEN_BLOCK_EVENTS(%rsi)
>  
>  retint_kernel:
>  retint_restore_args:
> -- 
> 1.7.7.6
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

Xu Zhang

2013-Apr-11 04:40 UTC

head link

Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking

Sorry about my delayed response. I just got some time to take a look and 
revise these patches.

It seems to me the bit of %gs manipulation is used for executing 
"do_hypervisor_callback" on a different stack. After it''s
done, it
switches back to the original stack and resumes from there. The event 
mask is not touched during this process. I put mini-os under gdb just to 
verify, and for the runs I observed, event mask remains untouched.

Is there anything I am missing?

On 03/15/2013 03:16 PM, Konrad Rzeszutek Wilk wrote:> On Fri, Mar 08, 2013 at 03:30:17PM -0600, Xu Zhang wrote:
>> We don''t need to block events here because:
>>   - if we came from "hypervisor_callback", events are
disabled at this point,
>>     no need to block again;
> Hm, I think I am not following it. It does a bit of %gs manipulation which
> looks to point to the cpu0_pda.irqcount.
>
> ?
>>   - if we came from "error_entry", we shouldn''t
touch event mask, for
>>     exception hanlding are meant to be interrupted by Xen events
(virtual
> handling
>>     irq).
>>
>> Signed-off-by: Xu Zhang <xzhang@cs.uic.edu>
>> ---
>>   extras/mini-os/arch/x86/x86_64.S |    1 -
>>   1 files changed, 0 insertions(+), 1 deletions(-)
>>
>> diff --git a/extras/mini-os/arch/x86/x86_64.S
b/extras/mini-os/arch/x86/x86_64.S
>> index 79e893f..5e0021b 100644
>> --- a/extras/mini-os/arch/x86/x86_64.S
>> +++ b/extras/mini-os/arch/x86/x86_64.S
>> @@ -185,7 +185,6 @@ ENTRY(hypervisor_callback2)
>>   
>>   error_exit:
>>   	RESTORE_REST
>> -	XEN_BLOCK_EVENTS(%rsi)
>>   
>>   retint_kernel:
>>   retint_restore_args:
>> -- 
>> 1.7.7.6
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
>>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

Xu Zhang

2013-Apr-11 04:40 UTC

head link

Re: [PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes

On 03/09/2013 03:03 PM, Samuel Thibault wrote:>> +/* Macros */
>> +.macro zeroentry sym
>> +	movq (%rsp),%rcx
>> +	movq 8(%rsp),%r11
>> +	addq $0x10,%rsp /* skip rcx and r11 */
>> +	pushq $0	/* push error code/oldrax */
>> +	pushq %rax	/* push real oldrax to the rdi slot */
>> +	leaq  \sym(%rip),%rax
>> +	jmp error_entry
>> +.endm
>> +
>> +.macro errorentry sym
>> +	movq (%rsp),%rcx
>> +	movq 8(%rsp),%r11
>> +	addq $0x10,%rsp /* rsp points to the error code */
>> +	pushq %rax
>> +	leaq  \sym(%rip),%rax
>> +	jmp error_entry
>> +.endm
> I disagree on moving those, as they do have some context.
Well, if they have some context, they probably end off better before 
they first get called, say before hypervisor_callback. Note they are 
also used in exception handlers. Besides, it''s kind of annoying to have
these macros sit in between assembly routines. Probably it''s just me.
If
you prefer leave them untouched, I am perfectly fine with that.
>
>>   retint_kernel:
>>   retint_restore_args:
> Please also drop these labels, they''re not used.
>
I would argue it gonna be a different story if mini-os has an user 
space: "retint_kernel" returns to kernel, and presumably
"retint_user"
would do schedule, x87 lazy switch, handle softirqs and etc. But since 
it does not, I guess they can be dropped.

Xen devel - Mar 2013 - [PATCH 0/6] mini-os: check and fix up against nested events in x86-64 kernel entry

[PATCH 0/6] mini-os: check and fix up against nested events in x86-64 kernel entry

[PATCH 1/6] mini-os/x86-64 entry: code clean-ups

[PATCH 2/6] mini-os/x86-64 entry: define macros for registers partial save and restore

[PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes

[PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking

[PATCH 5/6] mini-os/x86-64 entry: defer RESTORE_REST until return

[PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

Re: [PATCH 2/6] mini-os/x86-64 entry: define macros for registers partial save and restore

Re: [PATCH 1/6] mini-os/x86-64 entry: code clean-ups

Re: [PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes

Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking

Re: [PATCH 5/6] mini-os/x86-64 entry: defer RESTORE_REST until return

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

Re: [PATCH 6/6] mini-os/x86-64 entry: check against nested events and try to fix up

Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking

Re: [PATCH 4/6] mini-os/x86-64 entry: remove unnecessary event blocking

Re: [PATCH 3/6] mini-os/x86-64 entry: code refactoring; no functional changes