Make the Linux kernel able to run at CPL 0, 1, or 2.  There are a few
limited places where CPL-0 is assumed, and they can be converted very
efficiently into a supervisor CPL check instead of a CPL-0 check.
This step prepares the kernel for running in direct execution under
a hypervisor.
Note the user_mode_vm macro used in ptrace.h is very similar to the
flag mixing of EFLAGS and CS used to test in one branch in entry.S.
To make the COMPARE_SEGMENT_REG macro work, it must contain both the
push and the pop, requiring an additional load of EAX after the
possible stack fixup.  This is because FIXUP_ESPFIX_STACK destroys
%EAX again with a call to C code.
In all, the overhead is couple of instructions, and no extra branches.
Note that I prefer to use the testing of selectors in the form:
  SELECTOR_CLEAR_RPL(sel) == __KERNEL_SEL
Instead of SELECTOR == (%seg), where %seg is the live register value.
This is because moves from segment registers are more costly than a
single ALU instruction, and both cost a temporary register.  Still,
pushing the raw %cs value in the NMI after sysenter debug trap fixup
code instead of __KERNEL_CS | RPL is just as efficient, since fetching
RPL would require a %cs load anyway.
The switch from __KERNEL_DS to __USER_DS at one point is for
convenience, since they are both equivalent for %ds and %es segments;
__KERNEL_DS, contrary to the name, is only useful for the %ss segment.
Signed-off-by: Zachary Amsden <zach@vmware.com>
Index: linux-2.6.16-rc5/include/asm-i386/ptrace.h
==================================================================---
linux-2.6.16-rc5.orig/include/asm-i386/ptrace.h	2006-03-08 16:58:49.000000000
-0800
+++ linux-2.6.16-rc5/include/asm-i386/ptrace.h	2006-03-08 17:10:26.000000000
-0800
@@ -60,6 +60,7 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #include <asm/vm86.h>
+#include <asm/segment.h>
 
 struct task_struct;
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int
error_code);
@@ -73,11 +74,11 @@ extern void send_sigtrap(struct task_str
  */
 static inline int user_mode(struct pt_regs *regs)
 {
-	return (regs->xcs & 3) != 0;
+	return (regs->xcs & SEGMENT_RPL_MASK) == 3;
 }
 static inline int user_mode_vm(struct pt_regs *regs)
 {
-	return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0;
+	return (((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags &
VM_MASK)) >= 3);
 }
 #define instruction_pointer(regs) ((regs)->eip)
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
Index: linux-2.6.16-rc5/include/asm-i386/segment.h
==================================================================---
linux-2.6.16-rc5.orig/include/asm-i386/segment.h	2006-03-08 16:58:49.000000000
-0800
+++ linux-2.6.16-rc5/include/asm-i386/segment.h	2006-03-08 17:10:26.000000000
-0800
@@ -112,4 +112,9 @@
  */
 #define IDT_ENTRIES 256
 
+#define SEGMENT_RPL_MASK	0x03
+#define SEGMENT_TI_MASK		0x04
+
+#include <mach_segment.h>
+
 #endif
Index: linux-2.6.16-rc5/include/asm-i386/mach-vmi/mach_segment.h
==================================================================---
linux-2.6.16-rc5.orig/include/asm-i386/mach-vmi/mach_segment.h	2006-03-08
17:10:26.000000000 -0800
+++ linux-2.6.16-rc5/include/asm-i386/mach-vmi/mach_segment.h	2006-03-08
17:10:26.000000000 -0800
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to zach@vmware.com
+ *
+ */
+
+
+#ifndef __MACH_SEGMENT_H
+#define __MACH_SEGMENT_H
+
+#if !defined(CONFIG_X86_VMI)
+# error invalid sub-arch include
+#endif
+
+#ifndef __ASSEMBLY__
+static inline unsigned get_kernel_rpl(void)
+{
+	unsigned cs;
+	__asm__ ("movl %%cs,%0" : "=r"(cs):);
+	return cs & SEGMENT_RPL_MASK;
+}
+#endif
+
+#define COMPARE_SEGMENT_STACK(segment, offset)	\
+	pushl %eax;				\
+	mov   offset+4(%esp), %eax;		\
+	andl  $~SEGMENT_RPL_MASK, %eax;		\
+	cmpw  $segment,%ax;			\
+	popl  %eax;
+
+#define COMPARE_SEGMENT_REG(segment, reg)	\
+	pushl %eax;				\
+	mov   reg, %eax;			\
+	andl  $~SEGMENT_RPL_MASK, %eax;		\
+	cmpw  $segment,%ax;			\
+	popl  %eax;
+
+#endif
Index: linux-2.6.16-rc5/include/asm-i386/mach-default/mach_segment.h
==================================================================---
linux-2.6.16-rc5.orig/include/asm-i386/mach-default/mach_segment.h	2006-03-08
17:10:26.000000000 -0800
+++ linux-2.6.16-rc5/include/asm-i386/mach-default/mach_segment.h	2006-03-09
15:51:42.000000000 -0800
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to zach@vmware.com
+ *
+ */
+
+
+#ifndef __MACH_SEGMENT_H
+#define __MACH_SEGMENT_H
+
+#define get_kernel_rpl()  0
+
+#define COMPARE_SEGMENT_STACK(segment, offset)	\
+	cmpw $segment, offset(%esp);
+
+#define COMPARE_SEGMENT_REG(segment, reg)	\
+	pushl %eax;				\
+	mov   reg, %eax;			\
+	cmpw  $segment,%ax;			\
+	popl  %eax;
+#endif
Index: linux-2.6.16-rc5/arch/i386/kernel/entry.S
==================================================================---
linux-2.6.16-rc5.orig/arch/i386/kernel/entry.S	2006-03-08 17:10:25.000000000
-0800
+++ linux-2.6.16-rc5/arch/i386/kernel/entry.S	2006-03-08 17:10:26.000000000
-0800
@@ -145,9 +145,11 @@ ret_from_exception:
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	andl $VM_MASK, %eax
 	movb CS(%esp), %al
-	testl $(VM_MASK | 3), %eax
-	jz resume_kernel
+	andb $SEGMENT_RPL_MASK, %al
+	cmpl $SEGMENT_RPL_MASK, %eax
+	jb resume_kernel		# returning to kernel or vm86-space
 ENTRY(resume_userspace)
  	CLI				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
@@ -382,17 +384,14 @@ syscall_badsys:
 	/* put ESP to the proper location */ \
 	movl %eax, %esp;
 #define UNWIND_ESPFIX_STACK \
-	pushl %eax; \
-	movl %ss, %eax; \
-	/* see if on 16bit stack */ \
-	cmpw $__ESPFIX_SS, %ax; \
+	COMPARE_SEGMENT_REG(__ESPFIX_SS, %ss) \
 	jne 28f; \
-	movl $__KERNEL_DS, %edx; \
+	movl $__USER_DS, %edx; \
 	movl %edx, %ds; \
 	movl %edx, %es; \
 	/* switch to 32bit stack */ \
 	FIXUP_ESPFIX_STACK \
-28:	popl %eax;
+28:;
 
 /*
  * Build the entry stubs and pointer table with
@@ -451,6 +450,7 @@ error_code:
 	pushl %es
 	UNWIND_ESPFIX_STACK
 	popl %ecx
+	movl EAX(%esp), %eax
 	movl ES(%esp), %edi		# get the function address
 	movl ORIG_EAX(%esp), %edx	# get the error code
 	movl %eax, ORIG_EAX(%esp)
@@ -501,12 +501,12 @@ device_not_available_emulate:
  * the instruction that would have done it for sysenter.
  */
 #define FIX_STACK(offset, ok, label)		\
-	cmpw $__KERNEL_CS,4(%esp);		\
+	COMPARE_SEGMENT_STACK(__KERNEL_CS, 4)	\
 	jne ok;					\
 label:						\
 	movl TSS_sysenter_esp0+offset(%esp),%esp;	\
 	pushfl;					\
-	pushl $__KERNEL_CS;			\
+	push  %cs;				\
 	pushl $sysenter_past_esp
 
 KPROBE_ENTRY(debug)
@@ -530,10 +530,7 @@ debug_stack_correct:
  * fault happened on the sysenter path.
  */
 ENTRY(nmi)
-	pushl %eax
-	movl %ss, %eax
-	cmpw $__ESPFIX_SS, %ax
-	popl %eax
+	COMPARE_SEGMENT_REG(__ESPFIX_SS, %ss)
 	je nmi_16bit_stack
 	cmpl $sysenter_entry,(%esp)
 	je nmi_stack_fixup
@@ -560,7 +557,7 @@ nmi_stack_fixup:
 	FIX_STACK(12,nmi_stack_correct, 1)
 	jmp nmi_stack_correct
 nmi_debug_stack_check:
-	cmpw $__KERNEL_CS,16(%esp)
+	COMPARE_SEGMENT_STACK(__KERNEL_CS, 16)
 	jne nmi_stack_correct
 	cmpl $debug,(%esp)
 	jb nmi_stack_correct
Index: linux-2.6.16-rc5/arch/i386/kernel/process.c
==================================================================---
linux-2.6.16-rc5.orig/arch/i386/kernel/process.c	2006-03-08 16:58:49.000000000
-0800
+++ linux-2.6.16-rc5/arch/i386/kernel/process.c	2006-03-09 15:52:17.000000000
-0800
@@ -348,7 +348,7 @@ int kernel_thread(int (*fn)(void *), voi
 	regs.xes = __USER_DS;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
-	regs.xcs = __KERNEL_CS;
+	regs.xcs = __KERNEL_CS | get_kernel_rpl();
 	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
 	/* Ok, create the new process.. */
Index: linux-2.6.16-rc5/arch/i386/kernel/traps.c
==================================================================---
linux-2.6.16-rc5.orig/arch/i386/kernel/traps.c	2006-03-08 17:10:25.000000000
-0800
+++ linux-2.6.16-rc5/arch/i386/kernel/traps.c	2006-03-08 17:10:26.000000000
-0800
@@ -970,10 +970,10 @@ fastcall void setup_x86_bogus_stack(unsi
 	memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20);
 	/* fill in the switch pointers */
 	switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
-	switch16_ptr[1] = __ESPFIX_SS;
+	switch16_ptr[1] = __ESPFIX_SS | get_kernel_rpl();
 	switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
 		8 - CPU_16BIT_STACK_SIZE;
-	switch32_ptr[1] = __KERNEL_DS;
+	switch32_ptr[1] = __KERNEL_DS | get_kernel_rpl();
 }
 
 fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)