As we realized while trying out NetWare''s ring 3 support, call gates
didn''t work
for 32-bit guests on 64-bit hypervisor. Since x86-64 doesn''t know 16-
or 32-bit
call gates, the only option was to emulate them. The code here was developed
against 3.0.4, so hasn''t been checked for potential integration
possibilities with
the much improved emulator; nevertheless I want to supply this patch.
As was realized in the course of creating this patch, 64-bit gates
don''t work
either, and will also need to be emulated if any environment intends to use
them. The patch changes behavior here in that rather silently permitting the
use of 64-bit gates (with possibly difficult to understand exceptions happening
on the first instruction of the call/jump target) the call/jump itself will now
fault,
with the error code indicating the gate that was attempted to be used. I
intend to complete the emulation to also cover 64-bit gates, but there is one
issue that first needs to be addressed: Whether a gate transitions from user
to kernel mode doesn''t depend on the gate, but rather on the descriptor
referenced by the selector held in the gate. As the two can change
independently, this decision can be made only at the point of use of the gate,
and consequently descriptors for kernel code segments must become
distinguishable from user ones, which they currently aren''t as they
both get
their DPL forced to 3. An initial thought here is to possibly leverage the
otherwise meaningless conforming bit (i.e. forcing it on for all user code
segments, and off for kernel ones, where then the distinction can be made
at the point the descriptor gets verified/fixed up based of the kernel
supplied DPL [wouldn''t work for old guests when setting the DPL to 3
was
still required to be done by the guest]).
The patch also changes behavior of check_descriptor() in that no
modification is done to the descriptor anymore unless all verification steps
passed, and in that the selector RPL of selectors in call gates no longer
gets fixed up (a comment elsewhere in the code correctly states that
the RPL field here isn''t used for anything by the processor); really,
this
field is now used on 64-bits to store the original DPL of the gate, because
the architectural one now gets forced to zero.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Index: 2007-01-08/xen/arch/x86/traps.c
==================================================================---
2007-01-08.orig/xen/arch/x86/traps.c 2007-01-25 13:55:26.000000000 +0100
+++ 2007-01-08/xen/arch/x86/traps.c 2007-01-26 10:04:04.000000000 +0100
@@ -1087,6 +1087,63 @@ static int read_descriptor(unsigned int
return 1;
}
+#ifdef CONFIG_COMPAT/* XXX __x86_64__ */
+static int read_gate_descriptor(unsigned int gate_sel,
+ const struct vcpu *v,
+ unsigned int *sel,
+ unsigned long *off,
+ unsigned int *ar)
+{
+ struct desc_struct desc;
+ const struct desc_struct *pdesc;
+
+
+ pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
+ GDT_VIRT_START(v) :
+ LDT_VIRT_START(v))
+ + (gate_sel >> 3);
+ if ( gate_sel < 4 ||
+ (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel &
4)) ||
+ __get_user(desc, pdesc) )
+ return 0;
+
+ *sel = (desc.a >> 16) & 0x0000fffc;
+ *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
+ *ar = desc.b & 0x0000ffff;
+ /*
+ * check_descriptor() clears the DPL field and stores the
+ * guest requested DPL in the selector''s RPL field.
+ */
+ ASSERT(!(*ar & _SEGMENT_DPL));
+ *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
+
+ if ( !IS_COMPAT(v->domain) )
+ {
+ if ( (*ar & 0x1f00) != 0x0c00 ||
+ (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel
& 4)) ||
+ __get_user(desc, pdesc + 1) ||
+ (desc.b & 0x1f00) )
+ return 0;
+
+ *off |= (unsigned long)desc.a << 32;
+ return 1;
+ }
+
+ switch ( *ar & 0x1f00 )
+ {
+ case 0x0400:
+ *off &= 0xffff;
+ break;
+ case 0x0c00:
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+#endif
+
/* Has the guest requested sufficient permission for this I/O access? */
static inline int guest_io_okay(
unsigned int port, unsigned int bytes,
@@ -1154,6 +1211,8 @@ unsigned long guest_to_host_gpr_switch(u
#define insn_fetch(type, base, eip, limit) \
({ unsigned long _rc, _ptr = (base) + (eip); \
type _x; \
+ if ( ad_default < 8 )
\
+ _ptr = (unsigned int)_ptr; \
if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )
\
goto fail; \
if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )
\
@@ -1756,6 +1815,336 @@ static int emulate_privileged_op(struct
return 0;
}
+static inline int check_stack_limit(unsigned int ar, unsigned int limit,
+ unsigned int esp, unsigned int decr)
+{
+ return esp - decr < esp - 1 &&
+ (!(ar & _SEGMENT_EC) ? esp - 1 <= limit : esp - decr >
limit);
+}
+
+static int emulate_gate_op(struct cpu_user_regs *regs)
+{
+#ifdef CONFIG_COMPAT/* XXX __x86_64__ */
+ struct vcpu *v = current;
+ unsigned int sel, ar, dpl, nparm, opnd_sel;
+ unsigned int op_default, op_bytes, ad_default, ad_bytes;
+ unsigned long off, eip, opnd_off, base, limit;
+ int jump;
+
+ /* Check whether this fault is due to the use of a call gate. */
+ if ( !read_gate_descriptor(regs->error_code, v, &sel, &off,
&ar) ||
+ ((ar >> 13) & 3) < (regs->cs & 3) ||
+ (ar & _SEGMENT_TYPE) != 0xc00 )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ if ( !(ar & _SEGMENT_P) )
+ return do_guest_trap(TRAP_no_segment, regs, 1);
+ dpl = (ar >> 13) & 3;
+ nparm = ar & 0x1f;
+
+ /*
+ * Decode instruction (and perhaps operand) to determine RPL,
+ * whether this is a jump or a call, and the call return offset.
+ */
+ if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar,
0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ !(ar & _SEGMENT_CODE) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+
+ op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
+ ad_default = ad_bytes = op_default;
+ opnd_sel = opnd_off = 0;
+ jump = -1;
+ for ( eip = regs->eip; eip - regs->_eip < 10; )
+ {
+ switch ( insn_fetch(u8, base, eip, limit) )
+ {
+ case 0x66: /* operand-size override */
+ op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
+ continue;
+ case 0x67: /* address-size override */
+ ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
+ continue;
+ case 0x2e: /* CS override */
+ opnd_sel = regs->cs;
+ ASSERT(opnd_sel);
+ continue;
+ case 0x3e: /* DS override */
+ opnd_sel = read_sreg(regs, ds);
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0x26: /* ES override */
+ opnd_sel = read_sreg(regs, es);
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0x64: /* FS override */
+ opnd_sel = read_sreg(regs, fs);
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0x65: /* GS override */
+ opnd_sel = read_sreg(regs, gs);
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0x36: /* SS override */
+ opnd_sel = regs->ss;
+ if ( !opnd_sel )
+ opnd_sel = dpl;
+ continue;
+ case 0xea:
+ ++jump;
+ /* FALLTHROUGH */
+ case 0x9a:
+ ++jump;
+ opnd_sel = regs->cs;
+ opnd_off = eip;
+ ad_bytes = ad_default;
+ eip += op_bytes + 2;
+ break;
+ case 0xff:
+ {
+ unsigned int modrm;
+
+ switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8
)
+ {
+ case 0x28: case 0x68: case 0xa8:
+ ++jump;
+ /* FALLTHROUGH */
+ case 0x18: case 0x58: case 0x98:
+ ++jump;
+ if ( ad_bytes != 2 )
+ {
+ if ( (modrm & 7) == 4 )
+ {
+ unsigned int sib = insn_fetch(u8, base, eip,
limit);
+
+ modrm = (modrm & ~7) | (sib & 7);
+ if ( (sib >>= 3) != 4 )
+ opnd_off = *(unsigned long
*)decode_register(sib & 7, regs, 0);
+ opnd_off <<= sib >> 3;
+ }
+ if ( (modrm & 7) != 5 || (modrm & 0xc0) )
+ opnd_off += *(unsigned long *)decode_register(modrm
& 7, regs, 0);
+ else
+ modrm |= 0x87;
+ if ( !opnd_sel )
+ {
+ switch ( modrm & 7 )
+ {
+ default:
+ opnd_sel = read_sreg(regs, ds);
+ break;
+ case 4: case 5:
+ opnd_sel = regs->ss;
+ break;
+ }
+ }
+ }
+ else
+ {
+ switch ( modrm & 7 )
+ {
+ case 0: case 1: case 7:
+ opnd_off = regs->ebx;
+ break;
+ case 6:
+ if ( !(modrm & 0xc0) )
+ modrm |= 0x80;
+ else
+ case 2: case 3:
+ {
+ opnd_off = regs->ebp;
+ if ( !opnd_sel )
+ opnd_sel = regs->ss;
+ }
+ break;
+ }
+ if ( !opnd_sel )
+ opnd_sel = read_sreg(regs, ds);
+ switch ( modrm & 7 )
+ {
+ case 0: case 2: case 4:
+ opnd_off += regs->esi;
+ break;
+ case 1: case 3: case 5:
+ opnd_off += regs->edi;
+ break;
+ }
+ }
+ switch ( modrm & 0xc0 )
+ {
+ case 0x40:
+ opnd_off += insn_fetch(s8, base, eip, limit);
+ break;
+ case 0x80:
+ opnd_off += insn_fetch(s32, base, eip, limit);
+ break;
+ }
+ if ( ad_bytes == 4 )
+ opnd_off = (unsigned int)opnd_off;
+ else if ( ad_bytes == 2 )
+ opnd_off = (unsigned short)opnd_off;
+ break;
+ }
+ }
+ break;
+ }
+ break;
+ }
+
+ if ( jump < 0 )
+ {
+ fail:
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ }
+
+ if ( (opnd_sel != regs->cs &&
+ !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar,
0)) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+
+ opnd_off += op_bytes;
+#define ad_default ad_bytes
+ opnd_sel = insn_fetch(u16, base, opnd_off, limit);
+#undef ad_default
+ ASSERT((opnd_sel & ~3) == regs->error_code);
+ if ( dpl < (opnd_sel & 3) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+
+ if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_CODE) ||
+ (!jump || (ar & _SEGMENT_EC) ?
+ ((ar >> 13) & 3) > (regs->cs & 3) :
+ ((ar >> 13) & 3) != (regs->cs & 3)) )
+ {
+ regs->error_code = sel;
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ }
+ if ( !(ar & _SEGMENT_P) )
+ {
+ regs->error_code = sel;
+ return do_guest_trap(TRAP_no_segment, regs, 1);
+ }
+ if ( off > limit )
+ {
+ regs->error_code = 0;
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ }
+
+ if ( !jump )
+ {
+ unsigned int ss, esp, *stkp;
+ int rc;
+#define push(item) do \
+ { \
+ --stkp; \
+ esp -= 4; \
+ rc = __put_user(item, stkp); \
+ if ( rc ) \
+ { \
+ propagate_page_fault((unsigned long)(stkp + 1) - rc, \
+ PFEC_write_access); \
+ return 0; \
+ } \
+ } while ( 0 )
+
+ if ( ((ar >> 13) & 3) < (regs->cs & 3) )
+ {
+ sel |= (ar >> 13) & 3;
+ /* Inner stack known only for kernel ring. */
+ if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ esp = v->arch.guest_context.kernel_sp;
+ ss = v->arch.guest_context.kernel_ss;
+ if ( (ss & 3) != (sel & 3) ||
+ !read_descriptor(ss, v, regs, &base, &limit, &ar,
0) ||
+ ((ar >> 13) & 3) != (sel & 3) ||
+ !(ar & _SEGMENT_S) ||
+ (ar & _SEGMENT_CODE) ||
+ !(ar & _SEGMENT_WR) )
+ {
+ regs->error_code = ss & ~3;
+ return do_guest_trap(TRAP_invalid_tss, regs, 1);
+ }
+ if ( !(ar & _SEGMENT_P) ||
+ !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) )
+ {
+ regs->error_code = ss & ~3;
+ return do_guest_trap(TRAP_stack_error, regs, 1);
+ }
+ stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
+ if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ push(regs->ss);
+ push(regs->esp);
+ if ( nparm )
+ {
+ const unsigned int *ustkp;
+
+ if ( !read_descriptor(regs->ss, v, regs, &base,
&limit, &ar, 0) ||
+ ((ar >> 13) & 3) != (regs->cs & 3) ||
+ !(ar & _SEGMENT_S) ||
+ (ar & _SEGMENT_CODE) ||
+ !(ar & _SEGMENT_WR) ||
+ !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4)
)
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ ustkp = (unsigned int *)(unsigned long)((unsigned int)base +
regs->_esp + nparm * 4);
+ if ( !compat_access_ok(ustkp - nparm, nparm * 4) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ do
+ {
+ unsigned int parm;
+
+ --ustkp;
+ rc = __get_user(parm, ustkp);
+ if ( rc )
+ {
+ propagate_page_fault((unsigned long)(ustkp + 1) - rc,
0);
+ return 0;
+ }
+ push(parm);
+ } while ( --nparm );
+ }
+ }
+ else
+ {
+ sel |= (regs->cs & 3);
+ esp = regs->esp;
+ ss = regs->ss;
+ if ( !read_descriptor(ss, v, regs, &base, &limit, &ar,
0) ||
+ ((ar >> 13) & 3) != (sel & 3) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ if ( !check_stack_limit(ar, limit, esp, 2 * 4) )
+ {
+ regs->error_code = 0;
+ return do_guest_trap(TRAP_stack_error, regs, 1);
+ }
+ stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp);
+ if ( !compat_access_ok(stkp - 2, 2 * 4) )
+ return do_guest_trap(TRAP_gp_fault, regs, 1);
+ }
+ push(regs->cs);
+ push(eip);
+#undef push
+ regs->esp = esp;
+ regs->ss = ss;
+ }
+ else
+ sel |= (regs->cs & 3);
+
+ regs->eip = off;
+ regs->cs = sel;
+#endif
+
+ return 0;
+}
+
asmlinkage int do_general_protection(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
@@ -1801,6 +2190,8 @@ asmlinkage int do_general_protection(str
return do_guest_trap(vector, regs, 0);
}
}
+ else if ( IS_COMPAT(v->domain) && regs->error_code )
+ return emulate_gate_op(regs);
/* Emulate some simple privileged and I/O instructions. */
if ( (regs->error_code == 0) &&
Index: 2007-01-08/xen/arch/x86/x86_64/mm.c
==================================================================---
2007-01-08.orig/xen/arch/x86/x86_64/mm.c 2007-01-25 16:01:22.000000000 +0100
+++ 2007-01-08/xen/arch/x86/x86_64/mm.c 2007-01-26 09:16:06.000000000 +0100
@@ -366,14 +366,16 @@ int check_descriptor(const struct domain
{
u32 a = d->a, b = d->b;
u16 cs;
+ unsigned int dpl;
/* A not-present descriptor will always fault, so is safe. */
if ( !(b & _SEGMENT_P) )
goto good;
/* Check and fix up the DPL. */
- if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL(dom) << 13) )
- d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL(dom) <<
13);
+ dpl = (b >> 13) & 3;
+ __fixup_guest_selector(dom, dpl);
+ b = (b & ~_SEGMENT_DPL) | (dpl << 13);
/* All code and data segments are okay. No base/limit checking. */
if ( (b & _SEGMENT_S) )
@@ -391,18 +393,33 @@ int check_descriptor(const struct domain
if ( (b & _SEGMENT_TYPE) != 0xc00 )
goto bad;
- /* Validate and fix up the target code selector. */
+ /* Validate the target code selector. */
cs = a >> 16;
- fixup_guest_code_selector(dom, cs);
if ( !guest_gate_selector_okay(dom, cs) )
goto bad;
- a = d->a = (d->a & 0xffffU) | (cs << 16);
+#ifdef __x86_64__
+ /*
+ * Force DPL to zero, causing a GP fault with its error code indicating
+ * the gate in use, allowing emulation. This is necessary because with
+ * native guests (kernel in ring 3) call gates cannot be used directly
+ * to transition from user to kernel mode (and whether a gate is used
+ * to enter the kernel can only be determined when the gate is being
+ * used), and with compat guests call gates cannot be used at all as
+ * there are only 64-bit ones.
+ * Store the original DPL in the selector''s RPL field.
+ */
+ b &= ~_SEGMENT_DPL;
+ cs = (cs & ~3) | dpl;
+#endif
+ a = (a & 0xffffU) | (cs << 16);
/* Reserved bits must be zero. */
- if ( (b & 0xe0) != 0 )
+ if ( b & (CONFIG_PAGING_LEVELS < 4 || IS_COMPAT(dom) ? 0xe0 : 0xff)
)
goto bad;
good:
+ d->a = a;
+ d->b = b;
return 1;
bad:
return 0;
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel