Jan Beulich
2007-Nov-15 11:30 UTC
[Xen-devel] [PATCH] i386: adjustment to segment fixup code
Since SLES9SP4 is unfortunately expected to ship with a glibc that does use negative offsets into the TLS area, I looked more closely at the fixup code in Xen. Besides lacking support for SIB addressing and many general purpose instructions, I realized that the non-atomic updates of descriptor table entries provide a hole (albeit very small) for attacks on the hypervisor address space: Accessing negative offsets from a descriptor set up to have a base of e.g. 0xF0000000 (and its limit therefore truncated by Xen) would result in the low 16 bits of the limit in the descriptor being updated first (thus bumping the limit to the 4Gb boundary), and setting the expand down bit only in a subsequent memory access. Thus, if a guest tried hard enough it might be possible to load the %gs selector register at exactly the right point to access this partially updated descriptor. The kernel of that guest would then, by avoiding a reload of the selector register, be able to access Xen''s private space. Even worse - since the code in xen/arch/x86/x86_32/seg_fixup.c doesn''t appear to be serialized in any way, under SMP there was a small potential for producing permanently corrupted descriptors. Remains one question: What does BIGLOCK protect in do_update_descriptor()? If all it''s about is to avoid racing updates, I would think that its use could be dropped with this patch applied. Signed-off-by: Jan Beulich <jbeulich@novell.com> Index: 2007-11-13/xen/arch/x86/mm.c ==================================================================--- 2007-11-13.orig/xen/arch/x86/mm.c 2007-11-13 15:50:56.000000000 +0100 +++ 2007-11-13/xen/arch/x86/mm.c 2007-11-13 15:18:35.000000000 +0100 @@ -3113,7 +3113,7 @@ long do_update_descriptor(u64 pa, u64 de /* All is good so make the update. */ gdt_pent = map_domain_page(mfn); - memcpy(&gdt_pent[offset], &d, 8); + write_guest_descriptor(gdt_pent + offset, d); unmap_domain_page(gdt_pent); put_page_type(page); Index: 2007-11-13/xen/arch/x86/x86_32/seg_fixup.c ==================================================================--- 2007-11-13.orig/xen/arch/x86/x86_32/seg_fixup.c 2007-11-13 15:50:56.000000000 +0100 +++ 2007-11-13/xen/arch/x86/x86_32/seg_fixup.c 2007-11-13 15:25:48.000000000 +0100 @@ -42,7 +42,7 @@ #define O OPCODE_BYTE #define M HAS_MODRM -static unsigned char insn_decode[256] = { +static const unsigned char insn_decode[256] = { /* 0x00 - 0x0F */ O|M, O|M, O|M, O|M, X, X, X, X, O|M, O|M, O|M, O|M, X, X, X, X, @@ -69,7 +69,7 @@ static unsigned char insn_decode[256] = X, X, X, X, X, X, X, X, /* 0x80 - 0x8F */ O|M|1, O|M|4, O|M|1, O|M|1, O|M, O|M, O|M, O|M, - O|M, O|M, O|M, O|M, O|M, O|M, O|M, X, + O|M, O|M, O|M, O|M, O|M, X|M, O|M, O|M, /* 0x90 - 0x9F */ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, @@ -89,17 +89,17 @@ static unsigned char insn_decode[256] = X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0xF0 - 0xFF */ - X, X, X, X, X, X, X, X, + X, X, X, X, X, X, O|M, O|M, X, X, X, X, X, X, O|M, O|M }; -static unsigned char twobyte_decode[256] = { +static const unsigned char twobyte_decode[256] = { /* 0x00 - 0x0F */ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x10 - 0x1F */ X, X, X, X, X, X, X, X, - X, X, X, X, X, X, X, X, + O|M, X, X, X, X, X, X, X, /* 0x20 - 0x2F */ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, @@ -122,16 +122,16 @@ static unsigned char twobyte_decode[256] X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x90 - 0x9F */ - X, X, X, X, X, X, X, X, - X, X, X, X, X, X, X, X, + O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, + O|M, O|M, O|M, O|M, O|M, O|M, O|M, O|M, /* 0xA0 - 0xAF */ - X, X, X, X, X, X, X, X, - X, X, X, X, X, X, X, X, + X, X, X, O|M, O|M|1, O|M, O|M, X, + X, X, X, O|M, O|M|1, O|M, X, O|M, /* 0xB0 - 0xBF */ - X, X, X, X, X, X, X, X, - X, X, X, X, X, X, X, X, + X, X, X, O|M, X, X, O|M, O|M, + X, X, O|M|1, O|M, O|M, O|M, O|M, O|M, /* 0xC0 - 0xCF */ - X, X, X, X, X, X, X, X, + O|M, O|M, X, O|M, X, X, X, O|M, X, X, X, X, X, X, X, X, /* 0xD0 - 0xDF */ X, X, X, X, X, X, X, X, @@ -153,41 +153,40 @@ static unsigned char twobyte_decode[256] * @base (OUT): Decoded linear base address. * @limit (OUT): Decoded segment limit, in bytes. 0 == unlimited (4GB). */ -int get_baselimit(u16 seg, unsigned long *base, unsigned long *limit) +static int get_baselimit(u16 seg, unsigned long *base, unsigned long *limit) { - struct vcpu *d = current; - unsigned long *table, a, b; + struct vcpu *curr = current; + struct desc_struct *table, d; int ldt = !!(seg & 4); int idx = (seg >> 3) & 8191; /* Get base and check limit. */ if ( ldt ) { - table = (unsigned long *)LDT_VIRT_START(d); - if ( idx >= d->arch.guest_context.ldt_ents ) + table = (void *)LDT_VIRT_START(curr); + if ( idx >= curr->arch.guest_context.ldt_ents ) goto fail; } else /* gdt */ { - table = (unsigned long *)GDT_VIRT_START(d); - if ( idx >= d->arch.guest_context.gdt_ents ) + table = (void *)GDT_VIRT_START(curr); + if ( idx >= curr->arch.guest_context.gdt_ents ) goto fail; } /* Grab the segment descriptor. */ - if ( __get_user(a, &table[2*idx+0]) || - __get_user(b, &table[2*idx+1]) ) + if ( read_guest_descriptor(table + idx, &d) ) goto fail; /* Barking up the wrong tree. Decode needs a page fault.*/ /* We only parse 32-bit code and data segments. */ - if ( (b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB)) != + if ( (d.b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB)) ! (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB) ) goto fail; /* Decode base and limit. */ - *base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16); - *limit = ((b & 0xf0000) | (a & 0x0ffff)) + 1; - if ( (b & _SEGMENT_G) ) + *base = (d.b&(0xff<<24)) | ((d.b&0xff)<<16) | (d.a>>16); + *limit = ((d.b & 0xf0000) | (d.a & 0x0ffff)) + 1; + if ( (d.b & _SEGMENT_G) ) *limit <<= 12; /* @@ -204,7 +203,7 @@ int get_baselimit(u16 seg, unsigned long } /* Turn a segment+offset into a linear address. */ -int linearise_address(u16 seg, unsigned long off, unsigned long *linear) +static int linearise_address(u16 seg, unsigned long off, unsigned long *linear) { unsigned long base, limit; @@ -219,57 +218,57 @@ int linearise_address(u16 seg, unsigned return 1; } -int fixup_seg(u16 seg, unsigned long offset) +static int fixup_seg(u16 seg, unsigned long offset) { - struct vcpu *d = current; - unsigned long *table, a, b, base, limit; + struct vcpu *curr = current; + struct desc_struct *table, d; + unsigned long base, limit; int ldt = !!(seg & 4); int idx = (seg >> 3) & 8191; /* Get base and check limit. */ if ( ldt ) { - table = (unsigned long *)LDT_VIRT_START(d); - if ( idx >= d->arch.guest_context.ldt_ents ) + table = (void *)LDT_VIRT_START(curr); + if ( idx >= curr->arch.guest_context.ldt_ents ) { dprintk(XENLOG_DEBUG, "Segment %04x out of LDT range (%ld)\n", - seg, d->arch.guest_context.ldt_ents); + seg, curr->arch.guest_context.ldt_ents); goto fail; } } else /* gdt */ { - table = (unsigned long *)GDT_VIRT_START(d); - if ( idx >= d->arch.guest_context.gdt_ents ) + table = (void *)GDT_VIRT_START(curr); + if ( idx >= curr->arch.guest_context.gdt_ents ) { dprintk(XENLOG_DEBUG, "Segment %04x out of GDT range (%ld)\n", - seg, d->arch.guest_context.gdt_ents); + seg, curr->arch.guest_context.gdt_ents); goto fail; } } /* Grab the segment descriptor. */ - if ( __get_user(a, &table[2*idx+0]) || - __get_user(b, &table[2*idx+1]) ) + if ( read_guest_descriptor(table + idx, &d) ) { dprintk(XENLOG_DEBUG, "Fault while reading segment %04x\n", seg); goto fail; /* Barking up the wrong tree. Decode needs a page fault.*/ } /* We only parse 32-bit page-granularity non-privileged data segments. */ - if ( (b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB| - _SEGMENT_G|_SEGMENT_CODE|_SEGMENT_DPL)) != + if ( (d.b & (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB| + _SEGMENT_G|_SEGMENT_CODE|_SEGMENT_DPL)) ! (_SEGMENT_P|_SEGMENT_S|_SEGMENT_DB|_SEGMENT_G|_SEGMENT_DPL) ) { - dprintk(XENLOG_DEBUG, "Bad segment %08lx:%08lx\n", a, b); + dprintk(XENLOG_DEBUG, "Bad segment %08x:%08x\n", d.a, d.b); goto fail; } /* Decode base and limit. */ - base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16); - limit = (((b & 0xf0000) | (a & 0x0ffff)) + 1) << 12; + base = (d.b&(0xff<<24)) | ((d.b&0xff)<<16) | (d.a>>16); + limit = (((d.b & 0xf0000) | (d.a & 0x0ffff)) + 1) << 12; - if ( b & _SEGMENT_EC ) + if ( d.b & _SEGMENT_EC ) { /* Expands-down: All the way to zero? Assume 4GB if so. */ if ( ((base + limit) < PAGE_SIZE) && (offset <= limit) ) @@ -292,20 +291,19 @@ int fixup_seg(u16 seg, unsigned long off } dprintk(XENLOG_DEBUG, "None of the above! " - "(%08lx:%08lx, %08lx, %08lx, %08lx)\n", - a, b, base, limit, base+limit); + "(%08x:%08x, %08lx, %08lx, %08lx)\n", + d.a, d.b, base, limit, base+limit); fail: return 0; flip: limit = (limit >> 12) - 1; - a &= ~0x0ffff; a |= limit & 0x0ffff; - b &= ~0xf0000; b |= limit & 0xf0000; - b ^= _SEGMENT_EC; /* grows-up <-> grows-down */ - /* NB. These can''t fault. Checked readable above; must also be writable. */ - table[2*idx+0] = a; - table[2*idx+1] = b; + d.a &= ~0x0ffff; d.a |= limit & 0x0ffff; + d.b &= ~0xf0000; d.b |= limit & 0xf0000; + d.b ^= _SEGMENT_EC; /* grows-up <-> grows-down */ + /* NB. This can''t fault. Checked readable above; must also be writable. */ + write_guest_descriptor(table + idx, d); return 1; } @@ -315,18 +313,15 @@ int fixup_seg(u16 seg, unsigned long off */ int gpf_emulate_4gb(struct cpu_user_regs *regs) { - struct vcpu *d = current; - struct trap_info *ti; - struct trap_bounce *tb; - u8 modrm, mod, reg, rm, decode; - void *memreg; - unsigned long offset; - u8 disp8; - u32 disp32 = 0; + struct vcpu *curr = current; + u8 modrm, mod, rm, decode; + const u32 *base, *index = NULL; + unsigned long offset; + s8 disp8; + s32 disp32 = 0; u8 *eip; /* ptr to instruction start */ u8 *pb, b; /* ptr into instr. / current instr. byte */ - int gs_override = 0; - int twobyte = 0; + int gs_override = 0, scale = 0, twobyte = 0; /* WARNING: We only work for ring-3 segments. */ if ( unlikely(vm86_mode(regs)) || unlikely(!ring_3(regs)) ) @@ -357,6 +352,9 @@ int gpf_emulate_4gb(struct cpu_user_regs goto fail; } + if ( twobyte ) + break; + switch ( b ) { case 0x67: /* Address-size override */ @@ -375,6 +373,9 @@ int gpf_emulate_4gb(struct cpu_user_regs case 0x65: /* GS override */ gs_override = 1; break; + case 0x0f: /* Not really a prefix byte */ + twobyte = 1; + break; default: /* Not a prefix byte */ goto done_prefix; } @@ -387,32 +388,10 @@ int gpf_emulate_4gb(struct cpu_user_regs goto fail; } - decode = insn_decode[b]; /* opcode byte */ + decode = (!twobyte ? insn_decode : twobyte_decode)[b]; pb++; - if ( decode == 0 && b == 0x0f ) - { - twobyte = 1; - if ( get_user(b, pb) ) - { - dprintk(XENLOG_DEBUG, - "Fault while accessing byte %ld of instruction\n", - (long)(pb-eip)); - goto page_fault; - } - - if ( (pb - eip) >= 15 ) - { - dprintk(XENLOG_DEBUG, "Too many opcode bytes for a " - "legal instruction\n"); - goto fail; - } - - decode = twobyte_decode[b]; - pb++; - } - - if ( decode == 0 ) + if ( !(decode & OPCODE_BYTE) ) { dprintk(XENLOG_DEBUG, "Unsupported %sopcode %02x\n", twobyte ? "two byte " : "", b); @@ -422,12 +401,12 @@ int gpf_emulate_4gb(struct cpu_user_regs if ( !(decode & HAS_MODRM) ) { /* Must be a <disp32>, or bail. */ - if ( (decode & 7) != 4 ) + if ( (decode & INSN_SUFFIX_BYTES) != 4 ) goto fail; if ( get_user(offset, (u32 *)pb) ) { - dprintk(XENLOG_DEBUG, "Fault while extracting <disp32>.\n"); + dprintk(XENLOG_DEBUG, "Fault while extracting <moffs32>.\n"); goto page_fault; } pb += 4; @@ -448,29 +427,39 @@ int gpf_emulate_4gb(struct cpu_user_regs pb++; mod = (modrm >> 6) & 3; - reg = (modrm >> 3) & 7; rm = (modrm >> 0) & 7; if ( rm == 4 ) { - dprintk(XENLOG_DEBUG, "FIXME: Add decoding for the SIB byte.\n"); - goto fixme; + u8 sib; + + if ( get_user(sib, pb) ) + { + dprintk(XENLOG_DEBUG, "Fault while extracting sib byte\n"); + goto page_fault; + } + + pb++; + + rm = sib & 7; + if ( (sib & 0x38) != 0x20 ) + index = decode_register((sib >> 3) & 7, regs, 0); + scale = sib >> 6; } /* Decode R/M field. */ - memreg = decode_register(rm, regs, 0); + base = decode_register(rm, regs, 0); /* Decode Mod field. */ - switch ( modrm >> 6 ) + switch ( mod ) { case 0: - disp32 = 0; if ( rm == 5 ) /* disp32 rather than (EBP) */ { - memreg = NULL; + base = NULL; if ( get_user(disp32, (u32 *)pb) ) { - dprintk(XENLOG_DEBUG, "Fault while extracting <disp8>.\n"); + dprintk(XENLOG_DEBUG, "Fault while extracting <base32>.\n"); goto page_fault; } pb += 4; @@ -484,13 +473,13 @@ int gpf_emulate_4gb(struct cpu_user_regs goto page_fault; } pb++; - disp32 = (disp8 & 0x80) ? (disp8 | ~0xff) : disp8;; + disp32 = disp8; break; case 2: if ( get_user(disp32, (u32 *)pb) ) { - dprintk(XENLOG_DEBUG, "Fault while extracting <disp8>.\n"); + dprintk(XENLOG_DEBUG, "Fault while extracting <disp32>.\n"); goto page_fault; } pb += 4; @@ -502,8 +491,10 @@ int gpf_emulate_4gb(struct cpu_user_regs } offset = disp32; - if ( memreg != NULL ) - offset += *(u32 *)memreg; + if ( base != NULL ) + offset += *base; + if ( index != NULL ) + offset += *index << scale; skip_modrm: if ( !fixup_seg((u16)regs->gs, offset) ) @@ -513,10 +504,11 @@ int gpf_emulate_4gb(struct cpu_user_regs perfc_incr(seg_fixups); /* If requested, give a callback on otherwise unused vector 15. */ - if ( VM_ASSIST(d->domain, VMASST_TYPE_4gb_segments_notify) ) + if ( VM_ASSIST(curr->domain, VMASST_TYPE_4gb_segments_notify) ) { - ti = &d->arch.guest_context.trap_ctxt[15]; - tb = &d->arch.trap_bounce; + struct trap_info *ti = &curr->arch.guest_context.trap_ctxt[15]; + struct trap_bounce *tb = &curr->arch.trap_bounce; + tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; tb->error_code = pb - eip; tb->cs = ti->cs; @@ -527,13 +519,6 @@ int gpf_emulate_4gb(struct cpu_user_regs return EXCRET_fault_fixed; - fixme: - dprintk(XENLOG_DEBUG, "Undecodable instruction " - "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x " - "caused GPF(0) at %04x:%08x\n", - eip[0], eip[1], eip[2], eip[3], - eip[4], eip[5], eip[6], eip[7], - regs->cs, regs->eip); fail: return 0; Index: 2007-11-13/xen/arch/x86/x86_emulate.c ==================================================================--- 2007-11-13.orig/xen/arch/x86/x86_emulate.c 2007-11-13 15:50:56.000000000 +0100 +++ 2007-11-13/xen/arch/x86/x86_emulate.c 2007-11-13 15:51:41.000000000 +0100 @@ -2263,7 +2263,7 @@ x86_emulate( case 0x09: /* wbinvd */ generate_exception_if(!mode_ring0(), EXC_GP); fail_if(ops->wbinvd == NULL); - if ( (rc = ops->wbinvd(ctxt)) != 0 ) + if ( (rc = (ops->wbinvd)(ctxt)) != 0 ) goto done; break; Index: 2007-11-13/xen/include/asm-x86/desc.h ==================================================================--- 2007-11-13.orig/xen/include/asm-x86/desc.h 2007-11-13 15:50:56.000000000 +0100 +++ 2007-11-13/xen/include/asm-x86/desc.h 2007-11-13 15:18:35.000000000 +0100 @@ -137,6 +137,11 @@ struct desc_struct { #if defined(__x86_64__) +#include <asm/uaccess.h> + +/* Use __put_user() just to ensure the compiler doesn''t split the write. */ +#define write_guest_descriptor(pent, d) ((void)__put_user(d, pent)) + typedef struct { u64 a, b; } idt_entry_t; @@ -168,6 +173,29 @@ do { #elif defined(__i386__) +#include <asm/system.h> + +static inline int read_guest_descriptor(struct desc_struct *pent, + struct desc_struct *pd) +{ + union { u64 raw; struct desc_struct d; } val = { 0 }; + int rc = cmpxchg_user(pent, val.raw, val.raw); + + *pd = val.d; + return rc; +} + +static inline void write_guest_descriptor(struct desc_struct *pent, + struct desc_struct d) +{ + u64 cur = ((u64)pent->b << 32) | pent->a, old; + + do + { + old = cmpxchg((u64 *)pent, cur, ((u64)d.b << 32) | d.a); + } while ( old ^ cur ); +} + typedef struct desc_struct idt_entry_t; #define _set_gate(gate_addr,type,dpl,addr) \ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel