This is now the full set of changes needed to eliminate the assumption that segments in protected mode always have zero base addresses. At once, this further simplifies the instruction length determination code used for MMIO of HVM domains. Functionality tested on 3.0.3, compile tested on -unstable. Signed-off-by: Jan Beulich <jbeulich@novell.com> Index: 2006-11-27/xen/arch/x86/hvm/instrlen.c ==================================================================--- 2006-11-27.orig/xen/arch/x86/hvm/instrlen.c 2006-11-28 09:22:06.000000000 +0100 +++ 2006-11-27/xen/arch/x86/hvm/instrlen.c 2006-11-28 09:24:15.000000000 +0100 @@ -20,7 +20,6 @@ #include <xen/config.h> #include <xen/sched.h> #include <xen/mm.h> -#include <asm/regs.h> #include <asm-x86/x86_emulate.h> /* read from guest memory */ @@ -121,9 +120,7 @@ static uint8_t opcode_table[256] = { ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, ByteOp|ImplicitOps, ImplicitOps, /* 0xB0 - 0xBF */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xC7 */ ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM, 0, 0, 0, 0, ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImm|ModRM, @@ -195,54 +192,52 @@ static uint8_t twobyte_table[256] = { }; /* - * insn_fetch - fetch the next 1 to 4 bytes from instruction stream - * @_type: u8, u16, u32, s8, s16, or s32 - * @_size: 1, 2, or 4 bytes + * insn_fetch - fetch the next byte from instruction stream */ -#define insn_fetch(_type, _size) \ -({ unsigned long _x, _ptr = _regs.eip; \ - if ( mode == X86EMUL_MODE_REAL ) _ptr += _regs.cs << 4; \ - rc = inst_copy_from_guest((unsigned char *)(&(_x)), _ptr, _size); \ - if ( rc != _size ) goto done; \ - _regs.eip += (_size); \ - length += (_size); \ - (_type)_x; \ +#define insn_fetch() \ +({ uint8_t _x; \ + if ( length >= 15 ) \ + return -1; \ + if ( inst_copy_from_guest(&_x, pc, 1) != 1 ) { \ + gdprintk(XENLOG_WARNING, \ + "Cannot read from address %lx (eip %lx, mode %d)\n", \ + pc, org_pc, mode); \ + return -1; \ + } \ + pc += 1; \ + length += 1; \ + _x; \ }) /** * hvm_instruction_length - returns the current instructions length * - * @regs: guest register state + * @org_pc: guest instruction pointer * @mode: guest operating mode * * EXTERNAL this routine calculates the length of the current instruction - * pointed to by eip. The guest state is _not_ changed by this routine. + * pointed to by org_pc. The guest state is _not_ changed by this routine. */ -int hvm_instruction_length(struct cpu_user_regs *regs, int mode) +int hvm_instruction_length(unsigned long org_pc, int mode) { - uint8_t b, d, twobyte = 0, rex_prefix = 0; - uint8_t modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; - unsigned int op_bytes, ad_bytes, i; - int rc = 0; + uint8_t b, d, twobyte = 0, rex_prefix = 0, modrm_reg = 0; + unsigned int op_default, op_bytes, ad_default, ad_bytes, tmp; int length = 0; - unsigned int tmp; - - /* Shadow copy of register state. Committed on successful emulation. */ - struct cpu_user_regs _regs = *regs; + unsigned long pc = org_pc; switch ( mode ) { case X86EMUL_MODE_REAL: case X86EMUL_MODE_PROT16: - op_bytes = ad_bytes = 2; + op_bytes = op_default = ad_bytes = ad_default = 2; break; case X86EMUL_MODE_PROT32: - op_bytes = ad_bytes = 4; + op_bytes = op_default = ad_bytes = ad_default = 4; break; #ifdef __x86_64__ case X86EMUL_MODE_PROT64: - op_bytes = 4; - ad_bytes = 8; + op_bytes = op_default = 4; + ad_bytes = ad_default = 8; break; #endif default: @@ -250,18 +245,18 @@ int hvm_instruction_length(struct cpu_us } /* Legacy prefixes. */ - for ( i = 0; i < 8; i++ ) + for ( ; ; ) { - switch ( b = insn_fetch(uint8_t, 1) ) + switch ( b = insn_fetch() ) { case 0x66: /* operand-size override */ - op_bytes ^= 6; /* switch between 2/4 bytes */ + op_bytes = op_default ^ 6; /* switch between 2/4 bytes */ break; case 0x67: /* address-size override */ if ( mode == X86EMUL_MODE_PROT64 ) - ad_bytes ^= 12; /* switch between 4/8 bytes */ + ad_bytes = ad_default ^ 12; /* switch between 4/8 bytes */ else - ad_bytes ^= 6; /* switch between 2/4 bytes */ + ad_bytes = ad_default ^ 6; /* switch between 2/4 bytes */ break; case 0x2e: /* CS override */ case 0x3e: /* DS override */ @@ -273,22 +268,26 @@ int hvm_instruction_length(struct cpu_us case 0xf3: /* REP/REPE/REPZ */ case 0xf2: /* REPNE/REPNZ */ break; +#ifdef __x86_64__ + case 0x40 ... 0x4f: + if ( mode == X86EMUL_MODE_PROT64 ) + { + rex_prefix = b; + continue; + } + /* FALLTHRU */ +#endif default: goto done_prefixes; } + rex_prefix = 0; } done_prefixes: /* REX prefix. */ - if ( (mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40) ) - { - rex_prefix = b; - if ( b & 8 ) - op_bytes = 8; /* REX.W */ - modrm_reg = (b & 4) << 1; /* REX.R */ - /* REX.B and REX.X do not need to be decoded. */ - b = insn_fetch(uint8_t, 1); - } + if ( rex_prefix & 8 ) + op_bytes = 8; /* REX.W */ + /* REX.B, REX.R, and REX.X do not need to be decoded. */ /* Opcode byte(s). */ d = opcode_table[b]; @@ -298,7 +297,7 @@ done_prefixes: if ( b == 0x0f ) { twobyte = 1; - b = insn_fetch(uint8_t, 1); + b = insn_fetch(); d = twobyte_table[b]; } @@ -310,11 +309,11 @@ done_prefixes: /* ModRM and SIB bytes. */ if ( d & ModRM ) { - modrm = insn_fetch(uint8_t, 1); - modrm_mod |= (modrm & 0xc0) >> 6; - modrm_reg |= (modrm & 0x38) >> 3; - modrm_rm |= (modrm & 0x07); + uint8_t modrm = insn_fetch(); + uint8_t modrm_mod = (modrm & 0xc0) >> 6; + uint8_t modrm_rm = (modrm & 0x07); + modrm_reg = (modrm & 0x38) >> 3; if ( modrm_mod == 3 ) { gdprintk(XENLOG_WARNING, "Cannot parse ModRM.mod == 3.\n"); @@ -330,16 +329,16 @@ done_prefixes: if ( modrm_rm == 6 ) { length += 2; - _regs.eip += 2; /* skip disp16 */ + pc += 2; /* skip disp16 */ } break; case 1: length += 1; - _regs.eip += 1; /* skip disp8 */ + pc += 1; /* skip disp8 */ break; case 2: length += 2; - _regs.eip += 2; /* skip disp16 */ + pc += 2; /* skip disp16 */ break; } } @@ -350,33 +349,34 @@ done_prefixes: { case 0: if ( (modrm_rm == 4) && - (((insn_fetch(uint8_t, 1)) & 7) - == 5) ) + ((insn_fetch() & 7) == 5) ) { length += 4; - _regs.eip += 4; /* skip disp32 specified by SIB.base */ + pc += 4; /* skip disp32 specified by SIB.base */ } else if ( modrm_rm == 5 ) { length += 4; - _regs.eip += 4; /* skip disp32 */ + pc += 4; /* skip disp32 */ } break; case 1: if ( modrm_rm == 4 ) { - insn_fetch(uint8_t, 1); + length += 1; + pc += 1; } length += 1; - _regs.eip += 1; /* skip disp8 */ + pc += 1; /* skip disp8 */ break; case 2: if ( modrm_rm == 4 ) { - insn_fetch(uint8_t, 1); + length += 1; + pc += 1; } length += 4; - _regs.eip += 4; /* skip disp32 */ + pc += 4; /* skip disp32 */ break; } } @@ -397,15 +397,12 @@ done_prefixes: tmp = (d & ByteOp) ? 1 : op_bytes; if ( tmp == 8 ) tmp = 4; /* NB. Immediates are sign-extended as necessary. */ - switch ( tmp ) - { - case 1: insn_fetch(int8_t, 1); break; - case 2: insn_fetch(int16_t, 2); break; - case 4: insn_fetch(int32_t, 4); break; - } + length += tmp; + pc += tmp; break; case SrcImmByte: - insn_fetch(int8_t, 1); + length += 1; + pc += 1; break; } @@ -414,13 +411,9 @@ done_prefixes: switch ( b ) { - case 0xa0 ... 0xa1: /* mov */ - length += ad_bytes; - _regs.eip += ad_bytes; /* skip src displacement */ - break; - case 0xa2 ... 0xa3: /* mov */ + case 0xa0 ... 0xa3: /* mov */ length += ad_bytes; - _regs.eip += ad_bytes; /* skip dst displacement */ + pc += ad_bytes; /* skip src/dst displacement */ break; case 0xf6 ... 0xf7: /* Grp3 */ switch ( modrm_reg ) @@ -429,23 +422,19 @@ done_prefixes: /* Special case in Grp3: test has an immediate source operand. */ tmp = (d & ByteOp) ? 1 : op_bytes; if ( tmp == 8 ) tmp = 4; - switch ( tmp ) - { - case 1: insn_fetch(int8_t, 1); break; - case 2: insn_fetch(int16_t, 2); break; - case 4: insn_fetch(int32_t, 4); break; - } - goto done; + length += tmp; + pc += tmp; + break; } break; } done: - return length; + return length < 16 ? length : -1; cannot_emulate: gdprintk(XENLOG_WARNING, - "Cannot emulate %02x at address %lx (eip %lx, mode %d)\n", - b, (unsigned long)_regs.eip, (unsigned long)regs->eip, mode); + "Cannot emulate %02x at address %lx (%lx, mode %d)\n", + b, pc - 1, org_pc, mode); return -1; } Index: 2006-11-27/xen/arch/x86/hvm/platform.c ==================================================================--- 2006-11-27.orig/xen/arch/x86/hvm/platform.c 2006-11-28 09:22:06.000000000 +0100 +++ 2006-11-27/xen/arch/x86/hvm/platform.c 2006-11-28 09:39:05.000000000 +0100 @@ -28,6 +28,7 @@ #include <xen/trace.h> #include <xen/sched.h> #include <asm/regs.h> +#include <asm/x86_emulate.h> #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> #include <asm/hvm/io.h> @@ -168,13 +169,15 @@ long get_reg_value(int size, int index, static inline unsigned char *check_prefix(unsigned char *inst, struct hvm_io_op *mmio_op, + unsigned char *ad_size, unsigned char *op_size, + unsigned char *seg_sel, unsigned char *rex_p) { while ( 1 ) { switch ( *inst ) { /* rex prefix for em64t instructions */ - case 0x40 ... 0x4e: + case 0x40 ... 0x4f: *rex_p = *inst; break; case 0xf3: /* REPZ */ @@ -191,12 +194,13 @@ static inline unsigned char *check_prefi case 0x26: /* ES */ case 0x64: /* FS */ case 0x65: /* GS */ - //mmio_op->seg_sel = *inst; + *seg_sel = *inst; break; case 0x66: /* 32bit->16bit */ *op_size = WORD; break; case 0x67: + *ad_size = WORD; break; default: return inst; @@ -205,7 +209,7 @@ static inline unsigned char *check_prefi } } -static inline unsigned long get_immediate(int op16, const unsigned char *inst, int op_size) +static inline unsigned long get_immediate(int ad_size, const unsigned char *inst, int op_size) { int mod, reg, rm; unsigned long val = 0; @@ -216,16 +220,19 @@ static inline unsigned long get_immediat rm = *inst & 7; inst++; //skip ModR/M byte - if ( mod != 3 && rm == 4 ) { + if ( ad_size != WORD && mod != 3 && rm == 4 ) { + rm = *inst & 7; inst++; //skip SIB byte } switch ( mod ) { case 0: - if ( rm == 5 || rm == 4 ) { - if ( op16 ) + if ( ad_size == WORD ) { + if ( rm == 6 ) inst = inst + 2; //disp16, skip 2 bytes - else + } + else { + if ( rm == 5 ) inst = inst + 4; //disp32, skip 4 bytes } break; @@ -233,7 +240,7 @@ static inline unsigned long get_immediat inst++; //disp8, skip 1 byte break; case 2: - if ( op16 ) + if ( ad_size == WORD ) inst = inst + 2; //disp16, skip 2 bytes else inst = inst + 4; //disp32, skip 4 bytes @@ -276,7 +283,6 @@ static void init_instruction(struct hvm_ mmio_op->instr = 0; mmio_op->flags = 0; - //mmio_op->seg_sel = 0; mmio_op->operand[0] = 0; mmio_op->operand[1] = 0; @@ -346,25 +352,52 @@ static int reg_mem(unsigned char size, u return DECODE_success; } -static int mmio_decode(int realmode, unsigned char *opcode, - struct hvm_io_op *mmio_op, unsigned char *op_size) +static int mmio_decode(int mode, unsigned char *opcode, + struct hvm_io_op *mmio_op, + unsigned char *ad_size, unsigned char *op_size, + unsigned char *seg_sel) { unsigned char size_reg = 0; unsigned char rex = 0; int index; + *ad_size = 0; *op_size = 0; + *seg_sel = 0; init_instruction(mmio_op); - opcode = check_prefix(opcode, mmio_op, op_size, &rex); + opcode = check_prefix(opcode, mmio_op, ad_size, op_size, seg_sel, &rex); - if ( realmode ) { /* meaning is reversed */ + switch ( mode ) { + case X86EMUL_MODE_REAL: /* meaning is reversed */ + case X86EMUL_MODE_PROT16: if ( *op_size == WORD ) *op_size = LONG; else if ( *op_size == LONG ) *op_size = WORD; else if ( *op_size == 0 ) *op_size = WORD; + if ( *ad_size == WORD ) + *ad_size = LONG; + else if ( *ad_size == LONG ) + *ad_size = WORD; + else if ( *ad_size == 0 ) + *ad_size = WORD; + break; + case X86EMUL_MODE_PROT32: + if ( *op_size == 0 ) + *op_size = LONG; + if ( *ad_size == 0 ) + *ad_size = LONG; + break; +#ifdef __x86_64__ + case X86EMUL_MODE_PROT64: + if ( *op_size == 0 ) + *op_size = rex & 0x8 ? QUAD : LONG; + if ( *ad_size == 0 ) + *ad_size = QUAD; + break; +#endif } /* the operands order in comments conforms to AT&T convention */ @@ -471,10 +504,10 @@ static int mmio_decode(int realmode, uns /* opcode 0x83 always has a single byte operand */ if ( opcode[0] == 0x83 ) mmio_op->immediate - (signed char)get_immediate(realmode, opcode + 1, BYTE); + (signed char)get_immediate(*ad_size, opcode + 1, BYTE); else mmio_op->immediate - get_immediate(realmode, opcode + 1, *op_size); + get_immediate(*ad_size, opcode + 1, *op_size); mmio_op->operand[0] = mk_operand(size_reg, 0, 0, IMMEDIATE); mmio_op->operand[1] = mk_operand(size_reg, 0, 0, MEMORY); @@ -598,7 +631,7 @@ static int mmio_decode(int realmode, uns mmio_op->operand[0] = mk_operand(*op_size, 0, 0, IMMEDIATE); mmio_op->immediate - get_immediate(realmode, opcode + 1, *op_size); + get_immediate(*ad_size, opcode + 1, *op_size); mmio_op->operand[1] = mk_operand(*op_size, 0, 0, MEMORY); return DECODE_success; @@ -612,7 +645,7 @@ static int mmio_decode(int realmode, uns mmio_op->operand[0] = mk_operand(*op_size, 0, 0, IMMEDIATE); mmio_op->immediate - get_immediate(realmode, opcode + 1, *op_size); + get_immediate(*ad_size, opcode + 1, *op_size); mmio_op->operand[1] = mk_operand(*op_size, 0, 0, MEMORY); return DECODE_success; @@ -634,7 +667,7 @@ static int mmio_decode(int realmode, uns mmio_op->operand[0] = mk_operand(size_reg, 0, 0, IMMEDIATE); mmio_op->immediate - get_immediate(realmode, opcode + 1, *op_size); + get_immediate(*ad_size, opcode + 1, *op_size); mmio_op->operand[1] = mk_operand(size_reg, 0, 0, MEMORY); return DECODE_success; @@ -697,7 +730,7 @@ static int mmio_decode(int realmode, uns GET_OP_SIZE_FOR_NONEBYTE(*op_size); mmio_op->operand[0] = mk_operand(BYTE, 0, 0, IMMEDIATE); mmio_op->immediate - (signed char)get_immediate(realmode, opcode + 1, BYTE); + (signed char)get_immediate(*ad_size, opcode + 1, BYTE); mmio_op->operand[1] = mk_operand(*op_size, 0, 0, MEMORY); return DECODE_success; } @@ -866,15 +899,15 @@ static void mmio_operands(int type, unsi } #define GET_REPEAT_COUNT() \ - (mmio_op->flags & REPZ ? (realmode ? regs->ecx & 0xFFFF : regs->ecx) : 1) + (mmio_op->flags & REPZ ? (ad_size == WORD ? regs->ecx & 0xFFFF : regs->ecx) : 1) void handle_mmio(unsigned long gpa) { unsigned long inst_addr; struct hvm_io_op *mmio_op; struct cpu_user_regs *regs; - unsigned char inst[MAX_INST_LEN], op_size; - int i, realmode, df, inst_len; + unsigned char inst[MAX_INST_LEN], ad_size, op_size, seg_sel; + int i, mode, df, inst_len; struct vcpu *v = current; mmio_op = &v->arch.hvm_vcpu.io_op; @@ -886,27 +919,23 @@ void handle_mmio(unsigned long gpa) df = regs->eflags & X86_EFLAGS_DF ? 1 : 0; - inst_len = hvm_instruction_length(regs, hvm_guest_x86_mode(v)); + mode = hvm_guest_x86_mode(v); + inst_addr = hvm_get_segment_base(v, seg_cs) + regs->eip; + inst_len = hvm_instruction_length(inst_addr, mode); if ( inst_len <= 0 ) { printk("handle_mmio: failed to get instruction length\n"); domain_crash_synchronous(); } - realmode = hvm_realmode(v); - if ( realmode ) - inst_addr = regs->cs << 4; - else - inst_addr = hvm_get_segment_base(current, seg_cs); - inst_addr += regs->eip; - memset(inst, 0, MAX_INST_LEN); if ( inst_copy_from_guest(inst, inst_addr, inst_len) != inst_len ) { printk("handle_mmio: failed to copy instruction\n"); domain_crash_synchronous(); } - if ( mmio_decode(realmode, inst, mmio_op, &op_size) == DECODE_failure ) { + if ( mmio_decode(mode, inst, mmio_op, &ad_size, &op_size, &seg_sel) + == DECODE_failure ) { printk("handle_mmio: failed to decode instruction\n"); printk("mmio opcode: gpa 0x%lx, len %d:", gpa, inst_len); for ( i = 0; i < inst_len; i++ ) @@ -926,29 +955,39 @@ void handle_mmio(unsigned long gpa) { unsigned long count = GET_REPEAT_COUNT(); int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1; - unsigned long addr = 0; + unsigned long addr; int dir, size = op_size; ASSERT(count); /* determine non-MMIO address */ - if ( realmode ) { - if ( ((regs->es << 4) + (regs->edi & 0xFFFF)) == gpa ) { - dir = IOREQ_WRITE; - addr = (regs->ds << 4) + (regs->esi & 0xFFFF); - } else { - dir = IOREQ_READ; - addr = (regs->es << 4) + (regs->edi & 0xFFFF); - } - } else { - if ( gpa == regs->edi ) { - dir = IOREQ_WRITE; - addr = regs->esi; - } else { - dir = IOREQ_READ; - addr = regs->edi; + addr = regs->edi; + if ( ad_size == WORD ) + addr &= 0xFFFF; + addr += hvm_get_segment_base(v, seg_es); + if ( addr == gpa ) + { + enum segment seg; + + dir = IOREQ_WRITE; + addr = regs->esi; + if ( ad_size == WORD ) + addr &= 0xFFFF; + switch ( seg_sel ) + { + case 0x26: seg = seg_es; break; + case 0x2e: seg = seg_cs; break; + case 0x36: seg = seg_ss; break; + case 0: + case 0x3e: seg = seg_ds; break; + case 0x64: seg = seg_fs; break; + case 0x65: seg = seg_gs; break; + default: domain_crash_synchronous(); } + addr += hvm_get_segment_base(v, seg); } + else + dir = IOREQ_READ; if ( addr & (size - 1) ) gdprintk(XENLOG_WARNING, Index: 2006-11-27/xen/arch/x86/hvm/svm/svm.c ==================================================================--- 2006-11-27.orig/xen/arch/x86/hvm/svm/svm.c 2006-11-28 09:22:06.000000000 +0100 +++ 2006-11-27/xen/arch/x86/hvm/svm/svm.c 2006-11-28 09:43:47.000000000 +0100 @@ -511,18 +511,24 @@ unsigned long svm_get_ctrl_reg(struct vc static unsigned long svm_get_segment_base(struct vcpu *v, enum segment seg) { + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + int long_mode = 0; + +#ifdef __x86_64__ + long_mode = vmcb->cs.attributes.fields.l && (vmcb->efer & EFER_LMA); +#endif switch ( seg ) { - case seg_cs: return v->arch.hvm_svm.vmcb->cs.base; - case seg_ds: return v->arch.hvm_svm.vmcb->ds.base; - case seg_es: return v->arch.hvm_svm.vmcb->es.base; - case seg_fs: return v->arch.hvm_svm.vmcb->fs.base; - case seg_gs: return v->arch.hvm_svm.vmcb->gs.base; - case seg_ss: return v->arch.hvm_svm.vmcb->ss.base; - case seg_tr: return v->arch.hvm_svm.vmcb->tr.base; - case seg_gdtr: return v->arch.hvm_svm.vmcb->gdtr.base; - case seg_idtr: return v->arch.hvm_svm.vmcb->idtr.base; - case seg_ldtr: return v->arch.hvm_svm.vmcb->ldtr.base; + case seg_cs: return long_mode ? 0 : vmcb->cs.base; + case seg_ds: return long_mode ? 0 : vmcb->ds.base; + case seg_es: return long_mode ? 0 : vmcb->es.base; + case seg_fs: return vmcb->fs.base; + case seg_gs: return vmcb->gs.base; + case seg_ss: return long_mode ? 0 : vmcb->ss.base; + case seg_tr: return vmcb->tr.base; + case seg_gdtr: return vmcb->gdtr.base; + case seg_idtr: return vmcb->idtr.base; + case seg_ldtr: return vmcb->ldtr.base; } BUG(); return 0; @@ -832,7 +838,6 @@ int start_svm(void) hvm_funcs.store_cpu_guest_regs = svm_store_cpu_guest_regs; hvm_funcs.load_cpu_guest_regs = svm_load_cpu_guest_regs; - hvm_funcs.realmode = svm_realmode; hvm_funcs.paging_enabled = svm_paging_enabled; hvm_funcs.long_mode_enabled = svm_long_mode_enabled; hvm_funcs.pae_enabled = svm_pae_enabled; @@ -925,7 +930,7 @@ static void svm_do_general_protection_fa printk("Huh? We got a GP Fault with an invalid IDTR!\n"); svm_dump_vmcb(__func__, vmcb); svm_dump_regs(__func__, regs); - svm_dump_inst(vmcb->rip); + svm_dump_inst(svm_rip2pointer(vmcb)); domain_crash(v->domain); return; } @@ -1223,22 +1228,21 @@ static void svm_get_prefix_info( /* Get the address of INS/OUTS instruction */ static inline int svm_get_io_address( - struct vcpu *v, - struct cpu_user_regs *regs, unsigned int dir, + struct vcpu *v, struct cpu_user_regs *regs, + unsigned int size, ioio_info_t info, unsigned long *count, unsigned long *addr) { unsigned long reg; unsigned int asize = 0; unsigned int isize; - int long_mode; - ioio_info_t info; + int long_mode = 0; segment_selector_t *seg = NULL; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - info.bytes = vmcb->exitinfo1; - +#ifdef __x86_64__ /* If we''re in long mode, we shouldn''t check the segment presence & limit */ long_mode = vmcb->cs.attributes.fields.l && vmcb->efer & EFER_LMA; +#endif /* d field of cs.attributes is 1 for 32-bit, 0 for 16 or 64 bit. * l field combined with EFER_LMA -> longmode says whether it''s 16 or 64 bit. @@ -1256,11 +1260,9 @@ static inline int svm_get_io_address( isize --; if (isize > 1) - svm_get_prefix_info(vmcb, dir, &seg, &asize); + svm_get_prefix_info(vmcb, info.fields.type, &seg, &asize); - ASSERT(dir == IOREQ_READ || dir == IOREQ_WRITE); - - if (dir == IOREQ_WRITE) + if (info.fields.type == IOREQ_WRITE) { reg = regs->esi; if (!seg) /* If no prefix, used DS. */ @@ -1289,6 +1291,8 @@ static inline int svm_get_io_address( *addr = reg; *count = regs->ecx; } + if (!info.fields.rep) + *count = 1; if (!long_mode) { if (*addr > seg->limit) @@ -1301,7 +1305,8 @@ static inline int svm_get_io_address( *addr += seg->base; } } - + else if (seg == &vmcb->fs || seg == &vmcb->gs) + *addr += seg->base; return 1; } @@ -1351,7 +1356,7 @@ static void svm_io_instruction(struct vc unsigned long addr, count; int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1; - if (!svm_get_io_address(v, regs, dir, &count, &addr)) + if (!svm_get_io_address(v, regs, size, info, &count, &addr)) { /* We failed to get a valid address, so don''t do the IO operation - * it would just get worse if we do! Hopefully the guest is handing @@ -1365,10 +1370,6 @@ static void svm_io_instruction(struct vc { pio_opp->flags |= REPZ; } - else - { - count = 1; - } /* * Handle string pio instructions that cross pages or that Index: 2006-11-27/xen/arch/x86/hvm/vmx/vmx.c ==================================================================--- 2006-11-27.orig/xen/arch/x86/hvm/vmx/vmx.c 2006-11-28 09:22:06.000000000 +0100 +++ 2006-11-27/xen/arch/x86/hvm/vmx/vmx.c 2006-11-28 10:34:07.000000000 +0100 @@ -503,23 +503,32 @@ static unsigned long vmx_get_ctrl_reg(st static unsigned long vmx_get_segment_base(struct vcpu *v, enum segment seg) { - unsigned long base; + unsigned long base = 0; + int long_mode = 0; + + ASSERT(v == current); + +#ifdef __x86_64__ + if ( vmx_long_mode_enabled(v) && + (__vmread(GUEST_CS_AR_BYTES) & (1u<<13)) ) + long_mode = 1; +#endif - BUG_ON(v != current); switch ( seg ) { - case seg_cs: base = __vmread(GUEST_CS_BASE); break; - case seg_ds: base = __vmread(GUEST_DS_BASE); break; - case seg_es: base = __vmread(GUEST_ES_BASE); break; + case seg_cs: if ( !long_mode ) base = __vmread(GUEST_CS_BASE); break; + case seg_ds: if ( !long_mode ) base = __vmread(GUEST_DS_BASE); break; + case seg_es: if ( !long_mode ) base = __vmread(GUEST_ES_BASE); break; case seg_fs: base = __vmread(GUEST_FS_BASE); break; case seg_gs: base = __vmread(GUEST_GS_BASE); break; - case seg_ss: base = __vmread(GUEST_SS_BASE); break; + case seg_ss: if ( !long_mode ) base = __vmread(GUEST_SS_BASE); break; case seg_tr: base = __vmread(GUEST_TR_BASE); break; case seg_gdtr: base = __vmread(GUEST_GDTR_BASE); break; case seg_idtr: base = __vmread(GUEST_IDTR_BASE); break; case seg_ldtr: base = __vmread(GUEST_LDTR_BASE); break; - default: BUG(); base = 0; break; + default: BUG(); break; } + return base; } @@ -635,7 +644,6 @@ static void vmx_setup_hvm_funcs(void) hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs; hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs; - hvm_funcs.realmode = vmx_realmode; hvm_funcs.paging_enabled = vmx_paging_enabled; hvm_funcs.long_mode_enabled = vmx_long_mode_enabled; hvm_funcs.pae_enabled = vmx_pae_enabled; @@ -949,63 +957,100 @@ static void vmx_do_invlpg(unsigned long } -static int check_for_null_selector(unsigned long eip, int inst_len, int dir) +static int vmx_check_descriptor(int long_mode, unsigned long eip, int inst_len, + enum segment seg, unsigned long *base) { - unsigned char inst[MAX_INST_LEN]; - unsigned long sel; - int i; - int inst_copy_from_guest(unsigned char *, unsigned long, int); + enum vmcs_field ar_field, base_field; + u32 ar_bytes; - /* INS can only use ES segment register, and it can''t be overridden */ - if ( dir == IOREQ_READ ) + *base = 0; + if ( seg != seg_es ) { - sel = __vmread(GUEST_ES_SELECTOR); - return sel == 0 ? 1 : 0; + unsigned char inst[MAX_INST_LEN]; + int i; + extern int inst_copy_from_guest(unsigned char *, unsigned long, int); + + if ( !long_mode ) + eip += __vmread(GUEST_CS_BASE); + memset(inst, 0, MAX_INST_LEN); + if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len ) + { + gdprintk(XENLOG_ERR, "Get guest instruction failed\n"); + domain_crash(current->domain); + return 0; + } + + for ( i = 0; i < inst_len; i++ ) + { + switch ( inst[i] ) + { + case 0xf3: /* REPZ */ + case 0xf2: /* REPNZ */ + case 0xf0: /* LOCK */ + case 0x66: /* data32 */ + case 0x67: /* addr32 */ +#ifdef __x86_64__ + case 0x40 ... 0x4f: /* REX */ +#endif + continue; + case 0x2e: /* CS */ + seg = seg_cs; + continue; + case 0x36: /* SS */ + seg = seg_ss; + continue; + case 0x26: /* ES */ + seg = seg_es; + continue; + case 0x64: /* FS */ + seg = seg_fs; + continue; + case 0x65: /* GS */ + seg = seg_gs; + continue; + case 0x3e: /* DS */ + seg = seg_ds; + continue; + } + } } - memset(inst, 0, MAX_INST_LEN); - if ( inst_copy_from_guest(inst, eip, inst_len) != inst_len ) + switch ( seg ) { - gdprintk(XENLOG_ERR, "Get guest instruction failed\n"); - domain_crash(current->domain); + case seg_cs: + ar_field = GUEST_CS_AR_BYTES; + base_field = GUEST_CS_BASE; + break; + case seg_ds: + ar_field = GUEST_DS_AR_BYTES; + base_field = GUEST_DS_BASE; + break; + case seg_es: + ar_field = GUEST_ES_AR_BYTES; + base_field = GUEST_ES_BASE; + break; + case seg_fs: + ar_field = GUEST_FS_AR_BYTES; + base_field = GUEST_FS_BASE; + break; + case seg_gs: + ar_field = GUEST_FS_AR_BYTES; + base_field = GUEST_FS_BASE; + break; + case seg_ss: + ar_field = GUEST_GS_AR_BYTES; + base_field = GUEST_GS_BASE; + break; + default: + BUG(); return 0; } - for ( i = 0; i < inst_len; i++ ) - { - switch ( inst[i] ) - { - case 0xf3: /* REPZ */ - case 0xf2: /* REPNZ */ - case 0xf0: /* LOCK */ - case 0x66: /* data32 */ - case 0x67: /* addr32 */ - continue; - case 0x2e: /* CS */ - sel = __vmread(GUEST_CS_SELECTOR); - break; - case 0x36: /* SS */ - sel = __vmread(GUEST_SS_SELECTOR); - break; - case 0x26: /* ES */ - sel = __vmread(GUEST_ES_SELECTOR); - break; - case 0x64: /* FS */ - sel = __vmread(GUEST_FS_SELECTOR); - break; - case 0x65: /* GS */ - sel = __vmread(GUEST_GS_SELECTOR); - break; - case 0x3e: /* DS */ - /* FALLTHROUGH */ - default: - /* DS is the default */ - sel = __vmread(GUEST_DS_SELECTOR); - } - return sel == 0 ? 1 : 0; - } + if ( !long_mode || seg == seg_fs || seg == seg_gs ) + *base = __vmread(base_field); + ar_bytes = __vmread(ar_field); - return 0; + return !(ar_bytes & 0x10000); } static void vmx_io_instruction(unsigned long exit_qualification, @@ -1013,7 +1058,7 @@ static void vmx_io_instruction(unsigned { struct cpu_user_regs *regs; struct hvm_io_op *pio_opp; - unsigned long port, size; + unsigned int port, size; int dir, df, vm86; pio_opp = ¤t->arch.hvm_vcpu.io_op; @@ -1044,21 +1089,32 @@ static void vmx_io_instruction(unsigned dir = test_bit(3, &exit_qualification); /* direction */ if ( test_bit(4, &exit_qualification) ) { /* string instruction */ - unsigned long addr, count = 1; + unsigned long addr, count = 1, base; + u32 ar_bytes; int sign = regs->eflags & X86_EFLAGS_DF ? -1 : 1; + int long_mode = 0; + ar_bytes = __vmread(GUEST_CS_AR_BYTES); +#ifdef __x86_64__ + if ( vmx_long_mode_enabled(current) && (ar_bytes & (1u<<13)) ) + long_mode = 1; +#endif addr = __vmread(GUEST_LINEAR_ADDRESS); /* * In protected mode, guest linear address is invalid if the * selector is null. */ - if ( !vm86 && check_for_null_selector(regs->eip, inst_len, dir) ) - addr = dir == IOREQ_WRITE ? regs->esi : regs->edi; + if ( !vmx_check_descriptor(long_mode, regs->eip, inst_len, + dir == IOREQ_WRITE ? seg_ds : seg_es, + &base) ) + addr = dir == IOREQ_WRITE ? base + regs->esi : regs->edi; if ( test_bit(5, &exit_qualification) ) { /* "rep" prefix */ pio_opp->flags |= REPZ; - count = vm86 ? regs->ecx & 0xFFFF : regs->ecx; + count = regs->ecx; + if ( !long_mode && (vm86 || !(ar_bytes & (1u<<14))) ) + count &= 0xFFFF; } /* Index: 2006-11-27/xen/include/asm-x86/hvm/hvm.h ==================================================================--- 2006-11-27.orig/xen/include/asm-x86/hvm/hvm.h 2006-11-28 09:22:06.000000000 +0100 +++ 2006-11-27/xen/include/asm-x86/hvm/hvm.h 2006-11-28 09:24:15.000000000 +0100 @@ -62,12 +62,13 @@ struct hvm_function_table { struct vcpu *v, struct cpu_user_regs *r); /* * Examine specifics of the guest state: - * 1) determine whether the guest is in real or vm8086 mode, - * 2) determine whether paging is enabled, - * 3) return the current guest control-register value - * 4) return the current guest segment descriptor base + * 1) determine whether paging is enabled, + * 2) determine whether long mode is enabled, + * 3) determine whether PAE paging is enabled, + * 4) determine the mode the guest is running in, + * 5) return the current guest control-register value + * 6) return the current guest segment descriptor base */ - int (*realmode)(struct vcpu *v); int (*paging_enabled)(struct vcpu *v); int (*long_mode_enabled)(struct vcpu *v); int (*pae_enabled)(struct vcpu *v); @@ -128,12 +129,6 @@ hvm_load_cpu_guest_regs(struct vcpu *v, } static inline int -hvm_realmode(struct vcpu *v) -{ - return hvm_funcs.realmode(v); -} - -static inline int hvm_paging_enabled(struct vcpu *v) { return hvm_funcs.paging_enabled(v); @@ -157,7 +152,7 @@ hvm_guest_x86_mode(struct vcpu *v) return hvm_funcs.guest_x86_mode(v); } -int hvm_instruction_length(struct cpu_user_regs *regs, int mode); +int hvm_instruction_length(unsigned long pc, int mode); static inline void hvm_update_host_cr3(struct vcpu *v) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2006-Nov-29 15:19 UTC
Re: [Xen-devel] [PATCH] non-flat protected mode HVM support
>>> "Trolle Selander" <trolle.selander@gmail.com> 29.11.06 15:45 >>> >The assumption of zero base/limit still exists in insn_fetch in >x86_emulate.c. When an EXCEPTION_PF occurs and cs.base is non-zero, >x86_emulate_memop fails because it does not take cs.base into account when >reading the instruction to be decoded/emulated from memory. I had an email >discussion with Mats early last week about this and he made some >suggestions, but I didn''t have a chance to really look at it until >yesterday, and by then you had done a lot of work on the zero segment >base/limit assumptions anyway. Looking at it now, I''m not wholly sure what >would be the best way to pass this info to x86_emulate_memop. Maybe extend >the struct x86_emulate_ctxt to include cs base (and limits/attibutes)? Or >would that cause problems elsewhere?Indeed, I didn''t look outside the hvm directories. But it''s not just the instruction fetching there, there are other segment based operations in there, too. Hence I think adding a callback to obtain the base address would be cleaner; hvm_get_segment_base() could be used in the hvm case, and a simple function should be put in place for pv guest handling (for those, assuming a zero base address seems reasonably safe at this point, since the behavior of them is known, and if ever page table updates get done through segments with non- zero base addresses this would need to be revisited. I''ll try to take care of this unless you beat me. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2006-Nov-29 15:38 UTC
Re: [Xen-devel] [PATCH] non-flat protected mode HVM support
On 29/11/06 15:19, "Jan Beulich" <jbeulich@novell.com> wrote:> Indeed, I didn''t look outside the hvm directories. But it''s not just the > instruction > fetching there, there are other segment based operations in there, too. Hence > I think adding a callback to obtain the base address would be cleaner; > hvm_get_segment_base() could be used in the hvm case, and a simple function > should be put in place for pv guest handling (for those, assuming a zero base > address seems reasonably safe at this point, since the behavior of them is > known, and if ever page table updates get done through segments with non- > zero base addresses this would need to be revisited. > > I''ll try to take care of this unless you beat me.I have a big patch outstanding on the x86_emulate code to simplify the interface and properly support page-straddling data operands and non-zero segment bases. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Petersson, Mats
2006-Nov-29 15:42 UTC
RE: [Xen-devel] [PATCH] non-flat protected mode HVM support
> -----Original Message----- > From: xen-devel-bounces@lists.xensource.com > [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of > Jan Beulich > Sent: 29 November 2006 15:19 > To: Trolle Selander > Cc: xen-devel@lists.xensource.com > Subject: Re: [Xen-devel] [PATCH] non-flat protected mode HVM support > > >>> "Trolle Selander" <trolle.selander@gmail.com> 29.11.06 15:45 >>> > >The assumption of zero base/limit still exists in insn_fetch in > >x86_emulate.c. When an EXCEPTION_PF occurs and cs.base is non-zero, > >x86_emulate_memop fails because it does not take cs.base > into account when > >reading the instruction to be decoded/emulated from memory. > I had an email > >discussion with Mats early last week about this and he made some > >suggestions, but I didn''t have a chance to really look at it until > >yesterday, and by then you had done a lot of work on the zero segment > >base/limit assumptions anyway. Looking at it now, I''m not > wholly sure what > >would be the best way to pass this info to > x86_emulate_memop. Maybe extend > >the struct x86_emulate_ctxt to include cs base (and > limits/attibutes)? Or > >would that cause problems elsewhere? > > Indeed, I didn''t look outside the hvm directories. But it''s > not just the instruction > fetching there, there are other segment based operations in > there, too. Hence > I think adding a callback to obtain the base address would be cleaner; > hvm_get_segment_base() could be used in the hvm case, and a > simple function > should be put in place for pv guest handling (for those, > assuming a zero base > address seems reasonably safe at this point, since the > behavior of them is > known, and if ever page table updates get done through > segments with non- > zero base addresses this would need to be revisited.There should be a per-domain function pointer to get the segment-data - we can either allow this function pointer to be NULL (for non HVM-case), in which case the base is assumed to be zero. Or implement a function that just returns zero for non HVM domains. I''m thinking something like this (written as a function for ease of explaining, rather than efficiency - a macro may be better...) unsigned long get_seg_base(int segno) { if (current->get_segment_base) return current->get_segment_base; else return 0; } The if-statement should be leaner than calling a function that returns zero... -- Mats> > I''ll try to take care of this unless you beat me. > > Jan > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel > > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2006-Nov-29 15:47 UTC
Re: [Xen-devel] [PATCH] non-flat protected mode HVM support
>I have a big patch outstanding on the x86_emulate code to simplify the >interface and properly support page-straddling data operands and non-zero >segment bases.Great, saves me from doing this. Do you intend to commit this soon? Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2006-Nov-29 16:08 UTC
Re: [Xen-devel] [PATCH] non-flat protected mode HVM support
On 29/11/06 15:47, "Jan Beulich" <jbeulich@novell.com> wrote:>> I have a big patch outstanding on the x86_emulate code to simplify the >> interface and properly support page-straddling data operands and non-zero >> segment bases. > > Great, saves me from doing this. Do you intend to commit this soon?Yeah, tomorrow hopefully. I''m trying to get some big cleanups done before we call a 3.0.4 freeze (which we''ll probably do on Friday). :-) -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel