Jan Beulich
2011-Nov-16 14:00 UTC
[Xen-devel] [PATCH 2/2] x86/emulator: generalize movq emulation (SSE2 and AVX variants)
Extend the existing movq emulation to also support its SSE2 and AVX variants, the latter implying the addition of VEX decoding. Fold the read and write cases (as most of the logic is identical), and add movntq and variants (as they''re very similar). Extend the testing code to also exercise these instructions. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -1,3 +1,5 @@ +#include <errno.h> +#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -53,11 +55,84 @@ static int cmpxchg( return X86EMUL_OKAY; } +static int cpuid( + unsigned int *eax, + unsigned int *ebx, + unsigned int *ecx, + unsigned int *edx, + struct x86_emulate_ctxt *ctxt) +{ + asm ("cpuid" : "+a" (*eax), "+c" (*ecx), "=d" (*edx), "=b" (*ebx)); + return X86EMUL_OKAY; +} + +#define cpu_has_mmx ({ \ + unsigned int eax = 1, ecx = 0, edx; \ + cpuid(&eax, &ecx, &ecx, &edx, NULL); \ + (edx & (1U << 23)) != 0; \ +}) + +#define cpu_has_sse ({ \ + unsigned int eax = 1, ecx = 0, edx; \ + cpuid(&eax, &ecx, &ecx, &edx, NULL); \ + (edx & (1U << 25)) != 0; \ +}) + +#define cpu_has_sse2 ({ \ + unsigned int eax = 1, ecx = 0, edx; \ + cpuid(&eax, &ecx, &ecx, &edx, NULL); \ + (edx & (1U << 26)) != 0; \ +}) + +static inline uint64_t xgetbv(uint32_t xcr) +{ + uint64_t res; + + asm ( ".byte 0x0f, 0x01, 0xd0" : "=A" (res) : "c" (xcr) ); + + return res; +} + +#define cpu_has_avx ({ \ + unsigned int eax = 1, ecx = 0, edx; \ + cpuid(&eax, &edx, &ecx, &edx, NULL); \ + if ( !(ecx & (1U << 27)) || (xgetbv(0) & 6) != 6 ) \ + ecx = 0; \ + (ecx & (1U << 28)) != 0; \ +}) + +int get_fpu( + void (*exception_callback)(void *, struct cpu_user_regs *), + void *exception_callback_arg, + enum x86_emulate_fpu_type type, + struct x86_emulate_ctxt *ctxt) +{ + switch ( type ) + { + case X86EMUL_FPU_fpu: + break; + case X86EMUL_FPU_ymm: + if ( cpu_has_avx ) + break; + case X86EMUL_FPU_xmm: + if ( cpu_has_sse ) + break; + case X86EMUL_FPU_mmx: + if ( cpu_has_mmx ) + break; + default: + return X86EMUL_UNHANDLEABLE; + } + return X86EMUL_OKAY; +} + static struct x86_emulate_ops emulops = { .read = read, .insn_fetch = read, .write = write, .cmpxchg = cmpxchg, + .cpuid = cpuid, + .get_fpu = get_fpu, }; int main(int argc, char **argv) @@ -66,6 +141,8 @@ int main(int argc, char **argv) struct cpu_user_regs regs; char *instr; unsigned int *res, i, j; + unsigned long sp; + bool stack_exec; int rc; #ifndef __x86_64__ unsigned int bcdres_native, bcdres_emul; @@ -85,6 +162,16 @@ int main(int argc, char **argv) } instr = (char *)res + 0x100; +#ifdef __x86_64__ + asm ("movq %%rsp, %0" : "=g" (sp)); +#else + asm ("movl %%esp, %0" : "=g" (sp)); +#endif + stack_exec = mprotect((void *)(sp & -0x1000L) - (MMAP_SZ - 0x1000), + MMAP_SZ, PROT_READ|PROT_WRITE|PROT_EXEC) == 0; + if ( !stack_exec ) + printf("Warning: Stack could not be made executable (%d).\n", errno); + printf("%-40s", "Testing addl %%ecx,(%%eax)..."); instr[0] = 0x01; instr[1] = 0x08; regs.eflags = 0x200; @@ -442,6 +529,108 @@ int main(int argc, char **argv) printf("skipped\n"); #endif + printf("%-40s", "Testing movq %mm3,(%ecx)..."); + if ( stack_exec && cpu_has_mmx ) + { + extern const unsigned char movq_to_mem[]; + + asm volatile ( "pcmpeqb %%mm3, %%mm3\n" + ".pushsection .test, \"a\", @progbits\n" + "movq_to_mem: movq %%mm3, (%0)\n" + ".popsection" :: "c" (NULL) ); + + memcpy(instr, movq_to_mem, 15); + memset(res, 0x33, 64); + memset(res + 8, 0xff, 8); + regs.eip = (unsigned long)&instr[0]; + regs.ecx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + memcmp(res, res + 8, 32) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing movq (%edx),%mm5..."); + if ( stack_exec && cpu_has_mmx ) + { + extern const unsigned char movq_from_mem[]; + + asm volatile ( "pcmpgtb %%mm5, %%mm5\n" + ".pushsection .test, \"a\", @progbits\n" + "movq_from_mem: movq (%0), %%mm5\n" + ".popsection" :: "d" (NULL) ); + + memcpy(instr, movq_from_mem, 15); + regs.eip = (unsigned long)&instr[0]; + regs.ecx = 0; + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY ) + goto fail; + asm ( "pcmpeqb %%mm3, %%mm3\n\t" + "pcmpeqb %%mm5, %%mm3\n\t" + "pmovmskb %%mm3, %0" : "=r" (rc) ); + if ( rc != 0xff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing movdqu %xmm2,(%ecx)..."); + if ( stack_exec && cpu_has_sse2 ) + { + extern const unsigned char movdqu_to_mem[]; + + asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n" + ".pushsection .test, \"a\", @progbits\n" + "movdqu_to_mem: movdqu %%xmm2, (%0)\n" + ".popsection" :: "c" (NULL) ); + + memcpy(instr, movdqu_to_mem, 15); + memset(res, 0x55, 64); + memset(res + 8, 0xff, 16); + regs.eip = (unsigned long)&instr[0]; + regs.ecx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + memcmp(res, res + 8, 32) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing movdqu (%edx),%xmm4..."); + if ( stack_exec && cpu_has_sse2 ) + { + extern const unsigned char movdqu_from_mem[]; + + asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n" + ".pushsection .test, \"a\", @progbits\n" + "movdqu_from_mem: movdqu (%0), %%xmm4\n" + ".popsection" :: "d" (NULL) ); + + memcpy(instr, movdqu_from_mem, 15); + regs.eip = (unsigned long)&instr[0]; + regs.ecx = 0; + regs.edx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY ) + goto fail; + asm ( "pcmpeqb %%xmm2, %%xmm2\n\t" + "pcmpeqb %%xmm4, %%xmm2\n\t" + "pmovmskb %%xmm2, %0" : "=r" (rc) ); + if ( rc != 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + for ( j = 1; j <= 2; j++ ) { #if defined(__i386__) --- a/tools/tests/x86_emulator/x86_emulate.c +++ b/tools/tests/x86_emulator/x86_emulate.c @@ -9,5 +9,10 @@ typedef bool bool_t; #define BUG() abort() +#define DEFINE_PER_CPU(type, var) type this_cpu_##var +#define this_cpu(var) this_cpu_##var + +#define vm86_mode(regs) 0 + #include "x86_emulate/x86_emulate.h" #include "x86_emulate/x86_emulate.c" --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -16,6 +16,7 @@ #include <xen/paging.h> #include <xen/trace.h> #include <asm/event.h> +#include <asm/xstate.h> #include <asm/hvm/emulate.h> #include <asm/hvm/hvm.h> #include <asm/hvm/trace.h> @@ -905,6 +906,20 @@ static int hvmemul_get_fpu( if ( !cpu_has_mmx ) return X86EMUL_UNHANDLEABLE; break; + case X86EMUL_FPU_xmm: + if ( !cpu_has_xmm || + (curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_EM) || + !(curr->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSFXSR) ) + return X86EMUL_UNHANDLEABLE; + break; + case X86EMUL_FPU_ymm: + if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) || + vm86_mode(ctxt->regs) || + !(curr->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) || + !(curr->arch.xcr0 & XSTATE_SSE) || + !(curr->arch.xcr0 & XSTATE_YMM) ) + return X86EMUL_UNHANDLEABLE; + break; default: return X86EMUL_UNHANDLEABLE; } --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -253,6 +253,47 @@ static uint8_t twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +#define REX_PREFIX 0x40 +#define REX_B 0x01 +#define REX_X 0x02 +#define REX_R 0x04 +#define REX_W 0x08 + +#define vex_none 0 + +enum vex_opcx { + vex_0f = vex_none + 1, + vex_0f38, + vex_0f3a, +}; + +enum vex_pfx { + vex_66 = vex_none + 1, + vex_f3, + vex_f2 +}; + +union vex { + uint8_t raw[2]; + struct { + uint8_t opcx:5; + uint8_t b:1; + uint8_t x:1; + uint8_t r:1; + uint8_t pfx:2; + uint8_t l:1; + uint8_t reg:4; + uint8_t w:1; + }; +}; + +#define copy_REX_VEX(ptr, rex, vex) do { \ + if ( (vex).opcx != vex_none ) \ + ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \ + else if ( mode_64bit() ) \ + ptr[1] = rex | REX_PREFIX; \ +} while (0) + /* Type, address-of, and value of an instruction''s operand. */ struct operand { enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; @@ -281,6 +322,18 @@ struct operand { }; }; +typedef union { + uint64_t mmx; + uint64_t __attribute__ ((aligned(16))) xmm[2]; + uint64_t __attribute__ ((aligned(32))) ymm[4]; +} mmval_t; + +/* + * While alignment gets specified above, this doesn''t get honored by the + * compiler for automatic variables. Thus use a per-CPU variable instead. + */ +static DEFINE_PER_CPU(mmval_t, mmval); + /* MSRs. */ #define MSR_TSC 0x00000010 #define MSR_SYSENTER_CS 0x00000174 @@ -972,9 +1025,12 @@ static bool_t vcpu_has( generate_exception_if(!vcpu_has(leaf, subleaf, reg, \ X86_FEATURE_##feature % 32, \ ctxt, ops), EXC_UD, -1) +#define vcpu_must_have_mmx() vcpu_must_have(1, 0, EDX, MMX) +#define vcpu_must_have_sse() vcpu_must_have(1, 0, EDX, XMM) #define vcpu_must_have_sse2() vcpu_must_have(1, 0, EDX, XMM2) #define vcpu_must_have_sse3() vcpu_must_have(1, 0, ECX, XMM3) #define vcpu_must_have_cx16() vcpu_must_have(1, 0, ECX, CX16) +#define vcpu_must_have_avx() vcpu_must_have(1, 0, ECX, AVX) static int in_realmode( @@ -1255,6 +1311,7 @@ x86_emulate( uint8_t b, d, sib, sib_index, sib_base, twobyte = 0, rex_prefix = 0; uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; + union vex vex = {}; unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes; #define REPE_PREFIX 1 #define REPNE_PREFIX 2 @@ -1287,6 +1344,7 @@ x86_emulate( { case 0x66: /* operand-size override */ op_bytes = def_op_bytes ^ 6; + vex.pfx = vex_66; break; case 0x67: /* address-size override */ ad_bytes = def_ad_bytes ^ (mode_64bit() ? 12 : 6); @@ -1314,9 +1372,11 @@ x86_emulate( break; case 0xf2: /* REPNE/REPNZ */ rep_prefix = REPNE_PREFIX; + vex.pfx = vex_f2; break; case 0xf3: /* REP/REPE/REPZ */ rep_prefix = REPE_PREFIX; + vex.pfx = vex_f3; break; case 0x40 ... 0x4f: /* REX */ if ( !mode_64bit() ) @@ -1360,6 +1420,70 @@ x86_emulate( { modrm = insn_fetch_type(uint8_t); modrm_mod = (modrm & 0xc0) >> 6; + + if ( !twobyte && (b & ~1) == 0xc4 ) + switch ( def_ad_bytes ) + { + default: + BUG(); + case 2: + if ( in_realmode(ctxt, ops) || vm86_mode(&_regs) ) + break; + /* fall through */ + case 4: + if ( modrm_mod != 3 ) + break; + /* fall through */ + case 8: + /* VEX */ + generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1); + + vex.raw[0] = b; + if ( b & 1 ) + { + vex.raw[1] = b; + vex.opcx = vex_0f; + vex.x = 1; + vex.b = 1; + vex.w = 0; + } + else + { + vex.raw[1] = insn_fetch_type(uint8_t); + if ( mode_64bit() ) + { + if ( !vex.b ) + rex_prefix |= REX_B; + if ( !vex.x ) + rex_prefix |= REX_X; + if ( vex.w ) + { + rex_prefix |= REX_W; + op_bytes = 8; + } + } + } + vex.reg ^= 0xf; + if ( !mode_64bit() ) + vex.reg &= 0x7; + else if ( !vex.r ) + rex_prefix |= REX_R; + + fail_if(vex.opcx != vex_0f); + twobyte = 1; + b = insn_fetch_type(uint8_t); + d = twobyte_table[b]; + + /* Unrecognised? */ + if ( d == 0 ) + goto cannot_emulate; + + modrm = insn_fetch_type(uint8_t); + modrm_mod = (modrm & 0xc0) >> 6; + + break; + } + modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3); modrm_rm = modrm & 0x07; @@ -3917,44 +4041,77 @@ x86_emulate( break; } - case 0x6f: /* movq mm/m64,mm */ { - uint8_t stub[] = { 0x0f, 0x6f, modrm, 0xc3 }; + case 0x6f: /* movq mm/m64,mm */ + /* {,v}movdq{a,u} xmm/m128,xmm */ + /* vmovdq{a,u} ymm/m256,ymm */ + case 0x7f: /* movq mm,mm/m64 */ + /* {,v}movdq{a,u} xmm,xmm/m128 */ + /* vmovdq{a,u} ymm,ymm/m256 */ + case 0xe7: /* movntq mm,mm/m64 */ + /* {,v}movntdq xmm,xmm/m128 */ + /* vmovntdq{a,u} ymm,ymm/m256 */ + { + uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 }; struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; - uint64_t val; - if ( ea.type == OP_MEM ) + + if ( vex.opcx == vex_none ) { - unsigned long lval, hval; - if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0, - &lval, 4, ctxt, ops)) || - (rc = read_ulong(ea.mem.seg, ea.mem.off+4, - &hval, 4, ctxt, ops)) ) - goto done; - val = ((uint64_t)hval << 32) | (uint32_t)lval; - stub[2] = modrm & 0x38; /* movq (%eax),%mmN */ + switch ( vex.pfx ) + { + case vex_f3: + fail_if(b == 0xe7); + /* fall through */ + case vex_66: + vcpu_must_have_sse2(); + stub[0] = 0x66; /* movdqa */ + get_fpu(X86EMUL_FPU_xmm, &fic); + ea.bytes = 16; + break; + case vex_none: + if ( b != 0xe7 ) + vcpu_must_have_mmx(); + else + vcpu_must_have_sse(); + get_fpu(X86EMUL_FPU_mmx, &fic); + ea.bytes = 8; + break; + default: + goto cannot_emulate; + } + } + else + { + fail_if(vex.opcx != vex_0f || vex.reg || + (vex.pfx != vex_66 && (vex.pfx != vex_f3 || b == 0xe7))); + vcpu_must_have_avx(); + get_fpu(X86EMUL_FPU_ymm, &fic); + ea.bytes = 16 << vex.l; } - get_fpu(X86EMUL_FPU_mmx, &fic); - asm volatile ( "call *%0" : : "r" (stub), "a" (&val) : "memory" ); - put_fpu(&fic); - break; - } - - case 0x7f: /* movq mm,mm/m64 */ { - uint8_t stub[] = { 0x0f, 0x7f, modrm, 0xc3 }; - struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; - uint64_t val; - if ( ea.type == OP_MEM ) - stub[2] = modrm & 0x38; /* movq %mmN,(%eax) */ - get_fpu(X86EMUL_FPU_mmx, &fic); - asm volatile ( "call *%0" : : "r" (stub), "a" (&val) : "memory" ); - put_fpu(&fic); if ( ea.type == OP_MEM ) { - unsigned long lval = (uint32_t)val, hval = (uint32_t)(val >> 32); - if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) || - (rc = ops->write(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) ) - goto done; + /* XXX + generate_exception_if(vex.pfx == vex_66 && + (ops->ea(ea.mem.seg, ea.mem.off) + & (ea.bytes - 1)), EXC_GP, 0); */ + if ( b == 0x6f ) + rc = ops->read(ea.mem.seg, ea.mem.off+0, &this_cpu(mmval), + ea.bytes, ctxt); + /* convert memory operand to (%rAX) */ + rex_prefix &= ~REX_B; + vex.b = 1; + stub[4] &= 0x38; + } + if ( !rc ) + { + copy_REX_VEX(stub, rex_prefix, vex); + asm volatile ( "call *%0" : : "r" (stub), "a" (&this_cpu(mmval)) + : "memory" ); } - break; + put_fpu(&fic); + if ( b != 0x6f && ea.type == OP_MEM ) + rc = ops->write(ea.mem.seg, ea.mem.off, &this_cpu(mmval), + ea.bytes, ctxt); + goto done; } case 0x80 ... 0x8f: /* jcc (near) */ { --- a/xen/arch/x86/x86_emulate/x86_emulate.h +++ b/xen/arch/x86/x86_emulate/x86_emulate.h @@ -99,7 +99,9 @@ struct segment_register { /* FPU sub-types which may be requested via ->get_fpu(). */ enum x86_emulate_fpu_type { X86EMUL_FPU_fpu, /* Standard FPU coprocessor instruction set */ - X86EMUL_FPU_mmx /* MMX instruction set (%mm0-%mm7) */ + X86EMUL_FPU_mmx, /* MMX instruction set (%mm0-%mm7) */ + X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */ + X86EMUL_FPU_ymm /* AVX/XOP instruction set (%ymm0-%ymm7/15) */ }; /* --- a/xen/arch/x86/x86_emulate.c +++ b/xen/arch/x86/x86_emulate.c @@ -10,9 +10,11 @@ */ #include <asm/cpufeature.h> +#include <asm/processor.h> #include <asm/x86_emulate.h> /* Avoid namespace pollution. */ #undef cmpxchg +#undef cpuid #include "x86_emulate/x86_emulate.c" --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -218,7 +218,7 @@ #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) - +#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_lwp boot_cpu_has(X86_FEATURE_LWP) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2011-Nov-16 16:25 UTC
Re: [Xen-devel] [PATCH 2/2] x86/emulator: generalize movq emulation (SSE2 and AVX variants)
On 16/11/2011 14:00, "Jan Beulich" <JBeulich@suse.com> wrote:> Extend the existing movq emulation to also support its SSE2 and AVX > variants, the latter implying the addition of VEX decoding. Fold the > read and write cases (as most of the logic is identical), and add > movntq and variants (as they''re very similar). > > Extend the testing code to also exercise these instructions.I checked in your other patches, although I split them up and revised them in some cases. This one is broadly okay too, but: 1. Don''t import vm86_mode(). x86_emulate already does eflags&EFLG_VM in some places. And that''s fairly self documenting so just carry on with that. 2. Don''t import DEFINE_PER_CPU/this_cpu. I understand it works around a critical issue but it''s *so* nasty. I would rather define a nasty private macro for declaring aligned space on the stack, like, for example: char __mmval[64], *mmval = (__mmval + 31) & ~32; (suitably cleaned up, macroised, and made compilable of course ;-) 3. There''s a XXX''ed chunk of code in the middle of the patch. No explanation. Remove it, or comment it, or something. Note that I changed the vcpu_must_have stuff when I check it in, so those bits will need fixup in this patch too. In particular, I don''t bother importing cpufeature.h -- the leaf/reg are already open coded with no macro abstraction, so I see no harm in open-coding the bit number either. They won''t change and the vcpu_must_have_xxx macro name is sufficient documentation in itself. -- Keir> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/tools/tests/x86_emulator/test_x86_emulator.c > +++ b/tools/tests/x86_emulator/test_x86_emulator.c > @@ -1,3 +1,5 @@ > +#include <errno.h> > +#include <stdbool.h> > #include <stdio.h> > #include <stdlib.h> > #include <string.h> > @@ -53,11 +55,84 @@ static int cmpxchg( > return X86EMUL_OKAY; > } > > +static int cpuid( > + unsigned int *eax, > + unsigned int *ebx, > + unsigned int *ecx, > + unsigned int *edx, > + struct x86_emulate_ctxt *ctxt) > +{ > + asm ("cpuid" : "+a" (*eax), "+c" (*ecx), "=d" (*edx), "=b" (*ebx)); > + return X86EMUL_OKAY; > +} > + > +#define cpu_has_mmx ({ \ > + unsigned int eax = 1, ecx = 0, edx; \ > + cpuid(&eax, &ecx, &ecx, &edx, NULL); \ > + (edx & (1U << 23)) != 0; \ > +}) > + > +#define cpu_has_sse ({ \ > + unsigned int eax = 1, ecx = 0, edx; \ > + cpuid(&eax, &ecx, &ecx, &edx, NULL); \ > + (edx & (1U << 25)) != 0; \ > +}) > + > +#define cpu_has_sse2 ({ \ > + unsigned int eax = 1, ecx = 0, edx; \ > + cpuid(&eax, &ecx, &ecx, &edx, NULL); \ > + (edx & (1U << 26)) != 0; \ > +}) > + > +static inline uint64_t xgetbv(uint32_t xcr) > +{ > + uint64_t res; > + > + asm ( ".byte 0x0f, 0x01, 0xd0" : "=A" (res) : "c" (xcr) ); > + > + return res; > +} > + > +#define cpu_has_avx ({ \ > + unsigned int eax = 1, ecx = 0, edx; \ > + cpuid(&eax, &edx, &ecx, &edx, NULL); \ > + if ( !(ecx & (1U << 27)) || (xgetbv(0) & 6) != 6 ) \ > + ecx = 0; \ > + (ecx & (1U << 28)) != 0; \ > +}) > + > +int get_fpu( > + void (*exception_callback)(void *, struct cpu_user_regs *), > + void *exception_callback_arg, > + enum x86_emulate_fpu_type type, > + struct x86_emulate_ctxt *ctxt) > +{ > + switch ( type ) > + { > + case X86EMUL_FPU_fpu: > + break; > + case X86EMUL_FPU_ymm: > + if ( cpu_has_avx ) > + break; > + case X86EMUL_FPU_xmm: > + if ( cpu_has_sse ) > + break; > + case X86EMUL_FPU_mmx: > + if ( cpu_has_mmx ) > + break; > + default: > + return X86EMUL_UNHANDLEABLE; > + } > + return X86EMUL_OKAY; > +} > + > static struct x86_emulate_ops emulops = { > .read = read, > .insn_fetch = read, > .write = write, > .cmpxchg = cmpxchg, > + .cpuid = cpuid, > + .get_fpu = get_fpu, > }; > > int main(int argc, char **argv) > @@ -66,6 +141,8 @@ int main(int argc, char **argv) > struct cpu_user_regs regs; > char *instr; > unsigned int *res, i, j; > + unsigned long sp; > + bool stack_exec; > int rc; > #ifndef __x86_64__ > unsigned int bcdres_native, bcdres_emul; > @@ -85,6 +162,16 @@ int main(int argc, char **argv) > } > instr = (char *)res + 0x100; > > +#ifdef __x86_64__ > + asm ("movq %%rsp, %0" : "=g" (sp)); > +#else > + asm ("movl %%esp, %0" : "=g" (sp)); > +#endif > + stack_exec = mprotect((void *)(sp & -0x1000L) - (MMAP_SZ - 0x1000), > + MMAP_SZ, PROT_READ|PROT_WRITE|PROT_EXEC) == 0; > + if ( !stack_exec ) > + printf("Warning: Stack could not be made executable (%d).\n", errno); > + > printf("%-40s", "Testing addl %%ecx,(%%eax)..."); > instr[0] = 0x01; instr[1] = 0x08; > regs.eflags = 0x200; > @@ -442,6 +529,108 @@ int main(int argc, char **argv) > printf("skipped\n"); > #endif > > + printf("%-40s", "Testing movq %mm3,(%ecx)..."); > + if ( stack_exec && cpu_has_mmx ) > + { > + extern const unsigned char movq_to_mem[]; > + > + asm volatile ( "pcmpeqb %%mm3, %%mm3\n" > + ".pushsection .test, \"a\", @progbits\n" > + "movq_to_mem: movq %%mm3, (%0)\n" > + ".popsection" :: "c" (NULL) ); > + > + memcpy(instr, movq_to_mem, 15); > + memset(res, 0x33, 64); > + memset(res + 8, 0xff, 8); > + regs.eip = (unsigned long)&instr[0]; > + regs.ecx = (unsigned long)res; > + rc = x86_emulate(&ctxt, &emulops); > + if ( (rc != X86EMUL_OKAY) || > + memcmp(res, res + 8, 32) ) > + goto fail; > + printf("okay\n"); > + } > + else > + printf("skipped\n"); > + > + printf("%-40s", "Testing movq (%edx),%mm5..."); > + if ( stack_exec && cpu_has_mmx ) > + { > + extern const unsigned char movq_from_mem[]; > + > + asm volatile ( "pcmpgtb %%mm5, %%mm5\n" > + ".pushsection .test, \"a\", @progbits\n" > + "movq_from_mem: movq (%0), %%mm5\n" > + ".popsection" :: "d" (NULL) ); > + > + memcpy(instr, movq_from_mem, 15); > + regs.eip = (unsigned long)&instr[0]; > + regs.ecx = 0; > + regs.edx = (unsigned long)res; > + rc = x86_emulate(&ctxt, &emulops); > + if ( rc != X86EMUL_OKAY ) > + goto fail; > + asm ( "pcmpeqb %%mm3, %%mm3\n\t" > + "pcmpeqb %%mm5, %%mm3\n\t" > + "pmovmskb %%mm3, %0" : "=r" (rc) ); > + if ( rc != 0xff ) > + goto fail; > + printf("okay\n"); > + } > + else > + printf("skipped\n"); > + > + printf("%-40s", "Testing movdqu %xmm2,(%ecx)..."); > + if ( stack_exec && cpu_has_sse2 ) > + { > + extern const unsigned char movdqu_to_mem[]; > + > + asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n" > + ".pushsection .test, \"a\", @progbits\n" > + "movdqu_to_mem: movdqu %%xmm2, (%0)\n" > + ".popsection" :: "c" (NULL) ); > + > + memcpy(instr, movdqu_to_mem, 15); > + memset(res, 0x55, 64); > + memset(res + 8, 0xff, 16); > + regs.eip = (unsigned long)&instr[0]; > + regs.ecx = (unsigned long)res; > + rc = x86_emulate(&ctxt, &emulops); > + if ( (rc != X86EMUL_OKAY) || > + memcmp(res, res + 8, 32) ) > + goto fail; > + printf("okay\n"); > + } > + else > + printf("skipped\n"); > + > + printf("%-40s", "Testing movdqu (%edx),%xmm4..."); > + if ( stack_exec && cpu_has_sse2 ) > + { > + extern const unsigned char movdqu_from_mem[]; > + > + asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n" > + ".pushsection .test, \"a\", @progbits\n" > + "movdqu_from_mem: movdqu (%0), %%xmm4\n" > + ".popsection" :: "d" (NULL) ); > + > + memcpy(instr, movdqu_from_mem, 15); > + regs.eip = (unsigned long)&instr[0]; > + regs.ecx = 0; > + regs.edx = (unsigned long)res; > + rc = x86_emulate(&ctxt, &emulops); > + if ( rc != X86EMUL_OKAY ) > + goto fail; > + asm ( "pcmpeqb %%xmm2, %%xmm2\n\t" > + "pcmpeqb %%xmm4, %%xmm2\n\t" > + "pmovmskb %%xmm2, %0" : "=r" (rc) ); > + if ( rc != 0xffff ) > + goto fail; > + printf("okay\n"); > + } > + else > + printf("skipped\n"); > + > for ( j = 1; j <= 2; j++ ) > { > #if defined(__i386__) > --- a/tools/tests/x86_emulator/x86_emulate.c > +++ b/tools/tests/x86_emulator/x86_emulate.c > @@ -9,5 +9,10 @@ typedef bool bool_t; > > #define BUG() abort() > > +#define DEFINE_PER_CPU(type, var) type this_cpu_##var > +#define this_cpu(var) this_cpu_##var > + > +#define vm86_mode(regs) 0 > + > #include "x86_emulate/x86_emulate.h" > #include "x86_emulate/x86_emulate.c" > --- a/xen/arch/x86/hvm/emulate.c > +++ b/xen/arch/x86/hvm/emulate.c > @@ -16,6 +16,7 @@ > #include <xen/paging.h> > #include <xen/trace.h> > #include <asm/event.h> > +#include <asm/xstate.h> > #include <asm/hvm/emulate.h> > #include <asm/hvm/hvm.h> > #include <asm/hvm/trace.h> > @@ -905,6 +906,20 @@ static int hvmemul_get_fpu( > if ( !cpu_has_mmx ) > return X86EMUL_UNHANDLEABLE; > break; > + case X86EMUL_FPU_xmm: > + if ( !cpu_has_xmm || > + (curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_EM) || > + !(curr->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSFXSR) ) > + return X86EMUL_UNHANDLEABLE; > + break; > + case X86EMUL_FPU_ymm: > + if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) || > + vm86_mode(ctxt->regs) || > + !(curr->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) || > + !(curr->arch.xcr0 & XSTATE_SSE) || > + !(curr->arch.xcr0 & XSTATE_YMM) ) > + return X86EMUL_UNHANDLEABLE; > + break; > default: > return X86EMUL_UNHANDLEABLE; > } > --- a/xen/arch/x86/x86_emulate/x86_emulate.c > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c > @@ -253,6 +253,47 @@ static uint8_t twobyte_table[256] = { > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 > }; > > +#define REX_PREFIX 0x40 > +#define REX_B 0x01 > +#define REX_X 0x02 > +#define REX_R 0x04 > +#define REX_W 0x08 > + > +#define vex_none 0 > + > +enum vex_opcx { > + vex_0f = vex_none + 1, > + vex_0f38, > + vex_0f3a, > +}; > + > +enum vex_pfx { > + vex_66 = vex_none + 1, > + vex_f3, > + vex_f2 > +}; > + > +union vex { > + uint8_t raw[2]; > + struct { > + uint8_t opcx:5; > + uint8_t b:1; > + uint8_t x:1; > + uint8_t r:1; > + uint8_t pfx:2; > + uint8_t l:1; > + uint8_t reg:4; > + uint8_t w:1; > + }; > +}; > + > +#define copy_REX_VEX(ptr, rex, vex) do { \ > + if ( (vex).opcx != vex_none ) \ > + ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \ > + else if ( mode_64bit() ) \ > + ptr[1] = rex | REX_PREFIX; \ > +} while (0) > + > /* Type, address-of, and value of an instruction''s operand. */ > struct operand { > enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; > @@ -281,6 +322,18 @@ struct operand { > }; > }; > > +typedef union { > + uint64_t mmx; > + uint64_t __attribute__ ((aligned(16))) xmm[2]; > + uint64_t __attribute__ ((aligned(32))) ymm[4]; > +} mmval_t; > + > +/* > + * While alignment gets specified above, this doesn''t get honored by the > + * compiler for automatic variables. Thus use a per-CPU variable instead. > + */ > +static DEFINE_PER_CPU(mmval_t, mmval); > + > /* MSRs. */ > #define MSR_TSC 0x00000010 > #define MSR_SYSENTER_CS 0x00000174 > @@ -972,9 +1025,12 @@ static bool_t vcpu_has( > generate_exception_if(!vcpu_has(leaf, subleaf, reg, \ > X86_FEATURE_##feature % 32, \ > ctxt, ops), EXC_UD, -1) > +#define vcpu_must_have_mmx() vcpu_must_have(1, 0, EDX, MMX) > +#define vcpu_must_have_sse() vcpu_must_have(1, 0, EDX, XMM) > #define vcpu_must_have_sse2() vcpu_must_have(1, 0, EDX, XMM2) > #define vcpu_must_have_sse3() vcpu_must_have(1, 0, ECX, XMM3) > #define vcpu_must_have_cx16() vcpu_must_have(1, 0, ECX, CX16) > +#define vcpu_must_have_avx() vcpu_must_have(1, 0, ECX, AVX) > > static int > in_realmode( > @@ -1255,6 +1311,7 @@ x86_emulate( > > uint8_t b, d, sib, sib_index, sib_base, twobyte = 0, rex_prefix = 0; > uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; > + union vex vex = {}; > unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes; > #define REPE_PREFIX 1 > #define REPNE_PREFIX 2 > @@ -1287,6 +1344,7 @@ x86_emulate( > { > case 0x66: /* operand-size override */ > op_bytes = def_op_bytes ^ 6; > + vex.pfx = vex_66; > break; > case 0x67: /* address-size override */ > ad_bytes = def_ad_bytes ^ (mode_64bit() ? 12 : 6); > @@ -1314,9 +1372,11 @@ x86_emulate( > break; > case 0xf2: /* REPNE/REPNZ */ > rep_prefix = REPNE_PREFIX; > + vex.pfx = vex_f2; > break; > case 0xf3: /* REP/REPE/REPZ */ > rep_prefix = REPE_PREFIX; > + vex.pfx = vex_f3; > break; > case 0x40 ... 0x4f: /* REX */ > if ( !mode_64bit() ) > @@ -1360,6 +1420,70 @@ x86_emulate( > { > modrm = insn_fetch_type(uint8_t); > modrm_mod = (modrm & 0xc0) >> 6; > + > + if ( !twobyte && (b & ~1) == 0xc4 ) > + switch ( def_ad_bytes ) > + { > + default: > + BUG(); > + case 2: > + if ( in_realmode(ctxt, ops) || vm86_mode(&_regs) ) > + break; > + /* fall through */ > + case 4: > + if ( modrm_mod != 3 ) > + break; > + /* fall through */ > + case 8: > + /* VEX */ > + generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1); > + > + vex.raw[0] = b; > + if ( b & 1 ) > + { > + vex.raw[1] = b; > + vex.opcx = vex_0f; > + vex.x = 1; > + vex.b = 1; > + vex.w = 0; > + } > + else > + { > + vex.raw[1] = insn_fetch_type(uint8_t); > + if ( mode_64bit() ) > + { > + if ( !vex.b ) > + rex_prefix |= REX_B; > + if ( !vex.x ) > + rex_prefix |= REX_X; > + if ( vex.w ) > + { > + rex_prefix |= REX_W; > + op_bytes = 8; > + } > + } > + } > + vex.reg ^= 0xf; > + if ( !mode_64bit() ) > + vex.reg &= 0x7; > + else if ( !vex.r ) > + rex_prefix |= REX_R; > + > + fail_if(vex.opcx != vex_0f); > + twobyte = 1; > + b = insn_fetch_type(uint8_t); > + d = twobyte_table[b]; > + > + /* Unrecognised? */ > + if ( d == 0 ) > + goto cannot_emulate; > + > + modrm = insn_fetch_type(uint8_t); > + modrm_mod = (modrm & 0xc0) >> 6; > + > + break; > + } > + > modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3); > modrm_rm = modrm & 0x07; > > @@ -3917,44 +4041,77 @@ x86_emulate( > break; > } > > - case 0x6f: /* movq mm/m64,mm */ { > - uint8_t stub[] = { 0x0f, 0x6f, modrm, 0xc3 }; > + case 0x6f: /* movq mm/m64,mm */ > + /* {,v}movdq{a,u} xmm/m128,xmm */ > + /* vmovdq{a,u} ymm/m256,ymm */ > + case 0x7f: /* movq mm,mm/m64 */ > + /* {,v}movdq{a,u} xmm,xmm/m128 */ > + /* vmovdq{a,u} ymm,ymm/m256 */ > + case 0xe7: /* movntq mm,mm/m64 */ > + /* {,v}movntdq xmm,xmm/m128 */ > + /* vmovntdq{a,u} ymm,ymm/m256 */ > + { > + uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 }; > struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; > - uint64_t val; > - if ( ea.type == OP_MEM ) > + > + if ( vex.opcx == vex_none ) > { > - unsigned long lval, hval; > - if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0, > - &lval, 4, ctxt, ops)) || > - (rc = read_ulong(ea.mem.seg, ea.mem.off+4, > - &hval, 4, ctxt, ops)) ) > - goto done; > - val = ((uint64_t)hval << 32) | (uint32_t)lval; > - stub[2] = modrm & 0x38; /* movq (%eax),%mmN */ > + switch ( vex.pfx ) > + { > + case vex_f3: > + fail_if(b == 0xe7); > + /* fall through */ > + case vex_66: > + vcpu_must_have_sse2(); > + stub[0] = 0x66; /* movdqa */ > + get_fpu(X86EMUL_FPU_xmm, &fic); > + ea.bytes = 16; > + break; > + case vex_none: > + if ( b != 0xe7 ) > + vcpu_must_have_mmx(); > + else > + vcpu_must_have_sse(); > + get_fpu(X86EMUL_FPU_mmx, &fic); > + ea.bytes = 8; > + break; > + default: > + goto cannot_emulate; > + } > + } > + else > + { > + fail_if(vex.opcx != vex_0f || vex.reg || > + (vex.pfx != vex_66 && (vex.pfx != vex_f3 || b == 0xe7))); > + vcpu_must_have_avx(); > + get_fpu(X86EMUL_FPU_ymm, &fic); > + ea.bytes = 16 << vex.l; > } > - get_fpu(X86EMUL_FPU_mmx, &fic); > - asm volatile ( "call *%0" : : "r" (stub), "a" (&val) : "memory" ); > - put_fpu(&fic); > - break; > - } > - > - case 0x7f: /* movq mm,mm/m64 */ { > - uint8_t stub[] = { 0x0f, 0x7f, modrm, 0xc3 }; > - struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; > - uint64_t val; > - if ( ea.type == OP_MEM ) > - stub[2] = modrm & 0x38; /* movq %mmN,(%eax) */ > - get_fpu(X86EMUL_FPU_mmx, &fic); > - asm volatile ( "call *%0" : : "r" (stub), "a" (&val) : "memory" ); > - put_fpu(&fic); > if ( ea.type == OP_MEM ) > { > - unsigned long lval = (uint32_t)val, hval = (uint32_t)(val >> 32); > - if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) > || > - (rc = ops->write(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) > ) > - goto done; > + /* XXX > + generate_exception_if(vex.pfx == vex_66 && > + (ops->ea(ea.mem.seg, ea.mem.off) > + & (ea.bytes - 1)), EXC_GP, 0); */ > + if ( b == 0x6f ) > + rc = ops->read(ea.mem.seg, ea.mem.off+0, &this_cpu(mmval), > + ea.bytes, ctxt); > + /* convert memory operand to (%rAX) */ > + rex_prefix &= ~REX_B; > + vex.b = 1; > + stub[4] &= 0x38; > + } > + if ( !rc ) > + { > + copy_REX_VEX(stub, rex_prefix, vex); > + asm volatile ( "call *%0" : : "r" (stub), "a" (&this_cpu(mmval)) > + : "memory" ); > } > - break; > + put_fpu(&fic); > + if ( b != 0x6f && ea.type == OP_MEM ) > + rc = ops->write(ea.mem.seg, ea.mem.off, &this_cpu(mmval), > + ea.bytes, ctxt); > + goto done; > } > > case 0x80 ... 0x8f: /* jcc (near) */ { > --- a/xen/arch/x86/x86_emulate/x86_emulate.h > +++ b/xen/arch/x86/x86_emulate/x86_emulate.h > @@ -99,7 +99,9 @@ struct segment_register { > /* FPU sub-types which may be requested via ->get_fpu(). */ > enum x86_emulate_fpu_type { > X86EMUL_FPU_fpu, /* Standard FPU coprocessor instruction set */ > - X86EMUL_FPU_mmx /* MMX instruction set (%mm0-%mm7) */ > + X86EMUL_FPU_mmx, /* MMX instruction set (%mm0-%mm7) */ > + X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */ > + X86EMUL_FPU_ymm /* AVX/XOP instruction set (%ymm0-%ymm7/15) */ > }; > > /* > --- a/xen/arch/x86/x86_emulate.c > +++ b/xen/arch/x86/x86_emulate.c > @@ -10,9 +10,11 @@ > */ > > #include <asm/cpufeature.h> > +#include <asm/processor.h> > #include <asm/x86_emulate.h> > > /* Avoid namespace pollution. */ > #undef cmpxchg > +#undef cpuid > > #include "x86_emulate/x86_emulate.c" > --- a/xen/include/asm-x86/cpufeature.h > +++ b/xen/include/asm-x86/cpufeature.h > @@ -218,7 +218,7 @@ > #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) > > #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) > - > +#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) > #define cpu_has_lwp boot_cpu_has(X86_FEATURE_LWP) > > #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel