Jan Beulich
2013-Aug-30 09:55 UTC
[PATCH] x86/xsave: fix migration from xsave-capable to xsave-incapable host
With CPUID features suitably masked this is supposed to work, but was completely broken (i.e. the case wasn''t even considered when the original xsave save/restore code was written). First of all, xsave_enabled() wrongly returned the value of cpu_has_xsave, i.e. not even taking into consideration attributes of the vCPU in question. Instead this function ought to check whether the guest ever enabled xsave support (by writing a [non-zero] value to XCR0). As a result of this, a vCPU''s xcr0 and xcr0_accum must no longer be initialized to XSTATE_FP_SSE (since that''s a valid value a guest could write to XCR0), and the xsave/xrstor as well as the context switch code need to suitably account for this (by always enforcing at least this part of the state to be saved/loaded). This involves undoing large parts of c/s 22945:13a7d1f7f62c ("x86: add strictly sanity check for XSAVE/XRSTOR") - we need to cleanly distinguish between hardware capabilities and vCPU used features. Next both HVM and PV save code needed tweaking to not always save the full state supported by the underlying hardware, but just the parts that the guest actually used. Similarly the restore code should bail not just on state being restored that the hardware cannot handle, but also on inconsistent save state (inconsistent XCR0 settings or size of saved state not in line with XCR0). And finally the PV extended context get/set code needs to use slightly different logic than the HVM one, as here we can''t just key off of xsave_enabled() (i.e. avoid doing anything if a guest doesn''t use xsave) because the tools use this function to determine host capabilities as well as read/write vCPU state. The set operation in particular needs to be capable of cleanly dealing with input that consists of only the xcr0 and xcr0_accum values (if they''re both zero then no further data is required). While for things to work correctly both sides (saving _and_ restoring host) need to run with the fixed code, afaict no breakage should occur if either side isn''t up to date (other than the breakage that this patch attempts to fix). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- Please note that the development and testing of this was done on 4.1.x, as that''s where to issue was found. The xsave code being quite different between then and now, the porting was not really straight forward, but in the end the biggest difference was that the 4.1.x code needs more changes than presented here, due to there FPU context and XSAVE area not sharing the same memory block. Hence I think/hope I did all the porting over correctly, but I had no setup available where I could have myself fully tested all the scenarios. --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -618,7 +618,7 @@ unsigned long pv_guest_cr4_fixup(const s hv_cr4_mask &= ~X86_CR4_DE; if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) ) hv_cr4_mask &= ~X86_CR4_FSGSBASE; - if ( xsave_enabled(v) ) + if ( cpu_has_xsave ) hv_cr4_mask &= ~X86_CR4_OSXSAVE; if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) ) @@ -1351,9 +1351,13 @@ static void __context_switch(void) if ( !is_idle_vcpu(n) ) { memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES); - if ( xsave_enabled(n) && n->arch.xcr0 != get_xcr0() && - !set_xcr0(n->arch.xcr0) ) - BUG(); + if ( cpu_has_xsave ) + { + u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE; + + if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) ) + BUG(); + } vcpu_restore_fpu_eager(n); n->arch.ctxt_switch_to(n); } --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -1047,11 +1047,8 @@ long arch_do_domctl( struct xen_domctl_vcpuextstate *evc; struct vcpu *v; uint32_t offset = 0; - uint64_t _xfeature_mask = 0; - uint64_t _xcr0, _xcr0_accum; - void *receive_buf = NULL, *_xsave_area; -#define PV_XSAVE_SIZE (2 * sizeof(uint64_t) + xsave_cntxt_size) +#define PV_XSAVE_SIZE(xcr0) (2 * sizeof(uint64_t) + xstate_ctxt_size(xcr0)) evc = &domctl->u.vcpuextstate; @@ -1062,15 +1059,16 @@ long arch_do_domctl( if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate ) { + unsigned int size = PV_XSAVE_SIZE(v->arch.xcr0_accum); + if ( !evc->size && !evc->xfeature_mask ) { evc->xfeature_mask = xfeature_mask; - evc->size = PV_XSAVE_SIZE; + evc->size = size; ret = 0; goto vcpuextstate_out; } - if ( evc->size != PV_XSAVE_SIZE || - evc->xfeature_mask != xfeature_mask ) + if ( evc->size != size || evc->xfeature_mask != xfeature_mask ) { ret = -EINVAL; goto vcpuextstate_out; @@ -1093,7 +1091,7 @@ long arch_do_domctl( offset += sizeof(v->arch.xcr0_accum); if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer, offset, (void *)v->arch.xsave_area, - xsave_cntxt_size) ) + size - 2 * sizeof(uint64_t)) ) { ret = -EFAULT; goto vcpuextstate_out; @@ -1101,13 +1099,14 @@ long arch_do_domctl( } else { - ret = -EINVAL; + void *receive_buf; + uint64_t _xcr0, _xcr0_accum; + const struct xsave_struct *_xsave_area; - _xfeature_mask = evc->xfeature_mask; - /* xsave context must be restored on compatible target CPUs */ - if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask ) - goto vcpuextstate_out; - if ( evc->size > PV_XSAVE_SIZE || evc->size < 2 * sizeof(uint64_t) ) + ret = -EINVAL; + if ( evc->size < 2 * sizeof(uint64_t) || + evc->size > 2 * sizeof(uint64_t) + + xstate_ctxt_size(xfeature_mask) ) goto vcpuextstate_out; receive_buf = xmalloc_bytes(evc->size); @@ -1128,20 +1127,30 @@ long arch_do_domctl( _xcr0_accum = *(uint64_t *)(receive_buf + sizeof(uint64_t)); _xsave_area = receive_buf + 2 * sizeof(uint64_t); - if ( !(_xcr0 & XSTATE_FP) || _xcr0 & ~xfeature_mask ) + if ( _xcr0_accum ) { - xfree(receive_buf); - goto vcpuextstate_out; + if ( evc->size >= 2 * sizeof(uint64_t) + XSTATE_AREA_MIN_SIZE ) + ret = validate_xstate(_xcr0, _xcr0_accum, + _xsave_area->xsave_hdr.xstate_bv, + evc->xfeature_mask); } - if ( (_xcr0 & _xcr0_accum) != _xcr0 ) + else if ( !_xcr0 ) + ret = 0; + if ( ret ) { xfree(receive_buf); goto vcpuextstate_out; } - v->arch.xcr0 = _xcr0; - v->arch.xcr0_accum = _xcr0_accum; - memcpy(v->arch.xsave_area, _xsave_area, evc->size - 2 * sizeof(uint64_t) ); + if ( evc->size <= PV_XSAVE_SIZE(_xcr0_accum) ) + { + v->arch.xcr0 = _xcr0; + v->arch.xcr0_accum = _xcr0_accum; + memcpy(v->arch.xsave_area, _xsave_area, + evc->size - 2 * sizeof(uint64_t)); + } + else + ret = -EINVAL; xfree(receive_buf); } --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -906,14 +906,12 @@ static int hvm_load_cpu_ctxt(struct doma hvm_set_segment_register(v, x86_seg_ldtr, &seg); /* In case xsave-absent save file is restored on a xsave-capable host */ - if ( xsave_enabled(v) ) + if ( cpu_has_xsave && !xsave_enabled(v) ) { struct xsave_struct *xsave_area = v->arch.xsave_area; memcpy(v->arch.xsave_area, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; - v->arch.xcr0_accum = XSTATE_FP_SSE; - v->arch.xcr0 = XSTATE_FP_SSE; } else memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); @@ -957,7 +955,9 @@ static int hvm_load_cpu_ctxt(struct doma HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt, 1, HVMSR_PER_VCPU); -#define HVM_CPU_XSAVE_SIZE (3 * sizeof(uint64_t) + xsave_cntxt_size) +#define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \ + save_area) + \ + xstate_ctxt_size(xcr0)) static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) { @@ -969,20 +969,20 @@ static int hvm_save_cpu_xsave_states(str for_each_vcpu ( d, v ) { + unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum); + if ( !xsave_enabled(v) ) continue; - if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, HVM_CPU_XSAVE_SIZE) ) + if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) ) return 1; ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; - h->cur += HVM_CPU_XSAVE_SIZE; - memset(ctxt, 0, HVM_CPU_XSAVE_SIZE); + h->cur += size; ctxt->xfeature_mask = xfeature_mask; ctxt->xcr0 = v->arch.xcr0; ctxt->xcr0_accum = v->arch.xcr0_accum; - if ( v->fpu_initialised ) - memcpy(&ctxt->save_area, - v->arch.xsave_area, xsave_cntxt_size); + memcpy(&ctxt->save_area, v->arch.xsave_area, + size - offsetof(struct hvm_hw_cpu_xsave, save_area)); } return 0; @@ -990,11 +990,11 @@ static int hvm_save_cpu_xsave_states(str static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) { - int vcpuid; + unsigned int vcpuid, size; + int err; struct vcpu *v; struct hvm_hw_cpu_xsave *ctxt; struct hvm_save_descriptor *desc; - uint64_t _xfeature_mask; /* Which vcpu is this? */ vcpuid = hvm_load_instance(h); @@ -1006,47 +1006,74 @@ static int hvm_load_cpu_xsave_states(str } /* Fails since we can''t restore an img saved on xsave-capable host. */ - if ( !xsave_enabled(v) ) - return -EINVAL; + if ( !cpu_has_xsave ) + return -EOPNOTSUPP; /* Customized checking for entry since our entry is of variable length */ desc = (struct hvm_save_descriptor *)&h->data[h->cur]; if ( sizeof (*desc) > h->size - h->cur) { printk(XENLOG_G_WARNING - "HVM%d restore: not enough data left to read descriptor" - "for type %u\n", d->domain_id, CPU_XSAVE_CODE); - return -1; + "HVM%d.%d restore: not enough data left to read xsave descriptor\n", + d->domain_id, vcpuid); + return -ENODATA; } if ( desc->length + sizeof (*desc) > h->size - h->cur) { printk(XENLOG_G_WARNING - "HVM%d restore: not enough data left to read %u bytes " - "for type %u\n", d->domain_id, desc->length, CPU_XSAVE_CODE); - return -1; + "HVM%d.%d restore: not enough data left to read %u xsave bytes\n", + d->domain_id, vcpuid, desc->length); + return -ENODATA; + } + if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) + + XSTATE_AREA_MIN_SIZE ) + { + printk(XENLOG_G_WARNING + "HVM%d.%d restore mismatch: xsave length %u < %zu\n", + d->domain_id, vcpuid, desc->length, + offsetof(struct hvm_hw_cpu_xsave, + save_area) + XSTATE_AREA_MIN_SIZE); + return -EINVAL; } - if ( CPU_XSAVE_CODE != desc->typecode || (desc->length > HVM_CPU_XSAVE_SIZE) ) + size = HVM_CPU_XSAVE_SIZE(xfeature_mask); + if ( desc->length > size ) { printk(XENLOG_G_WARNING - "HVM%d restore mismatch: expected type %u with max length %u, " - "saw type %u length %u\n", d->domain_id, CPU_XSAVE_CODE, - (unsigned int)HVM_CPU_XSAVE_SIZE, - desc->typecode, desc->length); - return -1; + "HVM%d.%d restore mismatch: xsave length %u > %u\n", + d->domain_id, vcpuid, desc->length, size); + return -EOPNOTSUPP; } h->cur += sizeof (*desc); - /* Checking finished */ ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; h->cur += desc->length; - _xfeature_mask = ctxt->xfeature_mask; - if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask ) - return -EINVAL; + err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum, + ctxt->save_area.xsave_hdr.xstate_bv, + ctxt->xfeature_mask); + if ( err ) + { + printk(XENLOG_G_WARNING + "HVM%d.%d restore: inconsistent xsave state (feat=%#"PRIx64 + " accum=%#"PRIx64" xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n", + d->domain_id, vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum, + ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err); + return err; + } + size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum); + if ( desc->length > size ) + { + printk(XENLOG_G_WARNING + "HVM%d.%d restore mismatch: xsave length %u > %u\n", + d->domain_id, vcpuid, desc->length, size); + return -EOPNOTSUPP; + } + /* Checking finished */ v->arch.xcr0 = ctxt->xcr0; v->arch.xcr0_accum = ctxt->xcr0_accum; - memcpy(v->arch.xsave_area, &ctxt->save_area, xsave_cntxt_size); + memcpy(v->arch.xsave_area, &ctxt->save_area, + desc->length - offsetof(struct hvm_hw_cpu_xsave, save_area)); return 0; } @@ -1060,7 +1087,8 @@ static int __init __hvm_register_CPU_XSA "CPU_XSAVE", hvm_save_cpu_xsave_states, hvm_load_cpu_xsave_states, - HVM_CPU_XSAVE_SIZE + sizeof (struct hvm_save_descriptor), + HVM_CPU_XSAVE_SIZE(xfeature_mask) + + sizeof(struct hvm_save_descriptor), HVMSR_PER_VCPU); return 0; } @@ -2767,7 +2795,7 @@ void hvm_cpuid(unsigned int input, unsig __clear_bit(X86_FEATURE_APIC & 31, edx); /* Fix up OSXSAVE. */ - if ( xsave_enabled(v) ) + if ( cpu_has_xsave ) *ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ? cpufeat_mask(X86_FEATURE_OSXSAVE) : 0; --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -944,8 +944,7 @@ static int construct_vmcs(struct vcpu *v /* Host control registers. */ v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); - __vmwrite(HOST_CR4, - mmu_cr4_features | (xsave_enabled(v) ? X86_CR4_OSXSAVE : 0)); + __vmwrite(HOST_CR4, mmu_cr4_features); /* Host CS:RIP. */ __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); --- a/xen/arch/x86/i387.c +++ b/xen/arch/x86/i387.c @@ -38,14 +38,15 @@ static inline void fpu_xrstor(struct vcp { bool_t ok; + ASSERT(v->arch.xsave_area); /* * XCR0 normally represents what guest OS set. In case of Xen itself, - * we set all supported feature mask before doing save/restore. + * we set the accumulated feature mask before doing save/restore. */ - ok = set_xcr0(v->arch.xcr0_accum); + ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); ASSERT(ok); xrstor(v, mask); - ok = set_xcr0(v->arch.xcr0); + ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); ASSERT(ok); } @@ -137,13 +138,15 @@ static inline void fpu_xsave(struct vcpu { bool_t ok; - /* XCR0 normally represents what guest OS set. In case of Xen itself, - * we set all accumulated feature mask before doing save/restore. + ASSERT(v->arch.xsave_area); + /* + * XCR0 normally represents what guest OS set. In case of Xen itself, + * we set the accumulated feature mask before doing save/restore. */ - ok = set_xcr0(v->arch.xcr0_accum); + ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); ASSERT(ok); xsave(v, v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY); - ok = set_xcr0(v->arch.xcr0); + ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); ASSERT(ok); } @@ -232,7 +235,7 @@ void vcpu_restore_fpu_lazy(struct vcpu * if ( v->fpu_dirtied ) return; - if ( xsave_enabled(v) ) + if ( cpu_has_xsave ) fpu_xrstor(v, XSTATE_LAZY); else if ( v->fpu_initialised ) { @@ -262,7 +265,7 @@ void vcpu_save_fpu(struct vcpu *v) /* This can happen, if a paravirtualised guest OS has set its CR0.TS. */ clts(); - if ( xsave_enabled(v) ) + if ( cpu_has_xsave ) fpu_xsave(v); else if ( cpu_has_fxsr ) fpu_fxsave(v); --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -817,7 +817,7 @@ static void pv_cpuid(struct cpu_user_reg __clear_bit(X86_FEATURE_PDCM % 32, &c); __clear_bit(X86_FEATURE_PCID % 32, &c); __clear_bit(X86_FEATURE_DCA % 32, &c); - if ( !xsave_enabled(current) ) + if ( !cpu_has_xsave ) { __clear_bit(X86_FEATURE_XSAVE % 32, &c); __clear_bit(X86_FEATURE_AVX % 32, &c); @@ -842,7 +842,7 @@ static void pv_cpuid(struct cpu_user_reg break; case 0x0000000d: /* XSAVE */ - if ( !xsave_enabled(current) ) + if ( !cpu_has_xsave ) goto unsupported; break; --- a/xen/arch/x86/xstate.c +++ b/xen/arch/x86/xstate.c @@ -21,7 +21,7 @@ bool_t __read_mostly cpu_has_xsaveopt; * the supported and enabled features on the processor, including the * XSAVE.HEADER. We only enable XCNTXT_MASK that we have known. */ -u32 xsave_cntxt_size; +static u32 __read_mostly xsave_cntxt_size; /* A 64-bit bitmask of the XSAVE/XRSTOR features supported by processor. */ u64 xfeature_mask; @@ -206,13 +206,13 @@ void xrstor(struct vcpu *v, uint64_t mas bool_t xsave_enabled(const struct vcpu *v) { - if ( cpu_has_xsave ) - { - ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); - ASSERT(v->arch.xsave_area); - } + if ( !cpu_has_xsave ) + return 0; - return cpu_has_xsave; + ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); + ASSERT(v->arch.xsave_area); + + return !!v->arch.xcr0_accum; } int xstate_alloc_save_area(struct vcpu *v) @@ -238,8 +238,8 @@ int xstate_alloc_save_area(struct vcpu * save_area->fpu_sse.mxcsr = MXCSR_DEFAULT; v->arch.xsave_area = save_area; - v->arch.xcr0 = XSTATE_FP_SSE; - v->arch.xcr0_accum = XSTATE_FP_SSE; + v->arch.xcr0 = 0; + v->arch.xcr0_accum = 0; return 0; } @@ -257,7 +257,11 @@ void xstate_init(bool_t bsp) u64 feature_mask; if ( boot_cpu_data.cpuid_level < XSTATE_CPUID ) + { + BUG_ON(!bsp); + setup_clear_cpu_cap(X86_FEATURE_XSAVE); return; + } cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); @@ -277,7 +281,6 @@ void xstate_init(bool_t bsp) set_in_cr4(X86_CR4_OSXSAVE); if ( !set_xcr0(feature_mask) ) BUG(); - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); if ( bsp ) { @@ -286,14 +289,14 @@ void xstate_init(bool_t bsp) * xsave_cntxt_size is the max size required by enabled features. * We know FP/SSE and YMM about eax, and nothing about edx at present. */ - xsave_cntxt_size = ebx; + xsave_cntxt_size = xstate_ctxt_size(feature_mask); printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n", __func__, xsave_cntxt_size, xfeature_mask); } else { BUG_ON(xfeature_mask != feature_mask); - BUG_ON(xsave_cntxt_size != ebx); + BUG_ON(xsave_cntxt_size != xstate_ctxt_size(feature_mask)); } /* Check XSAVEOPT feature. */ @@ -304,6 +307,42 @@ void xstate_init(bool_t bsp) BUG_ON(!cpu_has_xsaveopt != !(eax & XSTATE_FEATURE_XSAVEOPT)); } +unsigned int xstate_ctxt_size(u64 xcr0) +{ + u32 ebx = 0; + + if ( xcr0 ) + { + u64 act_xcr0 = get_xcr0(); + u32 eax, ecx, edx; + bool_t ok = set_xcr0(xcr0); + + ASSERT(ok); + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + ASSERT(ebx <= ecx); + ok = set_xcr0(act_xcr0); + ASSERT(ok); + } + + return ebx; +} + +int validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, u64 xfeat_mask) +{ + if ( (xcr0_accum & ~xfeat_mask) || + (xstate_bv & ~xcr0_accum) || + (xcr0 & ~xcr0_accum) || + !(xcr0 & XSTATE_FP) || + ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) || + ((xcr0_accum & XSTATE_YMM) && !(xcr0_accum & XSTATE_SSE)) ) + return -EINVAL; + + if ( xcr0_accum & ~xfeature_mask ) + return -EOPNOTSUPP; + + return 0; +} + int handle_xsetbv(u32 index, u64 new_bv) { struct vcpu *curr = current; --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -456,9 +456,9 @@ unsigned long pv_guest_cr4_fixup(const s #define pv_guest_cr4_to_real_cr4(v) \ (((v)->arch.pv_vcpu.ctrlreg[4] \ | (mmu_cr4_features \ - & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP)) \ - | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0) \ - | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0)) \ + & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP | \ + X86_CR4_OSXSAVE)) \ + | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \ & ~X86_CR4_DE) #define real_cr4_to_pv_guest_cr4(c) \ ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD \ --- a/xen/include/asm-x86/hvm/hvm.h +++ b/xen/include/asm-x86/hvm/hvm.h @@ -368,7 +368,7 @@ static inline int hvm_event_pending(stru ((nestedhvm_enabled((_v)->domain) && cpu_has_vmx)\ ? X86_CR4_VMXE : 0) | \ (cpu_has_pcid ? X86_CR4_PCIDE : 0) | \ - (xsave_enabled(_v) ? X86_CR4_OSXSAVE : 0)))) + (cpu_has_xsave ? X86_CR4_OSXSAVE : 0)))) /* These exceptions must always be intercepted. */ #define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << TRAP_invalid_op)) --- a/xen/include/asm-x86/xstate.h +++ b/xen/include/asm-x86/xstate.h @@ -34,7 +34,6 @@ #define XSTATE_NONLAZY (XSTATE_LWP) #define XSTATE_LAZY (XSTATE_ALL & ~XSTATE_NONLAZY) -extern unsigned int xsave_cntxt_size; extern u64 xfeature_mask; /* extended state save area */ @@ -77,11 +76,14 @@ uint64_t get_xcr0(void); void xsave(struct vcpu *v, uint64_t mask); void xrstor(struct vcpu *v, uint64_t mask); bool_t xsave_enabled(const struct vcpu *v); +int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, + u64 xfeat_mask); int __must_check handle_xsetbv(u32 index, u64 new_bv); /* extended state init and cleanup functions */ void xstate_free_save_area(struct vcpu *v); int xstate_alloc_save_area(struct vcpu *v); void xstate_init(bool_t bsp); +unsigned int xstate_ctxt_size(u64 xcr0); #endif /* __ASM_XSTATE_H */
Jan Beulich
2013-Aug-30 10:11 UTC
XSAVE save/restore shortcomings (was: [PATCH] x86/xsave: fix migration from xsave-capable to xsave-incapable host)
>>> On 30.08.13 at 11:55, "Jan Beulich" <JBeulich@suse.com> wrote: > Next both HVM and PV save code needed tweaking to not always save the > full state supported by the underlying hardware, but just the parts > that the guest actually used. Similarly the restore code should bail > not just on state being restored that the hardware cannot handle, but > also on inconsistent save state (inconsistent XCR0 settings or size of > saved state not in line with XCR0). > > And finally the PV extended context get/set code needs to use slightly > different logic than the HVM one, as here we can''t just key off of > xsave_enabled() (i.e. avoid doing anything if a guest doesn''t use > xsave) because the tools use this function to determine host > capabilities as well as read/write vCPU state. The set operation in > particular needs to be capable of cleanly dealing with input that > consists of only the xcr0 and xcr0_accum values (if they''re both zero > then no further data is required).I''d like to make clear that the change presented is going to handle only the most trivial cases (where any new xsave state addition adds to the end of the save area). This is an effect of a more fundamental flaw in the original design (which the patch doesn''t try to revise, as it''s not clear to me yet what the best approach here is): While the XSAVE hardware specification allows for each piece to be individually savable/restorable, both PV and HVM save/restore assume a single monolithic blob. Which is already going to be a problem: AVX-512 as well as MPX conflict with LWP. And obviously it can''t be excluded that we''ll see CPUs supporting AVX-512 but not MPX as well as guests using the former but not the latter, and neither can be dealt with under the current design. I therefore think that we''ll need to start over from scratch with how save/restore is to be implemented, breaking up the current monolithic block into individual pieces. But before working on a proposal, I''d first like to hear whether others can see better (and namely less intrusive) ways of dealing with the problem. Jan
On 30/08/13 11:11, Jan Beulich wrote:>>>> On 30.08.13 at 11:55, "Jan Beulich" <JBeulich@suse.com> wrote: >> Next both HVM and PV save code needed tweaking to not always save the >> full state supported by the underlying hardware, but just the parts >> that the guest actually used. Similarly the restore code should bail >> not just on state being restored that the hardware cannot handle, but >> also on inconsistent save state (inconsistent XCR0 settings or size of >> saved state not in line with XCR0). >> >> And finally the PV extended context get/set code needs to use slightly >> different logic than the HVM one, as here we can''t just key off of >> xsave_enabled() (i.e. avoid doing anything if a guest doesn''t use >> xsave) because the tools use this function to determine host >> capabilities as well as read/write vCPU state. The set operation in >> particular needs to be capable of cleanly dealing with input that >> consists of only the xcr0 and xcr0_accum values (if they''re both zero >> then no further data is required). > I''d like to make clear that the change presented is going to handle > only the most trivial cases (where any new xsave state addition > adds to the end of the save area). This is an effect of a more > fundamental flaw in the original design (which the patch doesn''t try > to revise, as it''s not clear to me yet what the best approach here is): > While the XSAVE hardware specification allows for each piece to be > individually savable/restorable, both PV and HVM save/restore > assume a single monolithic blob. Which is already going to be a > problem: AVX-512 as well as MPX conflict with LWP. And obviously > it can''t be excluded that we''ll see CPUs supporting AVX-512 but not > MPX as well as guests using the former but not the latter, and > neither can be dealt with under the current design. > > I therefore think that we''ll need to start over from scratch with > how save/restore is to be implemented, breaking up the current > monolithic block into individual pieces. But before working on a > proposal, I''d first like to hear whether others can see better (and > namely less intrusive) ways of dealing with the problem. > > JanYikes that''s a big patch. I think I need to read it a few more times. XenServer has xsave disabled by default, not least because support in Xen was buggy until very recently. Getting it working is on my todo list (behind some other integration issues), but I admit I had not realised that migration was this broken. I will see about testing the patch, particularly to do with migrating between one Xen with the fix and one without, but I cant guarantee to get around to it any time soon. I think that breaking the save/restore it into pieces is the only tenable solution going forward. I cant spot a less intrusive method, but even if there is, hacking around this broken design now might be the easy solution but will be more work in the future as new extensions appear in the future. ~Andrew
Zhang, Yang Z
2013-Sep-03 05:47 UTC
Re: [PATCH] x86/xsave: fix migration from xsave-capable to xsave-incapable host
Jan Beulich wrote on 2013-08-30:> With CPUID features suitably masked this is supposed to work, but was > completely broken (i.e. the case wasn''t even considered when the > original xsave save/restore code was written). > > First of all, xsave_enabled() wrongly returned the value of > cpu_has_xsave, i.e. not even taking into consideration attributes of > the vCPU in question. Instead this function ought to check whether the > guest ever enabled xsave support (by writing a [non-zero] value to > XCR0). As a result of this, a vCPU''s xcr0 and xcr0_accum must no longer > be initialized to XSTATE_FP_SSE (since that''s a valid value a guest > could write to XCR0), and the xsave/xrstor as well as the context > switch code need to suitably account for this (by always enforcing at > least this part of the state to be saved/loaded). > > This involves undoing large parts of c/s 22945:13a7d1f7f62c ("x86: add > strictly sanity check for XSAVE/XRSTOR") - we need to cleanly > distinguish between hardware capabilities and vCPU used features. > > Next both HVM and PV save code needed tweaking to not always save the > full state supported by the underlying hardware, but just the parts > that the guest actually used. Similarly the restore code should bail > not just on state being restored that the hardware cannot handle, but > also on inconsistent save state (inconsistent XCR0 settings or size of > saved state not in line with XCR0). > > And finally the PV extended context get/set code needs to use slightly > different logic than the HVM one, as here we can''t just key off of > xsave_enabled() (i.e. avoid doing anything if a guest doesn''t use > xsave) because the tools use this function to determine host > capabilities as well as read/write vCPU state. The set operation in > particular needs to be capable of cleanly dealing with input that > consists of only the xcr0 and xcr0_accum values (if they''re both zero > then no further data is required). > > While for things to work correctly both sides (saving _and_ restoring > host) need to run with the fixed code, afaict no breakage should occur > if either side isn''t up to date (other than the breakage that this > patch attempts to fix).It looks good. But the title confused me before I saw your reply. Reviewed-by: Yang Zhang <yang.z.zhang@intel.com>> > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > Please note that the development and testing of this was done on 4.1.x, > as that''s where to issue was found. The xsave code being quite > different between then and now, the porting was not really straight > forward, but in the end the biggest difference was that the 4.1.x code > needs more changes than presented here, due to there FPU context and > XSAVE area not sharing the same memory block. Hence I think/hope I did > all the porting over correctly, but I had no setup available where I > could have myself fully tested all the scenarios. > > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -618,7 +618,7 @@ unsigned long pv_guest_cr4_fixup(const s > hv_cr4_mask &= ~X86_CR4_DE; if ( cpu_has_fsgsbase && > !is_pv_32bit_domain(v->domain) ) hv_cr4_mask &> ~X86_CR4_FSGSBASE; > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave ) > hv_cr4_mask &= ~X86_CR4_OSXSAVE; > if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) ) @@ > -1351,9 +1351,13 @@ static void __context_switch(void) if ( > !is_idle_vcpu(n) ) { > memcpy(stack_regs, &n->arch.user_regs, > CTXT_SWITCH_STACK_BYTES); > - if ( xsave_enabled(n) && n->arch.xcr0 != get_xcr0() && > - !set_xcr0(n->arch.xcr0) ) > - BUG(); > + if ( cpu_has_xsave ) > + { > + u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE; > + > + if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) ) > + BUG(); > + } > vcpu_restore_fpu_eager(n); > n->arch.ctxt_switch_to(n); > } > --- a/xen/arch/x86/domctl.c > +++ b/xen/arch/x86/domctl.c > @@ -1047,11 +1047,8 @@ long arch_do_domctl( > struct xen_domctl_vcpuextstate *evc; > struct vcpu *v; > uint32_t offset = 0; > - uint64_t _xfeature_mask = 0; > - uint64_t _xcr0, _xcr0_accum; > - void *receive_buf = NULL, *_xsave_area; > > -#define PV_XSAVE_SIZE (2 * sizeof(uint64_t) + xsave_cntxt_size) > +#define PV_XSAVE_SIZE(xcr0) (2 * sizeof(uint64_t) + xstate_ctxt_size(xcr0)) > > evc = &domctl->u.vcpuextstate; > @@ -1062,15 +1059,16 @@ long arch_do_domctl( > > if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate ) > { > + unsigned int size = PV_XSAVE_SIZE(v->arch.xcr0_accum); > + > if ( !evc->size && !evc->xfeature_mask ) > { > evc->xfeature_mask = xfeature_mask; > - evc->size = PV_XSAVE_SIZE; > + evc->size = size; > ret = 0; > goto vcpuextstate_out; > } > - if ( evc->size != PV_XSAVE_SIZE || > - evc->xfeature_mask != xfeature_mask ) > + if ( evc->size != size || evc->xfeature_mask != xfeature_mask ) > { > ret = -EINVAL; > goto vcpuextstate_out; > @@ -1093,7 +1091,7 @@ long arch_do_domctl( > offset += sizeof(v->arch.xcr0_accum); > if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer, > offset, (void > *)v->arch.xsave_area, > - xsave_cntxt_size) ) > + size - 2 * sizeof(uint64_t)) ) > { > ret = -EFAULT; > goto vcpuextstate_out; > @@ -1101,13 +1099,14 @@ long arch_do_domctl( > } > else > { > - ret = -EINVAL; > + void *receive_buf; > + uint64_t _xcr0, _xcr0_accum; > + const struct xsave_struct *_xsave_area; > > - _xfeature_mask = evc->xfeature_mask; - /* xsave > context must be restored on compatible target CPUs */ - if ( > (_xfeature_mask & xfeature_mask) != _xfeature_mask ) - > goto vcpuextstate_out; - if ( evc->size > PV_XSAVE_SIZE || > evc->size < 2 * sizeof(uint64_t) ) + ret = -EINVAL; + > if ( evc->size < 2 * sizeof(uint64_t) || + evc->size > > 2 * sizeof(uint64_t) + + > xstate_ctxt_size(xfeature_mask) ) > goto vcpuextstate_out; > receive_buf = xmalloc_bytes(evc->size); @@ -1128,20 > +1127,30 @@ long arch_do_domctl( _xcr0_accum = *(uint64_t > *)(receive_buf + sizeof(uint64_t)); _xsave_area > receive_buf + 2 * sizeof(uint64_t); > - if ( !(_xcr0 & XSTATE_FP) || _xcr0 & ~xfeature_mask ) > + if ( _xcr0_accum ) > { > - xfree(receive_buf); > - goto vcpuextstate_out; > + if ( evc->size >= 2 * sizeof(uint64_t) + > XSTATE_AREA_MIN_SIZE ) > + ret = validate_xstate(_xcr0, _xcr0_accum, > + > _xsave_area->xsave_hdr.xstate_bv, > + evc->xfeature_mask); > } > - if ( (_xcr0 & _xcr0_accum) != _xcr0 ) > + else if ( !_xcr0 ) > + ret = 0; > + if ( ret ) > { > xfree(receive_buf); > goto vcpuextstate_out; > } > - v->arch.xcr0 = _xcr0; > - v->arch.xcr0_accum = _xcr0_accum; > - memcpy(v->arch.xsave_area, _xsave_area, evc->size - 2 * > sizeof(uint64_t) ); > + if ( evc->size <= PV_XSAVE_SIZE(_xcr0_accum) ) > + { > + v->arch.xcr0 = _xcr0; > + v->arch.xcr0_accum = _xcr0_accum; > + memcpy(v->arch.xsave_area, _xsave_area, > + evc->size - 2 * sizeof(uint64_t)); > + } > + else > + ret = -EINVAL; > > xfree(receive_buf); > } > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -906,14 +906,12 @@ static int hvm_load_cpu_ctxt(struct doma > hvm_set_segment_register(v, x86_seg_ldtr, &seg); > > /* In case xsave-absent save file is restored on a xsave-capable host */ > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave && !xsave_enabled(v) ) > { > struct xsave_struct *xsave_area = v->arch.xsave_area; > > memcpy(v->arch.xsave_area, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); > xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; > - v->arch.xcr0_accum = XSTATE_FP_SSE; > - v->arch.xcr0 = XSTATE_FP_SSE; > } > else > memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); > @@ -957,7 +955,9 @@ static int hvm_load_cpu_ctxt(struct doma > HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, > hvm_load_cpu_ctxt, > 1, HVMSR_PER_VCPU); > -#define HVM_CPU_XSAVE_SIZE (3 * sizeof(uint64_t) + xsave_cntxt_size) > +#define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \ > + save_area) + \ > + xstate_ctxt_size(xcr0)) > > static int hvm_save_cpu_xsave_states(struct domain *d, > hvm_domain_context_t *h) { > @@ -969,20 +969,20 @@ static int hvm_save_cpu_xsave_states(str > > for_each_vcpu ( d, v ) > { > + unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum); > + > if ( !xsave_enabled(v) ) > continue; > - if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, > HVM_CPU_XSAVE_SIZE) ) > + if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) ) > return 1; > ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; > - h->cur += HVM_CPU_XSAVE_SIZE; > - memset(ctxt, 0, HVM_CPU_XSAVE_SIZE); > + h->cur += size; > > ctxt->xfeature_mask = xfeature_mask; > ctxt->xcr0 = v->arch.xcr0; > ctxt->xcr0_accum = v->arch.xcr0_accum; > - if ( v->fpu_initialised ) > - memcpy(&ctxt->save_area, > - v->arch.xsave_area, xsave_cntxt_size); > + memcpy(&ctxt->save_area, v->arch.xsave_area, > + size - offsetof(struct hvm_hw_cpu_xsave, save_area)); > } > > return 0; > @@ -990,11 +990,11 @@ static int hvm_save_cpu_xsave_states(str > > static int hvm_load_cpu_xsave_states(struct domain *d, > hvm_domain_context_t *h) { > - int vcpuid; > + unsigned int vcpuid, size; > + int err; > struct vcpu *v; > struct hvm_hw_cpu_xsave *ctxt; > struct hvm_save_descriptor *desc; > - uint64_t _xfeature_mask; > > /* Which vcpu is this? */ vcpuid = hvm_load_instance(h); @@ > -1006,47 +1006,74 @@ static int hvm_load_cpu_xsave_states(str } > > /* Fails since we can''t restore an img saved on xsave-capable host. */ > - if ( !xsave_enabled(v) ) > - return -EINVAL; > + if ( !cpu_has_xsave ) > + return -EOPNOTSUPP; > > /* Customized checking for entry since our entry is of variable length */ > desc = (struct hvm_save_descriptor *)&h->data[h->cur]; > if ( sizeof (*desc) > h->size - h->cur) > { > printk(XENLOG_G_WARNING > - "HVM%d restore: not enough data left to read descriptor" > - "for type %u\n", d->domain_id, CPU_XSAVE_CODE); - > return -1; + "HVM%d.%d restore: not enough data left to > read xsave descriptor\n", + d->domain_id, vcpuid); + > return -ENODATA; > } > if ( desc->length + sizeof (*desc) > h->size - h->cur) > { > printk(XENLOG_G_WARNING > - "HVM%d restore: not enough data left to read %u bytes " > - "for type %u\n", d->domain_id, desc->length, > CPU_XSAVE_CODE); - return -1; + "HVM%d.%d restore: > not enough data left to read %u xsave bytes\n", + > d->domain_id, vcpuid, desc->length); + return -ENODATA; + } + > if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) + + > XSTATE_AREA_MIN_SIZE ) + { + > printk(XENLOG_G_WARNING + "HVM%d.%d restore mismatch: > xsave length %u < %zu\n", + d->domain_id, vcpuid, > desc->length, + offsetof(struct hvm_hw_cpu_xsave, + > save_area) + XSTATE_AREA_MIN_SIZE); + return > -EINVAL; > } > - if ( CPU_XSAVE_CODE != desc->typecode || (desc->length > > HVM_CPU_XSAVE_SIZE) ) > + size = HVM_CPU_XSAVE_SIZE(xfeature_mask); > + if ( desc->length > size ) > { > printk(XENLOG_G_WARNING > - "HVM%d restore mismatch: expected type %u with max > length %u, " - "saw type %u length %u\n", d->domain_id, > CPU_XSAVE_CODE, - (unsigned int)HVM_CPU_XSAVE_SIZE, - > desc->typecode, desc->length); - return -1; + > "HVM%d.%d restore mismatch: xsave length %u > %u\n", + > d->domain_id, vcpuid, desc->length, size); + return -EOPNOTSUPP; > } > h->cur += sizeof (*desc); > - /* Checking finished */ > > ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; > h->cur += desc->length; > - _xfeature_mask = ctxt->xfeature_mask; - if ( (_xfeature_mask & > xfeature_mask) != _xfeature_mask ) - return -EINVAL; + err > validate_xstate(ctxt->xcr0, ctxt->xcr0_accum, + > ctxt->save_area.xsave_hdr.xstate_bv, + > ctxt->xfeature_mask); + if ( err ) + { + > printk(XENLOG_G_WARNING + "HVM%d.%d restore: inconsistent > xsave state (feat=%#"PRIx64 + " accum=%#"PRIx64" > xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n", + d->domain_id, > vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum, + > ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err); + return > err; + } + size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum); + if ( > desc->length > size ) + { + printk(XENLOG_G_WARNING + > "HVM%d.%d restore mismatch: xsave length %u > %u\n", + > d->domain_id, vcpuid, desc->length, size); + return -EOPNOTSUPP; > + } + /* Checking finished */ > > v->arch.xcr0 = ctxt->xcr0; > v->arch.xcr0_accum = ctxt->xcr0_accum; > - memcpy(v->arch.xsave_area, &ctxt->save_area, xsave_cntxt_size); > + memcpy(v->arch.xsave_area, &ctxt->save_area, > + desc->length - offsetof(struct hvm_hw_cpu_xsave, save_area)); > > return 0; > } > @@ -1060,7 +1087,8 @@ static int __init __hvm_register_CPU_XSA > "CPU_XSAVE", > hvm_save_cpu_xsave_states, > hvm_load_cpu_xsave_states, > - HVM_CPU_XSAVE_SIZE + sizeof (struct > hvm_save_descriptor), + > HVM_CPU_XSAVE_SIZE(xfeature_mask) + + > sizeof(struct hvm_save_descriptor), > HVMSR_PER_VCPU); > return 0; > } > @@ -2767,7 +2795,7 @@ void hvm_cpuid(unsigned int input, unsig > __clear_bit(X86_FEATURE_APIC & 31, edx); > /* Fix up OSXSAVE. */ > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave ) > *ecx |= (v->arch.hvm_vcpu.guest_cr[4] & > X86_CR4_OSXSAVE) ? > cpufeat_mask(X86_FEATURE_OSXSAVE) : 0; > --- a/xen/arch/x86/hvm/vmx/vmcs.c > +++ b/xen/arch/x86/hvm/vmx/vmcs.c > @@ -944,8 +944,7 @@ static int construct_vmcs(struct vcpu *v > /* Host control registers. */ > v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; > __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); > - __vmwrite(HOST_CR4, - mmu_cr4_features | > (xsave_enabled(v) ? X86_CR4_OSXSAVE : 0)); + __vmwrite(HOST_CR4, > mmu_cr4_features); > > /* Host CS:RIP. */ > __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); > --- a/xen/arch/x86/i387.c > +++ b/xen/arch/x86/i387.c > @@ -38,14 +38,15 @@ static inline void fpu_xrstor(struct vcp > { > bool_t ok; > + ASSERT(v->arch.xsave_area); > /* > * XCR0 normally represents what guest OS set. In case of Xen itself, > - * we set all supported feature mask before doing save/restore. > + * we set the accumulated feature mask before doing save/restore. > */ > - ok = set_xcr0(v->arch.xcr0_accum); > + ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); > ASSERT(ok); > xrstor(v, mask); > - ok = set_xcr0(v->arch.xcr0); > + ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); > ASSERT(ok); > } > @@ -137,13 +138,15 @@ static inline void fpu_xsave(struct vcpu > { > bool_t ok; > - /* XCR0 normally represents what guest OS set. In case of Xen itself, > - * we set all accumulated feature mask before doing save/restore. > + ASSERT(v->arch.xsave_area); > + /* > + * XCR0 normally represents what guest OS set. In case of Xen itself, > + * we set the accumulated feature mask before doing save/restore. > */ > - ok = set_xcr0(v->arch.xcr0_accum); > + ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); > ASSERT(ok); > xsave(v, v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY); > - ok = set_xcr0(v->arch.xcr0); > + ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); > ASSERT(ok); > } > @@ -232,7 +235,7 @@ void vcpu_restore_fpu_lazy(struct vcpu * > if ( v->fpu_dirtied ) > return; > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave ) > fpu_xrstor(v, XSTATE_LAZY); > else if ( v->fpu_initialised ) { @@ -262,7 +265,7 @@ void > vcpu_save_fpu(struct vcpu *v) /* This can happen, if a > paravirtualised guest OS has set its CR0.TS. */ clts(); > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave ) > fpu_xsave(v); else if ( cpu_has_fxsr ) fpu_fxsave(v); > --- a/xen/arch/x86/traps.c > +++ b/xen/arch/x86/traps.c > @@ -817,7 +817,7 @@ static void pv_cpuid(struct cpu_user_reg > __clear_bit(X86_FEATURE_PDCM % 32, &c); > __clear_bit(X86_FEATURE_PCID % 32, &c); > __clear_bit(X86_FEATURE_DCA % 32, &c); > - if ( !xsave_enabled(current) ) > + if ( !cpu_has_xsave ) > { > __clear_bit(X86_FEATURE_XSAVE % 32, &c); > __clear_bit(X86_FEATURE_AVX % 32, &c); > @@ -842,7 +842,7 @@ static void pv_cpuid(struct cpu_user_reg > break; > case 0x0000000d: /* XSAVE */ > - if ( !xsave_enabled(current) ) > + if ( !cpu_has_xsave ) > goto unsupported; > break; > --- a/xen/arch/x86/xstate.c > +++ b/xen/arch/x86/xstate.c > @@ -21,7 +21,7 @@ bool_t __read_mostly cpu_has_xsaveopt; > * the supported and enabled features on the processor, including the > * XSAVE.HEADER. We only enable XCNTXT_MASK that we have known. > */ > -u32 xsave_cntxt_size; > +static u32 __read_mostly xsave_cntxt_size; > > /* A 64-bit bitmask of the XSAVE/XRSTOR features supported by processor. */ > u64 xfeature_mask; > @@ -206,13 +206,13 @@ void xrstor(struct vcpu *v, uint64_t mas > > bool_t xsave_enabled(const struct vcpu *v) > { > - if ( cpu_has_xsave ) > - { > - ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); > - ASSERT(v->arch.xsave_area); > - } > + if ( !cpu_has_xsave ) > + return 0; > > - return cpu_has_xsave; > + ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); > + ASSERT(v->arch.xsave_area); > + > + return !!v->arch.xcr0_accum; > } > > int xstate_alloc_save_area(struct vcpu *v) > @@ -238,8 +238,8 @@ int xstate_alloc_save_area(struct vcpu * > save_area->fpu_sse.mxcsr = MXCSR_DEFAULT; > > v->arch.xsave_area = save_area; > - v->arch.xcr0 = XSTATE_FP_SSE; > - v->arch.xcr0_accum = XSTATE_FP_SSE; > + v->arch.xcr0 = 0; > + v->arch.xcr0_accum = 0; > > return 0; > } > @@ -257,7 +257,11 @@ void xstate_init(bool_t bsp) > u64 feature_mask; > > if ( boot_cpu_data.cpuid_level < XSTATE_CPUID ) > + { > + BUG_ON(!bsp); > + setup_clear_cpu_cap(X86_FEATURE_XSAVE); > return; > + } > > cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); > @@ -277,7 +281,6 @@ void xstate_init(bool_t bsp) > set_in_cr4(X86_CR4_OSXSAVE); > if ( !set_xcr0(feature_mask) ) > BUG(); > - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); > > if ( bsp ) > { > @@ -286,14 +289,14 @@ void xstate_init(bool_t bsp) > * xsave_cntxt_size is the max size required by enabled > features. * We know FP/SSE and YMM about eax, and nothing > about edx at present. */ > - xsave_cntxt_size = ebx; > + xsave_cntxt_size = xstate_ctxt_size(feature_mask); > printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n", > __func__, xsave_cntxt_size, xfeature_mask); > } > else > { > BUG_ON(xfeature_mask != feature_mask); > - BUG_ON(xsave_cntxt_size != ebx); > + BUG_ON(xsave_cntxt_size != xstate_ctxt_size(feature_mask)); > } > > /* Check XSAVEOPT feature. */ > @@ -304,6 +307,42 @@ void xstate_init(bool_t bsp) > BUG_ON(!cpu_has_xsaveopt != !(eax & > XSTATE_FEATURE_XSAVEOPT)); > } > +unsigned int xstate_ctxt_size(u64 xcr0) +{ + u32 ebx = 0; + + if > ( xcr0 ) + { + u64 act_xcr0 = get_xcr0(); + u32 eax, > ecx, edx; + bool_t ok = set_xcr0(xcr0); + + ASSERT(ok); + > cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + > ASSERT(ebx <= ecx); + ok = set_xcr0(act_xcr0); + > ASSERT(ok); + } + + return ebx; +} + +int validate_xstate(u64 > xcr0, u64 xcr0_accum, u64 xstate_bv, u64 xfeat_mask) +{ + if ( > (xcr0_accum & ~xfeat_mask) || + (xstate_bv & ~xcr0_accum) || + > (xcr0 & ~xcr0_accum) || + !(xcr0 & XSTATE_FP) || + > ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) || + ((xcr0_accum > & XSTATE_YMM) && !(xcr0_accum & XSTATE_SSE)) ) + return -EINVAL; > + + if ( xcr0_accum & ~xfeature_mask ) + return -EOPNOTSUPP; + > + return 0; +} + > int handle_xsetbv(u32 index, u64 new_bv) > { > struct vcpu *curr = current; > --- a/xen/include/asm-x86/domain.h > +++ b/xen/include/asm-x86/domain.h > @@ -456,9 +456,9 @@ unsigned long pv_guest_cr4_fixup(const s > #define pv_guest_cr4_to_real_cr4(v) \ > (((v)->arch.pv_vcpu.ctrlreg[4] \ > | (mmu_cr4_features \ > - & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP)) \ > - | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0) \ > - | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0)) \ > + & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP | \ > + X86_CR4_OSXSAVE)) \ > + | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \ > & ~X86_CR4_DE) > #define real_cr4_to_pv_guest_cr4(c) \ > ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD \ > --- a/xen/include/asm-x86/hvm/hvm.h > +++ b/xen/include/asm-x86/hvm/hvm.h > @@ -368,7 +368,7 @@ static inline int hvm_event_pending(stru > ((nestedhvm_enabled((_v)->domain) && cpu_has_vmx)\ > ? X86_CR4_VMXE : 0) | \ > (cpu_has_pcid ? X86_CR4_PCIDE : 0) | \ > - (xsave_enabled(_v) ? X86_CR4_OSXSAVE : 0)))) > + (cpu_has_xsave ? X86_CR4_OSXSAVE : 0)))) > > /* These exceptions must always be intercepted. */ > #define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << > TRAP_invalid_op)) > --- a/xen/include/asm-x86/xstate.h > +++ b/xen/include/asm-x86/xstate.h > @@ -34,7 +34,6 @@ > #define XSTATE_NONLAZY (XSTATE_LWP) > #define XSTATE_LAZY (XSTATE_ALL & ~XSTATE_NONLAZY) > -extern unsigned int xsave_cntxt_size; > extern u64 xfeature_mask; > > /* extended state save area */ @@ -77,11 +76,14 @@ uint64_t > get_xcr0(void); void xsave(struct vcpu *v, uint64_t mask); void > xrstor(struct vcpu *v, uint64_t mask); bool_t xsave_enabled(const > struct vcpu *v); > +int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, > + u64 xfeat_mask); > int __must_check handle_xsetbv(u32 index, u64 new_bv); > > /* extended state init and cleanup functions */ > void xstate_free_save_area(struct vcpu *v); > int xstate_alloc_save_area(struct vcpu *v); > void xstate_init(bool_t bsp); > +unsigned int xstate_ctxt_size(u64 xcr0); > > #endif /* __ASM_XSTATE_H */ > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-develBest regards, Yang
Il 30/08/2013 12:11, Jan Beulich ha scritto:> I''d like to make clear that the change presented is going to handle > only the most trivial cases (where any new xsave state addition > adds to the end of the save area). This is an effect of a more > fundamental flaw in the original design (which the patch doesn''t try > to revise, as it''s not clear to me yet what the best approach here is): > While the XSAVE hardware specification allows for each piece to be > individually savable/restorable, both PV and HVM save/restore > assume a single monolithic blob. Which is already going to be a > problem: AVX-512 as well as MPX conflict with LWP. And obviously > it can''t be excluded that we''ll see CPUs supporting AVX-512 but not > MPX as well as guests using the former but not the latter, and > neither can be dealt with under the current design.This should not be a problem, the manual says "The layout of the XSAVE/XRSTOR save area is fixed and may contain non-contiguous individual save areas. The XSAVE/XRSTOR save area is not compacted if some features are not saved or are not supported by the processor and/or by system software". Note "by the processor": the way I read this, size may vary (which is why CPUID.0Dh exists, basically), but offsets are guaranteed to be constant. Thus the only problem is LWP. Given AMD''s current non-involvement in x86, it may be simpler to avoid the problem completely by not implementing virtual LWP... Paolo> I therefore think that we''ll need to start over from scratch with > how save/restore is to be implemented, breaking up the current > monolithic block into individual pieces. But before working on a > proposal, I''d first like to hear whether others can see better (and > namely less intrusive) ways of dealing with the problem. > > Jan >
>>> On 05.09.13 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote: > Il 30/08/2013 12:11, Jan Beulich ha scritto: >> I''d like to make clear that the change presented is going to handle >> only the most trivial cases (where any new xsave state addition >> adds to the end of the save area). This is an effect of a more >> fundamental flaw in the original design (which the patch doesn''t try >> to revise, as it''s not clear to me yet what the best approach here is): >> While the XSAVE hardware specification allows for each piece to be >> individually savable/restorable, both PV and HVM save/restore >> assume a single monolithic blob. Which is already going to be a >> problem: AVX-512 as well as MPX conflict with LWP. And obviously >> it can''t be excluded that we''ll see CPUs supporting AVX-512 but not >> MPX as well as guests using the former but not the latter, and >> neither can be dealt with under the current design. > > This should not be a problem, the manual says "The layout of the > XSAVE/XRSTOR save area is fixed and may contain non-contiguous > individual save areas. The XSAVE/XRSTOR save area is not compacted if > some features are not saved or are not supported by the processor and/or > by system software". Note "by the processor": the way I read this, size > may vary (which is why CPUID.0Dh exists, basically), but offsets are > guaranteed to be constant.Then why would there be a way to retrieve these offsets via CPUID?> Thus the only problem is LWP. Given AMD''s current non-involvement in > x86, it may be simpler to avoid the problem completely by not > implementing virtual LWP...For which it is too late: Both 4.2 and 4.3 already have LWP support. Jan
Il 05/09/2013 14:01, Jan Beulich ha scritto:>>>> On 05.09.13 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote: >> Il 30/08/2013 12:11, Jan Beulich ha scritto: >>> I''d like to make clear that the change presented is going to handle >>> only the most trivial cases (where any new xsave state addition >>> adds to the end of the save area). This is an effect of a more >>> fundamental flaw in the original design (which the patch doesn''t try >>> to revise, as it''s not clear to me yet what the best approach here is): >>> While the XSAVE hardware specification allows for each piece to be >>> individually savable/restorable, both PV and HVM save/restore >>> assume a single monolithic blob. Which is already going to be a >>> problem: AVX-512 as well as MPX conflict with LWP. And obviously >>> it can''t be excluded that we''ll see CPUs supporting AVX-512 but not >>> MPX as well as guests using the former but not the latter, and >>> neither can be dealt with under the current design. >> >> This should not be a problem, the manual says "The layout of the >> XSAVE/XRSTOR save area is fixed and may contain non-contiguous >> individual save areas. The XSAVE/XRSTOR save area is not compacted if >> some features are not saved or are not supported by the processor and/or >> by system software". Note "by the processor": the way I read this, size >> may vary (which is why CPUID.0Dh exists, basically), but offsets are >> guaranteed to be constant. > > Then why would there be a way to retrieve these offsets via CPUID?Perhaps Intel envisioned that the OS could blindly do XSAVEOPT on all detected elements, including yet-unknown features.>> Thus the only problem is LWP. Given AMD''s current non-involvement in >> x86, it may be simpler to avoid the problem completely by not >> implementing virtual LWP... > > For which it is too late: Both 4.2 and 4.3 already have LWP support.Which guests are actually using it? But in the end it doesn''t really matter that AMD and Intel have incompatible XSAVE formats, as long as each vendor remains compatible with itself. Again AMD is not doing new x86 development; hence, all that you''d lose is AMD->Intel migration the day Intel implements LWP, but only on guests that use LWP (Intel->AMD probably would be broken anyway due to lack of AVX-512 on AMD). Paolo
>>> On 05.09.13 at 15:58, Paolo Bonzini <pbonzini@redhat.com> wrote: > Il 05/09/2013 14:01, Jan Beulich ha scritto: >>>>> On 05.09.13 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote: >>> Il 30/08/2013 12:11, Jan Beulich ha scritto: >>>> I''d like to make clear that the change presented is going to handle >>>> only the most trivial cases (where any new xsave state addition >>>> adds to the end of the save area). This is an effect of a more >>>> fundamental flaw in the original design (which the patch doesn''t try >>>> to revise, as it''s not clear to me yet what the best approach here is): >>>> While the XSAVE hardware specification allows for each piece to be >>>> individually savable/restorable, both PV and HVM save/restore >>>> assume a single monolithic blob. Which is already going to be a >>>> problem: AVX-512 as well as MPX conflict with LWP. And obviously >>>> it can''t be excluded that we''ll see CPUs supporting AVX-512 but not >>>> MPX as well as guests using the former but not the latter, and >>>> neither can be dealt with under the current design. >>> >>> This should not be a problem, the manual says "The layout of the >>> XSAVE/XRSTOR save area is fixed and may contain non-contiguous >>> individual save areas. The XSAVE/XRSTOR save area is not compacted if >>> some features are not saved or are not supported by the processor and/or >>> by system software". Note "by the processor": the way I read this, size >>> may vary (which is why CPUID.0Dh exists, basically), but offsets are >>> guaranteed to be constant. >> >> Then why would there be a way to retrieve these offsets via CPUID? > > Perhaps Intel envisioned that the OS could blindly do XSAVEOPT on all > detected elements, including yet-unknown features.For that all you''d need would be the total size, not the individual offsets.>>> Thus the only problem is LWP. Given AMD''s current non-involvement in >>> x86, it may be simpler to avoid the problem completely by not >>> implementing virtual LWP... >> >> For which it is too late: Both 4.2 and 4.3 already have LWP support. > > Which guests are actually using it?Who knows what guests (particularly HVM ones) use?> But in the end it doesn''t really matter that AMD and Intel have > incompatible XSAVE formats, as long as each vendor remains compatible > with itself. Again AMD is not doing new x86 development; hence, all > that you''d lose is AMD->Intel migration the day Intel implements LWP, > but only on guests that use LWP (Intel->AMD probably would be broken > anyway due to lack of AVX-512 on AMD).Yes, if Intel is going to confirm that the layout is going to be fixed now and forever, I''ll tell them that''s an awful design (along the lines of them requiring a 32-bit OS to enable XCR0[7] in order to use AVX-512, no matter that the guest can''t possibly make use of these registers, and thus the tail half or so of the save area will remain entirely unused, but needs to be allocated), but we might indeed get along without redesign as long as we e.g. don''t care about (or allow) migration between LWP-capable and LWP-incapable hosts. Jan
Jan Beulich wrote on 2013-09-05:>>>> On 05.09.13 at 15:58, Paolo Bonzini <pbonzini@redhat.com> wrote: >> Il 05/09/2013 14:01, Jan Beulich ha scritto: >>>>>> On 05.09.13 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote: >>>> Il 30/08/2013 12:11, Jan Beulich ha scritto: >>>>> I''d like to make clear that the change presented is going to >>>>> handle only the most trivial cases (where any new xsave state >>>>> addition adds to the end of the save area). This is an effect of >>>>> a more fundamental flaw in the original design (which the patch >>>>> doesn''t try to revise, as it''s not clear to me yet what the best approach here is): >>>>> While the XSAVE hardware specification allows for each piece to >>>>> be individually savable/restorable, both PV and HVM save/restore >>>>> assume a single monolithic blob. Which is already going to be a >>>>> problem: AVX-512 as well as MPX conflict with LWP. And obviously >>>>> it can''t be excluded that we''ll see CPUs supporting AVX-512 but >>>>> not MPX as well as guests using the former but not the latter, >>>>> and neither can be dealt with under the current design. >>>> >>>> This should not be a problem, the manual says "The layout of the >>>> XSAVE/XRSTOR save area is fixed and may contain non-contiguous >>>> individual save areas. The XSAVE/XRSTOR save area is not >>>> compacted if some features are not saved or are not supported by >>>> the processor and/or by system software". Note "by the >>>> processor": the way I read this, size may vary (which is why >>>> CPUID.0Dh exists, basically), but offsets are guaranteed to be constant. >>> >>> Then why would there be a way to retrieve these offsets via CPUID? >> >> Perhaps Intel envisioned that the OS could blindly do XSAVEOPT on >> all detected elements, including yet-unknown features. > > For that all you''d need would be the total size, not the individual offsets.Agree. If the offset is fixed (it is true currently), then CPU don''t need to expose the offset through CPUID. It may vary for future feature.> >>>> Thus the only problem is LWP. Given AMD''s current non-involvement >>>> in x86, it may be simpler to avoid the problem completely by not >>>> implementing virtual LWP... >>> >>> For which it is too late: Both 4.2 and 4.3 already have LWP support. >> >> Which guests are actually using it? > > Who knows what guests (particularly HVM ones) use? > >> But in the end it doesn''t really matter that AMD and Intel have >> incompatible XSAVE formats, as long as each vendor remains >> compatible with itself. Again AMD is not doing new x86 development; >> hence, all that you''d lose is AMD->Intel migration the day Intel >> implements LWP, but only on guests that use LWP (Intel->AMD probably >> would be broken anyway due to lack of AVX-512 on AMD). > > Yes, if Intel is going to confirm that the layout is going to be fixed > now and forever, I''ll tell them that''s an awful design (along the > lines of them requiring a 32-bit OS to enable XCR0[7] in order to use > AVX-512, no matter that the guest can''t possibly make use of these > registers, and thus the tail half or so of the save area will remain > entirely unused, but needs to be allocated), but we might indeed get > along without redesign as long as we e.g. don''t care about (or allow) migration between LWP-capable and LWP-incapable hosts. > > Jan > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-develBest regards, Yang
>>> On 06.09.13 at 05:03, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote: > Jan Beulich wrote on 2013-09-05: >>>>> On 05.09.13 at 15:58, Paolo Bonzini <pbonzini@redhat.com> wrote: >>> Il 05/09/2013 14:01, Jan Beulich ha scritto: >>>>>>> On 05.09.13 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote: >>>>> Il 30/08/2013 12:11, Jan Beulich ha scritto: >>>>>> I''d like to make clear that the change presented is going to >>>>>> handle only the most trivial cases (where any new xsave state >>>>>> addition adds to the end of the save area). This is an effect of >>>>>> a more fundamental flaw in the original design (which the patch >>>>>> doesn''t try to revise, as it''s not clear to me yet what the best approach > here is): >>>>>> While the XSAVE hardware specification allows for each piece to >>>>>> be individually savable/restorable, both PV and HVM save/restore >>>>>> assume a single monolithic blob. Which is already going to be a >>>>>> problem: AVX-512 as well as MPX conflict with LWP. And obviously >>>>>> it can''t be excluded that we''ll see CPUs supporting AVX-512 but >>>>>> not MPX as well as guests using the former but not the latter, >>>>>> and neither can be dealt with under the current design. >>>>> >>>>> This should not be a problem, the manual says "The layout of the >>>>> XSAVE/XRSTOR save area is fixed and may contain non-contiguous >>>>> individual save areas. The XSAVE/XRSTOR save area is not >>>>> compacted if some features are not saved or are not supported by >>>>> the processor and/or by system software". Note "by the >>>>> processor": the way I read this, size may vary (which is why >>>>> CPUID.0Dh exists, basically), but offsets are guaranteed to be constant. >>>> >>>> Then why would there be a way to retrieve these offsets via CPUID? >>> >>> Perhaps Intel envisioned that the OS could blindly do XSAVEOPT on >>> all detected elements, including yet-unknown features. >> >> For that all you''d need would be the total size, not the individual offsets. > Agree. If the offset is fixed (it is true currently), then CPU don''t need to > expose the offset through CPUID. It may vary for future feature.So here we are: Paolo quotes the doc stating that the layout is fixed (and I agree with his interpretation of that statement), and an Intel representative states that "it is true currently". And the fact that the offsets are to be retrieved also suggests that they''re not fixed now and forever (supported by Vol 3 section 13.6 reading "The number of save areas, the offset and the size of each save area is enumerated by CPUID leaf function 0DH"). Looks like the documentation isn''t really consistent in itself... Jan
Jan Beulich wrote on 2013-09-06:>>>> On 06.09.13 at 05:03, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote: >> Jan Beulich wrote on 2013-09-05: >>>>>> On 05.09.13 at 15:58, Paolo Bonzini <pbonzini@redhat.com> wrote: >>>> Il 05/09/2013 14:01, Jan Beulich ha scritto: >>>>>>>> On 05.09.13 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote: >>>>>> Il 30/08/2013 12:11, Jan Beulich ha scritto: >>>>>>> I''d like to make clear that the change presented is going to >>>>>>> handle only the most trivial cases (where any new xsave state >>>>>>> addition adds to the end of the save area). This is an effect of a >>>>>>> more fundamental flaw in the original design (which the patch >>>>>>> doesn''t try to revise, as it''s not clear to me yet what the best >>>>>>> approach here is): While the XSAVE hardware specification allows >>>>>>> for each piece to be individually savable/restorable, both PV and >>>>>>> HVM save/restore assume a single monolithic blob. Which is already >>>>>>> going to be a problem: AVX-512 as well as MPX conflict with LWP. >>>>>>> And obviously it can''t be excluded that we''ll see CPUs supporting >>>>>>> AVX-512 but not MPX as well as guests using the former but not the >>>>>>> latter, and neither can be dealt with under the current design. >>>>>> >>>>>> This should not be a problem, the manual says "The layout of the >>>>>> XSAVE/XRSTOR save area is fixed and may contain non-contiguous >>>>>> individual save areas. The XSAVE/XRSTOR save area is not >>>>>> compacted if some features are not saved or are not supported by >>>>>> the processor and/or by system software". Note "by the >>>>>> processor": the way I read this, size may vary (which is why >>>>>> CPUID.0Dh exists, basically), but offsets are guaranteed to be constant. >>>>> >>>>> Then why would there be a way to retrieve these offsets via CPUID? >>>> >>>> Perhaps Intel envisioned that the OS could blindly do XSAVEOPT on >>>> all detected elements, including yet-unknown features. >>> >>> For that all you''d need would be the total size, not the individual offsets. >> Agree. If the offset is fixed (it is true currently), then CPU don''t >> need to expose the offset through CPUID. It may vary for future feature. > > So here we are: Paolo quotes the doc stating that the layout is fixedLayout is not equal to the offset.> (and I agree with his interpretation of that statement), and an Intel > representative states that "it is true currently". And the fact that > the offsets are to be retrieved also suggests that they''re not fixed > now and forever (supported by Vol 3 section 13.6 reading "The number > of save areas, the offset and the size of each save area is enumerated by CPUID leaf function 0DH"). > > Looks like the documentation isn''t really consistent in itself...> JanBest regards, Yang
>>> On 06.09.13 at 09:20, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote: > Jan Beulich wrote on 2013-09-06: >> So here we are: Paolo quotes the doc stating that the layout is fixed > Layout is not equal to the offset.Sorry, I don''t follow. What does "The layout of the XSAVE/XRSTOR save area is fixed" mean to you then? Jan
Jan Beulich wrote on 2013-09-06:>>>> On 06.09.13 at 09:20, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote: >> Jan Beulich wrote on 2013-09-06: >>> So here we are: Paolo quotes the doc stating that the layout is >>> fixed >> Layout is not equal to the offset. > > Sorry, I don''t follow. What does "The layout of the XSAVE/XRSTOR save > area is fixed" mean to you then?My understanding is that "fixed" means "the first area always is FPU/SSE(offset 0, size 512), the second area always is header(offset 512, size64) and for other areas, you must check the CPUID to know the details". Not the offset of each feature is fixed. See the Table 4-20 in Intel SDM volume 2, it even puts the area3 behind area4.> > JanBest regards, Yang
[adding H. Peter Anvin... the context is whether the layout of the XSAVE/XRSTOR area is fixed, including the offset of each separate Ext_SAVE_Area]. Il 06/09/2013 09:45, Zhang, Yang Z ha scritto:>>>> >>> So here we are: Paolo quotes the doc stating that the layout is >>>> >>> fixed >>> >> Layout is not equal to the offset. >> > >> > Sorry, I don''t follow. What does "The layout of the XSAVE/XRSTOR save >> > area is fixed" mean to you then? > My understanding is that "fixed" means "the first area always is > FPU/SSE(offset 0, size 512), the second area always is header(offset > 512, size64) and for other areas, you must check the CPUID to know the > details". Not the offset of each feature is fixed.The sentence is "The XSAVE/XRSTOR save area is not compacted if some features are not saved or are not supported by the processor and/or by system software". This means four things: - the XSAVE/XRSTOR save area is not compacted if some features are not saved by the processor (my guess: this is for XSAVEOPT) - the XSAVE/XRSTOR save area is not compacted if some features are not saved by system software (my guess: bits are not set in EDX:EAX) - the the XSAVE/XRSTOR save area is not compacted if some features are not supported by system software (my guess: bits are not set in XCR0) - the the XSAVE/XRSTOR save area is not compacted if some features are not supported by the processor (???) I cannot give any sensible interpretation to the last case except "the offsets are fixed". In theory the lengths may vary, I suppose, though that would not make much sense. BTW, in my copy of the manual (325383-045US, January 2013) the Ext_SaveAreas are in consecutive order in "Figure 13-2. Future Layout of XSAVE/XRSTOR Area and XSTATE_BV with Five Sets of Processor State Extensions". And in the AVX-512/MPX manual, the vector states in Ext_SAVE_Area_2 to 7 have documented (thus fixed) offsets, while the MPX states in Ext_SAVE_Area_3 and 4 do not have documented offsets. And MPX is exactly where documented offsets would be most handy, since BNDCFGU and BNDSTATUS are only accessible via XSAVE/XRSTOR. Linux publishes the XSAVE blob to userspace as an NT_X86_XSTATE note in core dumps, but does not include CPUID data. If offsets change, it will not be possible to examine a core dump without knowing the processor on which it was produced. For our beloved hypervisors, if offsets can change it will be _impossible_ to migrate from a machine to another if they have different offsets. It will be impossible (lest you disable XSAVE and thus all processor features starting with AVX) to have clusters of heterogeneous virtualization hosts. This is because (a) the guest might have stored XSAVE areas at arbitrary places in the source, and the destination will not be able to restore them; (b) there are no vmexits for XSAVE/XSAVEOPT/XRSTOR, and anyway they would be too slow. So please Intel, pretty please do not modify the XSAVE offsets, and clarify this as soon as possible. Paolo> See the Table 4-20 in Intel SDM volume 2, it even puts the area3 behind area4. >
>>> On 06.09.13 at 13:57, Paolo Bonzini <pbonzini@redhat.com> wrote: > Linux publishes the XSAVE blob to userspace as an NT_X86_XSTATE note in > core dumps, but does not include CPUID data. If offsets change, it will > not be possible to examine a core dump without knowing the processor on > which it was produced.Design mistake.> For our beloved hypervisors, if offsets can change it will be > _impossible_ to migrate from a machine to another if they have different > offsets. It will be impossible (lest you disable XSAVE and thus all > processor features starting with AVX) to have clusters of heterogeneous > virtualization hosts. This is because (a) the guest might have stored > XSAVE areas at arbitrary places in the source, and the destination will > not be able to restore them; (b) there are no vmexits for > XSAVE/XSAVEOPT/XRSTOR, and anyway they would be too slow.No, this is precisely what this thread is about: Migration can work with the offsets varying, but only if the there''s no attempt to save the whole blob in one go. There needs to be a save record per state component, and that save record needs to include (implicitly or explicitly) include the size of that blob; the offset within the save area is of no interest then - the consuming system simply needs to put it at the offset within its save area that corresponds to the respective state component. The fact that we''re currently dependent on the offsets is again a design flaw.> So please Intel, pretty please do not modify the XSAVE offsets, and > clarify this as soon as possible.I''d like to ask for the exact opposite: Drop any mention of specific numbers from the documentation, except when used as an example for a particular implementation (it''s that, btw, how I interpret the numbers given in the AVX-512 spec). Not allowing compaction is going to be harmful sooner or later (as processors show up that implement new state, but may not implement all previous features, namely if some of them turn out to be relatively useless: AMD''s LWP appears to be an example of such a feature, at least judging by - as you alluded to - no OS actually making use of it by now, and at least some people seem to think of MPX as being pretty useless too). Jan
Il 06/09/2013 14:35, Jan Beulich ha scritto:>>>> On 06.09.13 at 13:57, Paolo Bonzini <pbonzini@redhat.com> wrote: >> Linux publishes the XSAVE blob to userspace as an NT_X86_XSTATE note in >> core dumps, but does not include CPUID data. If offsets change, it will >> not be possible to examine a core dump without knowing the processor on >> which it was produced. > > Design mistake.Perhaps, but...>> For our beloved hypervisors, if offsets can change it will be >> _impossible_ to migrate from a machine to another if they have different >> offsets. It will be impossible (lest you disable XSAVE and thus all >> processor features starting with AVX) to have clusters of heterogeneous >> virtualization hosts. This is because (a) the guest might have stored >> XSAVE areas at arbitrary places in the source, and the destination will >> not be able to restore them; (b) there are no vmexits for >> XSAVE/XSAVEOPT/XRSTOR, and anyway they would be too slow. > > No, this is precisely what this thread is about: Migration can work > with the offsets varying, but only if the there''s no attempt to save > the whole blob in one go. There needs to be a save record per > state component, and that save record needs to include (implicitly > or explicitly) include the size of that blob; the offset within the > save area is of no interest then - the consuming system simply > needs to put it at the offset within its save area that corresponds > to the respective state component.... this is not about migration of the current XSAVE state. The guest is storing XSAVE data in the internal structures for its processes. If offsets change, it won''t be able to XRSTOR it on the destination. Paolo> The fact that we''re currently dependent on the offsets is again a > design flaw. > >> So please Intel, pretty please do not modify the XSAVE offsets, and >> clarify this as soon as possible. > > I''d like to ask for the exact opposite: Drop any mention of specific > numbers from the documentation, except when used as an > example for a particular implementation (it''s that, btw, how I > interpret the numbers given in the AVX-512 spec). Not allowing > compaction is going to be harmful sooner or later (as processors > show up that implement new state, but may not implement all > previous features, namely if some of them turn out to be relatively > useless: AMD''s LWP appears to be an example of such a feature, > at least judging by - as you alluded to - no OS actually making use > of it by now, and at least some people seem to think of MPX as > being pretty useless too). > > Jan >
On 06/09/13 13:35, Jan Beulich wrote:>>>> On 06.09.13 at 13:57, Paolo Bonzini <pbonzini@redhat.com> wrote: >> Linux publishes the XSAVE blob to userspace as an NT_X86_XSTATE note in >> core dumps, but does not include CPUID data. If offsets change, it will >> not be possible to examine a core dump without knowing the processor on >> which it was produced. > Design mistake. > >> For our beloved hypervisors, if offsets can change it will be >> _impossible_ to migrate from a machine to another if they have different >> offsets. It will be impossible (lest you disable XSAVE and thus all >> processor features starting with AVX) to have clusters of heterogeneous >> virtualization hosts. This is because (a) the guest might have stored >> XSAVE areas at arbitrary places in the source, and the destination will >> not be able to restore them; (b) there are no vmexits for >> XSAVE/XSAVEOPT/XRSTOR, and anyway they would be too slow. > No, this is precisely what this thread is about: Migration can work > with the offsets varying, but only if the there''s no attempt to save > the whole blob in one go. There needs to be a save record per > state component, and that save record needs to include (implicitly > or explicitly) include the size of that blob; the offset within the > save area is of no interest then - the consuming system simply > needs to put it at the offset within its save area that corresponds > to the respective state component. > > The fact that we''re currently dependent on the offsets is again a > design flaw.There is a further complication - There would need to be full guest OS support for the offsets possibly changing across migrate. Simply having Xen able to fix up the vcpu state doesn''t prevent the guest OS from getting it wrong. ~Andrew> >> So please Intel, pretty please do not modify the XSAVE offsets, and >> clarify this as soon as possible. > I''d like to ask for the exact opposite: Drop any mention of specific > numbers from the documentation, except when used as an > example for a particular implementation (it''s that, btw, how I > interpret the numbers given in the AVX-512 spec). Not allowing > compaction is going to be harmful sooner or later (as processors > show up that implement new state, but may not implement all > previous features, namely if some of them turn out to be relatively > useless: AMD''s LWP appears to be an example of such a feature, > at least judging by - as you alluded to - no OS actually making use > of it by now, and at least some people seem to think of MPX as > being pretty useless too). > > Jan > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
On 09/06/2013 04:57 AM, Paolo Bonzini wrote:> [adding H. Peter Anvin... the context is whether the layout of the > XSAVE/XRSTOR area is fixed, including the offset of each separate > Ext_SAVE_Area].It is.> So please Intel, pretty please do not modify the XSAVE offsets, and > clarify this as soon as possible.They will not change. -hpa
Il 06/09/2013 16:44, H. Peter Anvin ha scritto:> On 09/06/2013 04:57 AM, Paolo Bonzini wrote: >> [adding H. Peter Anvin... the context is whether the layout of the >> XSAVE/XRSTOR area is fixed, including the offset of each separate >> Ext_SAVE_Area]. > > It is. > >> So please Intel, pretty please do not modify the XSAVE offsets, and >> clarify this as soon as possible. > > They will not change.Great, could Intel document the offsets of the MPX save areas too? It''s somewhat useful to know them in advance, if one wants to read and set BNDCFGU and BNDSTATUS. Paolo
On 09/06/2013 08:03 AM, Paolo Bonzini wrote:> > Great, could Intel document the offsets of the MPX save areas too? It''s > somewhat useful to know them in advance, if one wants to read and set > BNDCFGU and BNDSTATUS. >I can push that request to the documentation team. -hpa
Keir Fraser
2013-Sep-09 11:16 UTC
Re: [PATCH] x86/xsave: fix migration from xsave-capable to xsave-incapable host
On 30/08/2013 10:55, "Jan Beulich" <JBeulich@suse.com> wrote:> With CPUID features suitably masked this is supposed to work, but was > completely broken (i.e. the case wasn''t even considered when the > original xsave save/restore code was written). > > First of all, xsave_enabled() wrongly returned the value of > cpu_has_xsave, i.e. not even taking into consideration attributes of > the vCPU in question. Instead this function ought to check whether the > guest ever enabled xsave support (by writing a [non-zero] value to > XCR0). As a result of this, a vCPU''s xcr0 and xcr0_accum must no longer > be initialized to XSTATE_FP_SSE (since that''s a valid value a guest > could write to XCR0), and the xsave/xrstor as well as the context > switch code need to suitably account for this (by always enforcing at > least this part of the state to be saved/loaded). > > This involves undoing large parts of c/s 22945:13a7d1f7f62c ("x86: add > strictly sanity check for XSAVE/XRSTOR") - we need to cleanly > distinguish between hardware capabilities and vCPU used features. > > Next both HVM and PV save code needed tweaking to not always save the > full state supported by the underlying hardware, but just the parts > that the guest actually used. Similarly the restore code should bail > not just on state being restored that the hardware cannot handle, but > also on inconsistent save state (inconsistent XCR0 settings or size of > saved state not in line with XCR0). > > And finally the PV extended context get/set code needs to use slightly > different logic than the HVM one, as here we can''t just key off of > xsave_enabled() (i.e. avoid doing anything if a guest doesn''t use > xsave) because the tools use this function to determine host > capabilities as well as read/write vCPU state. The set operation in > particular needs to be capable of cleanly dealing with input that > consists of only the xcr0 and xcr0_accum values (if they''re both zero > then no further data is required). > > While for things to work correctly both sides (saving _and_ restoring > host) need to run with the fixed code, afaict no breakage should occur > if either side isn''t up to date (other than the breakage that this > patch attempts to fix). > > Signed-off-by: Jan Beulich <jbeulich@suse.com>Acked-by: Keir Fraser <keir@xen.org>> --- > Please note that the development and testing of this was done on 4.1.x, > as that''s where to issue was found. The xsave code being quite > different between then and now, the porting was not really straight > forward, but in the end the biggest difference was that the 4.1.x code > needs more changes than presented here, due to there FPU context and > XSAVE area not sharing the same memory block. Hence I think/hope I did > all the porting over correctly, but I had no setup available where I > could have myself fully tested all the scenarios. > > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -618,7 +618,7 @@ unsigned long pv_guest_cr4_fixup(const s > hv_cr4_mask &= ~X86_CR4_DE; > if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) ) > hv_cr4_mask &= ~X86_CR4_FSGSBASE; > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave ) > hv_cr4_mask &= ~X86_CR4_OSXSAVE; > > if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) ) > @@ -1351,9 +1351,13 @@ static void __context_switch(void) > if ( !is_idle_vcpu(n) ) > { > memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES); > - if ( xsave_enabled(n) && n->arch.xcr0 != get_xcr0() && > - !set_xcr0(n->arch.xcr0) ) > - BUG(); > + if ( cpu_has_xsave ) > + { > + u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE; > + > + if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) ) > + BUG(); > + } > vcpu_restore_fpu_eager(n); > n->arch.ctxt_switch_to(n); > } > --- a/xen/arch/x86/domctl.c > +++ b/xen/arch/x86/domctl.c > @@ -1047,11 +1047,8 @@ long arch_do_domctl( > struct xen_domctl_vcpuextstate *evc; > struct vcpu *v; > uint32_t offset = 0; > - uint64_t _xfeature_mask = 0; > - uint64_t _xcr0, _xcr0_accum; > - void *receive_buf = NULL, *_xsave_area; > > -#define PV_XSAVE_SIZE (2 * sizeof(uint64_t) + xsave_cntxt_size) > +#define PV_XSAVE_SIZE(xcr0) (2 * sizeof(uint64_t) + xstate_ctxt_size(xcr0)) > > evc = &domctl->u.vcpuextstate; > > @@ -1062,15 +1059,16 @@ long arch_do_domctl( > > if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate ) > { > + unsigned int size = PV_XSAVE_SIZE(v->arch.xcr0_accum); > + > if ( !evc->size && !evc->xfeature_mask ) > { > evc->xfeature_mask = xfeature_mask; > - evc->size = PV_XSAVE_SIZE; > + evc->size = size; > ret = 0; > goto vcpuextstate_out; > } > - if ( evc->size != PV_XSAVE_SIZE || > - evc->xfeature_mask != xfeature_mask ) > + if ( evc->size != size || evc->xfeature_mask != xfeature_mask ) > { > ret = -EINVAL; > goto vcpuextstate_out; > @@ -1093,7 +1091,7 @@ long arch_do_domctl( > offset += sizeof(v->arch.xcr0_accum); > if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer, > offset, (void *)v->arch.xsave_area, > - xsave_cntxt_size) ) > + size - 2 * sizeof(uint64_t)) ) > { > ret = -EFAULT; > goto vcpuextstate_out; > @@ -1101,13 +1099,14 @@ long arch_do_domctl( > } > else > { > - ret = -EINVAL; > + void *receive_buf; > + uint64_t _xcr0, _xcr0_accum; > + const struct xsave_struct *_xsave_area; > > - _xfeature_mask = evc->xfeature_mask; > - /* xsave context must be restored on compatible target CPUs */ > - if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask ) > - goto vcpuextstate_out; > - if ( evc->size > PV_XSAVE_SIZE || evc->size < 2 * > sizeof(uint64_t) ) > + ret = -EINVAL; > + if ( evc->size < 2 * sizeof(uint64_t) || > + evc->size > 2 * sizeof(uint64_t) + > + xstate_ctxt_size(xfeature_mask) ) > goto vcpuextstate_out; > > receive_buf = xmalloc_bytes(evc->size); > @@ -1128,20 +1127,30 @@ long arch_do_domctl( > _xcr0_accum = *(uint64_t *)(receive_buf + sizeof(uint64_t)); > _xsave_area = receive_buf + 2 * sizeof(uint64_t); > > - if ( !(_xcr0 & XSTATE_FP) || _xcr0 & ~xfeature_mask ) > + if ( _xcr0_accum ) > { > - xfree(receive_buf); > - goto vcpuextstate_out; > + if ( evc->size >= 2 * sizeof(uint64_t) + XSTATE_AREA_MIN_SIZE > ) > + ret = validate_xstate(_xcr0, _xcr0_accum, > + _xsave_area->xsave_hdr.xstate_bv, > + evc->xfeature_mask); > } > - if ( (_xcr0 & _xcr0_accum) != _xcr0 ) > + else if ( !_xcr0 ) > + ret = 0; > + if ( ret ) > { > xfree(receive_buf); > goto vcpuextstate_out; > } > > - v->arch.xcr0 = _xcr0; > - v->arch.xcr0_accum = _xcr0_accum; > - memcpy(v->arch.xsave_area, _xsave_area, evc->size - 2 * > sizeof(uint64_t) ); > + if ( evc->size <= PV_XSAVE_SIZE(_xcr0_accum) ) > + { > + v->arch.xcr0 = _xcr0; > + v->arch.xcr0_accum = _xcr0_accum; > + memcpy(v->arch.xsave_area, _xsave_area, > + evc->size - 2 * sizeof(uint64_t)); > + } > + else > + ret = -EINVAL; > > xfree(receive_buf); > } > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -906,14 +906,12 @@ static int hvm_load_cpu_ctxt(struct doma > hvm_set_segment_register(v, x86_seg_ldtr, &seg); > > /* In case xsave-absent save file is restored on a xsave-capable host */ > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave && !xsave_enabled(v) ) > { > struct xsave_struct *xsave_area = v->arch.xsave_area; > > memcpy(v->arch.xsave_area, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); > xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; > - v->arch.xcr0_accum = XSTATE_FP_SSE; > - v->arch.xcr0 = XSTATE_FP_SSE; > } > else > memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); > @@ -957,7 +955,9 @@ static int hvm_load_cpu_ctxt(struct doma > HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt, > 1, HVMSR_PER_VCPU); > > -#define HVM_CPU_XSAVE_SIZE (3 * sizeof(uint64_t) + xsave_cntxt_size) > +#define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \ > + save_area) + \ > + xstate_ctxt_size(xcr0)) > > static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t > *h) > { > @@ -969,20 +969,20 @@ static int hvm_save_cpu_xsave_states(str > > for_each_vcpu ( d, v ) > { > + unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum); > + > if ( !xsave_enabled(v) ) > continue; > - if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, > HVM_CPU_XSAVE_SIZE) ) > + if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) ) > return 1; > ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; > - h->cur += HVM_CPU_XSAVE_SIZE; > - memset(ctxt, 0, HVM_CPU_XSAVE_SIZE); > + h->cur += size; > > ctxt->xfeature_mask = xfeature_mask; > ctxt->xcr0 = v->arch.xcr0; > ctxt->xcr0_accum = v->arch.xcr0_accum; > - if ( v->fpu_initialised ) > - memcpy(&ctxt->save_area, > - v->arch.xsave_area, xsave_cntxt_size); > + memcpy(&ctxt->save_area, v->arch.xsave_area, > + size - offsetof(struct hvm_hw_cpu_xsave, save_area)); > } > > return 0; > @@ -990,11 +990,11 @@ static int hvm_save_cpu_xsave_states(str > > static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t > *h) > { > - int vcpuid; > + unsigned int vcpuid, size; > + int err; > struct vcpu *v; > struct hvm_hw_cpu_xsave *ctxt; > struct hvm_save_descriptor *desc; > - uint64_t _xfeature_mask; > > /* Which vcpu is this? */ > vcpuid = hvm_load_instance(h); > @@ -1006,47 +1006,74 @@ static int hvm_load_cpu_xsave_states(str > } > > /* Fails since we can''t restore an img saved on xsave-capable host. */ > - if ( !xsave_enabled(v) ) > - return -EINVAL; > + if ( !cpu_has_xsave ) > + return -EOPNOTSUPP; > > /* Customized checking for entry since our entry is of variable length */ > desc = (struct hvm_save_descriptor *)&h->data[h->cur]; > if ( sizeof (*desc) > h->size - h->cur) > { > printk(XENLOG_G_WARNING > - "HVM%d restore: not enough data left to read descriptor" > - "for type %u\n", d->domain_id, CPU_XSAVE_CODE); > - return -1; > + "HVM%d.%d restore: not enough data left to read xsave > descriptor\n", > + d->domain_id, vcpuid); > + return -ENODATA; > } > if ( desc->length + sizeof (*desc) > h->size - h->cur) > { > printk(XENLOG_G_WARNING > - "HVM%d restore: not enough data left to read %u bytes " > - "for type %u\n", d->domain_id, desc->length, CPU_XSAVE_CODE); > - return -1; > + "HVM%d.%d restore: not enough data left to read %u xsave > bytes\n", > + d->domain_id, vcpuid, desc->length); > + return -ENODATA; > + } > + if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) + > + XSTATE_AREA_MIN_SIZE ) > + { > + printk(XENLOG_G_WARNING > + "HVM%d.%d restore mismatch: xsave length %u < %zu\n", > + d->domain_id, vcpuid, desc->length, > + offsetof(struct hvm_hw_cpu_xsave, > + save_area) + XSTATE_AREA_MIN_SIZE); > + return -EINVAL; > } > - if ( CPU_XSAVE_CODE != desc->typecode || (desc->length > > HVM_CPU_XSAVE_SIZE) ) > + size = HVM_CPU_XSAVE_SIZE(xfeature_mask); > + if ( desc->length > size ) > { > printk(XENLOG_G_WARNING > - "HVM%d restore mismatch: expected type %u with max length %u, > " > - "saw type %u length %u\n", d->domain_id, CPU_XSAVE_CODE, > - (unsigned int)HVM_CPU_XSAVE_SIZE, > - desc->typecode, desc->length); > - return -1; > + "HVM%d.%d restore mismatch: xsave length %u > %u\n", > + d->domain_id, vcpuid, desc->length, size); > + return -EOPNOTSUPP; > } > h->cur += sizeof (*desc); > - /* Checking finished */ > > ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; > h->cur += desc->length; > > - _xfeature_mask = ctxt->xfeature_mask; > - if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask ) > - return -EINVAL; > + err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum, > + ctxt->save_area.xsave_hdr.xstate_bv, > + ctxt->xfeature_mask); > + if ( err ) > + { > + printk(XENLOG_G_WARNING > + "HVM%d.%d restore: inconsistent xsave state (feat=%#"PRIx64 > + " accum=%#"PRIx64" xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n", > + d->domain_id, vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum, > + ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err); > + return err; > + } > + size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum); > + if ( desc->length > size ) > + { > + printk(XENLOG_G_WARNING > + "HVM%d.%d restore mismatch: xsave length %u > %u\n", > + d->domain_id, vcpuid, desc->length, size); > + return -EOPNOTSUPP; > + } > + /* Checking finished */ > > v->arch.xcr0 = ctxt->xcr0; > v->arch.xcr0_accum = ctxt->xcr0_accum; > - memcpy(v->arch.xsave_area, &ctxt->save_area, xsave_cntxt_size); > + memcpy(v->arch.xsave_area, &ctxt->save_area, > + desc->length - offsetof(struct hvm_hw_cpu_xsave, save_area)); > > return 0; > } > @@ -1060,7 +1087,8 @@ static int __init __hvm_register_CPU_XSA > "CPU_XSAVE", > hvm_save_cpu_xsave_states, > hvm_load_cpu_xsave_states, > - HVM_CPU_XSAVE_SIZE + sizeof (struct > hvm_save_descriptor), > + HVM_CPU_XSAVE_SIZE(xfeature_mask) + > + sizeof(struct hvm_save_descriptor), > HVMSR_PER_VCPU); > return 0; > } > @@ -2767,7 +2795,7 @@ void hvm_cpuid(unsigned int input, unsig > __clear_bit(X86_FEATURE_APIC & 31, edx); > > /* Fix up OSXSAVE. */ > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave ) > *ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ? > cpufeat_mask(X86_FEATURE_OSXSAVE) : 0; > > --- a/xen/arch/x86/hvm/vmx/vmcs.c > +++ b/xen/arch/x86/hvm/vmx/vmcs.c > @@ -944,8 +944,7 @@ static int construct_vmcs(struct vcpu *v > /* Host control registers. */ > v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; > __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); > - __vmwrite(HOST_CR4, > - mmu_cr4_features | (xsave_enabled(v) ? X86_CR4_OSXSAVE : 0)); > + __vmwrite(HOST_CR4, mmu_cr4_features); > > /* Host CS:RIP. */ > __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); > --- a/xen/arch/x86/i387.c > +++ b/xen/arch/x86/i387.c > @@ -38,14 +38,15 @@ static inline void fpu_xrstor(struct vcp > { > bool_t ok; > > + ASSERT(v->arch.xsave_area); > /* > * XCR0 normally represents what guest OS set. In case of Xen itself, > - * we set all supported feature mask before doing save/restore. > + * we set the accumulated feature mask before doing save/restore. > */ > - ok = set_xcr0(v->arch.xcr0_accum); > + ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); > ASSERT(ok); > xrstor(v, mask); > - ok = set_xcr0(v->arch.xcr0); > + ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); > ASSERT(ok); > } > > @@ -137,13 +138,15 @@ static inline void fpu_xsave(struct vcpu > { > bool_t ok; > > - /* XCR0 normally represents what guest OS set. In case of Xen itself, > - * we set all accumulated feature mask before doing save/restore. > + ASSERT(v->arch.xsave_area); > + /* > + * XCR0 normally represents what guest OS set. In case of Xen itself, > + * we set the accumulated feature mask before doing save/restore. > */ > - ok = set_xcr0(v->arch.xcr0_accum); > + ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); > ASSERT(ok); > xsave(v, v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY); > - ok = set_xcr0(v->arch.xcr0); > + ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); > ASSERT(ok); > } > > @@ -232,7 +235,7 @@ void vcpu_restore_fpu_lazy(struct vcpu * > if ( v->fpu_dirtied ) > return; > > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave ) > fpu_xrstor(v, XSTATE_LAZY); > else if ( v->fpu_initialised ) > { > @@ -262,7 +265,7 @@ void vcpu_save_fpu(struct vcpu *v) > /* This can happen, if a paravirtualised guest OS has set its CR0.TS. */ > clts(); > > - if ( xsave_enabled(v) ) > + if ( cpu_has_xsave ) > fpu_xsave(v); > else if ( cpu_has_fxsr ) > fpu_fxsave(v); > --- a/xen/arch/x86/traps.c > +++ b/xen/arch/x86/traps.c > @@ -817,7 +817,7 @@ static void pv_cpuid(struct cpu_user_reg > __clear_bit(X86_FEATURE_PDCM % 32, &c); > __clear_bit(X86_FEATURE_PCID % 32, &c); > __clear_bit(X86_FEATURE_DCA % 32, &c); > - if ( !xsave_enabled(current) ) > + if ( !cpu_has_xsave ) > { > __clear_bit(X86_FEATURE_XSAVE % 32, &c); > __clear_bit(X86_FEATURE_AVX % 32, &c); > @@ -842,7 +842,7 @@ static void pv_cpuid(struct cpu_user_reg > break; > > case 0x0000000d: /* XSAVE */ > - if ( !xsave_enabled(current) ) > + if ( !cpu_has_xsave ) > goto unsupported; > break; > > --- a/xen/arch/x86/xstate.c > +++ b/xen/arch/x86/xstate.c > @@ -21,7 +21,7 @@ bool_t __read_mostly cpu_has_xsaveopt; > * the supported and enabled features on the processor, including the > * XSAVE.HEADER. We only enable XCNTXT_MASK that we have known. > */ > -u32 xsave_cntxt_size; > +static u32 __read_mostly xsave_cntxt_size; > > /* A 64-bit bitmask of the XSAVE/XRSTOR features supported by processor. */ > u64 xfeature_mask; > @@ -206,13 +206,13 @@ void xrstor(struct vcpu *v, uint64_t mas > > bool_t xsave_enabled(const struct vcpu *v) > { > - if ( cpu_has_xsave ) > - { > - ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); > - ASSERT(v->arch.xsave_area); > - } > + if ( !cpu_has_xsave ) > + return 0; > > - return cpu_has_xsave; > + ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); > + ASSERT(v->arch.xsave_area); > + > + return !!v->arch.xcr0_accum; > } > > int xstate_alloc_save_area(struct vcpu *v) > @@ -238,8 +238,8 @@ int xstate_alloc_save_area(struct vcpu * > save_area->fpu_sse.mxcsr = MXCSR_DEFAULT; > > v->arch.xsave_area = save_area; > - v->arch.xcr0 = XSTATE_FP_SSE; > - v->arch.xcr0_accum = XSTATE_FP_SSE; > + v->arch.xcr0 = 0; > + v->arch.xcr0_accum = 0; > > return 0; > } > @@ -257,7 +257,11 @@ void xstate_init(bool_t bsp) > u64 feature_mask; > > if ( boot_cpu_data.cpuid_level < XSTATE_CPUID ) > + { > + BUG_ON(!bsp); > + setup_clear_cpu_cap(X86_FEATURE_XSAVE); > return; > + } > > cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); > > @@ -277,7 +281,6 @@ void xstate_init(bool_t bsp) > set_in_cr4(X86_CR4_OSXSAVE); > if ( !set_xcr0(feature_mask) ) > BUG(); > - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); > > if ( bsp ) > { > @@ -286,14 +289,14 @@ void xstate_init(bool_t bsp) > * xsave_cntxt_size is the max size required by enabled features. > * We know FP/SSE and YMM about eax, and nothing about edx at > present. > */ > - xsave_cntxt_size = ebx; > + xsave_cntxt_size = xstate_ctxt_size(feature_mask); > printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n", > __func__, xsave_cntxt_size, xfeature_mask); > } > else > { > BUG_ON(xfeature_mask != feature_mask); > - BUG_ON(xsave_cntxt_size != ebx); > + BUG_ON(xsave_cntxt_size != xstate_ctxt_size(feature_mask)); > } > > /* Check XSAVEOPT feature. */ > @@ -304,6 +307,42 @@ void xstate_init(bool_t bsp) > BUG_ON(!cpu_has_xsaveopt != !(eax & XSTATE_FEATURE_XSAVEOPT)); > } > > +unsigned int xstate_ctxt_size(u64 xcr0) > +{ > + u32 ebx = 0; > + > + if ( xcr0 ) > + { > + u64 act_xcr0 = get_xcr0(); > + u32 eax, ecx, edx; > + bool_t ok = set_xcr0(xcr0); > + > + ASSERT(ok); > + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); > + ASSERT(ebx <= ecx); > + ok = set_xcr0(act_xcr0); > + ASSERT(ok); > + } > + > + return ebx; > +} > + > +int validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, u64 xfeat_mask) > +{ > + if ( (xcr0_accum & ~xfeat_mask) || > + (xstate_bv & ~xcr0_accum) || > + (xcr0 & ~xcr0_accum) || > + !(xcr0 & XSTATE_FP) || > + ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) || > + ((xcr0_accum & XSTATE_YMM) && !(xcr0_accum & XSTATE_SSE)) ) > + return -EINVAL; > + > + if ( xcr0_accum & ~xfeature_mask ) > + return -EOPNOTSUPP; > + > + return 0; > +} > + > int handle_xsetbv(u32 index, u64 new_bv) > { > struct vcpu *curr = current; > --- a/xen/include/asm-x86/domain.h > +++ b/xen/include/asm-x86/domain.h > @@ -456,9 +456,9 @@ unsigned long pv_guest_cr4_fixup(const s > #define pv_guest_cr4_to_real_cr4(v) \ > (((v)->arch.pv_vcpu.ctrlreg[4] \ > | (mmu_cr4_features \ > - & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP)) \ > - | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0) \ > - | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0)) \ > + & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP | \ > + X86_CR4_OSXSAVE)) \ > + | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \ > & ~X86_CR4_DE) > #define real_cr4_to_pv_guest_cr4(c) \ > ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD \ > --- a/xen/include/asm-x86/hvm/hvm.h > +++ b/xen/include/asm-x86/hvm/hvm.h > @@ -368,7 +368,7 @@ static inline int hvm_event_pending(stru > ((nestedhvm_enabled((_v)->domain) && cpu_has_vmx)\ > ? X86_CR4_VMXE : 0) | \ > (cpu_has_pcid ? X86_CR4_PCIDE : 0) | \ > - (xsave_enabled(_v) ? X86_CR4_OSXSAVE : 0)))) > + (cpu_has_xsave ? X86_CR4_OSXSAVE : 0)))) > > /* These exceptions must always be intercepted. */ > #define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << TRAP_invalid_op)) > --- a/xen/include/asm-x86/xstate.h > +++ b/xen/include/asm-x86/xstate.h > @@ -34,7 +34,6 @@ > #define XSTATE_NONLAZY (XSTATE_LWP) > #define XSTATE_LAZY (XSTATE_ALL & ~XSTATE_NONLAZY) > > -extern unsigned int xsave_cntxt_size; > extern u64 xfeature_mask; > > /* extended state save area */ > @@ -77,11 +76,14 @@ uint64_t get_xcr0(void); > void xsave(struct vcpu *v, uint64_t mask); > void xrstor(struct vcpu *v, uint64_t mask); > bool_t xsave_enabled(const struct vcpu *v); > +int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, > + u64 xfeat_mask); > int __must_check handle_xsetbv(u32 index, u64 new_bv); > > /* extended state init and cleanup functions */ > void xstate_free_save_area(struct vcpu *v); > int xstate_alloc_save_area(struct vcpu *v); > void xstate_init(bool_t bsp); > +unsigned int xstate_ctxt_size(u64 xcr0); > > #endif /* __ASM_XSTATE_H */ > >