From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 From: Liu Jinsong <jinsong.liu@intel.com> Date: Thu, 17 Oct 2013 05:49:23 +0800 Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling This patch solves XSA-60 security hole: 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need do nothing, since hardware snoop mechanism has ensured cache coherency. 2. For guest with VT-d but non-snooped, cache coherency can not be guaranteed by h/w snoop, therefore it need emulate UC type to guest: 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that guest memory type are all UC. 2.2). if it works w/ shadow, drop all shadows so that any new ones would be created on demand w/ UC. This patch also fix a bug of shadow cr0.cd setting. Current shadow has a small window between cache flush and TLB invalidation, resulting in possilbe cache pollution. This patch pause vcpus so that no vcpus context involved into the window. Signed-off-by: Liu Jinsong <jinsong.liu@intel.com> --- xen/arch/x86/hvm/hvm.c | 75 ++++++++++++++++++++++-------------- xen/arch/x86/hvm/vmx/vmcs.c | 2 +- xen/arch/x86/hvm/vmx/vmx.c | 58 +++++++++++++++++++++++++++- xen/common/domain.c | 10 +++++ xen/include/asm-x86/hvm/hvm.h | 1 + xen/include/asm-x86/hvm/support.h | 2 + xen/include/xen/sched.h | 1 + 7 files changed, 117 insertions(+), 32 deletions(-) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index 688a943..df021de 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -1701,6 +1701,40 @@ int hvm_mov_from_cr(unsigned int cr, unsigned int gpr) return X86EMUL_UNHANDLEABLE; } +void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value) +{ + if ( value & X86_CR0_CD ) + { + /* Entering no fill cache mode. */ + spin_lock(&v->domain->arch.hvm_domain.uc_lock); + v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; + + if ( !v->domain->arch.hvm_domain.is_in_uc_mode ) + { + domain_pause_nosync(v->domain); + + /* Flush physical caches. */ + on_each_cpu(local_flush_cache, NULL, 1); + hvm_set_uc_mode(v, 1); + + domain_unpause(v->domain); + } + spin_unlock(&v->domain->arch.hvm_domain.uc_lock); + } + else if ( !(value & X86_CR0_CD) && + (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) + { + /* Exit from no fill cache mode. */ + spin_lock(&v->domain->arch.hvm_domain.uc_lock); + v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; + + if ( domain_exit_uc_mode(v) ) + hvm_set_uc_mode(v, 0); + + spin_unlock(&v->domain->arch.hvm_domain.uc_lock); + } +} + int hvm_set_cr0(unsigned long value) { struct vcpu *v = current; @@ -1784,35 +1818,18 @@ int hvm_set_cr0(unsigned long value) } } - if ( has_arch_mmios(v->domain) ) - { - if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) ) - { - /* Entering no fill cache mode. */ - spin_lock(&v->domain->arch.hvm_domain.uc_lock); - v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; - - if ( !v->domain->arch.hvm_domain.is_in_uc_mode ) - { - /* Flush physical caches. */ - on_each_cpu(local_flush_cache, NULL, 1); - hvm_set_uc_mode(v, 1); - } - spin_unlock(&v->domain->arch.hvm_domain.uc_lock); - } - else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) && - (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) - { - /* Exit from no fill cache mode. */ - spin_lock(&v->domain->arch.hvm_domain.uc_lock); - v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; - - if ( domain_exit_uc_mode(v) ) - hvm_set_uc_mode(v, 0); - - spin_unlock(&v->domain->arch.hvm_domain.uc_lock); - } - } + /* + * When cr0.cd setting + * 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need not + * do anything, since hardware snoop mechanism has ensured cache coherency; + * 2. For guest with VT-d but non-snooped, cache coherency cannot be + * guaranteed by h/w so need emulate UC memory type to guest. + */ + if ( ((value ^ old_value) & X86_CR0_CD) && + has_arch_pdevs(v->domain) && + iommu_enabled && !iommu_snoop && + hvm_funcs.handle_cd ) + hvm_funcs.handle_cd(v, value); v->arch.hvm_vcpu.guest_cr[0] = value; hvm_update_guest_cr(v, 0); diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 6916c6d..7a01b1f 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -921,7 +921,7 @@ static int construct_vmcs(struct vcpu *v) vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, MSR_TYPE_R | MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, MSR_TYPE_R | MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, MSR_TYPE_R | MSR_TYPE_W); - if ( paging_mode_hap(d) ) + if ( paging_mode_hap(d) && (!iommu_enabled || iommu_snoop) ) vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, MSR_TYPE_R | MSR_TYPE_W); } diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 6dedb29..d846a9c 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -642,6 +642,10 @@ static void vmx_ctxt_switch_to(struct vcpu *v) __invept(INVEPT_SINGLE_CONTEXT, ept_get_eptp(ept_data), 0); } + /* For guest cr0.cd setting, do not use potentially polluted cache */ + if ( unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) + wbinvd(); + vmx_restore_guest_msrs(v); vmx_restore_dr(v); } @@ -908,7 +912,8 @@ static unsigned long vmx_get_shadow_gs_base(struct vcpu *v) static int vmx_set_guest_pat(struct vcpu *v, u64 gpat) { - if ( !paging_mode_hap(v->domain) ) + if ( !paging_mode_hap(v->domain) || + unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) return 0; vmx_vmcs_enter(v); @@ -919,7 +924,8 @@ static int vmx_set_guest_pat(struct vcpu *v, u64 gpat) static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat) { - if ( !paging_mode_hap(v->domain) ) + if ( !paging_mode_hap(v->domain) || + unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) return 0; vmx_vmcs_enter(v); @@ -928,6 +934,53 @@ static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat) return 1; } +static void vmx_handle_cd(struct vcpu *v, unsigned long value) +{ + if ( !paging_mode_hap(v->domain) ) + { + /* + * For shadow, ''load IA32_PAT'' VM-entry control is 0, so it cannot + * set guest memory type as UC via IA32_PAT. Xen drop all shadows + * so that any new ones would be created on demand. + */ + hvm_shadow_handle_cd(v, value); + } + else + { + u64 *pat = &v->arch.hvm_vcpu.pat_cr; + + if ( value & X86_CR0_CD ) + { + /* + * For EPT, set guest IA32_PAT fields as UC so that guest + * memory type are all UC. + */ + u64 uc_pat + ((uint64_t)PAT_TYPE_UNCACHABLE) | /* PAT0 */ + ((uint64_t)PAT_TYPE_UNCACHABLE << 8) | /* PAT1 */ + ((uint64_t)PAT_TYPE_UNCACHABLE << 16) | /* PAT2 */ + ((uint64_t)PAT_TYPE_UNCACHABLE << 24) | /* PAT3 */ + ((uint64_t)PAT_TYPE_UNCACHABLE << 32) | /* PAT4 */ + ((uint64_t)PAT_TYPE_UNCACHABLE << 40) | /* PAT5 */ + ((uint64_t)PAT_TYPE_UNCACHABLE << 48) | /* PAT6 */ + ((uint64_t)PAT_TYPE_UNCACHABLE << 56); /* PAT7 */ + + vmx_get_guest_pat(v, pat); + vmx_set_guest_pat(v, uc_pat); + + wbinvd(); /* flush possibly polluted cache */ + hvm_asid_flush_vcpu(v); /* invalidate memory type cached in TLB */ + v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; + } + else + { + v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; + vmx_set_guest_pat(v, *pat); + hvm_asid_flush_vcpu(v); /* no need to flush cache */ + } + } +} + static void vmx_set_tsc_offset(struct vcpu *v, u64 offset) { vmx_vmcs_enter(v); @@ -1550,6 +1603,7 @@ static struct hvm_function_table __initdata vmx_function_table = { .msr_read_intercept = vmx_msr_read_intercept, .msr_write_intercept = vmx_msr_write_intercept, .invlpg_intercept = vmx_invlpg_intercept, + .handle_cd = vmx_handle_cd, .set_info_guest = vmx_set_info_guest, .set_rdtsc_exiting = vmx_set_rdtsc_exiting, .nhvm_vcpu_initialise = nvmx_vcpu_initialise, diff --git a/xen/common/domain.c b/xen/common/domain.c index 5999779..ce20323 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -825,6 +825,16 @@ void domain_pause(struct domain *d) vcpu_sleep_sync(v); } +void domain_pause_nosync(struct domain *d) +{ + struct vcpu *v; + + atomic_inc(&d->pause_count); + + for_each_vcpu( d, v ) + vcpu_sleep_nosync(v); +} + void domain_unpause(struct domain *d) { struct vcpu *v; diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h index 8dd2b40..c9afb56 100644 --- a/xen/include/asm-x86/hvm/hvm.h +++ b/xen/include/asm-x86/hvm/hvm.h @@ -156,6 +156,7 @@ struct hvm_function_table { int (*msr_read_intercept)(unsigned int msr, uint64_t *msr_content); int (*msr_write_intercept)(unsigned int msr, uint64_t msr_content); void (*invlpg_intercept)(unsigned long vaddr); + void (*handle_cd)(struct vcpu *v, unsigned long value); void (*set_info_guest)(struct vcpu *v); void (*set_rdtsc_exiting)(struct vcpu *v, bool_t); diff --git a/xen/include/asm-x86/hvm/support.h b/xen/include/asm-x86/hvm/support.h index 7ddc806..52aef1f 100644 --- a/xen/include/asm-x86/hvm/support.h +++ b/xen/include/asm-x86/hvm/support.h @@ -130,6 +130,8 @@ void hvm_rdtsc_intercept(struct cpu_user_regs *regs); int __must_check hvm_handle_xsetbv(u32 index, u64 new_bv); +void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value); + /* These functions all return X86EMUL return codes. */ int hvm_set_efer(uint64_t value); int hvm_set_cr0(unsigned long value); diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 1765e18..82f522e 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -700,6 +700,7 @@ void vcpu_unblock(struct vcpu *v); void vcpu_pause(struct vcpu *v); void vcpu_pause_nosync(struct vcpu *v); void domain_pause(struct domain *d); +void domain_pause_nosync(struct domain *d); void vcpu_unpause(struct vcpu *v); void domain_unpause(struct domain *d); void domain_pause_by_systemcontroller(struct domain *d); -- 1.7.1 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Oct-22 14:55 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
>>> On 21.10.13 at 17:55, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 > From: Liu Jinsong <jinsong.liu@intel.com> > Date: Thu, 17 Oct 2013 05:49:23 +0800 > Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling > > This patch solves XSA-60 security hole: > 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need > do nothing, since hardware snoop mechanism has ensured cache coherency. > > 2. For guest with VT-d but non-snooped, cache coherency can not be > guaranteed by h/w snoop, therefore it need emulate UC type to guest: > 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that > guest memory type are all UC. > 2.2). if it works w/ shadow, drop all shadows so that any new ones would > be created on demand w/ UC. > > This patch also fix a bug of shadow cr0.cd setting. Current shadow has a > small window between cache flush and TLB invalidation, resulting in possilbe > cache pollution. This patch pause vcpus so that no vcpus context involved > into the window. > > Signed-off-by: Liu Jinsong <jinsong.liu@intel.com>This looks fine to me now, but will need acks/reviews at least from - Keir (whose blessing of the pausing construct I''d like to have even if this didn''t involve changing non-x86 files) - one of the VMX maintainers - one or both of Tim and Andrew And of course I''d really appreciate if Oracle could arrange for testing this, to confirm their performance problem is also gone with this. Thanks, Jan> --- > xen/arch/x86/hvm/hvm.c | 75 ++++++++++++++++++++++-------------- > xen/arch/x86/hvm/vmx/vmcs.c | 2 +- > xen/arch/x86/hvm/vmx/vmx.c | 58 +++++++++++++++++++++++++++- > xen/common/domain.c | 10 +++++ > xen/include/asm-x86/hvm/hvm.h | 1 + > xen/include/asm-x86/hvm/support.h | 2 + > xen/include/xen/sched.h | 1 + > 7 files changed, 117 insertions(+), 32 deletions(-) > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > index 688a943..df021de 100644 > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -1701,6 +1701,40 @@ int hvm_mov_from_cr(unsigned int cr, unsigned int gpr) > return X86EMUL_UNHANDLEABLE; > } > > +void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value) > +{ > + if ( value & X86_CR0_CD ) > + { > + /* Entering no fill cache mode. */ > + spin_lock(&v->domain->arch.hvm_domain.uc_lock); > + v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; > + > + if ( !v->domain->arch.hvm_domain.is_in_uc_mode ) > + { > + domain_pause_nosync(v->domain); > + > + /* Flush physical caches. */ > + on_each_cpu(local_flush_cache, NULL, 1); > + hvm_set_uc_mode(v, 1); > + > + domain_unpause(v->domain); > + } > + spin_unlock(&v->domain->arch.hvm_domain.uc_lock); > + } > + else if ( !(value & X86_CR0_CD) && > + (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > + { > + /* Exit from no fill cache mode. */ > + spin_lock(&v->domain->arch.hvm_domain.uc_lock); > + v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; > + > + if ( domain_exit_uc_mode(v) ) > + hvm_set_uc_mode(v, 0); > + > + spin_unlock(&v->domain->arch.hvm_domain.uc_lock); > + } > +} > + > int hvm_set_cr0(unsigned long value) > { > struct vcpu *v = current; > @@ -1784,35 +1818,18 @@ int hvm_set_cr0(unsigned long value) > } > } > > - if ( has_arch_mmios(v->domain) ) > - { > - if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) ) > - { > - /* Entering no fill cache mode. */ > - spin_lock(&v->domain->arch.hvm_domain.uc_lock); > - v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; > - > - if ( !v->domain->arch.hvm_domain.is_in_uc_mode ) > - { > - /* Flush physical caches. */ > - on_each_cpu(local_flush_cache, NULL, 1); > - hvm_set_uc_mode(v, 1); > - } > - spin_unlock(&v->domain->arch.hvm_domain.uc_lock); > - } > - else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) && > - (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > - { > - /* Exit from no fill cache mode. */ > - spin_lock(&v->domain->arch.hvm_domain.uc_lock); > - v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; > - > - if ( domain_exit_uc_mode(v) ) > - hvm_set_uc_mode(v, 0); > - > - spin_unlock(&v->domain->arch.hvm_domain.uc_lock); > - } > - } > + /* > + * When cr0.cd setting > + * 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need > not > + * do anything, since hardware snoop mechanism has ensured cache > coherency; > + * 2. For guest with VT-d but non-snooped, cache coherency cannot be > + * guaranteed by h/w so need emulate UC memory type to guest. > + */ > + if ( ((value ^ old_value) & X86_CR0_CD) && > + has_arch_pdevs(v->domain) && > + iommu_enabled && !iommu_snoop && > + hvm_funcs.handle_cd ) > + hvm_funcs.handle_cd(v, value); > > v->arch.hvm_vcpu.guest_cr[0] = value; > hvm_update_guest_cr(v, 0); > diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c > index 6916c6d..7a01b1f 100644 > --- a/xen/arch/x86/hvm/vmx/vmcs.c > +++ b/xen/arch/x86/hvm/vmx/vmcs.c > @@ -921,7 +921,7 @@ static int construct_vmcs(struct vcpu *v) > vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, MSR_TYPE_R | > MSR_TYPE_W); > vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, MSR_TYPE_R > | MSR_TYPE_W); > vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, MSR_TYPE_R > | MSR_TYPE_W); > - if ( paging_mode_hap(d) ) > + if ( paging_mode_hap(d) && (!iommu_enabled || iommu_snoop) ) > vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, MSR_TYPE_R | > MSR_TYPE_W); > } > > diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c > index 6dedb29..d846a9c 100644 > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -642,6 +642,10 @@ static void vmx_ctxt_switch_to(struct vcpu *v) > __invept(INVEPT_SINGLE_CONTEXT, ept_get_eptp(ept_data), 0); > } > > + /* For guest cr0.cd setting, do not use potentially polluted cache */ > + if ( unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > + wbinvd(); > + > vmx_restore_guest_msrs(v); > vmx_restore_dr(v); > } > @@ -908,7 +912,8 @@ static unsigned long vmx_get_shadow_gs_base(struct vcpu > *v) > > static int vmx_set_guest_pat(struct vcpu *v, u64 gpat) > { > - if ( !paging_mode_hap(v->domain) ) > + if ( !paging_mode_hap(v->domain) || > + unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > return 0; > > vmx_vmcs_enter(v); > @@ -919,7 +924,8 @@ static int vmx_set_guest_pat(struct vcpu *v, u64 gpat) > > static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat) > { > - if ( !paging_mode_hap(v->domain) ) > + if ( !paging_mode_hap(v->domain) || > + unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > return 0; > > vmx_vmcs_enter(v); > @@ -928,6 +934,53 @@ static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat) > return 1; > } > > +static void vmx_handle_cd(struct vcpu *v, unsigned long value) > +{ > + if ( !paging_mode_hap(v->domain) ) > + { > + /* > + * For shadow, ''load IA32_PAT'' VM-entry control is 0, so it cannot > + * set guest memory type as UC via IA32_PAT. Xen drop all shadows > + * so that any new ones would be created on demand. > + */ > + hvm_shadow_handle_cd(v, value); > + } > + else > + { > + u64 *pat = &v->arch.hvm_vcpu.pat_cr; > + > + if ( value & X86_CR0_CD ) > + { > + /* > + * For EPT, set guest IA32_PAT fields as UC so that guest > + * memory type are all UC. > + */ > + u64 uc_pat > + ((uint64_t)PAT_TYPE_UNCACHABLE) | /* PAT0 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 8) | /* PAT1 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 16) | /* PAT2 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 24) | /* PAT3 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 32) | /* PAT4 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 40) | /* PAT5 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 48) | /* PAT6 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 56); /* PAT7 */ > + > + vmx_get_guest_pat(v, pat); > + vmx_set_guest_pat(v, uc_pat); > + > + wbinvd(); /* flush possibly polluted cache */ > + hvm_asid_flush_vcpu(v); /* invalidate memory type cached in TLB > */ > + v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; > + } > + else > + { > + v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; > + vmx_set_guest_pat(v, *pat); > + hvm_asid_flush_vcpu(v); /* no need to flush cache */ > + } > + } > +} > + > static void vmx_set_tsc_offset(struct vcpu *v, u64 offset) > { > vmx_vmcs_enter(v); > @@ -1550,6 +1603,7 @@ static struct hvm_function_table __initdata > vmx_function_table = { > .msr_read_intercept = vmx_msr_read_intercept, > .msr_write_intercept = vmx_msr_write_intercept, > .invlpg_intercept = vmx_invlpg_intercept, > + .handle_cd = vmx_handle_cd, > .set_info_guest = vmx_set_info_guest, > .set_rdtsc_exiting = vmx_set_rdtsc_exiting, > .nhvm_vcpu_initialise = nvmx_vcpu_initialise, > diff --git a/xen/common/domain.c b/xen/common/domain.c > index 5999779..ce20323 100644 > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -825,6 +825,16 @@ void domain_pause(struct domain *d) > vcpu_sleep_sync(v); > } > > +void domain_pause_nosync(struct domain *d) > +{ > + struct vcpu *v; > + > + atomic_inc(&d->pause_count); > + > + for_each_vcpu( d, v ) > + vcpu_sleep_nosync(v); > +} > + > void domain_unpause(struct domain *d) > { > struct vcpu *v; > diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h > index 8dd2b40..c9afb56 100644 > --- a/xen/include/asm-x86/hvm/hvm.h > +++ b/xen/include/asm-x86/hvm/hvm.h > @@ -156,6 +156,7 @@ struct hvm_function_table { > int (*msr_read_intercept)(unsigned int msr, uint64_t *msr_content); > int (*msr_write_intercept)(unsigned int msr, uint64_t msr_content); > void (*invlpg_intercept)(unsigned long vaddr); > + void (*handle_cd)(struct vcpu *v, unsigned long value); > void (*set_info_guest)(struct vcpu *v); > void (*set_rdtsc_exiting)(struct vcpu *v, bool_t); > > diff --git a/xen/include/asm-x86/hvm/support.h > b/xen/include/asm-x86/hvm/support.h > index 7ddc806..52aef1f 100644 > --- a/xen/include/asm-x86/hvm/support.h > +++ b/xen/include/asm-x86/hvm/support.h > @@ -130,6 +130,8 @@ void hvm_rdtsc_intercept(struct cpu_user_regs *regs); > > int __must_check hvm_handle_xsetbv(u32 index, u64 new_bv); > > +void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value); > + > /* These functions all return X86EMUL return codes. */ > int hvm_set_efer(uint64_t value); > int hvm_set_cr0(unsigned long value); > diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h > index 1765e18..82f522e 100644 > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -700,6 +700,7 @@ void vcpu_unblock(struct vcpu *v); > void vcpu_pause(struct vcpu *v); > void vcpu_pause_nosync(struct vcpu *v); > void domain_pause(struct domain *d); > +void domain_pause_nosync(struct domain *d); > void vcpu_unpause(struct vcpu *v); > void domain_unpause(struct domain *d); > void domain_pause_by_systemcontroller(struct domain *d); > -- > 1.7.1
At 15:55 +0000 on 21 Oct (1382367312), Liu, Jinsong wrote:> From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 > From: Liu Jinsong <jinsong.liu@intel.com> > Date: Thu, 17 Oct 2013 05:49:23 +0800 > Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling > > This patch solves XSA-60 security hole: > 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need > do nothing, since hardware snoop mechanism has ensured cache coherency. > > 2. For guest with VT-d but non-snooped, cache coherency can not be > guaranteed by h/w snoop, therefore it need emulate UC type to guest: > 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that > guest memory type are all UC. > 2.2). if it works w/ shadow, drop all shadows so that any new ones would > be created on demand w/ UC. > > This patch also fix a bug of shadow cr0.cd setting. Current shadow has a > small window between cache flush and TLB invalidation, resulting in possilbe > cache pollution. This patch pause vcpus so that no vcpus context involved > into the window. > > Signed-off-by: Liu Jinsong <jinsong.liu@intel.com>Reviewed-by: Tim Deegan <tim@xen.org>
DuanZhenzhong
2013-Oct-23 08:48 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
Jan Beulich wrote:>>>> On 21.10.13 at 17:55, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >>>> >> From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 >> From: Liu Jinsong <jinsong.liu@intel.com> >> Date: Thu, 17 Oct 2013 05:49:23 +0800 >> Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling >> >> This patch solves XSA-60 security hole: >> 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need >> do nothing, since hardware snoop mechanism has ensured cache coherency. >> >> 2. For guest with VT-d but non-snooped, cache coherency can not be >> guaranteed by h/w snoop, therefore it need emulate UC type to guest: >> 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that >> guest memory type are all UC. >> 2.2). if it works w/ shadow, drop all shadows so that any new ones would >> be created on demand w/ UC. >> >> This patch also fix a bug of shadow cr0.cd setting. Current shadow has a >> small window between cache flush and TLB invalidation, resulting in possilbe >> cache pollution. This patch pause vcpus so that no vcpus context involved >> into the window. >> >> Signed-off-by: Liu Jinsong <jinsong.liu@intel.com> >> > > This looks fine to me now, but will need acks/reviews at least from > - Keir (whose blessing of the pausing construct I''d like to have even > if this didn''t involve changing non-x86 files) > - one of the VMX maintainers > - one or both of Tim and Andrew > > And of course I''d really appreciate if Oracle could arrange for > testing this, to confirm their performance problem is also gone with > this. >I am try finding an env to test it. I''ll reply after test. zduan
Andrew Cooper
2013-Oct-23 10:16 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
On 22/10/13 16:26, Tim Deegan wrote:> At 15:55 +0000 on 21 Oct (1382367312), Liu, Jinsong wrote: >> From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 >> From: Liu Jinsong <jinsong.liu@intel.com> >> Date: Thu, 17 Oct 2013 05:49:23 +0800 >> Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling >> >> This patch solves XSA-60 security hole: >> 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need >> do nothing, since hardware snoop mechanism has ensured cache coherency. >> >> 2. For guest with VT-d but non-snooped, cache coherency can not be >> guaranteed by h/w snoop, therefore it need emulate UC type to guest: >> 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that >> guest memory type are all UC. >> 2.2). if it works w/ shadow, drop all shadows so that any new ones would >> be created on demand w/ UC. >> >> This patch also fix a bug of shadow cr0.cd setting. Current shadow has a >> small window between cache flush and TLB invalidation, resulting in possilbe >> cache pollution. This patch pause vcpus so that no vcpus context involved >> into the window. >> >> Signed-off-by: Liu Jinsong <jinsong.liu@intel.com> > Reviewed-by: Tim Deegan <tim@xen.org>Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Nakajima, Jun
2013-Oct-23 16:29 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
On Tue, Oct 22, 2013 at 7:55 AM, Jan Beulich <JBeulich@suse.com> wrote:> >>> On 21.10.13 at 17:55, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > > From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 > > From: Liu Jinsong <jinsong.liu@intel.com> > > Date: Thu, 17 Oct 2013 05:49:23 +0800 > > Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling > > > > This patch solves XSA-60 security hole: > > 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need > > do nothing, since hardware snoop mechanism has ensured cache coherency. > > > > 2. For guest with VT-d but non-snooped, cache coherency can not be > > guaranteed by h/w snoop, therefore it need emulate UC type to guest: > > 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that > > guest memory type are all UC. >Can you make sure that "setting guest IA32_PAT fields as UC" doesn''t have a conflict with the existing (other) settings done by the guest? -- Jun Intel Open Source Technology Center _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Oct-23 16:38 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
>>> "Nakajima, Jun" <jun.nakajima@intel.com> 10/23/13 6:29 PM >>> >On Tue, Oct 22, 2013 at 7:55 AM, Jan Beulich <JBeulich@suse.com> wrote: >> >>> On 21.10.13 at 17:55, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> > From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 >> > From: Liu Jinsong <jinsong.liu@intel.com> >> > Date: Thu, 17 Oct 2013 05:49:23 +0800 >> > Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling >> > >> > This patch solves XSA-60 security hole: >> > 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need >> > do nothing, since hardware snoop mechanism has ensured cache coherency. >> > >> > 2. For guest with VT-d but non-snooped, cache coherency can not be >> > guaranteed by h/w snoop, therefore it need emulate UC type to guest: >> > 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that >> > guest memory type are all UC. > >Can you make sure that "setting guest IA32_PAT fields as UC" doesn''t have a >conflict with the existing (other) settings done by the guest?I don''t think I understand the question, and I also don''t think I''m the right addressee (I think you meant to send this to Jinsong and only Cc me). Jan
Liu, Jinsong
2013-Oct-24 16:19 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
Jan Beulich wrote:>>>> "Nakajima, Jun" <jun.nakajima@intel.com> 10/23/13 6:29 PM >>> >> On Tue, Oct 22, 2013 at 7:55 AM, Jan Beulich <JBeulich@suse.com> >> wrote: >>>>>> On 21.10.13 at 17:55, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>> wrote: >>>> From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 >>>> 2001 From: Liu Jinsong <jinsong.liu@intel.com> >>>> Date: Thu, 17 Oct 2013 05:49:23 +0800 >>>> Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling >>>> >>>> This patch solves XSA-60 security hole: >>>> 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen >>>> need do nothing, since hardware snoop mechanism has ensured cache >>>> coherency. >>>> >>>> 2. For guest with VT-d but non-snooped, cache coherency can not be >>>> guaranteed by h/w snoop, therefore it need emulate UC type to >>>> guest: >>>> 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so >>>> that guest memory type are all UC. >> >> Can you make sure that "setting guest IA32_PAT fields as UC" doesn''t >> have a conflict with the existing (other) settings done by the guest? > > I don''t think I understand the question, and I also don''t think I''m > the right addressee (I think you meant to send this to Jinsong and > only Cc me). > > JanMaybe Jun''s concern is ''guest PAT (real pat of vmcs which take effect, not nominal guest_pat) should be identical among all physical processors which run vcpus of that guest'', am I right, Jun? One thing I''m not sure is, per Intel SDM (8.7.4 of volume 3), the PAT MSR settings must be the same for all processors in a system. However, Xen obviously doesn''t satisfy this requirement: PAT of the cpus running vmm context (50100070406) is not identical to PAT of the cpus running guest context (take rhel6.4 guest as example, it''s 7010600070106) -- practically it works fine. Thanks, Jinsong
Liu, Jinsong
2013-Oct-24 16:39 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
Liu, Jinsong wrote:> Jan Beulich wrote: >>>>> "Nakajima, Jun" <jun.nakajima@intel.com> 10/23/13 6:29 PM >>> >>> On Tue, Oct 22, 2013 at 7:55 AM, Jan Beulich <JBeulich@suse.com> >>> wrote: >>>>>>> On 21.10.13 at 17:55, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>>> wrote: >>>>> From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 >>>>> 2001 From: Liu Jinsong <jinsong.liu@intel.com> >>>>> Date: Thu, 17 Oct 2013 05:49:23 +0800 >>>>> Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling >>>>> >>>>> This patch solves XSA-60 security hole: >>>>> 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen >>>>> need do nothing, since hardware snoop mechanism has ensured cache >>>>> coherency. >>>>> >>>>> 2. For guest with VT-d but non-snooped, cache coherency can not be >>>>> guaranteed by h/w snoop, therefore it need emulate UC type to >>>>> guest: >>>>> 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so >>>>> that guest memory type are all UC. >>> >>> Can you make sure that "setting guest IA32_PAT fields as UC" doesn''t >>> have a conflict with the existing (other) settings done by the >>> guest? >> >> I don''t think I understand the question, and I also don''t think I''m >> the right addressee (I think you meant to send this to Jinsong and >> only Cc me). >> >> Jan > > Maybe Jun''s concern is ''guest PAT (real pat of vmcs which take > effect, not nominal guest_pat) should be identical among all physical > processors which run vcpus of that guest'', am I right, Jun? > > One thing I''m not sure is, per Intel SDM (8.7.4 of volume 3), the PAT > MSR settings must be the same for all processors in a system. > However, Xen obviously doesn''t satisfy this requirement: PAT of the > cpus running vmm context (50100070406) is not identical to PAT of the > cpus running guest context (take rhel6.4 guest as example, it''s > 7010600070106) -- practically it works fine.Or, PAT requirement under virtualization would better be ''PAT MSR settings must be the same for all processors of a domain (take vmm as a special domain)''? otherwise IA32_PAT fields of vmcs is pointless. Anyway, we''d better change our patch from per-vcpu PAT emulation to per-domain PAT emulation. Does it make sense, Jun? Thanks, Jinsong
Jan Beulich
2013-Oct-28 07:29 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
>>> On 24.10.13 at 18:39, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Liu, Jinsong wrote: >> Maybe Jun''s concern is ''guest PAT (real pat of vmcs which take >> effect, not nominal guest_pat) should be identical among all physical >> processors which run vcpus of that guest'', am I right, Jun? >> >> One thing I''m not sure is, per Intel SDM (8.7.4 of volume 3), the PAT >> MSR settings must be the same for all processors in a system. >> However, Xen obviously doesn''t satisfy this requirement: PAT of the >> cpus running vmm context (50100070406) is not identical to PAT of the >> cpus running guest context (take rhel6.4 guest as example, it''s >> 7010600070106) -- practically it works fine. > > Or, PAT requirement under virtualization would better be ''PAT MSR settings > must be the same for all processors of a domain (take vmm as a special > domain)''? otherwise IA32_PAT fields of vmcs is pointless. > > Anyway, we''d better change our patch from per-vcpu PAT emulation to per-domain > PAT emulation. Does it make sense, Jun?I don''t think that''s be in line with what we currently do, or with how real hardware works. Unless inconsistencies between PAT settings can be leveraged to affect the hypervisor or other guests, we should allow the guest to have them inconsistent (as would be the natural thing transiently when switching to a new value on all CPUs). And if inconsistencies can have effects outside the VM, then afaict we have this issue already without this patch. While mentally going through this logic again I noticed, however, that the cache flushing your patch is doing is still insufficient: Doing this just when CD gets set and in the context switch path is not enough. This needs to be done prior to each VM entry, unless it can be proven that the hypervisor (or the service domain) didn''t touch guest memory. Jan
Liu, Jinsong
2013-Oct-28 08:31 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
Jan Beulich wrote:>>>> On 24.10.13 at 18:39, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> Liu, Jinsong wrote: >>> Maybe Jun''s concern is ''guest PAT (real pat of vmcs which take >>> effect, not nominal guest_pat) should be identical among all >>> physical processors which run vcpus of that guest'', am I right, Jun? >>> >>> One thing I''m not sure is, per Intel SDM (8.7.4 of volume 3), the >>> PAT MSR settings must be the same for all processors in a system. >>> However, Xen obviously doesn''t satisfy this requirement: PAT of the >>> cpus running vmm context (50100070406) is not identical to PAT of >>> the cpus running guest context (take rhel6.4 guest as example, it''s >>> 7010600070106) -- practically it works fine. >> >> Or, PAT requirement under virtualization would better be ''PAT MSR >> settings must be the same for all processors of a domain (take vmm >> as a special domain)''? otherwise IA32_PAT fields of vmcs is >> pointless. >> >> Anyway, we''d better change our patch from per-vcpu PAT emulation to >> per-domain PAT emulation. Does it make sense, Jun? > > I don''t think that''s be in line with what we currently do, or with how > real hardware works. Unless inconsistencies between PAT settings > can be leveraged to affect the hypervisor or other guests, we > should allow the guest to have them inconsistent (as would be the > natural thing transiently when switching to a new value on all CPUs). > And if inconsistencies can have effects outside the VM, then afaict > we have this issue already without this patch.Agree, let''s keep current per-vcpu PAT emulation.> > While mentally going through this logic again I noticed, however, > that the cache flushing your patch is doing is still insufficient: > Doing this just when CD gets set and in the context switch path is not > enough. This needs to be done prior to each VM entry, unless it > can be proven that the hypervisor (or the service domain) didn''t > touch guest memory. >I think it''s safe: it only need guarantee no vcpu guest context involved into the small window between cache flushing and TLB invalidation -- after that it doesn''t care whether hypervisor touch guest memory or not, since cache is clear and old memory type in TLB is invalidated (and is UC afterwards), so no cache line will be polluted by guest context any more. Thanks, Jinsong
Jan Beulich
2013-Oct-28 09:29 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
>>> On 28.10.13 at 09:31, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan Beulich wrote: >> While mentally going through this logic again I noticed, however, >> that the cache flushing your patch is doing is still insufficient: >> Doing this just when CD gets set and in the context switch path is not >> enough. This needs to be done prior to each VM entry, unless it >> can be proven that the hypervisor (or the service domain) didn''t >> touch guest memory. > > I think it''s safe: it only need guarantee no vcpu guest context involved > into the small window between cache flushing and TLB invalidation -- after that > it doesn''t care whether hypervisor touch guest memory or not, since cache is > clear and old memory type in TLB is invalidated (and is UC afterwards), so no > cache line will be polluted by guest context any more.No - consider a VM exit while in this mode where, in order to process it, the hypervisor or service domain touch guest memory. Such touching will happen with caches being used, and hence unwritten data may be left in the caches when exiting back to the guest when there''s no wbinvd on the VM entry path. I don''t think anything is being said explicitly anywhere on whether cache contents are being taken into consideration when CD=0 but PAT enforces UC. Jan
Liu, Jinsong
2013-Oct-29 16:52 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
Jan Beulich wrote:>>>> On 28.10.13 at 09:31, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> Jan Beulich wrote: >>> While mentally going through this logic again I noticed, however, >>> that the cache flushing your patch is doing is still insufficient: >>> Doing this just when CD gets set and in the context switch path is >>> not enough. This needs to be done prior to each VM entry, unless it >>> can be proven that the hypervisor (or the service domain) didn''t >>> touch guest memory. >> >> I think it''s safe: it only need guarantee no vcpu guest context >> involved >> into the small window between cache flushing and TLB invalidation -- >> after that it doesn''t care whether hypervisor touch guest memory or >> not, since cache is clear and old memory type in TLB is invalidated >> (and is UC afterwards), so no cache line will be polluted by guest >> context any more. > > No - consider a VM exit while in this mode where, in order to process > it, the hypervisor or service domain touch guest memory. Such > touching will happen with caches being used, and hence unwritten > data may be left in the caches when exiting back to the guest when > there''s no wbinvd on the VM entry path. I don''t think anything is > being said explicitly anywhere on whether cache contents are being > taken into consideration when CD=0 but PAT enforces UC. >I think we don''t need worry about hypervisor touch guest memory when guest UC. I draft 2 tests, monitoring various events during guest UC period. Test 1 add wbinvd before vmentry. When guest booting, it hungs 10s ~ 60s at vcpu0 UC stage, with bunches of vmexit caused by lapic timer interrupt storm. Test 2 is our current patch. It shows during guest UC, vmexit only triggerred by interrupt/ CR access/ MSR access/ wbinvd. With these vmexits hypervisor will not touch any guest memory. For test1 lapic timer interrupt storm triggered by the ''wbinvd'' (without it everything works fine. I didn''t dig out the reason -- maybe wbinvd involved into tricky vmentry microcode process?), but anyway, test 2 demonstrates that hypervisor will not touch guest memory during guest UC period. Tests log attached below: ========================================================Test 1: add wbinvd before vmentry (at vmx_vmenter_helper()) (XEN) uc_vmenter_count = 10607 (XEN) uc_vmexit_count = 10607 (XEN) EXIT-REASON COUNT (XEN) 1 10463 // EXIT_REASON_EXTERNAL_INTERRUPT (XEN) 28 10 // EXIT_REASON_CR_ACCESS (XEN) 31 114 // EXIT_REASON_MSR_READ (XEN) 32 15 // EXIT_REASON_MSR_WRITE (XEN) 54 5 // EXIT_REASON_WBINVD (XEN) TOTAL EXIT-REASON-COUNT = 10607 (XEN) (XEN) vcpu[0] vmentry count = 10492 (XEN) vcpu[1] vmentry count = 37 (XEN) vcpu[2] vmentry count = 40 (XEN) vcpu[3] vmentry count = 38 (XEN) interrupt vec 0xfa occurs 10450 times // lapic timer (XEN) interrupt vec 0xfb occurs 13 times // call function IPI Test 2: current patch which didn''t add wbinvd before vmentry (XEN) uc_vmenter_count = 147 (XEN) uc_vmexit_count = 147 (XEN) EXIT-REASON COUNT (XEN) 1 3 // EXIT_REASON_EXTERNAL_INTERRUPT (XEN) 28 10 // EXIT_REASON_CR_ACCESS (XEN) 31 114 // EXIT_REASON_MSR_READ (XEN) 32 15 // EXIT_REASON_MSR_WRITE (XEN) 54 5 // EXIT_REASON_WBINVD (XEN) TOTAL EXIT-REASON-COUNT = 147 (XEN) (XEN) vcpu[0] vmentry count = 45 (XEN) vcpu[1] vmentry count = 34 (XEN) vcpu[2] vmentry count = 34 (XEN) vcpu[3] vmentry count = 34 (XEN) interrupt vec 0xfa occurs 3 times // lapic timer ================================================================= Thanks, Jinsong
Andrew Cooper
2013-Oct-29 17:20 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
On 29/10/13 16:52, Liu, Jinsong wrote:> Jan Beulich wrote: >>>>> On 28.10.13 at 09:31, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >>> Jan Beulich wrote: >>>> While mentally going through this logic again I noticed, however, >>>> that the cache flushing your patch is doing is still insufficient: >>>> Doing this just when CD gets set and in the context switch path is >>>> not enough. This needs to be done prior to each VM entry, unless it >>>> can be proven that the hypervisor (or the service domain) didn''t >>>> touch guest memory. >>> I think it''s safe: it only need guarantee no vcpu guest context >>> involved >>> into the small window between cache flushing and TLB invalidation -- >>> after that it doesn''t care whether hypervisor touch guest memory or >>> not, since cache is clear and old memory type in TLB is invalidated >>> (and is UC afterwards), so no cache line will be polluted by guest >>> context any more. >> No - consider a VM exit while in this mode where, in order to process >> it, the hypervisor or service domain touch guest memory. Such >> touching will happen with caches being used, and hence unwritten >> data may be left in the caches when exiting back to the guest when >> there''s no wbinvd on the VM entry path. I don''t think anything is >> being said explicitly anywhere on whether cache contents are being >> taken into consideration when CD=0 but PAT enforces UC. >> > I think we don''t need worry about hypervisor touch guest memory when guest UC. I draft 2 tests, monitoring various events during guest UC period. > > Test 1 add wbinvd before vmentry. When guest booting, it hungs 10s ~ 60s at vcpu0 UC stage, with bunches of vmexit caused by lapic timer interrupt storm. > Test 2 is our current patch. It shows during guest UC, vmexit only triggerred by interrupt/ CR access/ MSR access/ wbinvd. With these vmexits hypervisor will not touch any guest memory. > > For test1 lapic timer interrupt storm triggered by the ''wbinvd'' (without it everything works fine. I didn''t dig out the reason -- maybe wbinvd involved into tricky vmentry microcode process?), but anyway, test 2 demonstrates that hypervisor will not touch guest memory during guest UC period.How about this: 1) Clear a page with ud2s 2) Use the hypervisor msr to write a new hypercall page over this cleared page 3) Immediately try to make a hypercall using this new page What guarantee is there that Xen writing the hypercall page has made its way correctly back to RAM by the time domU tries to execute the hypercall? ~Andrew> > Tests log attached below: > ========================================================> Test 1: add wbinvd before vmentry (at vmx_vmenter_helper()) > (XEN) uc_vmenter_count = 10607 > (XEN) uc_vmexit_count = 10607 > (XEN) EXIT-REASON COUNT > (XEN) 1 10463 // EXIT_REASON_EXTERNAL_INTERRUPT > (XEN) 28 10 // EXIT_REASON_CR_ACCESS > (XEN) 31 114 // EXIT_REASON_MSR_READ > (XEN) 32 15 // EXIT_REASON_MSR_WRITE > (XEN) 54 5 // EXIT_REASON_WBINVD > (XEN) TOTAL EXIT-REASON-COUNT = 10607 > (XEN) > (XEN) vcpu[0] vmentry count = 10492 > (XEN) vcpu[1] vmentry count = 37 > (XEN) vcpu[2] vmentry count = 40 > (XEN) vcpu[3] vmentry count = 38 > (XEN) interrupt vec 0xfa occurs 10450 times // lapic timer > (XEN) interrupt vec 0xfb occurs 13 times // call function IPI > > > Test 2: current patch which didn''t add wbinvd before vmentry > (XEN) uc_vmenter_count = 147 > (XEN) uc_vmexit_count = 147 > (XEN) EXIT-REASON COUNT > (XEN) 1 3 // EXIT_REASON_EXTERNAL_INTERRUPT > (XEN) 28 10 // EXIT_REASON_CR_ACCESS > (XEN) 31 114 // EXIT_REASON_MSR_READ > (XEN) 32 15 // EXIT_REASON_MSR_WRITE > (XEN) 54 5 // EXIT_REASON_WBINVD > (XEN) TOTAL EXIT-REASON-COUNT = 147 > (XEN) > (XEN) vcpu[0] vmentry count = 45 > (XEN) vcpu[1] vmentry count = 34 > (XEN) vcpu[2] vmentry count = 34 > (XEN) vcpu[3] vmentry count = 34 > (XEN) interrupt vec 0xfa occurs 3 times // lapic timer > =================================================================> > Thanks, > Jinsong
Jan Beulich
2013-Oct-30 08:05 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
>>> On 29.10.13 at 17:52, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan Beulich wrote: >> No - consider a VM exit while in this mode where, in order to process >> it, the hypervisor or service domain touch guest memory. Such >> touching will happen with caches being used, and hence unwritten >> data may be left in the caches when exiting back to the guest when >> there''s no wbinvd on the VM entry path. I don''t think anything is >> being said explicitly anywhere on whether cache contents are being >> taken into consideration when CD=0 but PAT enforces UC. >> > > I think we don''t need worry about hypervisor touch guest memory when guest > UC. I draft 2 tests, monitoring various events during guest UC period. > > Test 1 add wbinvd before vmentry. When guest booting, it hungs 10s ~ 60s at > vcpu0 UC stage, with bunches of vmexit caused by lapic timer interrupt storm. > Test 2 is our current patch. It shows during guest UC, vmexit only > triggerred by interrupt/ CR access/ MSR access/ wbinvd. With these vmexits > hypervisor will not touch any guest memory. > > For test1 lapic timer interrupt storm triggered by the ''wbinvd'' (without it > everything works fine. I didn''t dig out the reason -- maybe wbinvd involved > into tricky vmentry microcode process?), but anyway, test 2 demonstrates that > hypervisor will not touch guest memory during guest UC period.That storm is a problem, indeed, but what you did isn''t meaningful in any way: You just sampled a specific (presumably Linux) guest case, but code like this needs to cover the general case (i.e. without us knowing whether or when the guest might - temporarily or permanently - suppress caching). So saying that in this specific case guest memory isn''t being touched doesn''t allow us to draw any conclusions other than that perhaps we need to monitor whether guest memory is being accessed (by setting a flag in the HVM copy/clear routines), and gate the invocation of wbinvd based on that. It''s clear though that this would still not cover speculative reads. Jan
Liu, Jinsong
2013-Oct-30 15:21 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
Andrew Cooper wrote:> On 29/10/13 16:52, Liu, Jinsong wrote: >> Jan Beulich wrote: >>>>>> On 28.10.13 at 09:31, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>> wrote: >>>> Jan Beulich wrote: >>>>> While mentally going through this logic again I noticed, however, >>>>> that the cache flushing your patch is doing is still insufficient: >>>>> Doing this just when CD gets set and in the context switch path is >>>>> not enough. This needs to be done prior to each VM entry, unless >>>>> it can be proven that the hypervisor (or the service domain) >>>>> didn''t touch guest memory. >>>> I think it''s safe: it only need guarantee no vcpu guest context >>>> involved into the small window between cache flushing and TLB >>>> invalidation -- after that it doesn''t care whether hypervisor >>>> touch guest memory or not, since cache is clear and old memory >>>> type in TLB is invalidated (and is UC afterwards), so no cache >>>> line will be polluted by guest context any more. >>> No - consider a VM exit while in this mode where, in order to >>> process it, the hypervisor or service domain touch guest memory. >>> Such touching will happen with caches being used, and hence >>> unwritten data may be left in the caches when exiting back to the >>> guest when there''s no wbinvd on the VM entry path. I don''t think >>> anything is being said explicitly anywhere on whether cache >>> contents are being taken into consideration when CD=0 but PAT >>> enforces UC. >>> >> I think we don''t need worry about hypervisor touch guest memory when >> guest UC. I draft 2 tests, monitoring various events during guest UC >> period. >> >> Test 1 add wbinvd before vmentry. When guest booting, it hungs 10s ~ >> 60s at vcpu0 UC stage, with bunches of vmexit caused by lapic timer >> interrupt storm. >> Test 2 is our current patch. It shows during guest UC, vmexit only >> triggerred by interrupt/ CR access/ MSR access/ wbinvd. With these >> vmexits hypervisor will not touch any guest memory. >> >> For test1 lapic timer interrupt storm triggered by the ''wbinvd'' >> (without it everything works fine. I didn''t dig out the reason -- >> maybe wbinvd involved into tricky vmentry microcode process?), but >> anyway, test 2 demonstrates that hypervisor will not touch guest >> memory during guest UC period. > > How about this: > > 1) Clear a page with ud2s > 2) Use the hypervisor msr to write a new hypercall page over this > cleared page > 3) Immediately try to make a hypercall using this new page > > What guarantee is there that Xen writing the hypercall page has made > its way correctly back to RAM by the time domU tries to execute the > hypercall? > > ~AndrewSorry, seems I didn''t understand it? Thanks, Jinsong> >> >> Tests log attached below: >> ========================================================>> Test 1: add wbinvd before vmentry (at vmx_vmenter_helper()) >> (XEN) uc_vmenter_count = 10607 >> (XEN) uc_vmexit_count = 10607 >> (XEN) EXIT-REASON COUNT >> (XEN) 1 10463 // >> EXIT_REASON_EXTERNAL_INTERRUPT (XEN) 28 10 >> // EXIT_REASON_CR_ACCESS (XEN) 31 114 // >> EXIT_REASON_MSR_READ (XEN) 32 15 // >> EXIT_REASON_MSR_WRITE (XEN) 54 5 // >> EXIT_REASON_WBINVD (XEN) TOTAL EXIT-REASON-COUNT = 10607 >> (XEN) >> (XEN) vcpu[0] vmentry count = 10492 >> (XEN) vcpu[1] vmentry count = 37 >> (XEN) vcpu[2] vmentry count = 40 >> (XEN) vcpu[3] vmentry count = 38 >> (XEN) interrupt vec 0xfa occurs 10450 times // lapic timer >> (XEN) interrupt vec 0xfb occurs 13 times // call function IPI >> >> >> Test 2: current patch which didn''t add wbinvd before vmentry (XEN) >> uc_vmenter_count = 147 (XEN) uc_vmexit_count = 147 >> (XEN) EXIT-REASON COUNT >> (XEN) 1 3 // >> EXIT_REASON_EXTERNAL_INTERRUPT (XEN) 28 10 >> // EXIT_REASON_CR_ACCESS (XEN) 31 114 >> // EXIT_REASON_MSR_READ (XEN) 32 15 >> // EXIT_REASON_MSR_WRITE (XEN) 54 5 >> // EXIT_REASON_WBINVD (XEN) TOTAL EXIT-REASON-COUNT = 147 >> (XEN) >> (XEN) vcpu[0] vmentry count = 45 >> (XEN) vcpu[1] vmentry count = 34 >> (XEN) vcpu[2] vmentry count = 34 >> (XEN) vcpu[3] vmentry count = 34 >> (XEN) interrupt vec 0xfa occurs 3 times // lapic timer >> =================================================================>> >> Thanks, >> Jinsong
Jan Beulich
2013-Oct-30 15:27 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
>>> On 30.10.13 at 16:21, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Andrew Cooper wrote: >> How about this: >> >> 1) Clear a page with ud2s >> 2) Use the hypervisor msr to write a new hypercall page over this >> cleared page >> 3) Immediately try to make a hypercall using this new page >> >> What guarantee is there that Xen writing the hypercall page has made >> its way correctly back to RAM by the time domU tries to execute the >> hypercall? > > Sorry, seems I didn''t understand it?He just gave you an example for what I was telling you in an abstract way. Jan
Liu, Jinsong
2013-Oct-30 15:41 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
Jan Beulich wrote:>>>> On 29.10.13 at 17:52, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> Jan Beulich wrote: >>> No - consider a VM exit while in this mode where, in order to >>> process it, the hypervisor or service domain touch guest memory. >>> Such touching will happen with caches being used, and hence >>> unwritten data may be left in the caches when exiting back to the >>> guest when there''s no wbinvd on the VM entry path. I don''t think >>> anything is being said explicitly anywhere on whether cache >>> contents are being taken into consideration when CD=0 but PAT >>> enforces UC. >>> >> >> I think we don''t need worry about hypervisor touch guest memory when >> guest UC. I draft 2 tests, monitoring various events during guest UC >> period. >> >> Test 1 add wbinvd before vmentry. When guest booting, it hungs 10s ~ >> 60s at vcpu0 UC stage, with bunches of vmexit caused by lapic timer >> interrupt storm. Test 2 is our current patch. It shows during guest >> UC, vmexit only triggerred by interrupt/ CR access/ MSR access/ >> wbinvd. With these vmexits hypervisor will not touch any guest >> memory. >> >> For test1 lapic timer interrupt storm triggered by the ''wbinvd'' >> (without it everything works fine. I didn''t dig out the reason -- >> maybe wbinvd involved into tricky vmentry microcode process?), but >> anyway, test 2 demonstrates that hypervisor will not touch guest >> memory during guest UC period. > > That storm is a problem, indeed, but what you did isn''t meaningful > in any way: You just sampled a specific (presumably Linux) guest > case, but code like this needs to cover the general case (i.e. without > us knowing whether or when the guest might - temporarily or > permanently - suppress caching). > > So saying that in this specific case guest memory isn''t being touched > doesn''t allow us to draw any conclusions other than that perhaps we > need to monitor whether guest memory is being accessed (by setting > a flag in the HVM copy/clear routines), and gate the invocation of > wbinvd based on that. It''s clear though that this would still not > cover speculative reads. > > JanOK, I will send out a protection patch, adding flag indicating guest memory access to prevent interrupt storm, though it seems not elegant enough. Whenever the interrupt storm got root caused and fixed, the protection flag can be removed. Thanks, Jinsong
Zhenzhong Duan
2013-Nov-04 08:49 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
On 2013-10-21 23:55, Liu, Jinsong wrote:> From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 > From: Liu Jinsong <jinsong.liu@intel.com> > Date: Thu, 17 Oct 2013 05:49:23 +0800 > Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling > > This patch solves XSA-60 security hole: > 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need > do nothing, since hardware snoop mechanism has ensured cache coherency. > > 2. For guest with VT-d but non-snooped, cache coherency can not be > guaranteed by h/w snoop, therefore it need emulate UC type to guest: > 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that > guest memory type are all UC. > 2.2). if it works w/ shadow, drop all shadows so that any new ones would > be created on demand w/ UC. > > This patch also fix a bug of shadow cr0.cd setting. Current shadow has a > small window between cache flush and TLB invalidation, resulting in possilbe > cache pollution. This patch pause vcpus so that no vcpus context involved > into the window. > > Signed-off-by: Liu Jinsong <jinsong.liu@intel.com> > --- > xen/arch/x86/hvm/hvm.c | 75 ++++++++++++++++++++++-------------- > xen/arch/x86/hvm/vmx/vmcs.c | 2 +- > xen/arch/x86/hvm/vmx/vmx.c | 58 +++++++++++++++++++++++++++- > xen/common/domain.c | 10 +++++ > xen/include/asm-x86/hvm/hvm.h | 1 + > xen/include/asm-x86/hvm/support.h | 2 + > xen/include/xen/sched.h | 1 + > 7 files changed, 117 insertions(+), 32 deletions(-) > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > index 688a943..df021de 100644 > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -1701,6 +1701,40 @@ int hvm_mov_from_cr(unsigned int cr, unsigned int gpr) > return X86EMUL_UNHANDLEABLE; > } > > +void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value) > +{ > + if ( value & X86_CR0_CD ) > + { > + /* Entering no fill cache mode. */ > + spin_lock(&v->domain->arch.hvm_domain.uc_lock); > + v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; > + > + if ( !v->domain->arch.hvm_domain.is_in_uc_mode ) > + { > + domain_pause_nosync(v->domain); > + > + /* Flush physical caches. */ > + on_each_cpu(local_flush_cache, NULL, 1); > + hvm_set_uc_mode(v, 1); > + > + domain_unpause(v->domain); > + } > + spin_unlock(&v->domain->arch.hvm_domain.uc_lock); > + } > + else if ( !(value & X86_CR0_CD) && > + (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > + { > + /* Exit from no fill cache mode. */ > + spin_lock(&v->domain->arch.hvm_domain.uc_lock); > + v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; > + > + if ( domain_exit_uc_mode(v) ) > + hvm_set_uc_mode(v, 0); > + > + spin_unlock(&v->domain->arch.hvm_domain.uc_lock); > + } > +} > + > int hvm_set_cr0(unsigned long value) > { > struct vcpu *v = current; > @@ -1784,35 +1818,18 @@ int hvm_set_cr0(unsigned long value) > } > } > > - if ( has_arch_mmios(v->domain) ) > - { > - if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) ) > - { > - /* Entering no fill cache mode. */ > - spin_lock(&v->domain->arch.hvm_domain.uc_lock); > - v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; > - > - if ( !v->domain->arch.hvm_domain.is_in_uc_mode ) > - { > - /* Flush physical caches. */ > - on_each_cpu(local_flush_cache, NULL, 1); > - hvm_set_uc_mode(v, 1); > - } > - spin_unlock(&v->domain->arch.hvm_domain.uc_lock); > - } > - else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) && > - (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > - { > - /* Exit from no fill cache mode. */ > - spin_lock(&v->domain->arch.hvm_domain.uc_lock); > - v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; > - > - if ( domain_exit_uc_mode(v) ) > - hvm_set_uc_mode(v, 0); > - > - spin_unlock(&v->domain->arch.hvm_domain.uc_lock); > - } > - } > + /* > + * When cr0.cd setting > + * 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need not > + * do anything, since hardware snoop mechanism has ensured cache coherency; > + * 2. For guest with VT-d but non-snooped, cache coherency cannot be > + * guaranteed by h/w so need emulate UC memory type to guest. > + */ > + if ( ((value ^ old_value) & X86_CR0_CD) && > + has_arch_pdevs(v->domain) && > + iommu_enabled && !iommu_snoop && > + hvm_funcs.handle_cd ) > + hvm_funcs.handle_cd(v, value); > > v->arch.hvm_vcpu.guest_cr[0] = value; > hvm_update_guest_cr(v, 0); > diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c > index 6916c6d..7a01b1f 100644 > --- a/xen/arch/x86/hvm/vmx/vmcs.c > +++ b/xen/arch/x86/hvm/vmx/vmcs.c > @@ -921,7 +921,7 @@ static int construct_vmcs(struct vcpu *v) > vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, MSR_TYPE_R | MSR_TYPE_W); > vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, MSR_TYPE_R | MSR_TYPE_W); > vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, MSR_TYPE_R | MSR_TYPE_W); > - if ( paging_mode_hap(d) ) > + if ( paging_mode_hap(d) && (!iommu_enabled || iommu_snoop) ) > vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, MSR_TYPE_R | MSR_TYPE_W); > } > > diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c > index 6dedb29..d846a9c 100644 > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -642,6 +642,10 @@ static void vmx_ctxt_switch_to(struct vcpu *v) > __invept(INVEPT_SINGLE_CONTEXT, ept_get_eptp(ept_data), 0); > } > > + /* For guest cr0.cd setting, do not use potentially polluted cache */ > + if ( unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > + wbinvd(); > + > vmx_restore_guest_msrs(v); > vmx_restore_dr(v); > } > @@ -908,7 +912,8 @@ static unsigned long vmx_get_shadow_gs_base(struct vcpu *v) > > static int vmx_set_guest_pat(struct vcpu *v, u64 gpat) > { > - if ( !paging_mode_hap(v->domain) ) > + if ( !paging_mode_hap(v->domain) || > + unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > return 0; > > vmx_vmcs_enter(v); > @@ -919,7 +924,8 @@ static int vmx_set_guest_pat(struct vcpu *v, u64 gpat) > > static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat) > { > - if ( !paging_mode_hap(v->domain) ) > + if ( !paging_mode_hap(v->domain) || > + unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) > return 0; > > vmx_vmcs_enter(v); > @@ -928,6 +934,53 @@ static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat) > return 1; > } > > +static void vmx_handle_cd(struct vcpu *v, unsigned long value) > +{ > + if ( !paging_mode_hap(v->domain) ) > + { > + /* > + * For shadow, ''load IA32_PAT'' VM-entry control is 0, so it cannot > + * set guest memory type as UC via IA32_PAT. Xen drop all shadows > + * so that any new ones would be created on demand. > + */ > + hvm_shadow_handle_cd(v, value); > + } > + else > + { > + u64 *pat = &v->arch.hvm_vcpu.pat_cr; > + > + if ( value & X86_CR0_CD ) > + { > + /* > + * For EPT, set guest IA32_PAT fields as UC so that guest > + * memory type are all UC. > + */ > + u64 uc_pat > + ((uint64_t)PAT_TYPE_UNCACHABLE) | /* PAT0 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 8) | /* PAT1 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 16) | /* PAT2 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 24) | /* PAT3 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 32) | /* PAT4 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 40) | /* PAT5 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 48) | /* PAT6 */ > + ((uint64_t)PAT_TYPE_UNCACHABLE << 56); /* PAT7 */ > + > + vmx_get_guest_pat(v, pat); > + vmx_set_guest_pat(v, uc_pat); > + > + wbinvd(); /* flush possibly polluted cache */ > + hvm_asid_flush_vcpu(v); /* invalidate memory type cached in TLB */ > + v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; > + } > + else > + { > + v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; > + vmx_set_guest_pat(v, *pat); > + hvm_asid_flush_vcpu(v); /* no need to flush cache */ > + } > + } > +} > + > static void vmx_set_tsc_offset(struct vcpu *v, u64 offset) > { > vmx_vmcs_enter(v); > @@ -1550,6 +1603,7 @@ static struct hvm_function_table __initdata vmx_function_table = { > .msr_read_intercept = vmx_msr_read_intercept, > .msr_write_intercept = vmx_msr_write_intercept, > .invlpg_intercept = vmx_invlpg_intercept, > + .handle_cd = vmx_handle_cd, > .set_info_guest = vmx_set_info_guest, > .set_rdtsc_exiting = vmx_set_rdtsc_exiting, > .nhvm_vcpu_initialise = nvmx_vcpu_initialise, > diff --git a/xen/common/domain.c b/xen/common/domain.c > index 5999779..ce20323 100644 > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -825,6 +825,16 @@ void domain_pause(struct domain *d) > vcpu_sleep_sync(v); > } > > +void domain_pause_nosync(struct domain *d) > +{ > + struct vcpu *v; > + > + atomic_inc(&d->pause_count); > + > + for_each_vcpu( d, v ) > + vcpu_sleep_nosync(v); > +} > + > void domain_unpause(struct domain *d) > { > struct vcpu *v; > diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h > index 8dd2b40..c9afb56 100644 > --- a/xen/include/asm-x86/hvm/hvm.h > +++ b/xen/include/asm-x86/hvm/hvm.h > @@ -156,6 +156,7 @@ struct hvm_function_table { > int (*msr_read_intercept)(unsigned int msr, uint64_t *msr_content); > int (*msr_write_intercept)(unsigned int msr, uint64_t msr_content); > void (*invlpg_intercept)(unsigned long vaddr); > + void (*handle_cd)(struct vcpu *v, unsigned long value); > void (*set_info_guest)(struct vcpu *v); > void (*set_rdtsc_exiting)(struct vcpu *v, bool_t); > > diff --git a/xen/include/asm-x86/hvm/support.h b/xen/include/asm-x86/hvm/support.h > index 7ddc806..52aef1f 100644 > --- a/xen/include/asm-x86/hvm/support.h > +++ b/xen/include/asm-x86/hvm/support.h > @@ -130,6 +130,8 @@ void hvm_rdtsc_intercept(struct cpu_user_regs *regs); > > int __must_check hvm_handle_xsetbv(u32 index, u64 new_bv); > > +void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value); > + > /* These functions all return X86EMUL return codes. */ > int hvm_set_efer(uint64_t value); > int hvm_set_cr0(unsigned long value); > diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h > index 1765e18..82f522e 100644 > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -700,6 +700,7 @@ void vcpu_unblock(struct vcpu *v); > void vcpu_pause(struct vcpu *v); > void vcpu_pause_nosync(struct vcpu *v); > void domain_pause(struct domain *d); > +void domain_pause_nosync(struct domain *d); > void vcpu_unpause(struct vcpu *v); > void domain_unpause(struct domain *d); > void domain_pause_by_systemcontroller(struct domain *d);We have a problem w/ the new Xen4.4. hypervisor - dom0 comes up but hit the following bug and the box reboot. See stack trace bellow. ... Starting portmap: [ OK ] Starting NFS statd: [ OK ] Starting RPC idmapd: [ OK ] (XEN) Xen BUG at spinlock.c:48 (XEN) ----[ Xen-4.4-unstable x86_64 debug=y Not tainted ]---- (XEN) CPU: 2 (XEN) RIP: e008:[<ffff82d080127355>] check_lock+0x3d/0x43 (XEN) RFLAGS: 0000000000010046 CONTEXT: hypervisor (XEN) rax: 0000000000000000 rbx: ffff82d08028ab08 rcx: 0000000000000001 (XEN) rdx: 0000000000000000 rsi: 0000000000000001 rdi: ffff82d08028ab0c (XEN) rbp: ffff83203fda7c50 rsp: ffff83203fda7c50 r8: ffff82d0802dfc88 (XEN) r9: 00000000deadbeef r10: ffff82d08023e120 r11: 0000000000000206 (XEN) r12: ffff83007f481ff0 r13: 0000000000000000 r14: 000ffff82cfffd5d (XEN) r15: ffff82cfffd5e000 cr0: 0000000080050033 cr4: 00000000000026f0 (XEN) cr3: 000000c083a2a000 cr2: 000000000040e000 (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008 (XEN) Xen stack trace from rsp=ffff83203fda7c50: (XEN) ffff83203fda7c68 ffff82d08012750d ffff83007f74e000 ffff83203fda7d08 (XEN) ffff82d0801717bf ffff83203fda7ca8 010183203fda7d88 0000000000000163 (XEN) 000000013fda7d88 ffff832000000163 ffff83203fda7d28 0000000000000000 (XEN) 000000000c082433 01ffffffffffffff ffff83007f4839f8 ffff82d08026ff20 (XEN) ffff83203fdcafd0 0000000000000000 00000000000002a2 ffff82d0802dfc90 (XEN) 0000000000000001 000000c082432000 00000000000002a2 ffff83203fda7d18 (XEN) ffff82d080171e8a ffff83203fda7d58 ffff82d08019f5f5 0000000000000002 (XEN) ffff82d0802dfc88 ffff83203fda7db8 00000000ffffffea 0000000000000001 (XEN) 0000000000000000 ffff83203fda7da8 ffff82d080112f8b 0000000000000002 (XEN) ffff83203fdac6c8 0000000200000005 0000000000000000 0000000000000001 (XEN) 0000000000000000 ffff8807bc799e68 0000000000000246 ffff83203fda7ee8 (XEN) ffff82d080113cce 0000000000000001 000000c082432000 0000000000000000 (XEN) 000000c085c1b000 0000000000000000 000000c085c1a000 0000000000000000 (XEN) 000000c085c19000 0000000000000000 000000c085c18000 0000000000000000 (XEN) 000000c085eb7000 0000000000000000 000000c085eb6000 0000000000000000 (XEN) 000000c085eb5000 0000000000000000 000000c082433000 000000c085eb4002 (XEN) 0000000008f59690 ffff82d0801274bf ffff83007f2db060 ffff83203fda7e88 (XEN) ffff82d08018d8ee ffff83203fd9c330 ffff83203fda7f18 ffff83203fda7ef8 (XEN) ffff82d080220550 0000000000000000 0000000008fff000 0000000000000044 (XEN) 0000000000000000 ffffffff8125bd07 ffff83007f2db000 ffff8807bc684000 (XEN) Xen call trace: (XEN) [<ffff82d080127355>] check_lock+0x3d/0x43 (XEN) [<ffff82d08012750d>] _spin_lock+0x11/0x48 (XEN) [<ffff82d0801717bf>] map_pages_to_xen+0xcab/0x1052 (XEN) [<ffff82d080171e8a>] __set_fixmap+0x30/0x32 (XEN) [<ffff82d08019f5f5>] machine_kexec_load+0x66/0xa1 (XEN) [<ffff82d080112f8b>] kexec_load_unload_internal+0xb9/0x2cc (XEN) [<ffff82d080113cce>] do_kexec_op_internal+0x391/0x4b2 (XEN) [<ffff82d080113e0d>] do_kexec_op+0xe/0x12 (XEN) [<ffff82d080225c7b>] syscall_enter+0xeb/0x145 (XEN) (XEN) (XEN) **************************************** (XEN) Panic on CPU 2: (XEN) Xen BUG at spinlock.c:48 (XEN) **************************************** (XEN) (XEN) Reboot in five seconds... (XEN) Resetting with ACPI MEMORY or I/O RESET_REG.
Jan Beulich
2013-Nov-04 09:05 UTC
Re: kexec spin lock issue (was: Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling)
>>> On 04.11.13 at 09:49, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote: > We have a problem w/ the new Xen4.4. hypervisor - dom0 comes up but > hit the following bug and the box reboot. > See stack trace bellow.I''m sorry, but please put new problems in new threads. I''ll see to take a look later today. Also, from the stack trace it''s clear that kexec is involved here, yet you clearly don''t need this for the test(s) you want to do in the original context. Jan> ... > Starting portmap: [ OK ] > Starting NFS statd: [ OK ] > Starting RPC idmapd: [ OK ] > (XEN) Xen BUG at spinlock.c:48 > (XEN) ----[ Xen-4.4-unstable x86_64 debug=y Not tainted ]---- > (XEN) CPU: 2 > (XEN) RIP: e008:[<ffff82d080127355>] check_lock+0x3d/0x43 > (XEN) RFLAGS: 0000000000010046 CONTEXT: hypervisor > (XEN) rax: 0000000000000000 rbx: ffff82d08028ab08 rcx: 0000000000000001 > (XEN) rdx: 0000000000000000 rsi: 0000000000000001 rdi: ffff82d08028ab0c > (XEN) rbp: ffff83203fda7c50 rsp: ffff83203fda7c50 r8: ffff82d0802dfc88 > (XEN) r9: 00000000deadbeef r10: ffff82d08023e120 r11: 0000000000000206 > (XEN) r12: ffff83007f481ff0 r13: 0000000000000000 r14: 000ffff82cfffd5d > (XEN) r15: ffff82cfffd5e000 cr0: 0000000080050033 cr4: 00000000000026f0 > (XEN) cr3: 000000c083a2a000 cr2: 000000000040e000 > (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008 > (XEN) Xen stack trace from rsp=ffff83203fda7c50: > (XEN) ffff83203fda7c68 ffff82d08012750d ffff83007f74e000 > ffff83203fda7d08 > (XEN) ffff82d0801717bf ffff83203fda7ca8 010183203fda7d88 > 0000000000000163 > (XEN) 000000013fda7d88 ffff832000000163 ffff83203fda7d28 > 0000000000000000 > (XEN) 000000000c082433 01ffffffffffffff ffff83007f4839f8 > ffff82d08026ff20 > (XEN) ffff83203fdcafd0 0000000000000000 00000000000002a2 > ffff82d0802dfc90 > (XEN) 0000000000000001 000000c082432000 00000000000002a2 > ffff83203fda7d18 > (XEN) ffff82d080171e8a ffff83203fda7d58 ffff82d08019f5f5 > 0000000000000002 > (XEN) ffff82d0802dfc88 ffff83203fda7db8 00000000ffffffea > 0000000000000001 > (XEN) 0000000000000000 ffff83203fda7da8 ffff82d080112f8b > 0000000000000002 > (XEN) ffff83203fdac6c8 0000000200000005 0000000000000000 > 0000000000000001 > (XEN) 0000000000000000 ffff8807bc799e68 0000000000000246 > ffff83203fda7ee8 > (XEN) ffff82d080113cce 0000000000000001 000000c082432000 > 0000000000000000 > (XEN) 000000c085c1b000 0000000000000000 000000c085c1a000 > 0000000000000000 > (XEN) 000000c085c19000 0000000000000000 000000c085c18000 > 0000000000000000 > (XEN) 000000c085eb7000 0000000000000000 000000c085eb6000 > 0000000000000000 > (XEN) 000000c085eb5000 0000000000000000 000000c082433000 > 000000c085eb4002 > (XEN) 0000000008f59690 ffff82d0801274bf ffff83007f2db060 > ffff83203fda7e88 > (XEN) ffff82d08018d8ee ffff83203fd9c330 ffff83203fda7f18 > ffff83203fda7ef8 > (XEN) ffff82d080220550 0000000000000000 0000000008fff000 > 0000000000000044 > (XEN) 0000000000000000 ffffffff8125bd07 ffff83007f2db000 > ffff8807bc684000 > (XEN) Xen call trace: > (XEN) [<ffff82d080127355>] check_lock+0x3d/0x43 > (XEN) [<ffff82d08012750d>] _spin_lock+0x11/0x48 > (XEN) [<ffff82d0801717bf>] map_pages_to_xen+0xcab/0x1052 > (XEN) [<ffff82d080171e8a>] __set_fixmap+0x30/0x32 > (XEN) [<ffff82d08019f5f5>] machine_kexec_load+0x66/0xa1 > (XEN) [<ffff82d080112f8b>] kexec_load_unload_internal+0xb9/0x2cc > (XEN) [<ffff82d080113cce>] do_kexec_op_internal+0x391/0x4b2 > (XEN) [<ffff82d080113e0d>] do_kexec_op+0xe/0x12 > (XEN) [<ffff82d080225c7b>] syscall_enter+0xeb/0x145 > (XEN) > (XEN) > (XEN) **************************************** > (XEN) Panic on CPU 2: > (XEN) Xen BUG at spinlock.c:48 > (XEN) **************************************** > (XEN) > (XEN) Reboot in five seconds... > (XEN) Resetting with ACPI MEMORY or I/O RESET_REG.
Keir Fraser
2013-Nov-05 21:06 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
On 21/10/2013 16:55, "Liu, Jinsong" <jinsong.liu@intel.com> wrote:> From 4ff1e2955f67954e60562b29a00adea89e5b93ae Mon Sep 17 00:00:00 2001 > From: Liu Jinsong <jinsong.liu@intel.com> > Date: Thu, 17 Oct 2013 05:49:23 +0800 > Subject: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling > > This patch solves XSA-60 security hole: > 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need > do nothing, since hardware snoop mechanism has ensured cache coherency. > > 2. For guest with VT-d but non-snooped, cache coherency can not be > guaranteed by h/w snoop, therefore it need emulate UC type to guest: > 2.1). if it works w/ Intel EPT, set guest IA32_PAT fields as UC so that > guest memory type are all UC. > 2.2). if it works w/ shadow, drop all shadows so that any new ones would > be created on demand w/ UC. > > This patch also fix a bug of shadow cr0.cd setting. Current shadow has a > small window between cache flush and TLB invalidation, resulting in possilbe > cache pollution. This patch pause vcpus so that no vcpus context involved > into the window. > > Signed-off-by: Liu Jinsong <jinsong.liu@intel.com>Acked-by: Keir Fraser <keir@xen.org>
Jan Beulich
2013-Nov-06 12:30 UTC
Re: [PATCH 3/3 V3] XSA-60 security hole: cr0.cd handling
>>> On 04.11.13 at 09:49, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote: > We have a problem w/ the new Xen4.4. hypervisor - dom0 comes up but > hit the following bug and the box reboot. > See stack trace bellow. > > ... > Starting portmap: [ OK ] > Starting NFS statd: [ OK ] > Starting RPC idmapd: [ OK ] > (XEN) Xen BUG at spinlock.c:48 > (XEN) ----[ Xen-4.4-unstable x86_64 debug=y Not tainted ]---- > (XEN) CPU: 2 > (XEN) RIP: e008:[<ffff82d080127355>] check_lock+0x3d/0x43 > (XEN) RFLAGS: 0000000000010046 CONTEXT: hypervisor > (XEN) rax: 0000000000000000 rbx: ffff82d08028ab08 rcx: 0000000000000001 > (XEN) rdx: 0000000000000000 rsi: 0000000000000001 rdi: ffff82d08028ab0c > (XEN) rbp: ffff83203fda7c50 rsp: ffff83203fda7c50 r8: ffff82d0802dfc88 > (XEN) r9: 00000000deadbeef r10: ffff82d08023e120 r11: 0000000000000206 > (XEN) r12: ffff83007f481ff0 r13: 0000000000000000 r14: 000ffff82cfffd5d > (XEN) r15: ffff82cfffd5e000 cr0: 0000000080050033 cr4: 00000000000026f0 > (XEN) cr3: 000000c083a2a000 cr2: 000000000040e000 > (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e010 cs: e008 > (XEN) Xen stack trace from rsp=ffff83203fda7c50: > (XEN) ffff83203fda7c68 ffff82d08012750d ffff83007f74e000 ffff83203fda7d08 > (XEN) ffff82d0801717bf ffff83203fda7ca8 010183203fda7d88 0000000000000163 > (XEN) 000000013fda7d88 ffff832000000163 ffff83203fda7d28 0000000000000000 > (XEN) 000000000c082433 01ffffffffffffff ffff83007f4839f8 ffff82d08026ff20 > (XEN) ffff83203fdcafd0 0000000000000000 00000000000002a2 ffff82d0802dfc90 > (XEN) 0000000000000001 000000c082432000 00000000000002a2 ffff83203fda7d18 > (XEN) ffff82d080171e8a ffff83203fda7d58 ffff82d08019f5f5 0000000000000002 > (XEN) ffff82d0802dfc88 ffff83203fda7db8 00000000ffffffea 0000000000000001 > (XEN) 0000000000000000 ffff83203fda7da8 ffff82d080112f8b 0000000000000002 > (XEN) ffff83203fdac6c8 0000000200000005 0000000000000000 0000000000000001 > (XEN) 0000000000000000 ffff8807bc799e68 0000000000000246 ffff83203fda7ee8 > (XEN) ffff82d080113cce 0000000000000001 000000c082432000 0000000000000000 > (XEN) 000000c085c1b000 0000000000000000 000000c085c1a000 0000000000000000 > (XEN) 000000c085c19000 0000000000000000 000000c085c18000 0000000000000000 > (XEN) 000000c085eb7000 0000000000000000 000000c085eb6000 0000000000000000 > (XEN) 000000c085eb5000 0000000000000000 000000c082433000 000000c085eb4002 > (XEN) 0000000008f59690 ffff82d0801274bf ffff83007f2db060 ffff83203fda7e88 > (XEN) ffff82d08018d8ee ffff83203fd9c330 ffff83203fda7f18 ffff83203fda7ef8 > (XEN) ffff82d080220550 0000000000000000 0000000008fff000 0000000000000044 > (XEN) 0000000000000000 ffffffff8125bd07 ffff83007f2db000 ffff8807bc684000 > (XEN) Xen call trace: > (XEN) [<ffff82d080127355>] check_lock+0x3d/0x43 > (XEN) [<ffff82d08012750d>] _spin_lock+0x11/0x48 > (XEN) [<ffff82d0801717bf>] map_pages_to_xen+0xcab/0x1052 > (XEN) [<ffff82d080171e8a>] __set_fixmap+0x30/0x32 > (XEN) [<ffff82d08019f5f5>] machine_kexec_load+0x66/0xa1 > (XEN) [<ffff82d080112f8b>] kexec_load_unload_internal+0xb9/0x2cc > (XEN) [<ffff82d080113cce>] do_kexec_op_internal+0x391/0x4b2 > (XEN) [<ffff82d080113e0d>] do_kexec_op+0xe/0x12 > (XEN) [<ffff82d080225c7b>] syscall_enter+0xeb/0x145 > (XEN) > (XEN) > (XEN) **************************************** > (XEN) Panic on CPU 2: > (XEN) Xen BUG at spinlock.c:48 > (XEN) ****************************************The patch at http://lists.xenproject.org/archives/html/xen-devel/2013-11/msg00659.html should help. Jan