Boris Ostrovsky
2012-Jan-16 21:11 UTC
[PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
# HG changeset patch # User Boris Ostrovsky <boris.ostrovsky@amd.com> # Date 1326748089 -3600 # Node ID f157d40df95aa3b8becb970968d33f4eca6c7e75 # Parent 5b2676ac13218951698c49fa0350f2ac48220f3d x86/AMD: Add support for AMD''s OSVW feature in guests. In some cases guests should not provide workarounds for errata even when the physical processor is affected. For example, because of erratum 400 on family 10h processors a Linux guest will read an MSR (resulting in VMEXIT) before going to idle in order to avoid getting stuck in a non-C0 state. This is not necessary: HLT and IO instructions are intercepted and therefore there is no reason for erratum 400 workaround in the guest. This patch allows us to present a guest with certain errata as fixed, regardless of the state of actual hardware. Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com> diff -r 5b2676ac1321 -r f157d40df95a tools/libxc/xc_cpuid_x86.c --- a/tools/libxc/xc_cpuid_x86.c Mon Jan 09 16:01:44 2012 +0100 +++ b/tools/libxc/xc_cpuid_x86.c Mon Jan 16 22:08:09 2012 +0100 @@ -108,6 +108,7 @@ static void amd_xc_cpuid_policy( bitmaskof(X86_FEATURE_SSE4A) | bitmaskof(X86_FEATURE_MISALIGNSSE) | bitmaskof(X86_FEATURE_3DNOWPREFETCH) | + bitmaskof(X86_FEATURE_OSVW) | bitmaskof(X86_FEATURE_XOP) | bitmaskof(X86_FEATURE_FMA4) | bitmaskof(X86_FEATURE_TBM) | @@ -524,7 +525,6 @@ static void xc_cpuid_pv_policy( clear_bit(X86_FEATURE_RDTSCP, regs[3]); clear_bit(X86_FEATURE_SVM, regs[2]); - clear_bit(X86_FEATURE_OSVW, regs[2]); clear_bit(X86_FEATURE_IBS, regs[2]); clear_bit(X86_FEATURE_SKINIT, regs[2]); clear_bit(X86_FEATURE_WDT, regs[2]); diff -r 5b2676ac1321 -r f157d40df95a xen/arch/x86/cpu/amd.c --- a/xen/arch/x86/cpu/amd.c Mon Jan 09 16:01:44 2012 +0100 +++ b/xen/arch/x86/cpu/amd.c Mon Jan 16 22:08:09 2012 +0100 @@ -32,6 +32,13 @@ static char opt_famrev[14]; string_param("cpuid_mask_cpu", opt_famrev); +/* + * Set osvw_len to higher number when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_length = 4, osvw_status; +static DEFINE_SPINLOCK(amd_lock); + static inline void wrmsr_amd(unsigned int index, unsigned int lo, unsigned int hi) { @@ -182,6 +189,35 @@ static void __devinit set_cpuidmask(cons } } +static void amd_guest_osvw_init(struct vcpu *vcpu) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + + /* + * Guests should see errata 400 and 415 as fixed (assuming that + * HLT and IO instructions are intercepted). + */ + vcpu->arch.amd.osvw.length = (osvw_length >= 3) ? (osvw_length) : 3; + vcpu->arch.amd.osvw.status = osvw_status & ~(6ULL); + + /* + * By increasing VCPU''s osvw.length to 3 we are telling the guest that + * all osvw.status bits inside that length, including bit 0 (which is + * reserved for erratum 298), are valid. However, if host processor''s + * osvw_len is 0 then osvw_status[0] carries no information. We need to + * be conservative here and therefore we tell the guest that erratum 298 + * is present (because we really don''t know). + */ + if (osvw_length == 0 && boot_cpu_data.x86 == 0x10) + vcpu->arch.amd.osvw.status |= 1; +} + +void amd_vcpu_initialise(struct vcpu *vcpu) +{ + amd_guest_osvw_init(vcpu); +} + /* * Check for the presence of an AMD erratum. Arguments are defined in amd.h * for each known erratum. Return 1 if erratum is found. @@ -512,6 +548,30 @@ static void __devinit init_amd(struct cp set_cpuidmask(c); check_syscfg_dram_mod_en(); + + /* + * Get OSVW bits. If bits are not the same on different processors then + * choose the worst case (i.e. if erratum is present on one processor and + * not on another assume that the erratum is present everywhere). + */ + if (test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability)) { + uint64_t len, status; + + if (rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) || + rdmsr_safe(MSR_AMD_OSVW_STATUS, status)) + len = status = 0; + + spin_lock(&amd_lock); + + if (len < osvw_length) + osvw_length = len; + + osvw_status |= status; + osvw_status &= (1ULL << osvw_length) - 1; + + spin_unlock(&amd_lock); + } else + osvw_length = osvw_status = 0; } static struct cpu_dev amd_cpu_dev __cpuinitdata = { diff -r 5b2676ac1321 -r f157d40df95a xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Mon Jan 09 16:01:44 2012 +0100 +++ b/xen/arch/x86/domain.c Mon Jan 16 22:08:09 2012 +0100 @@ -422,6 +422,10 @@ int vcpu_initialise(struct vcpu *v) if ( (rc = vcpu_init_fpu(v)) != 0 ) return rc; + /* Vendor-specific initialization */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + amd_vcpu_initialise(v); + if ( is_hvm_domain(d) ) { rc = hvm_vcpu_initialise(v); diff -r 5b2676ac1321 -r f157d40df95a xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c Mon Jan 09 16:01:44 2012 +0100 +++ b/xen/arch/x86/hvm/svm/svm.c Mon Jan 16 22:08:09 2012 +0100 @@ -1044,6 +1044,27 @@ static void svm_init_erratum_383(struct } } +static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, uint read) +{ + uint eax, ebx, ecx, edx; + + /* Guest OSVW support */ + hvm_cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (!test_bit((X86_FEATURE_OSVW & 31), &ecx)) + return -1; + + if (read) { + if (msr == MSR_AMD_OSVW_ID_LENGTH) + *val = v->arch.amd.osvw.length; + else + *val = v->arch.amd.osvw.status; + } + /* Writes are ignored */ + + return 0; +} + + static int svm_cpu_up(void) { uint64_t msr_content; @@ -1385,6 +1406,13 @@ static int svm_msr_read_intercept(unsign vpmu_do_rdmsr(msr, msr_content); break; + case MSR_AMD_OSVW_ID_LENGTH: + case MSR_AMD_OSVW_STATUS: + ret = svm_handle_osvw(v, msr, msr_content, 1); + if (ret < 0) + goto gpf; + break; + default: ret = nsvm_rdmsr(v, msr, msr_content); if ( ret < 0 ) @@ -1509,6 +1537,13 @@ static int svm_msr_write_intercept(unsig */ break; + case MSR_AMD_OSVW_ID_LENGTH: + case MSR_AMD_OSVW_STATUS: + ret = svm_handle_osvw(v, msr, &msr_content, 0); + if (ret < 0) + goto gpf; + break; + default: ret = nsvm_wrmsr(v, msr, msr_content); if ( ret < 0 ) diff -r 5b2676ac1321 -r f157d40df95a xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Mon Jan 09 16:01:44 2012 +0100 +++ b/xen/arch/x86/traps.c Mon Jan 16 22:08:09 2012 +0100 @@ -71,6 +71,7 @@ #include <asm/apic.h> #include <asm/mc146818rtc.h> #include <asm/hpet.h> +#include <asm/amd.h> #include <public/arch-x86/cpuid.h> #include <xsm/xsm.h> @@ -889,7 +890,6 @@ static void pv_cpuid(struct cpu_user_reg __clear_bit(X86_FEATURE_SVM % 32, &c); if ( !cpu_has_apic ) __clear_bit(X86_FEATURE_EXTAPIC % 32, &c); - __clear_bit(X86_FEATURE_OSVW % 32, &c); __clear_bit(X86_FEATURE_IBS % 32, &c); __clear_bit(X86_FEATURE_SKINIT % 32, &c); __clear_bit(X86_FEATURE_WDT % 32, &c); @@ -2542,6 +2542,15 @@ static int emulate_privileged_op(struct if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) goto fail; break; + case MSR_AMD_OSVW_ID_LENGTH: + case MSR_AMD_OSVW_STATUS: + if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) { + if (!boot_cpu_has(X86_FEATURE_OSVW)) + goto fail; + else + break; /* Writes are ignored */ + } + /* Fall through to default case */ default: if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) ) break; @@ -2573,6 +2582,7 @@ static int emulate_privileged_op(struct break; case 0x32: /* RDMSR */ + switch ( (u32)regs->ecx ) { #ifdef CONFIG_X86_64 @@ -2632,6 +2642,23 @@ static int emulate_privileged_op(struct regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); break; + case MSR_AMD_OSVW_ID_LENGTH: + case MSR_AMD_OSVW_STATUS: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (!boot_cpu_has(X86_FEATURE_OSVW)) + goto fail; + else { + if ((u32)regs->ecx == MSR_AMD_OSVW_ID_LENGTH) + msr_content = v->arch.amd.osvw.length; + else + msr_content = v->arch.amd.osvw.status; + + regs->eax = (uint32_t)msr_content; + regs->edx = (uint32_t)(msr_content >> 32); + } + } else + goto rdmsr_normal; + break; default: if ( rdmsr_hypervisor_regs(regs->ecx, &val) ) { diff -r 5b2676ac1321 -r f157d40df95a xen/include/asm-x86/amd.h --- a/xen/include/asm-x86/amd.h Mon Jan 09 16:01:44 2012 +0100 +++ b/xen/include/asm-x86/amd.h Mon Jan 16 22:08:09 2012 +0100 @@ -140,7 +140,17 @@ AMD_MODEL_RANGE(0x11, 0x0, 0x0, 0xff, 0xf), \ AMD_MODEL_RANGE(0x12, 0x0, 0x0, 0xff, 0xf)) +struct vcpu_amd { + + /* OSVW MSRs */ + struct { + u64 length; + u64 status; + } osvw; +}; + struct cpuinfo_x86; +void amd_vcpu_initialise(struct vcpu *); int cpu_has_amd_erratum(const struct cpuinfo_x86 *, int, ...); #ifdef __x86_64__ diff -r 5b2676ac1321 -r f157d40df95a xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Mon Jan 09 16:01:44 2012 +0100 +++ b/xen/include/asm-x86/domain.h Mon Jan 16 22:08:09 2012 +0100 @@ -8,6 +8,7 @@ #include <asm/hvm/domain.h> #include <asm/e820.h> #include <asm/mce.h> +#include <asm/amd.h> #include <public/vcpu.h> #define has_32bit_shinfo(d) ((d)->arch.has_32bit_shinfo) @@ -495,6 +496,11 @@ struct arch_vcpu uint32_t gdbsx_vcpu_event; + /* Vendor-specific data */ + union { + struct vcpu_amd amd; + }; + /* A secondary copy of the vcpu time info. */ XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
Jan Beulich
2012-Jan-17 08:26 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
>>> On 16.01.12 at 22:11, Boris Ostrovsky <boris.ostrovsky@amd.com> wrote: > --- a/tools/libxc/xc_cpuid_x86.c Mon Jan 09 16:01:44 2012 +0100 > +++ b/tools/libxc/xc_cpuid_x86.c Mon Jan 16 22:08:09 2012 +0100 > @@ -108,6 +108,7 @@ static void amd_xc_cpuid_policy( > bitmaskof(X86_FEATURE_SSE4A) | > bitmaskof(X86_FEATURE_MISALIGNSSE) | > bitmaskof(X86_FEATURE_3DNOWPREFETCH) | > + bitmaskof(X86_FEATURE_OSVW) |Indentation. Also, is this really meaningful to PV guests? And valid for pre-Fam10?> bitmaskof(X86_FEATURE_XOP) | > bitmaskof(X86_FEATURE_FMA4) | > bitmaskof(X86_FEATURE_TBM) | > --- a/xen/arch/x86/cpu/amd.c Mon Jan 09 16:01:44 2012 +0100 > +++ b/xen/arch/x86/cpu/amd.c Mon Jan 16 22:08:09 2012 +0100 > @@ -32,6 +32,13 @@ > static char opt_famrev[14]; > string_param("cpuid_mask_cpu", opt_famrev); > > +/* > + * Set osvw_len to higher number when updated Revision Guides > + * are published and we know what the new status bits are > + */This is ugly, as it requires permanent attention. The value to start with should really be 64 (beyond which other code adjustments are going to be necessary anyway).> +static uint64_t osvw_length = 4, osvw_status; > +static DEFINE_SPINLOCK(amd_lock); > + > static inline void wrmsr_amd(unsigned int index, unsigned int lo, > unsigned int hi) > { > @@ -182,6 +189,35 @@ static void __devinit set_cpuidmask(cons > } > } > > +static void amd_guest_osvw_init(struct vcpu *vcpu) > +{ > + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) > + return;Shouldn''t we bail here for pre-Fam10 CPUs too? Further, as asked above already - is all this really meaningful to PV guests? Otherwise the code would better go somewhere under xen/arch/x86/hvm/svm/ ... Also, throughout this file indentation needs to be changed to Linux style (tabs instead of spaces), unless you want to contribute a patch to convert the whole file to Xen style (in which case you''d still need to make adjustments here).> + > + /* > + * Guests should see errata 400 and 415 as fixed (assuming that > + * HLT and IO instructions are intercepted). > + */ > + vcpu->arch.amd.osvw.length = (osvw_length >= 3) ? (osvw_length) : 3;An expression consisting of just an identifier for sure doesn''t need parentheses.> + vcpu->arch.amd.osvw.status = osvw_status & ~(6ULL); > + > + /* > + * By increasing VCPU''s osvw.length to 3 we are telling the guest that > + * all osvw.status bits inside that length, including bit 0 (which is > + * reserved for erratum 298), are valid. However, if host processor''s > + * osvw_len is 0 then osvw_status[0] carries no information. We need to > + * be conservative here and therefore we tell the guest that erratum 298 > + * is present (because we really don''t know). > + */ > + if (osvw_length == 0 && boot_cpu_data.x86 == 0x10)Why do you check the family here? If osvw_length can only ever be zero on Fam10, then the first check is sufficient. If osvw_length can be zero on other than Fam10 (no matter whether we bail early older CPUs), then the check is actually wrong.> + vcpu->arch.amd.osvw.status |= 1; > +} > + > +void amd_vcpu_initialise(struct vcpu *vcpu) > +{ > + amd_guest_osvw_init(vcpu); > +} > + > /* > * Check for the presence of an AMD erratum. Arguments are defined in amd.h > > * for each known erratum. Return 1 if erratum is found. > @@ -512,6 +548,30 @@ static void __devinit init_amd(struct cp > set_cpuidmask(c); > > check_syscfg_dram_mod_en(); > + > + /* > + * Get OSVW bits. If bits are not the same on different processors then > + * choose the worst case (i.e. if erratum is present on one processor and > + * not on another assume that the erratum is present everywhere). > + */ > + if (test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability)) { > + uint64_t len, status; > + > + if (rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) || > + rdmsr_safe(MSR_AMD_OSVW_STATUS, status)) > + len = status = 0; > + > + spin_lock(&amd_lock);What is the locking here supposed to protect against? The AP call tree is smp_callin() -> identify_cpu() -> init_amd(), which cannot race with anything (as the initiating processor is waiting for cpu_state to change to CPU_STATE_CALLIN).> + > + if (len < osvw_length) > + osvw_length = len; > + > + osvw_status |= status; > + osvw_status &= (1ULL << osvw_length) - 1; > + > + spin_unlock(&amd_lock); > + } else > + osvw_length = osvw_status = 0; > } > > static struct cpu_dev amd_cpu_dev __cpuinitdata = { > --- a/xen/arch/x86/domain.c Mon Jan 09 16:01:44 2012 +0100 > +++ b/xen/arch/x86/domain.c Mon Jan 16 22:08:09 2012 +0100 > @@ -422,6 +422,10 @@ int vcpu_initialise(struct vcpu *v) > if ( (rc = vcpu_init_fpu(v)) != 0 ) > return rc; > > + /* Vendor-specific initialization */ > + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)If you check the vendor here, you don''t need to anywhere in the descendant functions.> + amd_vcpu_initialise(v); > + > if ( is_hvm_domain(d) ) > { > rc = hvm_vcpu_initialise(v); > --- a/xen/arch/x86/hvm/svm/svm.c Mon Jan 09 16:01:44 2012 +0100 > +++ b/xen/arch/x86/hvm/svm/svm.c Mon Jan 16 22:08:09 2012 +0100 > @@ -1044,6 +1044,27 @@ static void svm_init_erratum_383(struct > } > } > > +static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, uint read) > +{ > + uint eax, ebx, ecx, edx; > + > + /* Guest OSVW support */ > + hvm_cpuid(0x80000001, &eax, &ebx, &ecx, &edx); > + if (!test_bit((X86_FEATURE_OSVW & 31), &ecx))Formatting of changes to this file should be changed to Xen style.> + return -1; > + > + if (read) { > + if (msr == MSR_AMD_OSVW_ID_LENGTH) > + *val = v->arch.amd.osvw.length; > + else > + *val = v->arch.amd.osvw.status; > + } > + /* Writes are ignored */ > + > + return 0; > +} > + > + > static int svm_cpu_up(void) > { > uint64_t msr_content; > --- a/xen/arch/x86/traps.c Mon Jan 09 16:01:44 2012 +0100 > +++ b/xen/arch/x86/traps.c Mon Jan 16 22:08:09 2012 +0100 > @@ -2542,6 +2542,15 @@ static int emulate_privileged_op(struct > if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) > goto fail; > break; > + case MSR_AMD_OSVW_ID_LENGTH: > + case MSR_AMD_OSVW_STATUS: > + if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) { > + if (!boot_cpu_has(X86_FEATURE_OSVW)) > + goto fail; > + elseBogus "else" after a "goto". And I wonder, provided there is some point in doing all this for PV guests in the first place, whether this shouldn''t be kept getting handled by the default case.> + break; /* Writes are ignored */ > + } > + /* Fall through to default case */ > default: > if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) ) > break; > @@ -2573,6 +2582,7 @@ static int emulate_privileged_op(struct > break; > > case 0x32: /* RDMSR */ > +Stray addition of a newline.> switch ( (u32)regs->ecx ) > { > #ifdef CONFIG_X86_64 > @@ -2632,6 +2642,23 @@ static int emulate_privileged_op(struct > regs->eax = (uint32_t)msr_content; > regs->edx = (uint32_t)(msr_content >> 32); > break; > + case MSR_AMD_OSVW_ID_LENGTH: > + case MSR_AMD_OSVW_STATUS: > + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { > + if (!boot_cpu_has(X86_FEATURE_OSVW)) > + goto fail; > + else {Bogus "else" after a "goto". Jan> + if ((u32)regs->ecx == MSR_AMD_OSVW_ID_LENGTH) > + msr_content = v->arch.amd.osvw.length; > + else > + msr_content = v->arch.amd.osvw.status; > + > + regs->eax = (uint32_t)msr_content; > + regs->edx = (uint32_t)(msr_content >> 32); > + } > + } else > + goto rdmsr_normal; > + break; > default: > if ( rdmsr_hypervisor_regs(regs->ecx, &val) ) > {
Boris Ostrovsky
2012-Jan-17 17:54 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
Responses inline. On 01/17/12 03:26, Jan Beulich wrote:>>>> On 16.01.12 at 22:11, Boris Ostrovsky<boris.ostrovsky@amd.com> wrote: >> --- a/tools/libxc/xc_cpuid_x86.c Mon Jan 09 16:01:44 2012 +0100 >> +++ b/tools/libxc/xc_cpuid_x86.c Mon Jan 16 22:08:09 2012 +0100 >> @@ -108,6 +108,7 @@ static void amd_xc_cpuid_policy( >> bitmaskof(X86_FEATURE_SSE4A) | >> bitmaskof(X86_FEATURE_MISALIGNSSE) | >> bitmaskof(X86_FEATURE_3DNOWPREFETCH) | >> + bitmaskof(X86_FEATURE_OSVW) | > > Indentation. > > Also, is this really meaningful to PV guests?Does amd_xc_cpuid_policy() get called for PV guests? I thought this is HVM path only. xc_cpuid_pv_policy() is where clear_bit(X86_FEATURE_OSVW, regs[2]) was removed to handle PV. I believe this (i.e. OSVW changes) is meaningful for PV. Taking erratum 400 as an example -- we don''t need a Linux PV guest reading an MSR before going to idle (in amd_e400_idle()).> And valid for pre-Fam10?No, it indeed shouldn''t be set in those cases. I could set the bit conditionally, based on host family but the trouble is I don''t necessarily know what family the guest will be. Or is this known at this point?> >> bitmaskof(X86_FEATURE_XOP) | >> bitmaskof(X86_FEATURE_FMA4) | >> bitmaskof(X86_FEATURE_TBM) | >> --- a/xen/arch/x86/cpu/amd.c Mon Jan 09 16:01:44 2012 +0100 >> +++ b/xen/arch/x86/cpu/amd.c Mon Jan 16 22:08:09 2012 +0100 >> @@ -32,6 +32,13 @@ >> static char opt_famrev[14]; >> string_param("cpuid_mask_cpu", opt_famrev); >> >> +/* >> + * Set osvw_len to higher number when updated Revision Guides >> + * are published and we know what the new status bits are >> + */ > > This is ugly, as it requires permanent attention. The value to start > with should really be 64 (beyond which other code adjustments are > going to be necessary anyway).I went back and forth on this. The reason I settled on 4 is because we obviously don''t know what errata the higher bits will cover and because it is (at least theoretically) possible that a guest shouldn''t see the same status bits as the host. But maybe code maintenance is more burden than it''s worth?> >> +static uint64_t osvw_length = 4, osvw_status; >> +static DEFINE_SPINLOCK(amd_lock); >> + >> static inline void wrmsr_amd(unsigned int index, unsigned int lo, >> unsigned int hi) >> { >> @@ -182,6 +189,35 @@ static void __devinit set_cpuidmask(cons >> } >> } >> >> +static void amd_guest_osvw_init(struct vcpu *vcpu) >> +{ >> + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) >> + return; > > Shouldn''t we bail here for pre-Fam10 CPUs too?As you noted below the vendor check should be removed anyway since this is being called from under such a check already. As for family --- I think I need to solve how X86_FEATURE_OSVW is set and that will take care of whether anything that gets initialized in this routine is ever used.> > Further, as asked above already - is all this really meaningful to PV > guests? Otherwise the code would better go somewhere under > xen/arch/x86/hvm/svm/ ... > > Also, throughout this file indentation needs to be changed to Linux > style (tabs instead of spaces), unless you want to contribute a patch > to convert the whole file to Xen style (in which case you''d still need > to make adjustments here). > >> + >> + /* >> + * Guests should see errata 400 and 415 as fixed (assuming that >> + * HLT and IO instructions are intercepted). >> + */ >> + vcpu->arch.amd.osvw.length = (osvw_length>= 3) ? (osvw_length) : 3; > > An expression consisting of just an identifier for sure doesn''t need > parentheses. > >> + vcpu->arch.amd.osvw.status = osvw_status& ~(6ULL); >> + >> + /* >> + * By increasing VCPU''s osvw.length to 3 we are telling the guest that >> + * all osvw.status bits inside that length, including bit 0 (which is >> + * reserved for erratum 298), are valid. However, if host processor''s >> + * osvw_len is 0 then osvw_status[0] carries no information. We need to >> + * be conservative here and therefore we tell the guest that erratum 298 >> + * is present (because we really don''t know). >> + */ >> + if (osvw_length == 0&& boot_cpu_data.x86 == 0x10) > > Why do you check the family here? If osvw_length can only ever be > zero on Fam10, then the first check is sufficient. If osvw_length can > be zero on other than Fam10 (no matter whether we bail early older > CPUs), then the check is actually wrong.10h is the only family affected by this erratum.> >> + vcpu->arch.amd.osvw.status |= 1; >> +} >> + >> +void amd_vcpu_initialise(struct vcpu *vcpu) >> +{ >> + amd_guest_osvw_init(vcpu); >> +} >> + >> /* >> * Check for the presence of an AMD erratum. Arguments are defined in amd.h >> >> * for each known erratum. Return 1 if erratum is found. >> @@ -512,6 +548,30 @@ static void __devinit init_amd(struct cp >> set_cpuidmask(c); >> >> check_syscfg_dram_mod_en(); >> + >> + /* >> + * Get OSVW bits. If bits are not the same on different processors then >> + * choose the worst case (i.e. if erratum is present on one processor and >> + * not on another assume that the erratum is present everywhere). >> + */ >> + if (test_bit(X86_FEATURE_OSVW,&boot_cpu_data.x86_capability)) { >> + uint64_t len, status; >> + >> + if (rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) || >> + rdmsr_safe(MSR_AMD_OSVW_STATUS, status)) >> + len = status = 0; >> + >> + spin_lock(&amd_lock); > > What is the locking here supposed to protect against? The AP call tree > is smp_callin() -> identify_cpu() -> init_amd(), which cannot race with > anything (as the initiating processor is waiting for cpu_state to change > to CPU_STATE_CALLIN). > >> + >> + if (len< osvw_length) >> + osvw_length = len; >> + >> + osvw_status |= status; >> + osvw_status&= (1ULL<< osvw_length) - 1; >> + >> + spin_unlock(&amd_lock); >> + } else >> + osvw_length = osvw_status = 0; >> } >> >> static struct cpu_dev amd_cpu_dev __cpuinitdata = { >> --- a/xen/arch/x86/domain.c Mon Jan 09 16:01:44 2012 +0100 >> +++ b/xen/arch/x86/domain.c Mon Jan 16 22:08:09 2012 +0100 >> @@ -422,6 +422,10 @@ int vcpu_initialise(struct vcpu *v) >> if ( (rc = vcpu_init_fpu(v)) != 0 ) >> return rc; >> >> + /* Vendor-specific initialization */ >> + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) > > If you check the vendor here, you don''t need to anywhere in the > descendant functions. > >> + amd_vcpu_initialise(v); >> + >> if ( is_hvm_domain(d) ) >> { >> rc = hvm_vcpu_initialise(v); >> --- a/xen/arch/x86/hvm/svm/svm.c Mon Jan 09 16:01:44 2012 +0100 >> +++ b/xen/arch/x86/hvm/svm/svm.c Mon Jan 16 22:08:09 2012 +0100 >> @@ -1044,6 +1044,27 @@ static void svm_init_erratum_383(struct >> } >> } >> >> +static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, uint read) >> +{ >> + uint eax, ebx, ecx, edx; >> + >> + /* Guest OSVW support */ >> + hvm_cpuid(0x80000001,&eax,&ebx,&ecx,&edx); >> + if (!test_bit((X86_FEATURE_OSVW& 31),&ecx)) > > Formatting of changes to this file should be changed to Xen style. > >> + return -1; >> + >> + if (read) { >> + if (msr == MSR_AMD_OSVW_ID_LENGTH) >> + *val = v->arch.amd.osvw.length; >> + else >> + *val = v->arch.amd.osvw.status; >> + } >> + /* Writes are ignored */ >> + >> + return 0; >> +} >> + >> + >> static int svm_cpu_up(void) >> { >> uint64_t msr_content; >> --- a/xen/arch/x86/traps.c Mon Jan 09 16:01:44 2012 +0100 >> +++ b/xen/arch/x86/traps.c Mon Jan 16 22:08:09 2012 +0100 >> @@ -2542,6 +2542,15 @@ static int emulate_privileged_op(struct >> if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) >> goto fail; >> break; >> + case MSR_AMD_OSVW_ID_LENGTH: >> + case MSR_AMD_OSVW_STATUS: >> + if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) { >> + if (!boot_cpu_has(X86_FEATURE_OSVW)) >> + goto fail; >> + else > > Bogus "else" after a "goto". And I wonder, provided there is some > point in doing all this for PV guests in the first place, whether this > shouldn''t be kept getting handled by the default case.If we go into default case we will emit "Domain attempted WRMSR ..." message in the log. I was trying to avoid that since writing to these registers is not really an error, it just gets ignored. I''ll fix the rest. Thanks for review. -boris> >> + break; /* Writes are ignored */ >> + } >> + /* Fall through to default case */ >> default: >> if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) ) >> break; >> @@ -2573,6 +2582,7 @@ static int emulate_privileged_op(struct >> break; >> >> case 0x32: /* RDMSR */ >> + > > Stray addition of a newline. > >> switch ( (u32)regs->ecx ) >> { >> #ifdef CONFIG_X86_64 >> @@ -2632,6 +2642,23 @@ static int emulate_privileged_op(struct >> regs->eax = (uint32_t)msr_content; >> regs->edx = (uint32_t)(msr_content>> 32); >> break; >> + case MSR_AMD_OSVW_ID_LENGTH: >> + case MSR_AMD_OSVW_STATUS: >> + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { >> + if (!boot_cpu_has(X86_FEATURE_OSVW)) >> + goto fail; >> + else { > > Bogus "else" after a "goto". > > Jan > >> + if ((u32)regs->ecx == MSR_AMD_OSVW_ID_LENGTH) >> + msr_content = v->arch.amd.osvw.length; >> + else >> + msr_content = v->arch.amd.osvw.status; >> + >> + regs->eax = (uint32_t)msr_content; >> + regs->edx = (uint32_t)(msr_content>> 32); >> + } >> + } else >> + goto rdmsr_normal; >> + break; >> default: >> if ( rdmsr_hypervisor_regs(regs->ecx,&val) ) >> { > >
Jan Beulich
2012-Jan-18 09:50 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
>>> On 17.01.12 at 18:54, Boris Ostrovsky <boris.ostrovsky@amd.com> wrote: > On 01/17/12 03:26, Jan Beulich wrote: >>>>> On 16.01.12 at 22:11, Boris Ostrovsky<boris.ostrovsky@amd.com> wrote: >>> --- a/tools/libxc/xc_cpuid_x86.c Mon Jan 09 16:01:44 2012 +0100 >>> +++ b/tools/libxc/xc_cpuid_x86.c Mon Jan 16 22:08:09 2012 +0100 >>> @@ -108,6 +108,7 @@ static void amd_xc_cpuid_policy( >>> bitmaskof(X86_FEATURE_SSE4A) | >>> bitmaskof(X86_FEATURE_MISALIGNSSE) | >>> bitmaskof(X86_FEATURE_3DNOWPREFETCH) | >>> + bitmaskof(X86_FEATURE_OSVW) | >> >> Indentation. >> >> Also, is this really meaningful to PV guests? > > Does amd_xc_cpuid_policy() get called for PV guests? I thought this is > HVM path only.In that case I simply misplaced the comment.> xc_cpuid_pv_policy() is where clear_bit(X86_FEATURE_OSVW, regs[2]) was > removed to handle PV. > > I believe this (i.e. OSVW changes) is meaningful for PV. Taking erratum > 400 as an example -- we don''t need a Linux PV guest reading an MSR > before going to idle (in amd_e400_idle()).It is bogus in the first place if a pv guest does so - after all, not doing stuff like this is the nature of being pv.>> And valid for pre-Fam10? > > No, it indeed shouldn''t be set in those cases. > > I could set the bit conditionally, based on host family but the trouble > is I don''t necessarily know what family the guest will be. Or is this > known at this point?I think basing this on the host family is a good first shot. I would hope others could comment on how to properly deal with a dependency like this...>>> --- a/xen/arch/x86/cpu/amd.c Mon Jan 09 16:01:44 2012 +0100 >>> +++ b/xen/arch/x86/cpu/amd.c Mon Jan 16 22:08:09 2012 +0100 >>> @@ -32,6 +32,13 @@ >>> static char opt_famrev[14]; >>> string_param("cpuid_mask_cpu", opt_famrev); >>> >>> +/* >>> + * Set osvw_len to higher number when updated Revision Guides >>> + * are published and we know what the new status bits are >>> + */ >> >> This is ugly, as it requires permanent attention. The value to start >> with should really be 64 (beyond which other code adjustments are >> going to be necessary anyway). > > I went back and forth on this. The reason I settled on 4 is because we > obviously don''t know what errata the higher bits will cover and because > it is (at least theoretically) possible that a guest shouldn''t see the > same status bits as the host.I think allowing the guest to know of (not) worked around errata is even desirable (aiui in the worst case it''ll apply a redundant fix).>>> + vcpu->arch.amd.osvw.status = osvw_status& ~(6ULL); >>> + >>> + /* >>> + * By increasing VCPU''s osvw.length to 3 we are telling the guest that >>> + * all osvw.status bits inside that length, including bit 0 (which is >>> + * reserved for erratum 298), are valid. However, if host processor''s >>> + * osvw_len is 0 then osvw_status[0] carries no information. We need to >>> + * be conservative here and therefore we tell the guest that erratum > 298 >>> + * is present (because we really don''t know). >>> + */ >>> + if (osvw_length == 0&& boot_cpu_data.x86 == 0x10) >> >> Why do you check the family here? If osvw_length can only ever be >> zero on Fam10, then the first check is sufficient. If osvw_length can >> be zero on other than Fam10 (no matter whether we bail early older >> CPUs), then the check is actually wrong. > > 10h is the only family affected by this erratum.What is "this erratum" here? My comment was to suggest that either of the two conditions can likely be eliminated for being redundant.>>> --- a/xen/arch/x86/traps.c Mon Jan 09 16:01:44 2012 +0100 >>> +++ b/xen/arch/x86/traps.c Mon Jan 16 22:08:09 2012 +0100 >>> @@ -2542,6 +2542,15 @@ static int emulate_privileged_op(struct >>> if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) >>> goto fail; >>> break; >>> + case MSR_AMD_OSVW_ID_LENGTH: >>> + case MSR_AMD_OSVW_STATUS: >>> + if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) { >>> + if (!boot_cpu_has(X86_FEATURE_OSVW)) >>> + goto fail; >>> + else >> >> Bogus "else" after a "goto". And I wonder, provided there is some >> point in doing all this for PV guests in the first place, whether this >> shouldn''t be kept getting handled by the default case. > > > If we go into default case we will emit "Domain attempted WRMSR ..." > message in the log. I was trying to avoid that since writing to these > registers is not really an error, it just gets ignored.The documentation says that both MSRs are read/write, so issuing a message here appears to be The Right Thing (tm). Kernels shouldn''t be doing this in general, and for PV kernels imo it is definitely a bug. Jan
Boris Ostrovsky
2012-Jan-18 18:26 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
On 01/18/12 04:50, Jan Beulich wrote:>>>> On 17.01.12 at 18:54, Boris Ostrovsky<boris.ostrovsky@amd.com> wrote: >> On 01/17/12 03:26, Jan Beulich wrote: >>>>>> On 16.01.12 at 22:11, Boris Ostrovsky<boris.ostrovsky@amd.com> wrote: >>>> --- a/tools/libxc/xc_cpuid_x86.c Mon Jan 09 16:01:44 2012 +0100 >>>> +++ b/tools/libxc/xc_cpuid_x86.c Mon Jan 16 22:08:09 2012 +0100 >>>> @@ -108,6 +108,7 @@ static void amd_xc_cpuid_policy( >>>> bitmaskof(X86_FEATURE_SSE4A) | >>>> bitmaskof(X86_FEATURE_MISALIGNSSE) | >>>> bitmaskof(X86_FEATURE_3DNOWPREFETCH) | >>>> + bitmaskof(X86_FEATURE_OSVW) | >>> >>> Indentation. >>> >>> Also, is this really meaningful to PV guests? >> >> Does amd_xc_cpuid_policy() get called for PV guests? I thought this is >> HVM path only. > > In that case I simply misplaced the comment. > >> xc_cpuid_pv_policy() is where clear_bit(X86_FEATURE_OSVW, regs[2]) was >> removed to handle PV. >> >> I believe this (i.e. OSVW changes) is meaningful for PV. Taking erratum >> 400 as an example -- we don''t need a Linux PV guest reading an MSR >> before going to idle (in amd_e400_idle()). > > It is bogus in the first place if a pv guest does so - after all, not doing > stuff like this is the nature of being pv.It was actually a bad example --- the guest is not using amd_e400_idle() and so there are no extra MSR accesses. However, without this change OSVW bit will not show up in the guest''s CPUID and I think guests should be able to see it. One could argue whether or not we should mask off status bits for the two errata (400 and 415) since they are not currently used; I''d mask them off just to be consistent with HVM, it won''t affect anything.>>>> + vcpu->arch.amd.osvw.length = (osvw_length >= 3) ? (osvw_length) : 3 >>>> + vcpu->arch.amd.osvw.status = osvw_status& ~(6ULL); >>>> + >>>> + /* >>>> + * By increasing VCPU''s osvw.length to 3 we are telling the guest that >>>> + * all osvw.status bits inside that length, including bit 0 (which is >>>> + * reserved for erratum 298), are valid. However, if host processor''s >>>> + * osvw_len is 0 then osvw_status[0] carries no information. We need to >>>> + * be conservative here and therefore we tell the guest that erratum >> 298 >>>> + * is present (because we really don''t know). >>>> + */ >>>> + if (osvw_length == 0&& boot_cpu_data.x86 == 0x10) >>> >>> Why do you check the family here? If osvw_length can only ever be >>> zero on Fam10, then the first check is sufficient. If osvw_length can >>> be zero on other than Fam10 (no matter whether we bail early older >>> CPUs), then the check is actually wrong. >> >> 10h is the only family affected by this erratum. > > What is "this erratum" here? My comment was to suggest that either > of the two conditions can likely be eliminated for being redundant.("This erratum" is erratum 298, which is bit 0) If osvw_length!=0 then we don''t need to do anything since bit 0 is already valid. If osvw_length==0 then -- since we just made the guest''s version of this register 3 and thus turned bit 0 from being invalid to valid -- we need to do something about bit 0. But the bit can only be set on family 10h, thus both checks. Thanks. -boris
Jan Beulich
2012-Jan-19 08:10 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
>>> On 18.01.12 at 19:26, Boris Ostrovsky <boris.ostrovsky@amd.com> wrote: > On 01/18/12 04:50, Jan Beulich wrote: >>>>> On 17.01.12 at 18:54, Boris Ostrovsky<boris.ostrovsky@amd.com> wrote: >>> I believe this (i.e. OSVW changes) is meaningful for PV. Taking erratum >>> 400 as an example -- we don''t need a Linux PV guest reading an MSR >>> before going to idle (in amd_e400_idle()). >> >> It is bogus in the first place if a pv guest does so - after all, not doing >> stuff like this is the nature of being pv. > > It was actually a bad example --- the guest is not using amd_e400_idle() > and so there are no extra MSR accesses. > > However, without this change OSVW bit will not show up in the guest''s > CPUID and I think guests should be able to see it. One could argue > whether or not we should mask off status bits for the two errata (400 > and 415) since they are not currently used; I''d mask them off just to be > consistent with HVM, it won''t affect anything.I continue to think otherwise - knowing of and dealing with this is supposed to be entirely hidden from PV guests, unless and until you can provide a counter example. Therefore I am likely to nak this part of future revisions of the patch (which Keir could certainly override), up to and including ripping out the PV part (and adjusting the rest accordingly) if I would go for committing it.>>>>> + if (osvw_length == 0&& boot_cpu_data.x86 == 0x10) >>>> >>>> Why do you check the family here? If osvw_length can only ever be >>>> zero on Fam10, then the first check is sufficient. If osvw_length can >>>> be zero on other than Fam10 (no matter whether we bail early older >>>> CPUs), then the check is actually wrong. >>> >>> 10h is the only family affected by this erratum. >> >> What is "this erratum" here? My comment was to suggest that either >> of the two conditions can likely be eliminated for being redundant. > > ("This erratum" is erratum 298, which is bit 0) > > If osvw_length!=0 then we don''t need to do anything since bit 0 is > already valid. > > If osvw_length==0 then -- since we just made the guest''s version of this > register 3 and thus turned bit 0 from being invalid to valid -- we need > to do something about bit 0. But the bit can only be set on family 10h, > thus both checks.Okay, got you. However, the whole thing will become superfluous anyway if you bail on family less than 0x10 right at the start of the function. Btw., one more comment on the change to init_amd(): You will likely need to distinguish BP and AP cases here - on the BP you want to plainly write the values read from the MSRs to the global variables, but on APs (with possibly different settings) you need to work towards a setting of the global variables that apply to all of the CPUs. This is not just for the (theoretical only?) hotplug case, but particularly the one of a soft-offlined CPU that had its microcode updated already in a way affecting the OSVW bits and is subsequently being brought back online. Which reminds you and me that the patch is missing integration with the microcode update loader (as that one may alter the OSVW bits iiuc). Jan
Keir Fraser
2012-Jan-19 08:46 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
On 19/01/2012 08:10, "Jan Beulich" <JBeulich@suse.com> wrote:>>>> On 18.01.12 at 19:26, Boris Ostrovsky <boris.ostrovsky@amd.com> wrote: >> On 01/18/12 04:50, Jan Beulich wrote: >>>>>> On 17.01.12 at 18:54, Boris Ostrovsky<boris.ostrovsky@amd.com> wrote: >>>> I believe this (i.e. OSVW changes) is meaningful for PV. Taking erratum >>>> 400 as an example -- we don''t need a Linux PV guest reading an MSR >>>> before going to idle (in amd_e400_idle()). >>> >>> It is bogus in the first place if a pv guest does so - after all, not doing >>> stuff like this is the nature of being pv. >> >> It was actually a bad example --- the guest is not using amd_e400_idle() >> and so there are no extra MSR accesses. >> >> However, without this change OSVW bit will not show up in the guest''s >> CPUID and I think guests should be able to see it. One could argue >> whether or not we should mask off status bits for the two errata (400 >> and 415) since they are not currently used; I''d mask them off just to be >> consistent with HVM, it won''t affect anything. > > I continue to think otherwise - knowing of and dealing with this is > supposed to be entirely hidden from PV guests, unless and until > you can provide a counter example. Therefore I am likely to nak > this part of future revisions of the patch (which Keir could > certainly override), up to and including ripping out the PV part (and > adjusting the rest accordingly) if I would go for committing it.Well, the general principle of exposing OSVW to PV guests doesn''t seem terrible. It''s just the current specific motivation for exposing to HVM guests does not apply to PV guests. Are *any* of the current OSVW defined bits at all useful or applicable to PV guests? -- Keir>>>>>> + if (osvw_length == 0&& boot_cpu_data.x86 == 0x10) >>>>> >>>>> Why do you check the family here? If osvw_length can only ever be >>>>> zero on Fam10, then the first check is sufficient. If osvw_length can >>>>> be zero on other than Fam10 (no matter whether we bail early older >>>>> CPUs), then the check is actually wrong. >>>> >>>> 10h is the only family affected by this erratum. >>> >>> What is "this erratum" here? My comment was to suggest that either >>> of the two conditions can likely be eliminated for being redundant. >> >> ("This erratum" is erratum 298, which is bit 0) >> >> If osvw_length!=0 then we don''t need to do anything since bit 0 is >> already valid. >> >> If osvw_length==0 then -- since we just made the guest''s version of this >> register 3 and thus turned bit 0 from being invalid to valid -- we need >> to do something about bit 0. But the bit can only be set on family 10h, >> thus both checks. > > Okay, got you. However, the whole thing will become superfluous > anyway if you bail on family less than 0x10 right at the start of the > function. > > Btw., one more comment on the change to init_amd(): You will likely > need to distinguish BP and AP cases here - on the BP you want to > plainly write the values read from the MSRs to the global variables, > but on APs (with possibly different settings) you need to work > towards a setting of the global variables that apply to all of the > CPUs. This is not just for the (theoretical only?) hotplug case, but > particularly the one of a soft-offlined CPU that had its microcode > updated already in a way affecting the OSVW bits and is > subsequently being brought back online. > > Which reminds you and me that the patch is missing integration with > the microcode update loader (as that one may alter the OSVW bits > iiuc). > > Jan > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel
Boris Ostrovsky
2012-Jan-19 15:22 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
On 01/19/12 03:10, Jan Beulich wrote:> > Btw., one more comment on the change to init_amd(): You will likely > need to distinguish BP and AP cases here - on the BP you want to > plainly write the values read from the MSRs to the global variables, > but on APs (with possibly different settings) you need to work > towards a setting of the global variables that apply to all of the > CPUs. This is not just for the (theoretical only?) hotplug case, but > particularly the one of a soft-offlined CPU that had its microcode > updated already in a way affecting the OSVW bits and is > subsequently being brought back online.Not sure I follow you. There is no difference in how BIOS/microcode sets OSVW bits on AP or BP, that''s based on silicon version (i.e. stepping) and not where that silicon is plugged in. If a core is offlined that will presumably happen after it has already gone through init_amd() and therefore global versions of the registers already accounted for that processor''s values. (The downside is that if the offlined processors has more bugs than the rest then everyone is considered as bad even though the bad processor is out of the action. But I think the positives of updating global variables after a cpu is offlined are not worth the complexity.)> > Which reminds you and me that the patch is missing integration with > the microcode update loader (as that one may alter the OSVW bits > iiuc).That''s a good point --- I should re-initialize the global variables after a patch is loaded. Thanks. -boris
Boris Ostrovsky
2012-Jan-19 15:57 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
On 01/19/12 03:46, Keir Fraser wrote:> On 19/01/2012 08:10, "Jan Beulich"<JBeulich@suse.com> wrote: > >>>>> On 18.01.12 at 19:26, Boris Ostrovsky<boris.ostrovsky@amd.com> wrote: >>> On 01/18/12 04:50, Jan Beulich wrote: >>>>>>> On 17.01.12 at 18:54, Boris Ostrovsky<boris.ostrovsky@amd.com> wrote: >>>>> I believe this (i.e. OSVW changes) is meaningful for PV. Taking erratum >>>>> 400 as an example -- we don''t need a Linux PV guest reading an MSR >>>>> before going to idle (in amd_e400_idle()). >>>> >>>> It is bogus in the first place if a pv guest does so - after all, not doing >>>> stuff like this is the nature of being pv. >>> >>> It was actually a bad example --- the guest is not using amd_e400_idle() >>> and so there are no extra MSR accesses. >>> >>> However, without this change OSVW bit will not show up in the guest''s >>> CPUID and I think guests should be able to see it. One could argue >>> whether or not we should mask off status bits for the two errata (400 >>> and 415) since they are not currently used; I''d mask them off just to be >>> consistent with HVM, it won''t affect anything. >> >> I continue to think otherwise - knowing of and dealing with this is >> supposed to be entirely hidden from PV guests, unless and until >> you can provide a counter example. Therefore I am likely to nak >> this part of future revisions of the patch (which Keir could >> certainly override), up to and including ripping out the PV part (and >> adjusting the rest accordingly) if I would go for committing it. > > Well, the general principle of exposing OSVW to PV guests doesn''t seem > terrible. It''s just the current specific motivation for exposing to HVM > guests does not apply to PV guests. > > Are *any* of the current OSVW defined bits at all useful or applicable to PV > guests? >(Wrong "Reply" button) Probably not with current bits, at least for Linux PV guests. -boris
Jan Beulich
2012-Jan-19 16:29 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
>>> On 19.01.12 at 16:22, Boris Ostrovsky <boris.ostrovsky@amd.com> wrote: > On 01/19/12 03:10, Jan Beulich wrote: > >> >> Btw., one more comment on the change to init_amd(): You will likely >> need to distinguish BP and AP cases here - on the BP you want to >> plainly write the values read from the MSRs to the global variables, >> but on APs (with possibly different settings) you need to work >> towards a setting of the global variables that apply to all of the >> CPUs. This is not just for the (theoretical only?) hotplug case, but >> particularly the one of a soft-offlined CPU that had its microcode >> updated already in a way affecting the OSVW bits and is >> subsequently being brought back online. > > > Not sure I follow you. There is no difference in how BIOS/microcode sets > OSVW bits on AP or BP, that''s based on silicon version (i.e. stepping) > and not where that silicon is plugged in.But mixed stepping systems could have values in an AP that are different from what the BP (and earlier APs) had.> If a core is offlined that will presumably happen after it has already > gone through init_amd() and therefore global versions of the registers > already accounted for that processor''s values. (The downside is that if > the offlined processors has more bugs than the rest then everyone is > considered as bad even though the bad processor is out of the action. > But I think the positives of updating global variables after a cpu is > offlined are not worth the complexity.)Yes, optimizing for this case is very unlikely to be necessary. Jan
Boris Ostrovsky
2012-Jan-30 17:05 UTC
[PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
# HG changeset patch # User Boris Ostrovsky <boris.ostrovsky@amd.com> # Date 1327942850 -3600 # Node ID 1361d9895cb17aa8fce1054120311285144b8399 # Parent e2722b24dc0962de37215320b05d1bb7c4c42864 x86/AMD: Add support for AMD''s OSVW feature in guests. In some cases guests should not provide workarounds for errata even when the physical processor is affected. For example, because of erratum 400 on family 10h processors a Linux guest will read an MSR (resulting in VMEXIT) before going to idle in order to avoid getting stuck in a non-C0 state. This is not necessary: HLT and IO instructions are intercepted and therefore there is no reason for erratum 400 workaround in the guest. This patch allows us to present a guest with certain errata as fixed, regardless of the state of actual hardware. Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com> Acked-by: Christoph Egger <Christoph.Egger@amd.com> diff -r e2722b24dc09 -r 1361d9895cb1 tools/libxc/xc_cpuid_x86.c --- a/tools/libxc/xc_cpuid_x86.c Thu Jan 26 17:43:31 2012 +0000 +++ b/tools/libxc/xc_cpuid_x86.c Mon Jan 30 18:00:50 2012 +0100 @@ -108,6 +108,7 @@ static void amd_xc_cpuid_policy( bitmaskof(X86_FEATURE_SSE4A) | bitmaskof(X86_FEATURE_MISALIGNSSE) | bitmaskof(X86_FEATURE_3DNOWPREFETCH) | + bitmaskof(X86_FEATURE_OSVW) | bitmaskof(X86_FEATURE_XOP) | bitmaskof(X86_FEATURE_FMA4) | bitmaskof(X86_FEATURE_TBM) | diff -r e2722b24dc09 -r 1361d9895cb1 xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c Thu Jan 26 17:43:31 2012 +0000 +++ b/xen/arch/x86/hvm/svm/svm.c Mon Jan 30 18:00:50 2012 +0100 @@ -83,6 +83,9 @@ static DEFINE_PER_CPU_READ_MOSTLY(void * static bool_t amd_erratum383_found __read_mostly; +/* OSVW bits */ +static uint64_t osvw_length, osvw_status; + void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len) { struct vcpu *curr = current; @@ -902,6 +905,60 @@ static void svm_do_resume(struct vcpu *v reset_stack_and_jump(svm_asm_do_resume); } +static void svm_guest_osvw_init(struct vcpu *vcpu) +{ + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) + return; + + /* + * Guests should see errata 400 and 415 as fixed (assuming that + * HLT and IO instructions are intercepted). + */ + vcpu->arch.hvm_svm.osvw.length = (osvw_length >= 3) ? osvw_length : 3; + vcpu->arch.hvm_svm.osvw.status = osvw_status & ~(6ULL); + + /* + * By increasing VCPU''s osvw.length to 3 we are telling the guest that + * all osvw.status bits inside that length, including bit 0 (which is + * reserved for erratum 298), are valid. However, if host processor''s + * osvw_len is 0 then osvw_status[0] carries no information. We need to + * be conservative here and therefore we tell the guest that erratum 298 + * is present (because we really don''t know). + */ + if ( osvw_length == 0 && boot_cpu_data.x86 == 0x10 ) + vcpu->arch.hvm_svm.osvw.status |= 1; +} + +void svm_host_osvw_reset() +{ + osvw_length = 64; /* One register (MSRC001_0141) worth of errata */ + osvw_status = 0; +} + +void svm_host_osvw_init() +{ + /* + * Get OSVW bits. If bits are not the same on different processors then + * choose the worst case (i.e. if erratum is present on one processor and + * not on another assume that the erratum is present everywhere). + */ + if ( test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability) ) + { + uint64_t len, status; + + if ( rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) || + rdmsr_safe(MSR_AMD_OSVW_STATUS, status) ) + len = status = 0; + + if (len < osvw_length) + osvw_length = len; + + osvw_status |= status; + osvw_status &= (1ULL << osvw_length) - 1; + } else + osvw_length = osvw_status = 0; +} + static int svm_domain_initialise(struct domain *d) { return 0; @@ -930,6 +987,9 @@ static int svm_vcpu_initialise(struct vc } vpmu_initialise(v); + + svm_guest_osvw_init(v); + return 0; } @@ -1044,6 +1104,27 @@ static void svm_init_erratum_383(struct } } +static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, bool_t read) +{ + uint eax, ebx, ecx, edx; + + /* Guest OSVW support */ + hvm_cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if ( !test_bit((X86_FEATURE_OSVW & 31), &ecx) ) + return -1; + + if ( read ) + { + if (msr == MSR_AMD_OSVW_ID_LENGTH) + *val = v->arch.hvm_svm.osvw.length; + else + *val = v->arch.hvm_svm.osvw.status; + } + /* Writes are ignored */ + + return 0; +} + static int svm_cpu_up(void) { uint64_t msr_content; @@ -1094,6 +1175,9 @@ static int svm_cpu_up(void) } #endif + /* Initialize OSVW bits to be used by guests */ + svm_host_osvw_init(); + return 0; } @@ -1104,6 +1188,8 @@ struct hvm_function_table * __init start if ( !test_bit(X86_FEATURE_SVM, &boot_cpu_data.x86_capability) ) return NULL; + svm_host_osvw_reset(); + if ( svm_cpu_up() ) { printk("SVM: failed to initialise.\n"); @@ -1388,6 +1474,13 @@ static int svm_msr_read_intercept(unsign vpmu_do_rdmsr(msr, msr_content); break; + case MSR_AMD_OSVW_ID_LENGTH: + case MSR_AMD_OSVW_STATUS: + ret = svm_handle_osvw(v, msr, msr_content, 1); + if ( ret < 0 ) + goto gpf; + break; + default: ret = nsvm_rdmsr(v, msr, msr_content); if ( ret < 0 ) @@ -1512,6 +1605,13 @@ static int svm_msr_write_intercept(unsig */ break; + case MSR_AMD_OSVW_ID_LENGTH: + case MSR_AMD_OSVW_STATUS: + ret = svm_handle_osvw(v, msr, &msr_content, 0); + if ( ret < 0 ) + goto gpf; + break; + default: ret = nsvm_wrmsr(v, msr, msr_content); if ( ret < 0 ) diff -r e2722b24dc09 -r 1361d9895cb1 xen/arch/x86/microcode.c --- a/xen/arch/x86/microcode.c Thu Jan 26 17:43:31 2012 +0000 +++ b/xen/arch/x86/microcode.c Mon Jan 30 18:00:50 2012 +0100 @@ -218,6 +218,16 @@ int microcode_update(XEN_GUEST_HANDLE(co info->error = 0; info->cpu = cpumask_first(&cpu_online_map); + if ( microcode_ops->start_update ) + { + ret = microcode_ops->start_update(); + if ( ret != 0 ) + { + xfree(info); + return ret; + } + } + return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info); } diff -r e2722b24dc09 -r 1361d9895cb1 xen/arch/x86/microcode_amd.c --- a/xen/arch/x86/microcode_amd.c Thu Jan 26 17:43:31 2012 +0000 +++ b/xen/arch/x86/microcode_amd.c Mon Jan 30 18:00:50 2012 +0100 @@ -25,6 +25,7 @@ #include <asm/msr.h> #include <asm/processor.h> #include <asm/microcode.h> +#include <asm/hvm/svm/svm.h> struct equiv_cpu_entry { uint32_t installed_cpu; @@ -287,7 +288,8 @@ static int cpu_request_microcode(int cpu { printk(KERN_ERR "microcode: error! Wrong " "microcode patch file magic\n"); - return -EINVAL; + error = -EINVAL; + goto out; } mc_amd = xmalloc(struct microcode_amd); @@ -295,7 +297,8 @@ static int cpu_request_microcode(int cpu { printk(KERN_ERR "microcode: error! " "Can not allocate memory for microcode patch\n"); - return -ENOMEM; + error = -ENOMEM; + goto out; } error = install_equiv_cpu_table(mc_amd, buf, &offset); @@ -303,7 +306,8 @@ static int cpu_request_microcode(int cpu { xfree(mc_amd); printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); - return -EINVAL; + error = -EINVAL; + goto out; } mc_old = uci->mc.mc_amd; @@ -337,13 +341,18 @@ static int cpu_request_microcode(int cpu /* On success keep the microcode patch for * re-apply on resume. */ - if (error == 1) + if ( error == 1 ) { xfree(mc_old); - return 0; + error = 0; + } else + { + xfree(mc_amd); + uci->mc.mc_amd = mc_old; } - xfree(mc_amd); - uci->mc.mc_amd = mc_old; + + out: + svm_host_osvw_init(); return error; } @@ -395,11 +404,19 @@ err1: return -ENOMEM; } +static int start_update(void) +{ + svm_host_osvw_reset(); + + return 0; +} + static const struct microcode_ops microcode_amd_ops = { .microcode_resume_match = microcode_resume_match, .cpu_request_microcode = cpu_request_microcode, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode, + .start_update = start_update, }; static __init int microcode_init_amd(void) diff -r e2722b24dc09 -r 1361d9895cb1 xen/arch/x86/platform_hypercall.c --- a/xen/arch/x86/platform_hypercall.c Thu Jan 26 17:43:31 2012 +0000 +++ b/xen/arch/x86/platform_hypercall.c Mon Jan 30 18:00:50 2012 +0100 @@ -166,7 +166,17 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe break; guest_from_compat_handle(data, op->u.microcode.data); + + while ( !domctl_lock_acquire() ) + if ( hypercall_preempt_check() ) + { + ret = hypercall_create_continuation( + __HYPERVISOR_platform_op, "h", u_xenpf_op); + goto out; + } + ret = microcode_update(data, op->u.microcode.length); + domctl_lock_release(); } break; diff -r e2722b24dc09 -r 1361d9895cb1 xen/include/asm-x86/hvm/svm/svm.h --- a/xen/include/asm-x86/hvm/svm/svm.h Thu Jan 26 17:43:31 2012 +0000 +++ b/xen/include/asm-x86/hvm/svm/svm.h Mon Jan 30 18:00:50 2012 +0100 @@ -98,4 +98,7 @@ extern u32 svm_feature_flags; ~TSC_RATIO_RSVD_BITS ) #define vcpu_tsc_ratio(v) TSC_RATIO((v)->domain->arch.tsc_khz, cpu_khz) +extern void svm_host_osvw_reset(void); +extern void svm_host_osvw_init(void); + #endif /* __ASM_X86_HVM_SVM_H__ */ diff -r e2722b24dc09 -r 1361d9895cb1 xen/include/asm-x86/hvm/svm/vmcb.h --- a/xen/include/asm-x86/hvm/svm/vmcb.h Thu Jan 26 17:43:31 2012 +0000 +++ b/xen/include/asm-x86/hvm/svm/vmcb.h Mon Jan 30 18:00:50 2012 +0100 @@ -516,6 +516,12 @@ struct arch_svm_struct { /* AMD lightweight profiling MSR */ uint64_t guest_lwp_cfg; + + /* OSVW MSRs */ + struct { + u64 length; + u64 status; + } osvw; }; struct vmcb_struct *alloc_vmcb(void); diff -r e2722b24dc09 -r 1361d9895cb1 xen/include/asm-x86/microcode.h --- a/xen/include/asm-x86/microcode.h Thu Jan 26 17:43:31 2012 +0000 +++ b/xen/include/asm-x86/microcode.h Mon Jan 30 18:00:50 2012 +0100 @@ -11,6 +11,7 @@ struct microcode_ops { int (*cpu_request_microcode)(int cpu, const void *buf, size_t size); int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); int (*apply_microcode)(int cpu); + int (*start_update)(void); }; struct cpu_signature {
Jan Beulich
2012-Jan-31 10:13 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
>>> On 30.01.12 at 18:05, Boris Ostrovsky <boris.ostrovsky@amd.com> wrote: > --- a/xen/arch/x86/microcode.c Thu Jan 26 17:43:31 2012 +0000 > +++ b/xen/arch/x86/microcode.c Mon Jan 30 18:00:50 2012 +0100 > @@ -218,6 +218,16 @@ int microcode_update(XEN_GUEST_HANDLE(co > info->error = 0; > info->cpu = cpumask_first(&cpu_online_map); > > + if ( microcode_ops->start_update ) > + { > + ret = microcode_ops->start_update(); > + if ( ret != 0 ) > + { > + xfree(info); > + return ret; > + }Indentation (no tabs).> + } > + > return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info); > } > > --- a/xen/arch/x86/microcode_amd.c Thu Jan 26 17:43:31 2012 +0000 > +++ b/xen/arch/x86/microcode_amd.c Mon Jan 30 18:00:50 2012 +0100 > @@ -337,13 +341,18 @@ static int cpu_request_microcode(int cpu > /* On success keep the microcode patch for > * re-apply on resume. > */ > - if (error == 1) > + if ( error == 1 ) > { > xfree(mc_old); > - return 0; > + error = 0; > + } else} else> + { > + xfree(mc_amd); > + uci->mc.mc_amd = mc_old; > } > - xfree(mc_amd); > - uci->mc.mc_amd = mc_old; > + > + out: > + svm_host_osvw_init(); > > return error; > } > --- a/xen/arch/x86/platform_hypercall.c Thu Jan 26 17:43:31 2012 +0000 > +++ b/xen/arch/x86/platform_hypercall.c Mon Jan 30 18:00:50 2012 +0100 > @@ -166,7 +166,17 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe > break; > > guest_from_compat_handle(data, op->u.microcode.data); > + > + while ( !domctl_lock_acquire() )It''s not clear to me what you''re trying to protect against here. I don''t think this prevents guests from making progress (and hence possibly seeing the intermediate [and wrong] OSVW state between svm_host_osvw_reset() and svm_host_osvw_init()). Jan> + if ( hypercall_preempt_check() ) > + { > + ret = hypercall_create_continuation( > + __HYPERVISOR_platform_op, "h", u_xenpf_op); > + goto out; > + } > + > ret = microcode_update(data, op->u.microcode.length); > + domctl_lock_release(); > } > break; >
Christoph Egger
2012-Jan-31 10:36 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
On 01/31/12 11:13, Jan Beulich wrote:>> --- a/xen/arch/x86/platform_hypercall.c Thu Jan 26 17:43:31 2012 +0000 >> +++ b/xen/arch/x86/platform_hypercall.c Mon Jan 30 18:00:50 2012 +0100 >> @@ -166,7 +166,17 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe >> break; >> >> guest_from_compat_handle(data, op->u.microcode.data); >> + >> + while ( !domctl_lock_acquire() ) > > It''s not clear to me what you''re trying to protect against here. I don''t > think this prevents guests from making progress (and hence possibly > seeing the intermediate [and wrong] OSVW state between > svm_host_osvw_reset() and svm_host_osvw_init()).The reason is because when we allocated VCPUs (via XEN_DOMCTL_max_vcpus and possibly other controls) we call svm_vcpu_initialise() -> svm_guest_osvw_init(). If we are in the middle of microcode update then osvw_length/osvw_status may be in an inconsistent state. Christoph> Jan > >> + if ( hypercall_preempt_check() ) >> + { >> + ret = hypercall_create_continuation( >> + __HYPERVISOR_platform_op, "h", u_xenpf_op); >> + goto out; >> + } >> + >> ret = microcode_update(data, op->u.microcode.length); >> + domctl_lock_release(); >> } >> break; >> > > >-- ---to satisfy European Law for business letters: Advanced Micro Devices GmbH Einsteinring 24, 85689 Dornach b. Muenchen Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen Registergericht Muenchen, HRB Nr. 43632
Keir Fraser
2012-Jan-31 10:39 UTC
Re: [PATCH] x86/AMD: Add support for AMD''s OSVW feature in guests
On 31/01/2012 10:36, "Christoph Egger" <Christoph.Egger@amd.com> wrote:> On 01/31/12 11:13, Jan Beulich wrote: >>> --- a/xen/arch/x86/platform_hypercall.c Thu Jan 26 17:43:31 2012 +0000 >>> +++ b/xen/arch/x86/platform_hypercall.c Mon Jan 30 18:00:50 2012 +0100 >>> @@ -166,7 +166,17 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe >>> break; >>> >>> guest_from_compat_handle(data, op->u.microcode.data); >>> + >>> + while ( !domctl_lock_acquire() ) >> >> It''s not clear to me what you''re trying to protect against here. I don''t >> think this prevents guests from making progress (and hence possibly >> seeing the intermediate [and wrong] OSVW state between >> svm_host_osvw_reset() and svm_host_osvw_init()). > > The reason is because when we allocated VCPUs (via XEN_DOMCTL_max_vcpus > and possibly other controls) we call svm_vcpu_initialise() -> > svm_guest_osvw_init(). If we are in the middle of microcode update then > osvw_length/osvw_status may be in an inconsistent state.Deserves a code comment. Even better, deserves its own synchronisation (e.g., your own specific lock for this purpose). It would be nice to avoid further intricate dependencies on the big domctl lock, so we can consider removing it in future. -- Keir> Christoph > >> Jan >> >>> + if ( hypercall_preempt_check() ) >>> + { >>> + ret = hypercall_create_continuation( >>> + __HYPERVISOR_platform_op, "h", u_xenpf_op); >>> + goto out; >>> + } >>> + >>> ret = microcode_update(data, op->u.microcode.length); >>> + domctl_lock_release(); >>> } >>> break; >>> >> >> >> >