Xen/MCE: adjust for future new vMCE model Recently we designed a new vMCE model, and would implement later. In order not to block live migration from current vMCE to future new vMCE, we do some middle work in this patch, basically it does minimal adjustment, including 1. remove MCG_CTL; 2. stick all 1''s to MCi_CTL; 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP; Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com> diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c Wed Jun 27 09:36:43 2012 +0200 +++ b/xen/arch/x86/cpu/mcheck/mce.c Thu Jul 05 04:28:48 2012 +0800 @@ -843,8 +843,6 @@ mctelem_init(sizeof(struct mc_info)); - vmce_init(c); - /* Turn on MCE now */ set_in_cr4(X86_CR4_MCE); diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.h --- a/xen/arch/x86/cpu/mcheck/mce.h Wed Jun 27 09:36:43 2012 +0200 +++ b/xen/arch/x86/cpu/mcheck/mce.h Thu Jul 05 04:28:48 2012 +0800 @@ -170,8 +170,6 @@ int inject_vmce(struct domain *d); int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global); -extern int vmce_init(struct cpuinfo_x86 *c); - static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) { if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/vmce.c --- a/xen/arch/x86/cpu/mcheck/vmce.c Wed Jun 27 09:36:43 2012 +0200 +++ b/xen/arch/x86/cpu/mcheck/vmce.c Thu Jul 05 04:28:48 2012 +0800 @@ -19,36 +19,42 @@ #include "mce.h" #include "x86_mca.h" +/* + * Emulalte 2 banks for guest + * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: + * 1). Intel cpu whose family-model valuse < 06-1A; + * 2). AMD K7 + * Bank1: used to transfer error info to guest + */ +#define GUEST_BANK_NUM 2 + #define dom_vmce(x) ((x)->arch.vmca_msrs) static uint64_t __read_mostly g_mcg_cap; -/* Real value in physical CTL MSR */ -static uint64_t __read_mostly h_mcg_ctl; -static uint64_t *__read_mostly h_mci_ctrl; - int vmce_init_msr(struct domain *d) { dom_vmce(d) = xmalloc(struct domain_mca_msrs); if ( !dom_vmce(d) ) return -ENOMEM; - dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks); + dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, GUEST_BANK_NUM); if ( !dom_vmce(d)->mci_ctl ) { xfree(dom_vmce(d)); return -ENOMEM; } memset(dom_vmce(d)->mci_ctl, ~0, - nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); + GUEST_BANK_NUM * sizeof(*dom_vmce(d)->mci_ctl)); dom_vmce(d)->mcg_status = 0x0; - dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; dom_vmce(d)->nr_injection = 0; INIT_LIST_HEAD(&dom_vmce(d)->impact_header); spin_lock_init(&dom_vmce(d)->lock); + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; + return 0; } @@ -68,7 +74,7 @@ int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) { dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", @@ -93,9 +99,10 @@ switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: - if ( bank < nr_mce_banks ) - *val = vmce->mci_ctl[bank] & - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); + if ( bank < GUEST_BANK_NUM ) { + BUG_ON( vmce->mci_ctl[bank] != ~0UL ); + *val = vmce->mci_ctl[bank]; + } mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", bank, *val); break; @@ -187,11 +194,9 @@ *val); break; case MSR_IA32_MCG_CTL: - /* Always 0 if no CTL support */ - if ( cur->arch.mcg_cap & MCG_CTL_P ) - *val = vmce->mcg_ctl & h_mcg_ctl; - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", - *val); + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); + ret = -1; break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; @@ -220,8 +225,10 @@ switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: - if ( bank < nr_mce_banks ) - vmce->mci_ctl[bank] = val; + /* + * if guest crazy clear any bit of MCi_CTL, + * treat it as not implement and ignore write change it. + */ break; case MSR_IA32_MC0_STATUS: if ( entry && (entry->bank == bank) ) @@ -305,7 +312,9 @@ switch ( msr ) { case MSR_IA32_MCG_CTL: - vmce->mcg_ctl = val; + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); + ret = -1; break; case MSR_IA32_MCG_STATUS: vmce->mcg_status = val; @@ -520,52 +529,6 @@ } #endif -int vmce_init(struct cpuinfo_x86 *c) -{ - u64 value; - unsigned int i; - - if ( !h_mci_ctrl ) - { - h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks); - if (!h_mci_ctrl) - { - dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n"); - return -ENOMEM; - } - /* Don''t care banks before firstbank */ - memset(h_mci_ctrl, ~0, - min(firstbank, nr_mce_banks) * sizeof(*h_mci_ctrl)); - for (i = firstbank; i < nr_mce_banks; i++) - rdmsrl(MSR_IA32_MCx_CTL(i), h_mci_ctrl[i]); - } - - rdmsrl(MSR_IA32_MCG_CAP, value); - /* For Guest vMCE usage */ - g_mcg_cap = value & (MCG_CAP_COUNT | MCG_CTL_P | MCG_TES_P | MCG_SER_P); - if (value & MCG_CTL_P) - rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl); - - return 0; -} - -static int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d) -{ - int bank_nr; - - if ( !bank || !d || !h_mci_ctrl ) - return 1; - - /* Will MCE happen in host if If host mcg_ctl is 0? */ - if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl ) - return 1; - - bank_nr = bank->mc_bank; - if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] ) - return 1; - return 0; -} - static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct domain *d) { struct vcpu *v; @@ -619,14 +582,6 @@ if (no_vmce) return 0; - /* Guest has different MCE ctl value setting */ - if (mca_ctl_conflict(bank, d)) - { - dprintk(XENLOG_WARNING, - "No vmce, guest has different mca control setting\n"); - return 0; - } - return 1; } diff -r 4f92bdf3370c xen/include/asm-x86/mce.h --- a/xen/include/asm-x86/mce.h Wed Jun 27 09:36:43 2012 +0200 +++ b/xen/include/asm-x86/mce.h Thu Jul 05 04:28:48 2012 +0800 @@ -16,7 +16,6 @@ struct domain_mca_msrs { /* Guest should not change below values after DOM boot up */ - uint64_t mcg_ctl; uint64_t mcg_status; uint64_t *mci_ctl; uint16_t nr_injection; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Christoph Egger
2012-Jul-04 13:25 UTC
Re: [PATCH] Xen/MCE: adjust for future new vMCE model
On 07/04/12 15:08, Liu, Jinsong wrote:> Xen/MCE: adjust for future new vMCE model > > Recently we designed a new vMCE model, and would implement later. > > In order not to block live migration from current vMCE to future new vMCE, > we do some middle work in this patch, basically it does minimal adjustment, including > 1. remove MCG_CTL; > 2. stick all 1''s to MCi_CTL; > 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP; > > Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com> > > diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.c > --- a/xen/arch/x86/cpu/mcheck/mce.c Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/arch/x86/cpu/mcheck/mce.c Thu Jul 05 04:28:48 2012 +0800 > @@ -843,8 +843,6 @@ > > mctelem_init(sizeof(struct mc_info)); > > - vmce_init(c); > - > /* Turn on MCE now */ > set_in_cr4(X86_CR4_MCE); > > diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.h > --- a/xen/arch/x86/cpu/mcheck/mce.h Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/arch/x86/cpu/mcheck/mce.h Thu Jul 05 04:28:48 2012 +0800 > @@ -170,8 +170,6 @@ > int inject_vmce(struct domain *d); > int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global); > > -extern int vmce_init(struct cpuinfo_x86 *c); > - > static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) > { > if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && > diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/vmce.c > --- a/xen/arch/x86/cpu/mcheck/vmce.c Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/arch/x86/cpu/mcheck/vmce.c Thu Jul 05 04:28:48 2012 +0800 > @@ -19,36 +19,42 @@ > #include "mce.h" > #include "x86_mca.h" > > +/* > + * Emulalte 2 banks for guest > + * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: > + * 1). Intel cpu whose family-model valuse < 06-1A; > + * 2). AMD K7 > + * Bank1: used to transfer error info to guest > + */ > +#define GUEST_BANK_NUM 2 > + > #define dom_vmce(x) ((x)->arch.vmca_msrs) > > static uint64_t __read_mostly g_mcg_cap; > > -/* Real value in physical CTL MSR */ > -static uint64_t __read_mostly h_mcg_ctl; > -static uint64_t *__read_mostly h_mci_ctrl; > - > int vmce_init_msr(struct domain *d) > { > dom_vmce(d) = xmalloc(struct domain_mca_msrs); > if ( !dom_vmce(d) ) > return -ENOMEM; > > - dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks); > + dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, GUEST_BANK_NUM); > if ( !dom_vmce(d)->mci_ctl ) > { > xfree(dom_vmce(d)); > return -ENOMEM; > } > memset(dom_vmce(d)->mci_ctl, ~0, > - nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); > + GUEST_BANK_NUM * sizeof(*dom_vmce(d)->mci_ctl)); > > dom_vmce(d)->mcg_status = 0x0; > - dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; > dom_vmce(d)->nr_injection = 0; > > INIT_LIST_HEAD(&dom_vmce(d)->impact_header); > spin_lock_init(&dom_vmce(d)->lock); > > + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; > +Is MCG_TES_P and MCG_SER_P emulated independent if the host has them or not? For AMD this only works if the answer is yes. (I know in upstream this code path is used by Intel only but I have patches that brings this code path in use by both AMD and Intel.) Christoph> return 0; > } > > @@ -68,7 +74,7 @@ > > int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) > { > - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) > + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) > { > dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" > " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", > @@ -93,9 +99,10 @@ > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > { > case MSR_IA32_MC0_CTL: > - if ( bank < nr_mce_banks ) > - *val = vmce->mci_ctl[bank] & > - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); > + if ( bank < GUEST_BANK_NUM ) { > + BUG_ON( vmce->mci_ctl[bank] != ~0UL ); > + *val = vmce->mci_ctl[bank]; > + } > mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", > bank, *val); > break; > @@ -187,11 +194,9 @@ > *val); > break; > case MSR_IA32_MCG_CTL: > - /* Always 0 if no CTL support */ > - if ( cur->arch.mcg_cap & MCG_CTL_P ) > - *val = vmce->mcg_ctl & h_mcg_ctl; > - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", > - *val); > + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); > + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); > + ret = -1; > break; > default: > ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; > @@ -220,8 +225,10 @@ > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > { > case MSR_IA32_MC0_CTL: > - if ( bank < nr_mce_banks ) > - vmce->mci_ctl[bank] = val; > + /* > + * if guest crazy clear any bit of MCi_CTL, > + * treat it as not implement and ignore write change it. > + */ > break; > case MSR_IA32_MC0_STATUS: > if ( entry && (entry->bank == bank) ) > @@ -305,7 +312,9 @@ > switch ( msr ) > { > case MSR_IA32_MCG_CTL: > - vmce->mcg_ctl = val; > + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); > + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); > + ret = -1; > break; > case MSR_IA32_MCG_STATUS: > vmce->mcg_status = val; > @@ -520,52 +529,6 @@ > } > #endif > > -int vmce_init(struct cpuinfo_x86 *c) > -{ > - u64 value; > - unsigned int i; > - > - if ( !h_mci_ctrl ) > - { > - h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks); > - if (!h_mci_ctrl) > - { > - dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n"); > - return -ENOMEM; > - } > - /* Don''t care banks before firstbank */ > - memset(h_mci_ctrl, ~0, > - min(firstbank, nr_mce_banks) * sizeof(*h_mci_ctrl)); > - for (i = firstbank; i < nr_mce_banks; i++) > - rdmsrl(MSR_IA32_MCx_CTL(i), h_mci_ctrl[i]); > - } > - > - rdmsrl(MSR_IA32_MCG_CAP, value); > - /* For Guest vMCE usage */ > - g_mcg_cap = value & (MCG_CAP_COUNT | MCG_CTL_P | MCG_TES_P | MCG_SER_P); > - if (value & MCG_CTL_P) > - rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl); > - > - return 0; > -} > - > -static int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d) > -{ > - int bank_nr; > - > - if ( !bank || !d || !h_mci_ctrl ) > - return 1; > - > - /* Will MCE happen in host if If host mcg_ctl is 0? */ > - if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl ) > - return 1; > - > - bank_nr = bank->mc_bank; > - if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] ) > - return 1; > - return 0; > -} > - > static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct domain *d) > { > struct vcpu *v; > @@ -619,14 +582,6 @@ > if (no_vmce) > return 0; > > - /* Guest has different MCE ctl value setting */ > - if (mca_ctl_conflict(bank, d)) > - { > - dprintk(XENLOG_WARNING, > - "No vmce, guest has different mca control setting\n"); > - return 0; > - } > - > return 1; > } > > diff -r 4f92bdf3370c xen/include/asm-x86/mce.h > --- a/xen/include/asm-x86/mce.h Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/include/asm-x86/mce.h Thu Jul 05 04:28:48 2012 +0800 > @@ -16,7 +16,6 @@ > struct domain_mca_msrs > { > /* Guest should not change below values after DOM boot up */ > - uint64_t mcg_ctl; > uint64_t mcg_status; > uint64_t *mci_ctl; > uint16_t nr_injection;-- ---to satisfy European Law for business letters: Advanced Micro Devices GmbH Einsteinring 24, 85689 Dornach b. Muenchen Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen Registergericht Muenchen, HRB Nr. 43632
On Wed, 2012-07-04 at 14:08 +0100, Liu, Jinsong wrote:> Xen/MCE: adjust for future new vMCE model > > Recently we designed a new vMCE model, and would implement later. > > In order not to block live migration from current vMCE to future new vMCE, > we do some middle work in this patch, basically it does minimal adjustment, including > 1. remove MCG_CTL; > 2. stick all 1''s to MCi_CTL; > 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP;Does this have any impact on the ability to migrate from 4.1 -> 4.2?
>>> On 04.07.12 at 15:08, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > --- a/xen/arch/x86/cpu/mcheck/vmce.c Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/arch/x86/cpu/mcheck/vmce.c Thu Jul 05 04:28:48 2012 +0800 > @@ -19,36 +19,42 @@ > #include "mce.h" > #include "x86_mca.h" > > +/* > + * Emulalte 2 banks for guest > + * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: > + * 1). Intel cpu whose family-model valuse < 06-1A; > + * 2). AMD K7 > + * Bank1: used to transfer error info to guest > + */ > +#define GUEST_BANK_NUM 2 > + > #define dom_vmce(x) ((x)->arch.vmca_msrs) > > static uint64_t __read_mostly g_mcg_cap; > > -/* Real value in physical CTL MSR */ > -static uint64_t __read_mostly h_mcg_ctl; > -static uint64_t *__read_mostly h_mci_ctrl; > - > int vmce_init_msr(struct domain *d) > { > dom_vmce(d) = xmalloc(struct domain_mca_msrs); > if ( !dom_vmce(d) ) > return -ENOMEM; > > - dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks); > + dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, GUEST_BANK_NUM); > if ( !dom_vmce(d)->mci_ctl ) > { > xfree(dom_vmce(d)); > return -ENOMEM; > } > memset(dom_vmce(d)->mci_ctl, ~0, > - nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); > + GUEST_BANK_NUM * sizeof(*dom_vmce(d)->mci_ctl)); > > dom_vmce(d)->mcg_status = 0x0; > - dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; > dom_vmce(d)->nr_injection = 0; > > INIT_LIST_HEAD(&dom_vmce(d)->impact_header); > spin_lock_init(&dom_vmce(d)->lock); > > + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; > + > return 0; > } > > @@ -68,7 +74,7 @@ > > int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) > { > - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) > + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT )Shouldn''t you also check bank count being GUEST_BANK_NUM?> { > dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" > " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", > @@ -93,9 +99,10 @@ > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > { > case MSR_IA32_MC0_CTL: > - if ( bank < nr_mce_banks ) > - *val = vmce->mci_ctl[bank] & > - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); > + if ( bank < GUEST_BANK_NUM ) {This being false is impossible afaict (provided guest''s MCG_CAP is never permitted to hold other than GUEST_BANK_NUM). However, if the intention is to support an eventual future need to grow that number, than an else clause would be needed setting *val to ~0. But read on...> + BUG_ON( vmce->mci_ctl[bank] != ~0UL ); > + *val = vmce->mci_ctl[bank];Why do you retain the (per-domain!) array if all slots are always expected to be ~0 anyway?> + } > mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", > bank, *val); > break; > @@ -187,11 +194,9 @@ > *val); > break; > case MSR_IA32_MCG_CTL: > - /* Always 0 if no CTL support */ > - if ( cur->arch.mcg_cap & MCG_CTL_P ) > - *val = vmce->mcg_ctl & h_mcg_ctl; > - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", > - *val); > + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); > + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); > + ret = -1; > break; > default: > ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; > @@ -220,8 +225,10 @@ > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > { > case MSR_IA32_MC0_CTL: > - if ( bank < nr_mce_banks ) > - vmce->mci_ctl[bank] = val; > + /* > + * if guest crazy clear any bit of MCi_CTL, > + * treat it as not implement and ignore write change it. > + */ > break; > case MSR_IA32_MC0_STATUS: > if ( entry && (entry->bank == bank) ) > @@ -305,7 +312,9 @@ > switch ( msr ) > { > case MSR_IA32_MCG_CTL: > - vmce->mcg_ctl = val; > + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); > + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); > + ret = -1; > break; > case MSR_IA32_MCG_STATUS: > vmce->mcg_status = val; > @@ -520,52 +529,6 @@ > } > #endif > > -int vmce_init(struct cpuinfo_x86 *c) > -{ > - u64 value; > - unsigned int i; > - > - if ( !h_mci_ctrl ) > - { > - h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks); > - if (!h_mci_ctrl) > - { > - dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n"); > - return -ENOMEM; > - } > - /* Don''t care banks before firstbank */ > - memset(h_mci_ctrl, ~0, > - min(firstbank, nr_mce_banks) * sizeof(*h_mci_ctrl)); > - for (i = firstbank; i < nr_mce_banks; i++) > - rdmsrl(MSR_IA32_MCx_CTL(i), h_mci_ctrl[i]); > - } > - > - rdmsrl(MSR_IA32_MCG_CAP, value); > - /* For Guest vMCE usage */ > - g_mcg_cap = value & (MCG_CAP_COUNT | MCG_CTL_P | MCG_TES_P | MCG_SER_P); > - if (value & MCG_CTL_P) > - rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl); > - > - return 0; > -} > - > -static int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d) > -{ > - int bank_nr; > - > - if ( !bank || !d || !h_mci_ctrl ) > - return 1; > - > - /* Will MCE happen in host if If host mcg_ctl is 0? */ > - if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl ) > - return 1; > - > - bank_nr = bank->mc_bank; > - if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] ) > - return 1; > - return 0; > -} > - > static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct domain *d) > { > struct vcpu *v; > @@ -619,14 +582,6 @@ > if (no_vmce) > return 0; > > - /* Guest has different MCE ctl value setting */ > - if (mca_ctl_conflict(bank, d)) > - { > - dprintk(XENLOG_WARNING, > - "No vmce, guest has different mca control setting\n"); > - return 0; > - } > - > return 1; > } >Also I seem to recall that there was going to be a need to save/restore MCi_CTL2, but there''s nothing being done in that direction. Jan
>>> On 04.07.12 at 15:37, Ian Campbell <Ian.Campbell@citrix.com> wrote: > On Wed, 2012-07-04 at 14:08 +0100, Liu, Jinsong wrote: >> Xen/MCE: adjust for future new vMCE model >> >> Recently we designed a new vMCE model, and would implement later. >> >> In order not to block live migration from current vMCE to future new vMCE, >> we do some middle work in this patch, basically it does minimal adjustment, > including >> 1. remove MCG_CTL; >> 2. stick all 1''s to MCi_CTL; >> 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP; > > Does this have any impact on the ability to migrate from 4.1 -> 4.2?That migration is broken anyway (as much as 4.1 -> 4.1 is when the hosts have different MCE capabilities). For a 4.1 -> 4.2 migration between equal hosts, I don''t think there''s any problem (as 4.1 doesn''t save any MCE related data). Jan
Christoph Egger wrote:> On 07/04/12 15:08, Liu, Jinsong wrote: > >> Xen/MCE: adjust for future new vMCE model >> >> Recently we designed a new vMCE model, and would implement later. >> >> In order not to block live migration from current vMCE to future new >> vMCE, >> we do some middle work in this patch, basically it does minimal >> adjustment, including >> 1. remove MCG_CTL; >> 2. stick all 1''s to MCi_CTL; >> 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP; >> >> Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com> >> >> diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.c >> --- a/xen/arch/x86/cpu/mcheck/mce.c Wed Jun 27 09:36:43 2012 +0200 >> +++ b/xen/arch/x86/cpu/mcheck/mce.c Thu Jul 05 04:28:48 2012 +0800 >> @@ -843,8 +843,6 @@ >> >> mctelem_init(sizeof(struct mc_info)); >> >> - vmce_init(c); >> - >> /* Turn on MCE now */ >> set_in_cr4(X86_CR4_MCE); >> >> diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.h >> --- a/xen/arch/x86/cpu/mcheck/mce.h Wed Jun 27 09:36:43 2012 +0200 >> +++ b/xen/arch/x86/cpu/mcheck/mce.h Thu Jul 05 04:28:48 2012 +0800 >> @@ -170,8 +170,6 @@ int inject_vmce(struct domain *d); >> int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, >> struct mcinfo_global *global); >> >> -extern int vmce_init(struct cpuinfo_x86 *c); >> - >> static inline int mce_vendor_bank_msr(const struct vcpu *v, >> uint32_t msr) { if ( boot_cpu_data.x86_vendor =>> X86_VENDOR_INTEL && >> diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/vmce.c >> --- a/xen/arch/x86/cpu/mcheck/vmce.c Wed Jun 27 09:36:43 2012 +0200 >> +++ b/xen/arch/x86/cpu/mcheck/vmce.c Thu Jul 05 04:28:48 2012 +0800 >> @@ -19,36 +19,42 @@ #include "mce.h" >> #include "x86_mca.h" >> >> +/* >> + * Emulalte 2 banks for guest >> + * Bank0: reserved for ''bank0 quirk'' occur at some very old >> processors: + * 1). Intel cpu whose family-model valuse < 06-1A; + >> * 2). AMD K7 + * Bank1: used to transfer error info to guest >> + */ >> +#define GUEST_BANK_NUM 2 >> + >> #define dom_vmce(x) ((x)->arch.vmca_msrs) >> >> static uint64_t __read_mostly g_mcg_cap; >> >> -/* Real value in physical CTL MSR */ >> -static uint64_t __read_mostly h_mcg_ctl; >> -static uint64_t *__read_mostly h_mci_ctrl; >> - >> int vmce_init_msr(struct domain *d) >> { >> dom_vmce(d) = xmalloc(struct domain_mca_msrs); if ( >> !dom_vmce(d) ) return -ENOMEM; >> >> - dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks); >> + dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, GUEST_BANK_NUM); >> if ( !dom_vmce(d)->mci_ctl ) >> { >> xfree(dom_vmce(d)); >> return -ENOMEM; >> } >> memset(dom_vmce(d)->mci_ctl, ~0, >> - nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); >> + GUEST_BANK_NUM * sizeof(*dom_vmce(d)->mci_ctl)); >> >> dom_vmce(d)->mcg_status = 0x0; >> - dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; >> dom_vmce(d)->nr_injection = 0; >> >> INIT_LIST_HEAD(&dom_vmce(d)->impact_header); >> spin_lock_init(&dom_vmce(d)->lock); >> >> + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; >> + > > > Is MCG_TES_P and MCG_SER_P emulated independent if the host has them > or not? For AMD this only works if the answer is yes. > > (I know in upstream this code path is used by Intel only but I have > patches that brings this code path in use by both AMD and Intel.) > > ChristophI''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me where can I get AMD''s *latest* open doc (something like amd architecture programmer manual)? If AMD has these 2 bits, it''s safe to set them independent of host capability -- guest will just think it running on a platform w/ some events *possilbe* (though actually may never occur), hypervisor know what actually occur and has the flexibility to decide what it would like to inject to guest. This code is only used by Intel, and it''s only for not blocking future vMCE, so it just do minimal necessary update. Thanks, Jinsong> > >> return 0; >> } >> >> @@ -68,7 +74,7 @@ >> >> int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { >> - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) >> + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) >> { >> dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA >> capabilities" " %#" PRIx64 " for d%d:v%u >> (supported: %#Lx)\n", @@ -93,9 +99,10 @@ switch ( msr & >> (MSR_IA32_MC0_CTL | 3) ) { >> case MSR_IA32_MC0_CTL: >> - if ( bank < nr_mce_banks ) >> - *val = vmce->mci_ctl[bank] & >> - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); >> + if ( bank < GUEST_BANK_NUM ) { >> + BUG_ON( vmce->mci_ctl[bank] != ~0UL ); >> + *val = vmce->mci_ctl[bank]; >> + } >> mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", >> bank, *val); >> break; >> @@ -187,11 +194,9 @@ >> *val); >> break; >> case MSR_IA32_MCG_CTL: >> - /* Always 0 if no CTL support */ >> - if ( cur->arch.mcg_cap & MCG_CTL_P ) >> - *val = vmce->mcg_ctl & h_mcg_ctl; >> - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", >> - *val); >> + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); >> + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); + ret >> -1; break; >> default: >> ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, >> val) : 0; @@ -220,8 +225,10 @@ switch ( msr & (MSR_IA32_MC0_CTL >> | 3) ) { >> case MSR_IA32_MC0_CTL: >> - if ( bank < nr_mce_banks ) >> - vmce->mci_ctl[bank] = val; >> + /* >> + * if guest crazy clear any bit of MCi_CTL, >> + * treat it as not implement and ignore write change it. + >> */ break; >> case MSR_IA32_MC0_STATUS: >> if ( entry && (entry->bank == bank) ) >> @@ -305,7 +312,9 @@ >> switch ( msr ) >> { >> case MSR_IA32_MCG_CTL: >> - vmce->mcg_ctl = val; >> + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); >> + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); + ret >> -1; break; >> case MSR_IA32_MCG_STATUS: >> vmce->mcg_status = val; >> @@ -520,52 +529,6 @@ >> } >> #endif >> >> -int vmce_init(struct cpuinfo_x86 *c) >> -{ >> - u64 value; >> - unsigned int i; >> - >> - if ( !h_mci_ctrl ) >> - { >> - h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks); >> - if (!h_mci_ctrl) >> - { >> - dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n"); >> - return -ENOMEM; >> - } >> - /* Don''t care banks before firstbank */ >> - memset(h_mci_ctrl, ~0, >> - min(firstbank, nr_mce_banks) * sizeof(*h_mci_ctrl)); >> - for (i = firstbank; i < nr_mce_banks; i++) >> - rdmsrl(MSR_IA32_MCx_CTL(i), h_mci_ctrl[i]); >> - } >> - >> - rdmsrl(MSR_IA32_MCG_CAP, value); >> - /* For Guest vMCE usage */ >> - g_mcg_cap = value & (MCG_CAP_COUNT | MCG_CTL_P | MCG_TES_P | >> MCG_SER_P); >> - if (value & MCG_CTL_P) >> - rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl); >> - >> - return 0; >> -} >> - >> -static int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain >> *d) -{ >> - int bank_nr; >> - >> - if ( !bank || !d || !h_mci_ctrl ) >> - return 1; >> - >> - /* Will MCE happen in host if If host mcg_ctl is 0? */ >> - if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl ) >> - return 1; >> - >> - bank_nr = bank->mc_bank; >> - if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] ) >> - return 1; >> - return 0; >> -} >> - >> static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct >> domain *d) { struct vcpu *v; >> @@ -619,14 +582,6 @@ >> if (no_vmce) >> return 0; >> >> - /* Guest has different MCE ctl value setting */ >> - if (mca_ctl_conflict(bank, d)) >> - { >> - dprintk(XENLOG_WARNING, >> - "No vmce, guest has different mca control setting\n"); >> - return 0; >> - } >> - >> return 1; >> } >> >> diff -r 4f92bdf3370c xen/include/asm-x86/mce.h >> --- a/xen/include/asm-x86/mce.h Wed Jun 27 09:36:43 2012 +0200 >> +++ b/xen/include/asm-x86/mce.h Thu Jul 05 04:28:48 2012 +0800 @@ >> -16,7 +16,6 @@ struct domain_mca_msrs >> { >> /* Guest should not change below values after DOM boot up */ - >> uint64_t mcg_ctl; uint64_t mcg_status; >> uint64_t *mci_ctl; >> uint16_t nr_injection;
Jan Beulich wrote:>>>> On 04.07.12 at 15:08, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> --- a/xen/arch/x86/cpu/mcheck/vmce.c Wed Jun 27 09:36:43 2012 +0200 >> +++ b/xen/arch/x86/cpu/mcheck/vmce.c Thu Jul 05 04:28:48 2012 +0800 >> @@ -19,36 +19,42 @@ #include "mce.h" >> #include "x86_mca.h" >> >> +/* >> + * Emulalte 2 banks for guest >> + * Bank0: reserved for ''bank0 quirk'' occur at some very old >> processors: + * 1). Intel cpu whose family-model valuse < 06-1A; + >> * 2). AMD K7 + * Bank1: used to transfer error info to guest >> + */ >> +#define GUEST_BANK_NUM 2 >> + >> #define dom_vmce(x) ((x)->arch.vmca_msrs) >> >> static uint64_t __read_mostly g_mcg_cap; >> >> -/* Real value in physical CTL MSR */ >> -static uint64_t __read_mostly h_mcg_ctl; >> -static uint64_t *__read_mostly h_mci_ctrl; >> - >> int vmce_init_msr(struct domain *d) >> { >> dom_vmce(d) = xmalloc(struct domain_mca_msrs); if ( >> !dom_vmce(d) ) return -ENOMEM; >> >> - dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks); >> + dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, GUEST_BANK_NUM); >> if ( !dom_vmce(d)->mci_ctl ) >> { >> xfree(dom_vmce(d)); >> return -ENOMEM; >> } >> memset(dom_vmce(d)->mci_ctl, ~0, >> - nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); >> + GUEST_BANK_NUM * sizeof(*dom_vmce(d)->mci_ctl)); >> >> dom_vmce(d)->mcg_status = 0x0; >> - dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; >> dom_vmce(d)->nr_injection = 0; >> >> INIT_LIST_HEAD(&dom_vmce(d)->impact_header); >> spin_lock_init(&dom_vmce(d)->lock); >> >> + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; + >> return 0; >> } >> >> @@ -68,7 +74,7 @@ >> >> int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { >> - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) >> + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) > > Shouldn''t you also check bank count being GUEST_BANK_NUM? > >> { >> dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA >> capabilities" " %#" PRIx64 " for d%d:v%u >> (supported: %#Lx)\n", @@ -93,9 +99,10 @@ switch ( msr & >> (MSR_IA32_MC0_CTL | 3) ) { >> case MSR_IA32_MC0_CTL: >> - if ( bank < nr_mce_banks ) >> - *val = vmce->mci_ctl[bank] & >> - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); >> + if ( bank < GUEST_BANK_NUM ) { > > This being false is impossible afaict (provided guest''s MCG_CAP > is never permitted to hold other than GUEST_BANK_NUM). > > However, if the intention is to support an eventual future need > to grow that number, than an else clause would be needed > setting *val to ~0. But read on... > >> + BUG_ON( vmce->mci_ctl[bank] != ~0UL ); >> + *val = vmce->mci_ctl[bank]; > > Why do you retain the (per-domain!) array if all slots are always > expected to be ~0 anyway? > >> + } >> mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", >> bank, *val); >> break; >> @@ -187,11 +194,9 @@ >> *val); >> break; >> case MSR_IA32_MCG_CTL: >> - /* Always 0 if no CTL support */ >> - if ( cur->arch.mcg_cap & MCG_CTL_P ) >> - *val = vmce->mcg_ctl & h_mcg_ctl; >> - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", >> - *val); >> + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); >> + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); + ret >> -1; break; >> default: >> ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, >> val) : 0; @@ -220,8 +225,10 @@ switch ( msr & (MSR_IA32_MC0_CTL >> | 3) ) { >> case MSR_IA32_MC0_CTL: >> - if ( bank < nr_mce_banks ) >> - vmce->mci_ctl[bank] = val; >> + /* >> + * if guest crazy clear any bit of MCi_CTL, >> + * treat it as not implement and ignore write change it. + >> */ break; >> case MSR_IA32_MC0_STATUS: >> if ( entry && (entry->bank == bank) ) >> @@ -305,7 +312,9 @@ >> switch ( msr ) >> { >> case MSR_IA32_MCG_CTL: >> - vmce->mcg_ctl = val; >> + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); >> + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); + ret >> -1; break; >> case MSR_IA32_MCG_STATUS: >> vmce->mcg_status = val; >> @@ -520,52 +529,6 @@ >> } >> #endif >> >> -int vmce_init(struct cpuinfo_x86 *c) >> -{ >> - u64 value; >> - unsigned int i; >> - >> - if ( !h_mci_ctrl ) >> - { >> - h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks); >> - if (!h_mci_ctrl) >> - { >> - dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n"); >> - return -ENOMEM; >> - } >> - /* Don''t care banks before firstbank */ >> - memset(h_mci_ctrl, ~0, >> - min(firstbank, nr_mce_banks) * sizeof(*h_mci_ctrl)); >> - for (i = firstbank; i < nr_mce_banks; i++) >> - rdmsrl(MSR_IA32_MCx_CTL(i), h_mci_ctrl[i]); >> - } >> - >> - rdmsrl(MSR_IA32_MCG_CAP, value); >> - /* For Guest vMCE usage */ >> - g_mcg_cap = value & (MCG_CAP_COUNT | MCG_CTL_P | MCG_TES_P | >> MCG_SER_P); >> - if (value & MCG_CTL_P) >> - rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl); >> - >> - return 0; >> -} >> - >> -static int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain >> *d) -{ >> - int bank_nr; >> - >> - if ( !bank || !d || !h_mci_ctrl ) >> - return 1; >> - >> - /* Will MCE happen in host if If host mcg_ctl is 0? */ >> - if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl ) >> - return 1; >> - >> - bank_nr = bank->mc_bank; >> - if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] ) >> - return 1; >> - return 0; >> -} >> - >> static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct >> domain *d) { struct vcpu *v; >> @@ -619,14 +582,6 @@ >> if (no_vmce) >> return 0; >> >> - /* Guest has different MCE ctl value setting */ >> - if (mca_ctl_conflict(bank, d)) >> - { >> - dprintk(XENLOG_WARNING, >> - "No vmce, guest has different mca control setting\n"); >> - return 0; >> - } >> - >> return 1; >> } >> > > Also I seem to recall that there was going to be a need to > save/restore MCi_CTL2, but there''s nothing being done in that > direction. > > JanYes, but that would be done at new vMCE. Current vMCE didn''t enable MCG_CMCI_P, and MCi_CTL2 are not used, so we don''t save/restore MCi_CTL2 at this version. Thanks, Jinsong
Jan, I update according to your comments. ====================Xen/MCE: adjust for future new vMCE model Recently we designed a new vMCE model, and would implement later. In order not to block live migration from current vMCE to future new vMCE, we do some middle work in this patch, basically it does minimal adjustment, including 1. remove MCG_CTL; 2. stick all 1''s to MCi_CTL; 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP; Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com> diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c Wed Jun 27 09:36:43 2012 +0200 +++ b/xen/arch/x86/cpu/mcheck/mce.c Thu Jul 05 19:21:16 2012 +0800 @@ -843,8 +843,6 @@ mctelem_init(sizeof(struct mc_info)); - vmce_init(c); - /* Turn on MCE now */ set_in_cr4(X86_CR4_MCE); diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.h --- a/xen/arch/x86/cpu/mcheck/mce.h Wed Jun 27 09:36:43 2012 +0200 +++ b/xen/arch/x86/cpu/mcheck/mce.h Thu Jul 05 19:21:16 2012 +0800 @@ -170,8 +170,6 @@ int inject_vmce(struct domain *d); int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global); -extern int vmce_init(struct cpuinfo_x86 *c); - static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) { if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/vmce.c --- a/xen/arch/x86/cpu/mcheck/vmce.c Wed Jun 27 09:36:43 2012 +0200 +++ b/xen/arch/x86/cpu/mcheck/vmce.c Thu Jul 05 19:21:16 2012 +0800 @@ -19,36 +19,42 @@ #include "mce.h" #include "x86_mca.h" +/* + * Emulalte 2 banks for guest + * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: + * 1). Intel cpu whose family-model value < 06-1A; + * 2). AMD K7 + * Bank1: used to transfer error info to guest + */ +#define GUEST_BANK_NUM 2 + #define dom_vmce(x) ((x)->arch.vmca_msrs) static uint64_t __read_mostly g_mcg_cap; -/* Real value in physical CTL MSR */ -static uint64_t __read_mostly h_mcg_ctl; -static uint64_t *__read_mostly h_mci_ctrl; - int vmce_init_msr(struct domain *d) { dom_vmce(d) = xmalloc(struct domain_mca_msrs); if ( !dom_vmce(d) ) return -ENOMEM; - dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks); + dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, GUEST_BANK_NUM); if ( !dom_vmce(d)->mci_ctl ) { xfree(dom_vmce(d)); return -ENOMEM; } memset(dom_vmce(d)->mci_ctl, ~0, - nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); + GUEST_BANK_NUM * sizeof(*dom_vmce(d)->mci_ctl)); dom_vmce(d)->mcg_status = 0x0; - dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; dom_vmce(d)->nr_injection = 0; INIT_LIST_HEAD(&dom_vmce(d)->impact_header); spin_lock_init(&dom_vmce(d)->lock); + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; + return 0; } @@ -68,7 +74,7 @@ int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) { dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", @@ -77,6 +83,12 @@ return -EPERM; } + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) + { + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); + return -EPERM; + } + v->arch.mcg_cap = caps; return 0; } @@ -93,9 +105,9 @@ switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: - if ( bank < nr_mce_banks ) - *val = vmce->mci_ctl[bank] & - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); + BUG_ON( bank >= GUEST_BANK_NUM ); + BUG_ON( vmce->mci_ctl[bank] != ~0UL ); + *val = ~0UL; mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", bank, *val); break; @@ -187,11 +199,9 @@ *val); break; case MSR_IA32_MCG_CTL: - /* Always 0 if no CTL support */ - if ( cur->arch.mcg_cap & MCG_CTL_P ) - *val = vmce->mcg_ctl & h_mcg_ctl; - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", - *val); + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); + ret = -1; break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; @@ -220,8 +230,11 @@ switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: - if ( bank < nr_mce_banks ) - vmce->mci_ctl[bank] = val; + BUG_ON( bank >= GUEST_BANK_NUM ); + /* + * if guest crazy clear any bit of MCi_CTL, + * treat it as not implement and ignore write change it. + */ break; case MSR_IA32_MC0_STATUS: if ( entry && (entry->bank == bank) ) @@ -305,7 +318,9 @@ switch ( msr ) { case MSR_IA32_MCG_CTL: - vmce->mcg_ctl = val; + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); + ret = -1; break; case MSR_IA32_MCG_STATUS: vmce->mcg_status = val; @@ -520,52 +535,6 @@ } #endif -int vmce_init(struct cpuinfo_x86 *c) -{ - u64 value; - unsigned int i; - - if ( !h_mci_ctrl ) - { - h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks); - if (!h_mci_ctrl) - { - dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n"); - return -ENOMEM; - } - /* Don''t care banks before firstbank */ - memset(h_mci_ctrl, ~0, - min(firstbank, nr_mce_banks) * sizeof(*h_mci_ctrl)); - for (i = firstbank; i < nr_mce_banks; i++) - rdmsrl(MSR_IA32_MCx_CTL(i), h_mci_ctrl[i]); - } - - rdmsrl(MSR_IA32_MCG_CAP, value); - /* For Guest vMCE usage */ - g_mcg_cap = value & (MCG_CAP_COUNT | MCG_CTL_P | MCG_TES_P | MCG_SER_P); - if (value & MCG_CTL_P) - rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl); - - return 0; -} - -static int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d) -{ - int bank_nr; - - if ( !bank || !d || !h_mci_ctrl ) - return 1; - - /* Will MCE happen in host if If host mcg_ctl is 0? */ - if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl ) - return 1; - - bank_nr = bank->mc_bank; - if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] ) - return 1; - return 0; -} - static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct domain *d) { struct vcpu *v; @@ -619,14 +588,6 @@ if (no_vmce) return 0; - /* Guest has different MCE ctl value setting */ - if (mca_ctl_conflict(bank, d)) - { - dprintk(XENLOG_WARNING, - "No vmce, guest has different mca control setting\n"); - return 0; - } - return 1; } diff -r 4f92bdf3370c xen/include/asm-x86/mce.h --- a/xen/include/asm-x86/mce.h Wed Jun 27 09:36:43 2012 +0200 +++ b/xen/include/asm-x86/mce.h Thu Jul 05 19:21:16 2012 +0800 @@ -16,7 +16,6 @@ struct domain_mca_msrs { /* Guest should not change below values after DOM boot up */ - uint64_t mcg_ctl; uint64_t mcg_status; uint64_t *mci_ctl; uint16_t nr_injection; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan, I update according to your comments.Some of them you did address, but I see that the per-domain mci_ctl[] array is still present. What''s the point? Jan> ====================> Xen/MCE: adjust for future new vMCE model > > Recently we designed a new vMCE model, and would implement later. > > In order not to block live migration from current vMCE to future new vMCE, > we do some middle work in this patch, basically it does minimal adjustment, > including > 1. remove MCG_CTL; > 2. stick all 1''s to MCi_CTL; > 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP; > > Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com> > > diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.c > --- a/xen/arch/x86/cpu/mcheck/mce.c Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/arch/x86/cpu/mcheck/mce.c Thu Jul 05 19:21:16 2012 +0800 > @@ -843,8 +843,6 @@ > > mctelem_init(sizeof(struct mc_info)); > > - vmce_init(c); > - > /* Turn on MCE now */ > set_in_cr4(X86_CR4_MCE); > > diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/mce.h > --- a/xen/arch/x86/cpu/mcheck/mce.h Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/arch/x86/cpu/mcheck/mce.h Thu Jul 05 19:21:16 2012 +0800 > @@ -170,8 +170,6 @@ > int inject_vmce(struct domain *d); > int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct > mcinfo_global *global); > > -extern int vmce_init(struct cpuinfo_x86 *c); > - > static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) > { > if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && > diff -r 4f92bdf3370c xen/arch/x86/cpu/mcheck/vmce.c > --- a/xen/arch/x86/cpu/mcheck/vmce.c Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/arch/x86/cpu/mcheck/vmce.c Thu Jul 05 19:21:16 2012 +0800 > @@ -19,36 +19,42 @@ > #include "mce.h" > #include "x86_mca.h" > > +/* > + * Emulalte 2 banks for guest > + * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: > + * 1). Intel cpu whose family-model value < 06-1A; > + * 2). AMD K7 > + * Bank1: used to transfer error info to guest > + */ > +#define GUEST_BANK_NUM 2 > + > #define dom_vmce(x) ((x)->arch.vmca_msrs) > > static uint64_t __read_mostly g_mcg_cap; > > -/* Real value in physical CTL MSR */ > -static uint64_t __read_mostly h_mcg_ctl; > -static uint64_t *__read_mostly h_mci_ctrl; > - > int vmce_init_msr(struct domain *d) > { > dom_vmce(d) = xmalloc(struct domain_mca_msrs); > if ( !dom_vmce(d) ) > return -ENOMEM; > > - dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks); > + dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, GUEST_BANK_NUM); > if ( !dom_vmce(d)->mci_ctl ) > { > xfree(dom_vmce(d)); > return -ENOMEM; > } > memset(dom_vmce(d)->mci_ctl, ~0, > - nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); > + GUEST_BANK_NUM * sizeof(*dom_vmce(d)->mci_ctl)); > > dom_vmce(d)->mcg_status = 0x0; > - dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; > dom_vmce(d)->nr_injection = 0; > > INIT_LIST_HEAD(&dom_vmce(d)->impact_header); > spin_lock_init(&dom_vmce(d)->lock); > > + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; > + > return 0; > } > > @@ -68,7 +74,7 @@ > > int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) > { > - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) > + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) > { > dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" > " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", > @@ -77,6 +83,12 @@ > return -EPERM; > } > > + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) > + { > + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); > + return -EPERM; > + } > + > v->arch.mcg_cap = caps; > return 0; > } > @@ -93,9 +105,9 @@ > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > { > case MSR_IA32_MC0_CTL: > - if ( bank < nr_mce_banks ) > - *val = vmce->mci_ctl[bank] & > - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); > + BUG_ON( bank >= GUEST_BANK_NUM ); > + BUG_ON( vmce->mci_ctl[bank] != ~0UL ); > + *val = ~0UL; > mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", > bank, *val); > break; > @@ -187,11 +199,9 @@ > *val); > break; > case MSR_IA32_MCG_CTL: > - /* Always 0 if no CTL support */ > - if ( cur->arch.mcg_cap & MCG_CTL_P ) > - *val = vmce->mcg_ctl & h_mcg_ctl; > - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", > - *val); > + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); > + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); > + ret = -1; > break; > default: > ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; > @@ -220,8 +230,11 @@ > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > { > case MSR_IA32_MC0_CTL: > - if ( bank < nr_mce_banks ) > - vmce->mci_ctl[bank] = val; > + BUG_ON( bank >= GUEST_BANK_NUM ); > + /* > + * if guest crazy clear any bit of MCi_CTL, > + * treat it as not implement and ignore write change it. > + */ > break; > case MSR_IA32_MC0_STATUS: > if ( entry && (entry->bank == bank) ) > @@ -305,7 +318,9 @@ > switch ( msr ) > { > case MSR_IA32_MCG_CTL: > - vmce->mcg_ctl = val; > + BUG_ON( cur->arch.mcg_cap & MCG_CTL_P ); > + mce_printk(MCE_QUIET, "MCE: no MCG_CTL\n"); > + ret = -1; > break; > case MSR_IA32_MCG_STATUS: > vmce->mcg_status = val; > @@ -520,52 +535,6 @@ > } > #endif > > -int vmce_init(struct cpuinfo_x86 *c) > -{ > - u64 value; > - unsigned int i; > - > - if ( !h_mci_ctrl ) > - { > - h_mci_ctrl = xmalloc_array(uint64_t, nr_mce_banks); > - if (!h_mci_ctrl) > - { > - dprintk(XENLOG_INFO, "Failed to alloc h_mci_ctrl\n"); > - return -ENOMEM; > - } > - /* Don''t care banks before firstbank */ > - memset(h_mci_ctrl, ~0, > - min(firstbank, nr_mce_banks) * sizeof(*h_mci_ctrl)); > - for (i = firstbank; i < nr_mce_banks; i++) > - rdmsrl(MSR_IA32_MCx_CTL(i), h_mci_ctrl[i]); > - } > - > - rdmsrl(MSR_IA32_MCG_CAP, value); > - /* For Guest vMCE usage */ > - g_mcg_cap = value & (MCG_CAP_COUNT | MCG_CTL_P | MCG_TES_P | MCG_SER_P); > - if (value & MCG_CTL_P) > - rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl); > - > - return 0; > -} > - > -static int mca_ctl_conflict(struct mcinfo_bank *bank, struct domain *d) > -{ > - int bank_nr; > - > - if ( !bank || !d || !h_mci_ctrl ) > - return 1; > - > - /* Will MCE happen in host if If host mcg_ctl is 0? */ > - if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl ) > - return 1; > - > - bank_nr = bank->mc_bank; > - if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] ) > - return 1; > - return 0; > -} > - > static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct domain *d) > { > struct vcpu *v; > @@ -619,14 +588,6 @@ > if (no_vmce) > return 0; > > - /* Guest has different MCE ctl value setting */ > - if (mca_ctl_conflict(bank, d)) > - { > - dprintk(XENLOG_WARNING, > - "No vmce, guest has different mca control setting\n"); > - return 0; > - } > - > return 1; > } > > diff -r 4f92bdf3370c xen/include/asm-x86/mce.h > --- a/xen/include/asm-x86/mce.h Wed Jun 27 09:36:43 2012 +0200 > +++ b/xen/include/asm-x86/mce.h Thu Jul 05 19:21:16 2012 +0800 > @@ -16,7 +16,6 @@ > struct domain_mca_msrs > { > /* Guest should not change below values after DOM boot up */ > - uint64_t mcg_ctl; > uint64_t mcg_status; > uint64_t *mci_ctl; > uint16_t nr_injection;
>>> On 05.07.12 at 05:03, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan Beulich wrote: >> Also I seem to recall that there was going to be a need to >> save/restore MCi_CTL2, but there''s nothing being done in that >> direction. > > Yes, but that would be done at new vMCE. > Current vMCE didn''t enable MCG_CMCI_P, and MCi_CTL2 are not used, so we > don''t save/restore MCi_CTL2 at this version.But _that_ is the major save/restore related issue to be addressed, at least if backward migration (4.2 -> 4.1) is also to be working reasonably. If such compatibility is entirely unneeded, then I agree that it likely doesn''t need any consideration at this point. Jan
On Thu, 2012-07-05 at 07:39 +0100, Jan Beulich wrote:> >>> On 05.07.12 at 05:03, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > > Jan Beulich wrote: > >> Also I seem to recall that there was going to be a need to > >> save/restore MCi_CTL2, but there''s nothing being done in that > >> direction. > > > > Yes, but that would be done at new vMCE. > > Current vMCE didn''t enable MCG_CMCI_P, and MCi_CTL2 are not used, so we > > don''t save/restore MCi_CTL2 at this version. > > But _that_ is the major save/restore related issue to be > addressed, at least if backward migration (4.2 -> 4.1) is also > to be working reasonably. If such compatibility is entirely > unneeded, then I agree that it likely doesn''t need any > consideration at this point.We don''t generally worry about N->N-1 migration, just N->N+1. Ian.
>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > @@ -68,7 +74,7 @@ > > int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) > { > - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) > + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) > { > dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" > " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", > @@ -77,6 +83,12 @@ > return -EPERM; > } > > + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) > + { > + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); > + return -EPERM; > + } > + > v->arch.mcg_cap = caps; > return 0; > }These two changes, as pointed out before, need some further thought and tweaking: As I said earlier, we are already shipping the code in its prior form, so outright rejecting MCG_CTL_P set and non-default bank counts is not a proper solution. Warning about them being in an unsupported state is certainly acceptable. And I think the guest visible MCG_CAP value also shouldn''t change across migration, so tolerating (but ignoring) a higher bank count seems necessary. Not sure what to do when the bank count is lower (0 or 1) - for 1, all communication to the guest should probably go through bank 0, while 0 should probably disable vMCE for that vCPU. Further I just noticed that you don''t touch fill_vmsr_data() at all (sending patches created with diff''s -p option or equivalent helps spotting where individual changes belong), yet that function uses the hardware bank number to fill the struct bank_entry. With the intended concept, the "bank" member of that structure can probably go away altogether. Jan
>>> On 04.07.12 at 18:08, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Christoph Egger wrote: >> On 07/04/12 15:08, Liu, Jinsong wrote: >>> + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; >>> + >> >> >> Is MCG_TES_P and MCG_SER_P emulated independent if the host has them >> or not? For AMD this only works if the answer is yes. >> >> (I know in upstream this code path is used by Intel only but I have >> patches that brings this code path in use by both AMD and Intel.) >> >> Christoph > > I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me where can > I get AMD''s *latest* open doc (something like amd architecture programmer > manual)? > > If AMD has these 2 bits, it''s safe to set them independent of host > capability -- guest will just think it running on a platform w/ some events > *possilbe* (though actually may never occur), hypervisor know what actually > occur and has the flexibility to decide what it would like to inject to > guest.According to the latest doc I have access to, neither of these two bits are known there (nor are any other of the bits beyond 8). But as we don''t want to surface model specific behavior to guests, having these bits set seems the right thing in any case. The MCi_STATUS values reported to guests just may need some additional massaging. All this is of course assuming that AMD won''t assign these bits _different_ meanings in the future, but as that would be pretty counter productive (requiring more vendor specific code in all OSes) that seems pretty unlikely. Christoph? Jan
On Thu, 2012-07-05 at 07:44 +0100, Ian Campbell wrote:> On Thu, 2012-07-05 at 07:39 +0100, Jan Beulich wrote: > > >>> On 05.07.12 at 05:03, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > > > Jan Beulich wrote: > > >> Also I seem to recall that there was going to be a need to > > >> save/restore MCi_CTL2, but there''s nothing being done in that > > >> direction. > > > > > > Yes, but that would be done at new vMCE. > > > Current vMCE didn''t enable MCG_CMCI_P, and MCi_CTL2 are not used, so we > > > don''t save/restore MCi_CTL2 at this version. > > > > But _that_ is the major save/restore related issue to be > > addressed, at least if backward migration (4.2 -> 4.1) is also > > to be working reasonably. If such compatibility is entirely > > unneeded, then I agree that it likely doesn''t need any > > consideration at this point. > > We don''t generally worry about N->N-1 migration, just N->N+1.If there''s nothing to save in 4.2 (and therefore to restore in 4.3) what is it we are giving a freeze exception to again? If there''s no save/restore can you even migrate a guest with vMCE enabled at all? (these aren''t rhetorical questions, I have absolutely no idea how this stuff works ;-)) Ian.
>>> On 05.07.12 at 09:02, Ian Campbell <Ian.Campbell@citrix.com> wrote: > On Thu, 2012-07-05 at 07:44 +0100, Ian Campbell wrote: >> On Thu, 2012-07-05 at 07:39 +0100, Jan Beulich wrote: >> > >>> On 05.07.12 at 05:03, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> > > Jan Beulich wrote: >> > >> Also I seem to recall that there was going to be a need to >> > >> save/restore MCi_CTL2, but there''s nothing being done in that >> > >> direction. >> > > >> > > Yes, but that would be done at new vMCE. >> > > Current vMCE didn''t enable MCG_CMCI_P, and MCi_CTL2 are not used, so we >> > > don''t save/restore MCi_CTL2 at this version. >> > >> > But _that_ is the major save/restore related issue to be >> > addressed, at least if backward migration (4.2 -> 4.1) is also >> > to be working reasonably. If such compatibility is entirely >> > unneeded, then I agree that it likely doesn''t need any >> > consideration at this point. >> >> We don''t generally worry about N->N-1 migration, just N->N+1. > > If there''s nothing to save in 4.2 (and therefore to restore in 4.3) what > is it we are giving a freeze exception to again? > > If there''s no save/restore can you even migrate a guest with vMCE > enabled at all?We''re currently saving MCG_CAP, with the bank count in it reflecting the host bank count, and with other bits in there also matching the host''s. That''s what we''d like to simplify, _but_ as said we''re already shipping the current interface, so backward compatibility will be needed anyway. The main issue really was with MCi_CTL, which if not saved by 4.2 wouldn''t be restorable in 4.3. Now that we decided that they will get fixed values anyway, simply enforcing these fixed values (to not surprise the guest) would seem sufficient for 4.2. MCi_CTL2 are different in that afaiu setting them to 0 for a guest with no saved values (i.e. 4.2, where their presence isn''t being advertised, since CMCI doesn''t get surfaced) would be correct. Jan
Jan Beulich wrote:>>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> Jan, I update according to your comments. > > Some of them you did address, but I see that the per-domain > mci_ctl[] array is still present. What''s the point? > > Jan >I know what you meant now. I''m OK to remove per-domain mci_ctl[]. (In last patch I just want to change code as little as possible) Thanks, Jinsong
>>> On 05.07.12 at 09:28, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan Beulich wrote: >>>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >>> Jan, I update according to your comments. >> >> Some of them you did address, but I see that the per-domain >> mci_ctl[] array is still present. What''s the point? > > I know what you meant now. I''m OK to remove per-domain mci_ctl[]. > (In last patch I just want to change code as little as possible)As you remove mcg_ctl already, doing the same for mci_ctl won''t increase the amount of changes significantly, yet leaving it in place would be confusing. Jan
Jan Beulich wrote:>>>> On 05.07.12 at 09:02, Ian Campbell <Ian.Campbell@citrix.com> wrote: >> On Thu, 2012-07-05 at 07:44 +0100, Ian Campbell wrote: >>> On Thu, 2012-07-05 at 07:39 +0100, Jan Beulich wrote: >>>>>>> On 05.07.12 at 05:03, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>>> wrote: >>>>> Jan Beulich wrote: >>>>>> Also I seem to recall that there was going to be a need to >>>>>> save/restore MCi_CTL2, but there''s nothing being done in that >>>>>> direction. >>>>> >>>>> Yes, but that would be done at new vMCE. >>>>> Current vMCE didn''t enable MCG_CMCI_P, and MCi_CTL2 are not used, >>>>> so we don''t save/restore MCi_CTL2 at this version. >>>> >>>> But _that_ is the major save/restore related issue to be >>>> addressed, at least if backward migration (4.2 -> 4.1) is also >>>> to be working reasonably. If such compatibility is entirely >>>> unneeded, then I agree that it likely doesn''t need any >>>> consideration at this point. >>> >>> We don''t generally worry about N->N-1 migration, just N->N+1. >> >> If there''s nothing to save in 4.2 (and therefore to restore in 4.3) >> what is it we are giving a freeze exception to again? >> >> If there''s no save/restore can you even migrate a guest with vMCE >> enabled at all? > > We''re currently saving MCG_CAP, with the bank count in it > reflecting the host bank count, and with other bits in there also > matching the host''s. That''s what we''d like to simplify, _but_ as > said we''re already shipping the current interface, so backward > compatibility will be needed anyway. > > The main issue really was with MCi_CTL, which if not saved by > 4.2 wouldn''t be restorable in 4.3. Now that we decided that they > will get fixed values anyway, simply enforcing these fixed values > (to not surprise the guest) would seem sufficient for 4.2. MCi_CTL2 > are different in that afaiu setting them to 0 for a guest with no > saved values (i.e. 4.2, where their presence isn''t being advertised, > since CMCI doesn''t get surfaced) would be correct. > > JanSo Jan and Ian, do we really care migrate N -> N-1 ? If yes, I will save/restore MCi_CTL2, but that would involve more code change. Thanks, Jinsong
On 05/07/2012 08:50, "Liu, Jinsong" <jinsong.liu@intel.com> wrote:>> >> We''re currently saving MCG_CAP, with the bank count in it >> reflecting the host bank count, and with other bits in there also >> matching the host''s. That''s what we''d like to simplify, _but_ as >> said we''re already shipping the current interface, so backward >> compatibility will be needed anyway. >> >> The main issue really was with MCi_CTL, which if not saved by >> 4.2 wouldn''t be restorable in 4.3. Now that we decided that they >> will get fixed values anyway, simply enforcing these fixed values >> (to not surprise the guest) would seem sufficient for 4.2. MCi_CTL2 >> are different in that afaiu setting them to 0 for a guest with no >> saved values (i.e. 4.2, where their presence isn''t being advertised, >> since CMCI doesn''t get surfaced) would be correct. >> >> Jan > > So Jan and Ian, do we really care migrate N -> N-1 ? > If yes, I will save/restore MCi_CTL2, but that would involve more code change.We categorically do not care about migration N -> N-1. N-1 -> N is the only thing that really gets much test (N-k -> N should work by design though), and it''s the most useful one, for rolling fairly-seamless upgrades. -- Keir
Christoph Egger
2012-Jul-05 09:18 UTC
Re: [PATCH] Xen/MCE: adjust for future new vMCE model
On 07/05/12 09:00, Jan Beulich wrote:>>>> On 04.07.12 at 18:08, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> Christoph Egger wrote: >>> On 07/04/12 15:08, Liu, Jinsong wrote: >>>> + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; >>>> + >>> >>> >>> Is MCG_TES_P and MCG_SER_P emulated independent if the host has them >>> or not? For AMD this only works if the answer is yes. >>> >>> (I know in upstream this code path is used by Intel only but I have >>> patches that brings this code path in use by both AMD and Intel.) >>> >>> Christoph >> >> I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me where can >> I get AMD''s *latest* open doc (something like amd architecture programmer >> manual)? >> >> If AMD has these 2 bits, it''s safe to set them independent of host >> capability -- guest will just think it running on a platform w/ some events >> *possilbe* (though actually may never occur), hypervisor know what actually >> occur and has the flexibility to decide what it would like to inject to >> guest. > > According to the latest doc I have access to, neither of these two > bits are known there (nor are any other of the bits beyond 8). But > as we don''t want to surface model specific behavior to guests, > having these bits set seems the right thing in any case. The > MCi_STATUS values reported to guests just may need some > additional massaging. > > All this is of course assuming that AMD won''t assign these bits > _different_ meanings in the future, but as that would be pretty > counter productive (requiring more vendor specific code in all OSes) > that seems pretty unlikely. Christoph?Let me ask back internally. Christoph -- ---to satisfy European Law for business letters: Advanced Micro Devices GmbH Einsteinring 24, 85689 Dornach b. Muenchen Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen Registergericht Muenchen, HRB Nr. 43632
Jan Beulich wrote:>>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> >>>> wrote: @@ -68,7 +74,7 @@ >> >> int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { >> - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) >> + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) >> { >> dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA >> capabilities" " %#" PRIx64 " for d%d:v%u >> (supported: %#Lx)\n", @@ -77,6 +83,12 @@ return -EPERM; >> } >> >> + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) + { >> + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); + >> return -EPERM; + } >> + >> v->arch.mcg_cap = caps; >> return 0; >> } > > These two changes, as pointed out before, need some further > thought and tweaking: As I said earlier, we are already shipping > the code in its prior form, so outright rejecting MCG_CTL_P set > and non-default bank counts is not a proper solution. Warning > about them being in an unsupported state is certainly acceptable. > > And I think the guest visible MCG_CAP value also shouldn''t > change across migration, so tolerating (but ignoring) a higher > bank count seems necessary. Not sure what to do when the > bank count is lower (0 or 1) - for 1, all communication to the > guest should probably go through bank 0, while 0 should > probably disable vMCE for that vCPU. > > Further I just noticed that you don''t touch fill_vmsr_data() at > all (sending patches created with diff''s -p option or equivalent > helps spotting where individual changes belong), yet that > function uses the hardware bank number to fill the struct > bank_entry. With the intended concept, the "bank" member > of that structure can probably go away altogether. > > JanSeems things become a little bit chaos, let''s jump out from details, make agreement first about what''s the general purpose of this middle-work patch. ===========1. current xen vmce status 1.1) current xen vmce has 2 kinds of bugs: * functional bug: it actually doesn''t work correctly for guest; * migration bug: partly solved by c/s 24887; 1.2) c/s 24887 not in formal xen release version, it''s a temporary fix (but unfortunately has been backported to SELS11 sp2) ===========2. target of this middle-work patch 2.1) it''s not used to address functional bug 2.3) it does minimal work, just in order not to bring trouble/dirty to future new vMCE, so it would * remove MCG_CTL --> otherwise new vMCE have to add useless MCG_CTL_P and MCG_CTL, w/ potential model specific issue; * stick all 1''s to MCi_CTL --> to avoid semantic issue; * set MCG_CTL_P=0 and 2 banks at MCG_CAP --> otherwise dirty code at new vMCE, like bank number issue; =========== I think in this middle-work patch, we don''t need constrained by c/s 24887: 1. c/s 24887 not in formal xen release 2. the benefit of keep compatible w/ 24887 is minor: * currently all xen version have migration bug, 3.x -> 4.0 -> 4.1 -> 4.2 * keep compatible w/ 24887 only benefit one case --> migration from SLES11 sp2 to xen 4.2 (for both SLES11 sp2 and xen 4.2, vMCE actually doesn''t functionally work fine to guest) 3. the price/hurt to future vMCE is high (to keep compatible w/ 24887) Considering above, I prefer an outright cleanup in xen4.2, not constrained by c/s 24887 and not bring trouble/dirty to future vMCE. Thanks, Jinsong
Christoph Egger wrote:> On 07/05/12 09:00, Jan Beulich wrote: > >>>>> On 04.07.12 at 18:08, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>> wrote: Christoph Egger wrote: On 07/04/12 15:08, Liu, Jinsong >>>>> wrote: + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; + >>>> >>>> >>>> Is MCG_TES_P and MCG_SER_P emulated independent if the host has >>>> them or not? For AMD this only works if the answer is yes. >>>> >>>> (I know in upstream this code path is used by Intel only but I have >>>> patches that brings this code path in use by both AMD and Intel.) >>>> >>>> Christoph >>> >>> I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me >>> where can I get AMD''s *latest* open doc (something like amd >>> architecture programmer manual)? >>> >>> If AMD has these 2 bits, it''s safe to set them independent of host >>> capability -- guest will just think it running on a platform w/ >>> some events *possilbe* (though actually may never occur), >>> hypervisor know what actually occur and has the flexibility to >>> decide what it would like to inject to guest. >> >> According to the latest doc I have access to, neither of these two >> bits are known there (nor are any other of the bits beyond 8). But >> as we don''t want to surface model specific behavior to guests, >> having these bits set seems the right thing in any case. The >> MCi_STATUS values reported to guests just may need some >> additional massaging. >> >> All this is of course assuming that AMD won''t assign these bits >> _different_ meanings in the future, but as that would be pretty >> counter productive (requiring more vendor specific code in all OSes) >> that seems pretty unlikely. Christoph? > > Let me ask back internally. > > ChristophChristoph, where can I get more AMD mce details (of course open doc)? I read ''amd64 architecture programmer manual (v2).pdf'' (publication number 24593, Rev 3.18, May 2011), but seems it''s quite general. Per that document I cannot answer some of your questions. Are there more deatials in K7/K8/K10 docs (but I fail to find these docs)? Thanks, Jinsong
Christoph Egger
2012-Jul-05 09:53 UTC
Re: [PATCH] Xen/MCE: adjust for future new vMCE model
On 07/05/12 11:36, Liu, Jinsong wrote:> Christoph Egger wrote: >> On 07/05/12 09:00, Jan Beulich wrote: >> >>>>>> On 04.07.12 at 18:08, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>> wrote: Christoph Egger wrote: On 07/04/12 15:08, Liu, Jinsong >>>>>> wrote: + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; + >>>>> >>>>> >>>>> Is MCG_TES_P and MCG_SER_P emulated independent if the host has >>>>> them or not? For AMD this only works if the answer is yes. >>>>> >>>>> (I know in upstream this code path is used by Intel only but I have >>>>> patches that brings this code path in use by both AMD and Intel.) >>>>> >>>>> Christoph >>>> >>>> I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me >>>> where can I get AMD''s *latest* open doc (something like amd >>>> architecture programmer manual)? >>>> >>>> If AMD has these 2 bits, it''s safe to set them independent of host >>>> capability -- guest will just think it running on a platform w/ >>>> some events *possilbe* (though actually may never occur), >>>> hypervisor know what actually occur and has the flexibility to >>>> decide what it would like to inject to guest. >>> >>> According to the latest doc I have access to, neither of these two >>> bits are known there (nor are any other of the bits beyond 8). But >>> as we don''t want to surface model specific behavior to guests, >>> having these bits set seems the right thing in any case. The >>> MCi_STATUS values reported to guests just may need some >>> additional massaging. >>> >>> All this is of course assuming that AMD won''t assign these bits >>> _different_ meanings in the future, but as that would be pretty >>> counter productive (requiring more vendor specific code in all OSes) >>> that seems pretty unlikely. Christoph? >> >> Let me ask back internally. >> >> Christoph > > Christoph, where can I get more AMD mce details (of course open doc)?> I read ''amd64 architecture programmer manual (v2).pdf'' (publication > number 24593, Rev 3.18, May 2011), but seems it''s quite general. Per > that document I cannot answer some of your questions. Are there more > deatials in K7/K8/K10 docs (but I fail to find these docs)?Public documentation is available at http://developer.amd.com/documentation/guides/Pages/default.aspx The latest APM v2 you are asking is http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf CPU family specific information are in the BKDG (BIOS and Kernel Developer Guide). K8 documentation is no longer available. The only difference to Family10h is that Family10h has three extra MC4 MISC MSRs. Christoph -- ---to satisfy European Law for business letters: Advanced Micro Devices GmbH Einsteinring 24, 85689 Dornach b. Muenchen Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen Registergericht Muenchen, HRB Nr. 43632
Christoph Egger
2012-Jul-05 09:58 UTC
Re: [PATCH] Xen/MCE: adjust for future new vMCE model
On 07/05/12 11:36, Liu, Jinsong wrote:> Christoph Egger wrote: >> On 07/05/12 09:00, Jan Beulich wrote: >> >>>>>> On 04.07.12 at 18:08, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>> wrote: Christoph Egger wrote: On 07/04/12 15:08, Liu, Jinsong >>>>>> wrote: + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; + >>>>> >>>>> >>>>> Is MCG_TES_P and MCG_SER_P emulated independent if the host has >>>>> them or not? For AMD this only works if the answer is yes. >>>>> >>>>> (I know in upstream this code path is used by Intel only but I have >>>>> patches that brings this code path in use by both AMD and Intel.) >>>>> >>>>> Christoph >>>> >>>> I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me >>>> where can I get AMD''s *latest* open doc (something like amd >>>> architecture programmer manual)? >>>> >>>> If AMD has these 2 bits, it''s safe to set them independent of host >>>> capability -- guest will just think it running on a platform w/ >>>> some events *possilbe* (though actually may never occur), >>>> hypervisor know what actually occur and has the flexibility to >>>> decide what it would like to inject to guest. >>> >>> According to the latest doc I have access to, neither of these two >>> bits are known there (nor are any other of the bits beyond 8). But >>> as we don''t want to surface model specific behavior to guests, >>> having these bits set seems the right thing in any case. The >>> MCi_STATUS values reported to guests just may need some >>> additional massaging. >>> >>> All this is of course assuming that AMD won''t assign these bits >>> _different_ meanings in the future, but as that would be pretty >>> counter productive (requiring more vendor specific code in all OSes) >>> that seems pretty unlikely. Christoph? >> >> Let me ask back internally. >> >> Christoph > > Christoph, where can I get more AMD mce details (of course open doc)?> I read ''amd64 architecture programmer manual (v2).pdf'' (publication > number 24593, Rev 3.18, May 2011), but seems it''s quite general. Per > that document I cannot answer some of your questions. Are there more > deatials in K7/K8/K10 docs (but I fail to find these docs)?Public documentation is available at http://developer.amd.com/documentation/guides/Pages/default.aspx The latest APM v2 you are asking is http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf CPU family specific information are in the BKDG (BIOS and Kernel Developer Guide). K8 documentation is no longer available. The only difference to Family10h is that Family10h has three exra MC4 MISC MSRs. Christoph -- ---to satisfy European Law for business letters: Advanced Micro Devices GmbH Einsteinring 24, 85689 Dornach b. Muenchen Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen Registergericht Muenchen, HRB Nr. 43632
Christoph Egger wrote:> On 07/05/12 11:36, Liu, Jinsong wrote: > >> Christoph Egger wrote: >>> On 07/05/12 09:00, Jan Beulich wrote: >>> >>>>>>> On 04.07.12 at 18:08, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>>> wrote: Christoph Egger wrote: On 07/04/12 15:08, Liu, Jinsong >>>>>>> wrote: + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; >>>>>>> + >>>>>> >>>>>> >>>>>> Is MCG_TES_P and MCG_SER_P emulated independent if the host has >>>>>> them or not? For AMD this only works if the answer is yes. >>>>>> >>>>>> (I know in upstream this code path is used by Intel only but I >>>>>> have patches that brings this code path in use by both AMD and >>>>>> Intel.) >>>>>> >>>>>> Christoph >>>>> >>>>> I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me >>>>> where can I get AMD''s *latest* open doc (something like amd >>>>> architecture programmer manual)? >>>>> >>>>> If AMD has these 2 bits, it''s safe to set them independent of host >>>>> capability -- guest will just think it running on a platform w/ >>>>> some events *possilbe* (though actually may never occur), >>>>> hypervisor know what actually occur and has the flexibility to >>>>> decide what it would like to inject to guest. >>>> >>>> According to the latest doc I have access to, neither of these two >>>> bits are known there (nor are any other of the bits beyond 8). But >>>> as we don''t want to surface model specific behavior to guests, >>>> having these bits set seems the right thing in any case. The >>>> MCi_STATUS values reported to guests just may need some >>>> additional massaging. >>>> >>>> All this is of course assuming that AMD won''t assign these bits >>>> _different_ meanings in the future, but as that would be pretty >>>> counter productive (requiring more vendor specific code in all >>>> OSes) that seems pretty unlikely. Christoph? >>> >>> Let me ask back internally. >>> >>> Christoph >> >> Christoph, where can I get more AMD mce details (of course open doc)? > >> I read ''amd64 architecture programmer manual (v2).pdf'' (publication >> number 24593, Rev 3.18, May 2011), but seems it''s quite general. Per >> that document I cannot answer some of your questions. Are there more >> deatials in K7/K8/K10 docs (but I fail to find these docs)? > > > > > > Public documentation is available at > http://developer.amd.com/documentation/guides/Pages/default.aspx > > The latest APM v2 you are asking is > http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf > > CPU family specific information are in the BKDG > (BIOS and Kernel Developer Guide). > > K8 documentation is no longer available. The only difference to > Family10h is that Family10h has three exra MC4 MISC MSRs. > > ChristophThanks a lot! Jinsong
On Thu, 2012-07-05 at 10:20 +0100, Liu, Jinsong wrote:> > * currently all xen version have migration bug, 3.x -> 4.0 -> 4.1 -> > 4.2I hope you mean with vMCE specifically and not generally because AFAIK migration from 3.4 -> 4.0 -> 4.1 works and the intention should be for 4.1->4.2 to work fine too. If you know of bugs in this please let us know. Ian.
>>> On 05.07.12 at 11:20, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan Beulich wrote: >>>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>> wrote: @@ -68,7 +74,7 @@ >>> >>> int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { >>> - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) >>> + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) >>> { >>> dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA >>> capabilities" " %#" PRIx64 " for d%d:v%u >>> (supported: %#Lx)\n", @@ -77,6 +83,12 @@ return -EPERM; >>> } >>> >>> + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) + { >>> + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); + >>> return -EPERM; + } >>> + >>> v->arch.mcg_cap = caps; >>> return 0; >>> } >> >> These two changes, as pointed out before, need some further >> thought and tweaking: As I said earlier, we are already shipping >> the code in its prior form, so outright rejecting MCG_CTL_P set >> and non-default bank counts is not a proper solution. Warning >> about them being in an unsupported state is certainly acceptable. >> >> And I think the guest visible MCG_CAP value also shouldn''t >> change across migration, so tolerating (but ignoring) a higher >> bank count seems necessary. Not sure what to do when the >> bank count is lower (0 or 1) - for 1, all communication to the >> guest should probably go through bank 0, while 0 should >> probably disable vMCE for that vCPU. >> >> Further I just noticed that you don''t touch fill_vmsr_data() at >> all (sending patches created with diff''s -p option or equivalent >> helps spotting where individual changes belong), yet that >> function uses the hardware bank number to fill the struct >> bank_entry. With the intended concept, the "bank" member >> of that structure can probably go away altogether. >> >> Jan > > > Seems things become a little bit chaos, let''s jump out from details, make > agreement first about what''s the general purpose of this middle-work patch. > > ===========> 1. current xen vmce status > 1.1) current xen vmce has 2 kinds of bugs: > * functional bug: it actually doesn''t work correctly for guest; > * migration bug: partly solved by c/s 24887; > 1.2) c/s 24887 not in formal xen release version, it''s a temporary fix > (but unfortunately has been backported to SELS11 sp2) > ===========> 2. target of this middle-work patch > 2.1) it''s not used to address functional bug > 2.3) it does minimal work, just in order not to bring trouble/dirty to > future new vMCE, so it would > * remove MCG_CTL --> otherwise new vMCE have to add useless MCG_CTL_P and > MCG_CTL, w/ potential model specific issue; > * stick all 1''s to MCi_CTL --> to avoid semantic issue; > * set MCG_CTL_P=0 and 2 banks at MCG_CAP --> otherwise dirty code at new > vMCE, like bank number issue; > ===========> > I think in this middle-work patch, we don''t need constrained by c/s 24887: > 1. c/s 24887 not in formal xen release > 2. the benefit of keep compatible w/ 24887 is minor: > * currently all xen version have migration bug, 3.x -> 4.0 -> 4.1 -> 4.2 > * keep compatible w/ 24887 only benefit one case --> migration from SLES11 > sp2 to xen 4.2 (for both SLES11 sp2 and xen 4.2, vMCE actually doesn''t > functionally work fine to guest) > 3. the price/hurt to future vMCE is high (to keep compatible w/ 24887)Why do you consider the price high? I think it would require pretty minor changes.> Considering above, I prefer an outright cleanup in xen4.2, not constrained > by c/s 24887 and not bring trouble/dirty to future vMCE.Whether or not is was appropriate for us to backport this change early is unclear, but given that back at that time I had already pointed out the problems and asked for work to be done to clean it up, and given that it has taken over four months to get going with this, I don''t think you would suggest the alternative of us having left the bug unfixed for this entire time period. So unless the price to pay is unreasonably high, I''d favor getting this taken care of rather than ignored. If I can find time to work on your last version of the patch towards the direction I have in mind tomorrow, I''ll do so; I''ll be gone for two weeks afterwards (and be mostly invisible for another one), so wouldn''t be able to look at this again presumably until the beginning of August. Jan
Ian Campbell wrote:> On Thu, 2012-07-05 at 10:20 +0100, Liu, Jinsong wrote: >> >> * currently all xen version have migration bug, 3.x -> 4.0 -> 4.1 >> -> >> 4.2 > > I hope you mean with vMCE specifically and not generally because AFAIK > migration from 3.4 -> 4.0 -> 4.1 works and the intention should be for > 4.1->4.2 to work fine too. > > If you know of bugs in this please let us know. > > Ian.Yes, it''s vMCE specific issue, and only under the case when migrate from platform A w/ bigger bank number to platform B w/ smaller bank number. Under this case, it would block migration from 3.x -> 4.0 -> 4.1 -> 4.2. Sorry not to speak it clear. Thanks, Jinsong
Jan Beulich wrote:>>>> On 05.07.12 at 11:20, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> Jan Beulich wrote: >>>>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>> wrote: @@ -68,7 +74,7 @@ >>>> >>>> int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { >>>> - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) >>>> + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) >>>> { >>>> dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA >>>> capabilities" " %#" PRIx64 " for d%d:v%u >>>> (supported: %#Lx)\n", @@ -77,6 +83,12 @@ return -EPERM; >>>> } >>>> >>>> + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) + { >>>> + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); + >>>> return -EPERM; + } + >>>> v->arch.mcg_cap = caps; >>>> return 0; >>>> } >>> >>> These two changes, as pointed out before, need some further >>> thought and tweaking: As I said earlier, we are already shipping >>> the code in its prior form, so outright rejecting MCG_CTL_P set >>> and non-default bank counts is not a proper solution. Warning >>> about them being in an unsupported state is certainly acceptable. >>> >>> And I think the guest visible MCG_CAP value also shouldn''t >>> change across migration, so tolerating (but ignoring) a higher >>> bank count seems necessary. Not sure what to do when the >>> bank count is lower (0 or 1) - for 1, all communication to the >>> guest should probably go through bank 0, while 0 should >>> probably disable vMCE for that vCPU. >>> >>> Further I just noticed that you don''t touch fill_vmsr_data() at >>> all (sending patches created with diff''s -p option or equivalent >>> helps spotting where individual changes belong), yet that >>> function uses the hardware bank number to fill the struct >>> bank_entry. With the intended concept, the "bank" member >>> of that structure can probably go away altogether. >>> >>> Jan >> >> >> Seems things become a little bit chaos, let''s jump out from details, >> make agreement first about what''s the general purpose of this >> middle-work patch. >> >> ===========>> 1. current xen vmce status >> 1.1) current xen vmce has 2 kinds of bugs: >> * functional bug: it actually doesn''t work correctly for guest; >> * migration bug: partly solved by c/s 24887; >> 1.2) c/s 24887 not in formal xen release version, it''s a temporary >> fix (but unfortunately has been backported to SELS11 sp2) >> ============ >> 2. target of this middle-work patch >> 2.1) it''s not used to address functional bug >> 2.3) it does minimal work, just in order not to bring >> trouble/dirty to future new vMCE, so it would * remove MCG_CTL >> --> otherwise new vMCE have to add useless MCG_CTL_P and MCG_CTL, w/ >> potential model specific issue; >> * stick all 1''s to MCi_CTL --> to avoid semantic issue; >> * set MCG_CTL_P=0 and 2 banks at MCG_CAP --> otherwise dirty >> code at new vMCE, like bank number issue; ===========>> >> I think in this middle-work patch, we don''t need constrained by c/s >> 24887: >> 1. c/s 24887 not in formal xen release >> 2. the benefit of keep compatible w/ 24887 is minor: >> * currently all xen version have migration bug, 3.x -> 4.0 -> 4.1 >> -> 4.2 >> * keep compatible w/ 24887 only benefit one case --> migration >> from SLES11 sp2 to xen 4.2 (for both SLES11 sp2 and xen 4.2, vMCE >> actually doesn''t functionally work fine to guest) >> 3. the price/hurt to future vMCE is high (to keep compatible w/ >> 24887) > > Why do you consider the price high? I think it would require pretty > minor changes.No, what I meant is not coding price. I mean the price that have to add dirty and useless change to the new vMCE is high. Just curious why we cannot simply get rid of c/s 24887? after all it only benefit SLES11 sp2.> >> Considering above, I prefer an outright cleanup in xen4.2, not >> constrained by c/s 24887 and not bring trouble/dirty to future vMCE. > > Whether or not is was appropriate for us to backport this change > early is unclear, but given that back at that time I had already > pointed out the problems and asked for work to be done to clean > it up, and given that it has taken over four months to get going > with this, I don''t think you would suggest the alternative of us > having left the bug unfixed for this entire time period.Jan, I''m not challenge why you backport to SLES11 sp2. If there is anything wrong, I agree it''s my fault. Currently what I concern is if we can do an outright cleanup at xen4.2 so that future vMCE could be simple and clean.> > So unless the price to pay is unreasonably high, I''d favor getting > this taken care of rather than ignored.If so, why we need this middle-work patch? It could just keep current status at xen 4.2, then start ''dirty'' new vMCE implement at xen4.3 --> and the ''dirty'' inherit from c/s 24887 which from code point of view would be totally removed at new vMCE. Thanks, Jinsong> > If I can find time to work on your last version of the patch towards > the direction I have in mind tomorrow, I''ll do so; I''ll be gone for > two weeks afterwards (and be mostly invisible for another one), > so wouldn''t be able to look at this again presumably until the > beginning of August. > > Jan
>>> On 05.07.12 at 14:46, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan Beulich wrote: >>>>> On 05.07.12 at 11:20, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >>> Jan Beulich wrote: >>>>>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>>> wrote: @@ -68,7 +74,7 @@ >>>>> >>>>> int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { >>>>> - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) >>>>> + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) >>>>> { >>>>> dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA >>>>> capabilities" " %#" PRIx64 " for d%d:v%u >>>>> (supported: %#Lx)\n", @@ -77,6 +83,12 @@ return -EPERM; >>>>> } >>>>> >>>>> + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) + { >>>>> + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); + >>>>> return -EPERM; + } + >>>>> v->arch.mcg_cap = caps; >>>>> return 0; >>>>> } >>>> >>>> These two changes, as pointed out before, need some further >>>> thought and tweaking: As I said earlier, we are already shipping >>>> the code in its prior form, so outright rejecting MCG_CTL_P set >>>> and non-default bank counts is not a proper solution. Warning >>>> about them being in an unsupported state is certainly acceptable. >>>> >>>> And I think the guest visible MCG_CAP value also shouldn''t >>>> change across migration, so tolerating (but ignoring) a higher >>>> bank count seems necessary. Not sure what to do when the >>>> bank count is lower (0 or 1) - for 1, all communication to the >>>> guest should probably go through bank 0, while 0 should >>>> probably disable vMCE for that vCPU. >>>> >>>> Further I just noticed that you don''t touch fill_vmsr_data() at >>>> all (sending patches created with diff''s -p option or equivalent >>>> helps spotting where individual changes belong), yet that >>>> function uses the hardware bank number to fill the struct >>>> bank_entry. With the intended concept, the "bank" member >>>> of that structure can probably go away altogether. >>>> >>>> Jan >>> >>> >>> Seems things become a little bit chaos, let''s jump out from details, >>> make agreement first about what''s the general purpose of this >>> middle-work patch. >>> >>> ===========>>> 1. current xen vmce status >>> 1.1) current xen vmce has 2 kinds of bugs: >>> * functional bug: it actually doesn''t work correctly for guest; >>> * migration bug: partly solved by c/s 24887; >>> 1.2) c/s 24887 not in formal xen release version, it''s a temporary >>> fix (but unfortunately has been backported to SELS11 sp2) >>> ============ >>> 2. target of this middle-work patch >>> 2.1) it''s not used to address functional bug >>> 2.3) it does minimal work, just in order not to bring >>> trouble/dirty to future new vMCE, so it would * remove MCG_CTL >>> --> otherwise new vMCE have to add useless MCG_CTL_P and MCG_CTL, w/ >>> potential model specific issue; >>> * stick all 1''s to MCi_CTL --> to avoid semantic issue; >>> * set MCG_CTL_P=0 and 2 banks at MCG_CAP --> otherwise dirty >>> code at new vMCE, like bank number issue; ===========>>> >>> I think in this middle-work patch, we don''t need constrained by c/s >>> 24887: >>> 1. c/s 24887 not in formal xen release >>> 2. the benefit of keep compatible w/ 24887 is minor: >>> * currently all xen version have migration bug, 3.x -> 4.0 -> 4.1 >>> -> 4.2 >>> * keep compatible w/ 24887 only benefit one case --> migration >>> from SLES11 sp2 to xen 4.2 (for both SLES11 sp2 and xen 4.2, vMCE >>> actually doesn''t functionally work fine to guest) >>> 3. the price/hurt to future vMCE is high (to keep compatible w/ >>> 24887) >> >> Why do you consider the price high? I think it would require pretty >> minor changes. > > No, what I meant is not coding price. I mean the price that have to add > dirty and useless change to the new vMCE is high. Just curious why we cannot > simply get rid of c/s 24887? after all it only benefit SLES11 sp2.This is what save MCG_CAP, and that this is necessary we agreed on iirc. So why would you want to drop that code all of the sudden?>>> Considering above, I prefer an outright cleanup in xen4.2, not >>> constrained by c/s 24887 and not bring trouble/dirty to future vMCE. >> >> Whether or not is was appropriate for us to backport this change >> early is unclear, but given that back at that time I had already >> pointed out the problems and asked for work to be done to clean >> it up, and given that it has taken over four months to get going >> with this, I don''t think you would suggest the alternative of us >> having left the bug unfixed for this entire time period. > > Jan, I''m not challenge why you backport to SLES11 sp2. If there is anything > wrong, I agree it''s my fault. Currently what I concern is if we can do an > outright cleanup at xen4.2 so that future vMCE could be simple and clean.But we can''t tell our customers that migration from, say, SP2 to SP3 won''t be possible.>> So unless the price to pay is unreasonably high, I''d favor getting >> this taken care of rather than ignored. > > If so, why we need this middle-work patch? It could just keep current status > at xen 4.2, then start ''dirty'' new vMCE implement at xen4.3 --> and the ''dirty'' > inherit from c/s 24887 which from code point of view would be totally removed > at new vMCE.No, what we now want is to complete said c/s. While at that time I thought we would need to save MCi_CTL, with the new concept we won''t need to. Instead, we need to enforce it to be all ones universally. The only compatibility thing is the need to deal with higher bank counts perceived to be available by guests, and the possibly previously seen set MCG_CTL_P bit. And yes, if this created a lot of ugly and otherwise unnecessary code, I''d agree not to care for this compatibility. But as I don''t expect this to be the case, I thought I''d still ask for it (since if we don''t do it in -unstable, we''ll have to carry a private patch in SLE to do so, and presumably for much longer than the 4.3 development period). Jan
Christoph Egger
2012-Jul-05 13:39 UTC
Re: [PATCH] Xen/MCE: adjust for future new vMCE model
On 07/05/12 11:20, Liu, Jinsong wrote:> Jan Beulich wrote: >>>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>> wrote: @@ -68,7 +74,7 @@ >>> >>> int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { >>> - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) >>> + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) >>> { >>> dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA >>> capabilities" " %#" PRIx64 " for d%d:v%u >>> (supported: %#Lx)\n", @@ -77,6 +83,12 @@ return -EPERM; >>> } >>> >>> + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) + { >>> + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); + >>> return -EPERM; + } >>> + >>> v->arch.mcg_cap = caps; >>> return 0; >>> } >> >> These two changes, as pointed out before, need some further >> thought and tweaking: As I said earlier, we are already shipping >> the code in its prior form, so outright rejecting MCG_CTL_P set >> and non-default bank counts is not a proper solution. Warning >> about them being in an unsupported state is certainly acceptable. >> >> And I think the guest visible MCG_CAP value also shouldn''t >> change across migration, so tolerating (but ignoring) a higher >> bank count seems necessary. Not sure what to do when the >> bank count is lower (0 or 1) - for 1, all communication to the >> guest should probably go through bank 0, while 0 should >> probably disable vMCE for that vCPU. >> >> Further I just noticed that you don''t touch fill_vmsr_data() at >> all (sending patches created with diff''s -p option or equivalent >> helps spotting where individual changes belong), yet that >> function uses the hardware bank number to fill the struct >> bank_entry. With the intended concept, the "bank" member >> of that structure can probably go away altogether. >> >> Jan > > > Seems things become a little bit chaos, let''s jump out from details,> make agreement first about what''s the general purpose of this > middle-work patch.> > ===========> 1. current xen vmce status > 1.1) current xen vmce has 2 kinds of bugs: > * functional bug: it actually doesn''t work correctly for guest; > * migration bug: partly solved by c/s 24887; > 1.2) c/s 24887 not in formal xen release version, it''s a temporary fix (but unfortunately has been backported to SELS11 sp2) > ===========> 2. target of this middle-work patch > 2.1) it''s not used to address functional bug > 2.3) it does minimal work, just in order not to bring trouble/dirty to future new vMCE, so it would > * remove MCG_CTL --> otherwise new vMCE have to add useless MCG_CTL_P and MCG_CTL, w/ potential model specific issue; > * stick all 1''s to MCi_CTL --> to avoid semantic issue; > * set MCG_CTL_P=0 and 2 banks at MCG_CAP --> otherwise dirty code at new vMCE, like bank number issue; > ===========> > I think in this middle-work patch, we don''t need constrained by c/s 24887: > 1. c/s 24887 not in formal xen release > 2. the benefit of keep compatible w/ 24887 is minor: > * currently all xen version have migration bug, 3.x -> 4.0 -> 4.1 -> 4.2 > * keep compatible w/ 24887 only benefit one case --> migration from SLES11 sp2 to xen 4.2 (for both SLES11 sp2 and xen 4.2, vMCE actually doesn''t functionally work fine to guest) > 3. the price/hurt to future vMCE is high (to keep compatible w/ 24887) > > Considering above, I prefer an outright cleanup in xen4.2,> not constrained by c/s 24887 and not bring trouble/dirty to future > vMCE.>This all is fine from AMD side. The point I want to bring up is this: In xen-unstable vMCE is Intel only *but* I have a set of patches which will unify a lot of logic so that vMCE, page-offlining, etc. is also used on AMD I want to make sure that the vMCE interface works with both Intel and AMD to avoid yet another vMCE interface change (and all discussion around this) after the end of the feature freeze. Another vMCE interface change will affect both Intel and AMD. So I think this is also in Intel''s interest to have a sane vMCE interface that fits for both sides. Christoph -- ---to satisfy European Law for business letters: Advanced Micro Devices GmbH Einsteinring 24, 85689 Dornach b. Muenchen Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen Registergericht Muenchen, HRB Nr. 43632
Jan Beulich wrote:>>>> On 05.07.12 at 14:46, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> Jan Beulich wrote: >>>>>> On 05.07.12 at 11:20, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>> wrote: >>>> Jan Beulich wrote: >>>>>>>> On 05.07.12 at 05:30, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>>>> wrote: @@ -68,7 +74,7 @@ >>>>>> >>>>>> int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { >>>>>> - if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) >>>>>> + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT ) >>>>>> { >>>>>> dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA >>>>>> capabilities" " %#" PRIx64 " for d%d:v%u >>>>>> (supported: %#Lx)\n", @@ -77,6 +83,12 @@ return -EPERM; >>>>>> } >>>>>> >>>>>> + if ( (caps & MCG_CAP_COUNT) != GUEST_BANK_NUM ) + { >>>>>> + dprintk(XENLOG_G_ERR, "Bank number inconsistency\n"); + >>>>>> return -EPERM; + } + v->arch.mcg_cap = caps; >>>>>> return 0; >>>>>> } >>>>> >>>>> These two changes, as pointed out before, need some further >>>>> thought and tweaking: As I said earlier, we are already shipping >>>>> the code in its prior form, so outright rejecting MCG_CTL_P set >>>>> and non-default bank counts is not a proper solution. Warning >>>>> about them being in an unsupported state is certainly acceptable. >>>>> >>>>> And I think the guest visible MCG_CAP value also shouldn''t >>>>> change across migration, so tolerating (but ignoring) a higher >>>>> bank count seems necessary. Not sure what to do when the >>>>> bank count is lower (0 or 1) - for 1, all communication to the >>>>> guest should probably go through bank 0, while 0 should >>>>> probably disable vMCE for that vCPU. >>>>> >>>>> Further I just noticed that you don''t touch fill_vmsr_data() at >>>>> all (sending patches created with diff''s -p option or equivalent >>>>> helps spotting where individual changes belong), yet that >>>>> function uses the hardware bank number to fill the struct >>>>> bank_entry. With the intended concept, the "bank" member >>>>> of that structure can probably go away altogether. >>>>> >>>>> Jan >>>> >>>> >>>> Seems things become a little bit chaos, let''s jump out from >>>> details, make agreement first about what''s the general purpose of >>>> this middle-work patch. >>>> >>>> ===========>>>> 1. current xen vmce status >>>> 1.1) current xen vmce has 2 kinds of bugs: >>>> * functional bug: it actually doesn''t work correctly for guest; >>>> * migration bug: partly solved by c/s 24887; >>>> 1.2) c/s 24887 not in formal xen release version, it''s a >>>> temporary fix (but unfortunately has been backported to SELS11 >>>> sp2) ============ >>>> 2. target of this middle-work patch >>>> 2.1) it''s not used to address functional bug >>>> 2.3) it does minimal work, just in order not to bring >>>> trouble/dirty to future new vMCE, so it would * remove MCG_CTL >>>> --> otherwise new vMCE have to add useless MCG_CTL_P and MCG_CTL, >>>> w/ potential model specific issue; >>>> * stick all 1''s to MCi_CTL --> to avoid semantic issue; >>>> * set MCG_CTL_P=0 and 2 banks at MCG_CAP --> otherwise dirty >>>> code at new vMCE, like bank number issue; ===========>>>> >>>> I think in this middle-work patch, we don''t need constrained by >>>> c/s 24887: >>>> 1. c/s 24887 not in formal xen release >>>> 2. the benefit of keep compatible w/ 24887 is minor: >>>> * currently all xen version have migration bug, 3.x -> 4.0 -> >>>> 4.1 -> 4.2 >>>> * keep compatible w/ 24887 only benefit one case --> migration >>>> from SLES11 sp2 to xen 4.2 (for both SLES11 sp2 and xen 4.2, vMCE >>>> actually doesn''t functionally work fine to guest) >>>> 3. the price/hurt to future vMCE is high (to keep compatible w/ >>>> 24887) >>> >>> Why do you consider the price high? I think it would require pretty >>> minor changes. >> >> No, what I meant is not coding price. I mean the price that have to >> add >> dirty and useless change to the new vMCE is high. Just curious why >> we cannot simply get rid of c/s 24887? after all it only benefit >> SLES11 sp2. > > This is what save MCG_CAP, and that this is necessary we agreed > on iirc. So why would you want to drop that code all of the sudden?The original idea we buy in save/restore MCG_CAP is for the sake of future capability bits exentsion of MCG_CAP. I didn''t image keep a useless bit in new vMCE.> >>>> Considering above, I prefer an outright cleanup in xen4.2, not >>>> constrained by c/s 24887 and not bring trouble/dirty to future >>>> vMCE. >>> >>> Whether or not is was appropriate for us to backport this change >>> early is unclear, but given that back at that time I had already >>> pointed out the problems and asked for work to be done to clean >>> it up, and given that it has taken over four months to get going >>> with this, I don''t think you would suggest the alternative of us >>> having left the bug unfixed for this entire time period. >> >> Jan, I''m not challenge why you backport to SLES11 sp2. If there is >> anything wrong, I agree it''s my fault. Currently what I concern is >> if we can do an outright cleanup at xen4.2 so that future vMCE could >> be simple and clean. > > But we can''t tell our customers that migration from, say, SP2 to > SP3 won''t be possible.Somehow I agree, though not 100%, say, how can you tell customers that migrate from sp1 to sp2 (and former) would block? AFAICT it only benefit a little earlier to sp2 (in our solution it delay to sp3).> >>> So unless the price to pay is unreasonably high, I''d favor getting >>> this taken care of rather than ignored. >> >> If so, why we need this middle-work patch? It could just keep >> current status at xen 4.2, then start ''dirty'' new vMCE implement at >> xen4.3 --> and the ''dirty'' inherit from c/s 24887 which from code >> point of view would be totally removed at new vMCE. > > No, what we now want is to complete said c/s. While at that time > I thought we would need to save MCi_CTL, with the new concept > we won''t need to. Instead, we need to enforce it to be all ones > universally. > > The only compatibility thing is the need to deal with higher bank > counts perceived to be available by guests, and the possibly > previously seen set MCG_CTL_P bit. > > And yes, if this created a lot of ugly and otherwise unnecessary > code, I''d agree not to care for this compatibility. But as I don''t > expect this to be the case, I thought I''d still ask for it (since if > we don''t do it in -unstable, we''ll have to carry a private patch > in SLE to do so, and presumably for much longer than the 4.3 > development period). > > JanIf so, the only thing we need do in this middle-work patch is to remove mci_ctl bank array and stick all 1''s to MCi_CTL; Thanks, Jinsong
>>> On 05.07.12 at 17:56, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan Beulich wrote: >>>>> On 05.07.12 at 14:46, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >>> No, what I meant is not coding price. I mean the price that have to >>> add >>> dirty and useless change to the new vMCE is high. Just curious why >>> we cannot simply get rid of c/s 24887? after all it only benefit >>> SLES11 sp2. >> >> This is what save MCG_CAP, and that this is necessary we agreed >> on iirc. So why would you want to drop that code all of the sudden? > > The original idea we buy in save/restore MCG_CAP is for the sake of future > capability bits exentsion of MCG_CAP. I didn''t image keep a useless bit in > new vMCE.Which bit is useless?>>> Jan, I''m not challenge why you backport to SLES11 sp2. If there is >>> anything wrong, I agree it''s my fault. Currently what I concern is >>> if we can do an outright cleanup at xen4.2 so that future vMCE could >>> be simple and clean. >> >> But we can''t tell our customers that migration from, say, SP2 to >> SP3 won''t be possible. > > Somehow I agree, though not 100%, say, how can you tell customers that > migrate from sp1 to sp2 (and former) would block? AFAICT it only benefit a > little earlier to sp2 (in our solution it delay to sp3).I''m afraid I don''t understand what you try to tell me here.>>> If so, why we need this middle-work patch? It could just keep >>> current status at xen 4.2, then start ''dirty'' new vMCE implement at >>> xen4.3 --> and the ''dirty'' inherit from c/s 24887 which from code >>> point of view would be totally removed at new vMCE. >> >> No, what we now want is to complete said c/s. While at that time >> I thought we would need to save MCi_CTL, with the new concept >> we won''t need to. Instead, we need to enforce it to be all ones >> universally. >> >> The only compatibility thing is the need to deal with higher bank >> counts perceived to be available by guests, and the possibly >> previously seen set MCG_CTL_P bit. >> >> And yes, if this created a lot of ugly and otherwise unnecessary >> code, I''d agree not to care for this compatibility. But as I don''t >> expect this to be the case, I thought I''d still ask for it (since if >> we don''t do it in -unstable, we''ll have to carry a private patch >> in SLE to do so, and presumably for much longer than the 4.3 >> development period). > > If so, the only thing we need do in this middle-work patch is to remove > mci_ctl bank array and stick all 1''s to MCi_CTL;And removing MCG_CTL at the same time is the logical consequence. If we''re in agreement that with this we can adjust things in 4.3 without breaking 4.2->4.3 migration, then why don''t we just do that? Jan
Jan Beulich wrote:>>>> On 05.07.12 at 17:56, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >> Jan Beulich wrote: >>>>>> On 05.07.12 at 14:46, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>> wrote: >>>> No, what I meant is not coding price. I mean the price that have >>>> to add dirty and useless change to the new vMCE is high. Just >>>> curious why we cannot simply get rid of c/s 24887? after all it >>>> only benefit SLES11 sp2. >>> >>> This is what save MCG_CAP, and that this is necessary we agreed >>> on iirc. So why would you want to drop that code all of the sudden? >> >> The original idea we buy in save/restore MCG_CAP is for the sake of >> future capability bits exentsion of MCG_CAP. I didn''t image keep a >> useless bit in new vMCE. > > Which bit is useless?I mean, MCG_CTL_P bit (and the MCG_CTL reg) are useless for new vMCE. But according to your opinion (if I didn''t misunderstand), we have to enable MCG_CTL_P bit and keep MCG_CTL reg in the future.> >>>> Jan, I''m not challenge why you backport to SLES11 sp2. If there is >>>> anything wrong, I agree it''s my fault. Currently what I concern is >>>> if we can do an outright cleanup at xen4.2 so that future vMCE >>>> could be simple and clean. >>> >>> But we can''t tell our customers that migration from, say, SP2 to >>> SP3 won''t be possible. >> >> Somehow I agree, though not 100%, say, how can you tell customers >> that migrate from sp1 to sp2 (and former) would block? AFAICT it >> only benefit a little earlier to sp2 (in our solution it delay to >> sp3). > > I''m afraid I don''t understand what you try to tell me here.Per my understanding, you just backport c/s 24887 to sp2, so sp0->sp1 and sp1->sp2 migration still block.> >>>> If so, why we need this middle-work patch? It could just keep >>>> current status at xen 4.2, then start ''dirty'' new vMCE implement at >>>> xen4.3 --> and the ''dirty'' inherit from c/s 24887 which from code >>>> point of view would be totally removed at new vMCE. >>> >>> No, what we now want is to complete said c/s. While at that time >>> I thought we would need to save MCi_CTL, with the new concept >>> we won''t need to. Instead, we need to enforce it to be all ones >>> universally. >>> >>> The only compatibility thing is the need to deal with higher bank >>> counts perceived to be available by guests, and the possibly >>> previously seen set MCG_CTL_P bit. >>> >>> And yes, if this created a lot of ugly and otherwise unnecessary >>> code, I''d agree not to care for this compatibility. But as I don''t >>> expect this to be the case, I thought I''d still ask for it (since if >>> we don''t do it in -unstable, we''ll have to carry a private patch >>> in SLE to do so, and presumably for much longer than the 4.3 >>> development period). >> >> If so, the only thing we need do in this middle-work patch is to >> remove mci_ctl bank array and stick all 1''s to MCi_CTL; > > And removing MCG_CTL at the same time is the logical consequence.I again confuse here. You don''t mind I disable MCG_CTL_P bit and remove MCG_CTL reg in this middle-work patch? so which point you against my former patch? 1. remove MCG_CTL; 2. stick all 1''s to MCi_CTL; 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP;> > If we''re in agreement that with this we can adjust things in 4.3 > without breaking 4.2->4.3 migration, then why don''t we just do > that? > > Jan
> I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me where can I get > AMD''s *latest* open doc (something like amd architecture programmer manual)? > > If AMD has these 2 bits, it''s safe to set them independent of host capability -- guest > will just think it running on a platform w/ some events *possilbe* (though actually > may never occur), hypervisor know what actually occur and has the flexibility to > decide what it would like to inject to guest. > > This code is only used by Intel, and it''s only for not blocking future vMCE, so it just do minimal necessary update.I think you should be very wary of creating "Franken-machines" that look half AMD (according to CPUID) and half Intel (according to MCG_CAP). You can look at the Linux code and check whether we always make sensible decisions when presented with such a system ... but you may not have that luxury with other guest operating systems. My general mantra is that untested code paths have bugs. -Tony
> Yes, it''s vMCE specific issue, and only under the case when migrate from platform > A w/ bigger bank number to platform B w/ smaller bank number. Under this case, > it would block migration from 3.x -> 4.0 -> 4.1 -> 4.2.Linux (and probably every other OS) will only look to see how many banks there are once at boot time. From that point on it will scan as many banks as it was told existed for the rest of eternity. So it really doesn''t matter what value you present in MCG_CAP after a migration - the OS will never look at it any way. The important thing is what you do when the OS does: for (i = 0; i < banks; i++) { status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); ... } because you''d better give it something reasonable for each bank that it was told existed. Also if you are going to present a machine check to the guest, then you need to fill in the data for some bank that it knows about (i.e. i < banks ... and possibly i != 0 for certain CPUIDs). -Tony
Luck, Tony wrote:>> I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me >> where can I get >> AMD''s *latest* open doc (something like amd architecture programmer >> manual)? >> >> If AMD has these 2 bits, it''s safe to set them independent of host >> capability -- guest >> will just think it running on a platform w/ some events *possilbe* >> (though actually >> may never occur), hypervisor know what actually occur and has the >> flexibility to >> decide what it would like to inject to guest. >> >> This code is only used by Intel, and it''s only for not blocking >> future vMCE, so it just do minimal necessary update. > > I think you should be very wary of creating "Franken-machines" that > look half AMD (according to CPUID) and half Intel (according to > MCG_CAP). You can look at the Linux code and check whether we always > make sensible decisions when presented with > such a system ... but you may not have that luxury with other guest > operating systems. My general mantra is that untested code paths have > bugs. > > -TonyYes, I indeed concern AMD cpuid vs. Intel MCG_CAP. Do you suggest that we''d better separately provide Intel''s and AMD''s vMCE interface? Thanks, Jinsong
Luck, Tony wrote:>> Yes, it''s vMCE specific issue, and only under the case when migrate >> from platform A w/ bigger bank number to platform B w/ smaller bank >> number. Under this case, it would block migration from 3.x -> 4.0 -> >> 4.1 -> 4.2. > > Linux (and probably every other OS) will only look to see how many > banks there > are once at boot time. From that point on it will scan as many banks > as it was told existed for the rest of eternity. So it really doesn''t > matter what value you present in MCG_CAP after a migration - the OS > will never look at it any way. The important thing is what you do > when the OS does: for (i = 0; i < banks; i++) { > status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); > ... > } > because you''d better give it something reasonable for each bank that > it was > told existed.Hmm, this bug is not worry about MCG_CAP bank field. It in fact caused from current dirty xen vmce code and its data structure ... and because of that, it block migration when migrate from big bank to small bank platform :)> > Also if you are going to present a machine check to the guest, then > you need to fill in the data for some bank that it knows about (i.e. > i < banks ... and possibly i != 0 for certain CPUIDs). > > -Tony
> Yes, I indeed concern AMD cpuid vs. Intel MCG_CAP. Do you suggest that we''d better separately provide Intel''s and AMD''s vMCE interface?It might be worth a little extra coding now to avoid problems in the future if AMD and Intel ever use the same MCG_CAP bit for different purposes. I don''t know if this would ever happen - I''m just issuing a general architectural caution that decisions made now and coded into software live for a very long time. -Tony
>>> On 05.07.12 at 18:42, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: > Jan Beulich wrote: >>>>> On 05.07.12 at 17:56, "Liu, Jinsong" <jinsong.liu@intel.com> wrote: >>> Jan Beulich wrote: >>>>>>> On 05.07.12 at 14:46, "Liu, Jinsong" <jinsong.liu@intel.com> >>>>>>> wrote: >>>>> No, what I meant is not coding price. I mean the price that have >>>>> to add dirty and useless change to the new vMCE is high. Just >>>>> curious why we cannot simply get rid of c/s 24887? after all it >>>>> only benefit SLES11 sp2. >>>> >>>> This is what save MCG_CAP, and that this is necessary we agreed >>>> on iirc. So why would you want to drop that code all of the sudden? >>> >>> The original idea we buy in save/restore MCG_CAP is for the sake of >>> future capability bits exentsion of MCG_CAP. I didn''t image keep a >>> useless bit in new vMCE. >> >> Which bit is useless? > > I mean, MCG_CTL_P bit (and the MCG_CTL reg) are useless for new vMCE. > > But according to your opinion (if I didn''t misunderstand), we have to enable > MCG_CTL_P bit and keep MCG_CTL reg in the future.No - we need to tolerate MCG_CTL_P coming in set from a migration. There''s no reason I can see why MCG_CTL needs to be retained, all that''s needed is that guest accesses don''t fault (returning all 1s is, if I got your earlier mails right, to be tolerated by guests).> Per my understanding, you just backport c/s 24887 to sp2, so sp0->sp1 and > sp1->sp2 migration still block.Yes, but sp2->sp2 migration now works even when the destination host has fewer banks. An sp1->sp2 migration would still underly the restriction on the number of banks, but one could now do a sp1->sp2->sp2 migration to achieve the effect of migrating to a less capable host.>>> If so, the only thing we need do in this middle-work patch is to >>> remove mci_ctl bank array and stick all 1''s to MCi_CTL; >> >> And removing MCG_CTL at the same time is the logical consequence. > > I again confuse here. You don''t mind I disable MCG_CTL_P bit and remove > MCG_CTL reg in this middle-work patch? so which point you against my former > patch? > 1. remove MCG_CTL; > 2. stick all 1''s to MCi_CTL; > 3. set MCG_CTL_P=0 and 2 banks at MCG_CAP;None of these. What I do mind is you disallowing incoming migration with MCG_CTL_P set and/or bank count != 2. Jan
Christoph Egger
2012-Jul-06 09:12 UTC
Re: [PATCH] Xen/MCE: adjust for future new vMCE model
On 07/05/12 20:38, Liu, Jinsong wrote:> Luck, Tony wrote: >>> I''m not sure if AMD has these 2 bits in MCG_CAP. Could you tell me >>> where can I get >>> AMD''s *latest* open doc (something like amd architecture programmer >>> manual)? >>> >>> If AMD has these 2 bits, it''s safe to set them independent of host >>> capability -- guest >>> will just think it running on a platform w/ some events *possilbe* >>> (though actually >>> may never occur), hypervisor know what actually occur and has the >>> flexibility to >>> decide what it would like to inject to guest. >>> >>> This code is only used by Intel, and it''s only for not blocking >>> future vMCE, so it just do minimal necessary update. >> >> I think you should be very wary of creating "Franken-machines" that >> look half AMD (according to CPUID) and half Intel (according to >> MCG_CAP). You can look at the Linux code and check whether we always >> make sensible decisions when presented with >> such a system ... but you may not have that luxury with other guest >> operating systems. My general mantra is that untested code paths have >> bugs. >> >> -Tony > > Yes, I indeed concern AMD cpuid vs. Intel MCG_CAP. Do you suggest> that we''d better separately provide Intel''s and AMD''s vMCE interface?That is no reason to have seperate vmce_intel.c and vmce_amd.c files. From the last patch in function vmce_init_msr(): + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; + I think, this should be changed to: g_mcg_cap = GUEST_BANK_NUM; if (cpu_vendor == X86_VENDOR_INTEL) g_mcg_cap |= MCG_TES_P | MCG_SER_P; Another question: What happens when a guest access the MSRs 0xc0000408, 0xc0000409 and 0xc000040a ? Christoph -- ---to satisfy European Law for business letters: Advanced Micro Devices GmbH Einsteinring 24, 85689 Dornach b. Muenchen Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen Registergericht Muenchen, HRB Nr. 43632
>>> On 06.07.12 at 11:12, Christoph Egger <Christoph.Egger@amd.com> wrote: > On 07/05/12 20:38, Liu, Jinsong wrote: >> Yes, I indeed concern AMD cpuid vs. Intel MCG_CAP. Do you suggest >> that we''d better separately provide Intel''s and AMD''s vMCE interface? > > That is no reason to have seperate vmce_intel.c and vmce_amd.c files. > > From the last patch in function vmce_init_msr(): > > + g_mcg_cap = MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM; > + > > I think, this should be changed to: > > g_mcg_cap = GUEST_BANK_NUM; > if (cpu_vendor == X86_VENDOR_INTEL) > g_mcg_cap |= MCG_TES_P | MCG_SER_P;While I agree in general, that may imply further problems when migrating between different vendor CPUs. On the assumption that OSes look at all this information at boot time only, it might be as simple as adjusting behavior to the guest specified (possibly restored) value (if the two bits have an effect on vMCE behavior at all).> Another question: > > What happens when a guest access the MSRs > 0xc0000408, 0xc0000409 and 0xc000040a ?This should depend on the bank count in the vCPU''s MCG_CAP (#GP if count <= 2, returning sensible values - all 0s or all 1s, depending on the register - if count implies existence of these MSRs). Jan