Xen/MCE: vMCE emulation This patch provides virtual MCE support to guest. It emulates a simple and clean MCE MSRs interface to guest by faking caps to guest if needed and masking caps if unnecessary: 1. Providing a well-defined MCG_CAP to guest, filter out un-necessary caps and provide only guest needed caps; 2. Disabling MCG_CTL to avoid model specific; 3. Sticking all 1''s to MCi_CTL to guest to avoid model specific; 4. Enabling CMCI cap but never really inject to guest to prevent polling periodically; 5. Masking MSCOD field of MCi_STATUS to avoid model specific; 6. Keeping natural semantics by per-vcpu instead of per-domain variables; 7. Using bank1 and reserving bank0 to work around ''bank0 quirk'' of some very old processors; 8. Cleaning some vMCE# injection logic which shared by Intel and AMD but useless under new vMCE implement; 9. Keeping compatilbe w/ old xen version which has been backported to SLES11 SP2, so that old vMCE would not blocked when migrate to new vMCE; Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com> diff -r fbd9e864c047 xen/arch/x86/cpu/mcheck/mce.h --- a/xen/arch/x86/cpu/mcheck/mce.h Mon Sep 17 18:02:59 2012 +0800 +++ b/xen/arch/x86/cpu/mcheck/mce.h Tue Sep 18 22:39:10 2012 +0800 @@ -168,13 +168,12 @@ int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, uint64_t gstatus); int inject_vmce(struct domain *d); -int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global); static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) { if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && msr >= MSR_IA32_MC0_CTL2 && - msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) + msr < MSR_IA32_MCx_CTL2(v->arch.vmce.mcg_cap & MCG_CAP_COUNT) ) return 1; return 0; } @@ -182,7 +181,7 @@ static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr) { if ( (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(v->arch.mcg_cap & MCG_CAP_COUNT)) || + msr < MSR_IA32_MCx_CTL(v->arch.vmce.mcg_cap & MCG_CAP_COUNT)) || mce_vendor_bank_msr(v, msr) ) return 1; return 0; diff -r fbd9e864c047 xen/arch/x86/cpu/mcheck/mce_intel.c --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Mon Sep 17 18:02:59 2012 +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Sep 18 22:39:10 2012 +0800 @@ -1300,14 +1300,15 @@ /* intel specific MCA MSR */ int intel_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { + unsigned int bank = msr - MSR_IA32_MC0_CTL2; int ret = 0; - if ( msr >= MSR_IA32_MC0_CTL2 && - msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) + if ( bank < GUEST_MC_BANK_NUM ) { - mce_printk(MCE_QUIET, "We have disabled CMCI capability, " - "Guest should not write this MSR!\n"); - ret = 1; + v->arch.vmce.bank[bank].mci_ctl2 = val; + mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %"PRIx64"\n", + bank, val); + ret = 1; } return ret; @@ -1315,13 +1316,14 @@ int intel_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { + unsigned int bank = msr - MSR_IA32_MC0_CTL2; int ret = 0; - if ( msr >= MSR_IA32_MC0_CTL2 && - msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) + if ( bank < GUEST_MC_BANK_NUM ) { - mce_printk(MCE_QUIET, "We have disabled CMCI capability, " - "Guest should not read this MSR!\n"); + *val = v->arch.vmce.bank[bank].mci_ctl2; + mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL2 0x%"PRIx64"\n", + bank, *val); ret = 1; } diff -r fbd9e864c047 xen/arch/x86/cpu/mcheck/vmce.c --- a/xen/arch/x86/cpu/mcheck/vmce.c Mon Sep 17 18:02:59 2012 +0800 +++ b/xen/arch/x86/cpu/mcheck/vmce.c Tue Sep 18 22:39:10 2012 +0800 @@ -1,5 +1,22 @@ /* - * vmce.c - virtual MCE support + * vmce.c - provide software emulated vMCE support to guest + * + * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com> + * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <xen/init.h> @@ -19,67 +36,55 @@ #include "mce.h" #include "x86_mca.h" -/* - * Emulate 2 banks for guest - * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: - * 1). Intel cpu whose family-model value < 06-1A; - * 2). AMD K7 - * Bank1: used to transfer error info to guest - */ -#define GUEST_BANK_NUM 2 -#define GUEST_MCG_CAP (MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM) - -#define dom_vmce(x) ((x)->arch.vmca_msrs) - -int vmce_init_msr(struct domain *d) -{ - dom_vmce(d) = xmalloc(struct domain_mca_msrs); - if ( !dom_vmce(d) ) - return -ENOMEM; - - dom_vmce(d)->mcg_status = 0x0; - dom_vmce(d)->nr_injection = 0; - - INIT_LIST_HEAD(&dom_vmce(d)->impact_header); - spin_lock_init(&dom_vmce(d)->lock); - - return 0; -} - -void vmce_destroy_msr(struct domain *d) -{ - if ( !dom_vmce(d) ) - return; - xfree(dom_vmce(d)); - dom_vmce(d) = NULL; -} - void vmce_init_vcpu(struct vcpu *v) { - v->arch.mcg_cap = GUEST_MCG_CAP; + int i; + + /* global MCA MSRs init */ + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) + v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP; + else + v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP; + + v->arch.vmce.mcg_status = 0; + + /* per-bank MCA MSRs init */ + for ( i = 0; i < GUEST_MC_BANK_NUM; i++ ) + memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank)); + + spin_lock_init(&v->arch.vmce.lock); } int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { - if ( caps & ~GUEST_MCG_CAP & ~MCG_CAP_COUNT & ~MCG_CTL_P ) + uint64_t guest_mcg_cap; + + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) + guest_mcg_cap = INTEL_GUEST_MCG_CAP; + else + guest_mcg_cap = AMD_GUEST_MCG_CAP; + + if ( caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) { dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", is_hvm_vcpu(v) ? "HVM" : "PV", caps, v->domain->domain_id, - v->vcpu_id, GUEST_MCG_CAP & ~MCG_CAP_COUNT); + v->vcpu_id, guest_mcg_cap & ~MCG_CAP_COUNT); return -EPERM; } - v->arch.mcg_cap = caps; + v->arch.vmce.mcg_cap = caps; return 0; } -static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) +/* + * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM, + * when migratie from old vMCE version to new vMCE. + */ +static int bank_mce_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val) { int ret = 1; unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; - struct domain_mca_msrs *vmce = dom_vmce(v->domain); - struct bank_entry *entry; *val = 0; @@ -92,46 +97,33 @@ bank, *val); break; case MSR_IA32_MC0_STATUS: - /* Only error bank is read. Non-error banks simply return. */ - if ( !list_empty(&vmce->impact_header) ) + if ( bank < GUEST_MC_BANK_NUM ) { - entry = list_entry(vmce->impact_header.next, - struct bank_entry, list); - if ( entry->bank == bank ) - { - *val = entry->mci_status; + *val = v->arch.vmce.bank[bank].mci_status; + if ( *val ) mce_printk(MCE_VERBOSE, - "MCE: rd MC%u_STATUS in vMCE# context " - "value 0x%"PRIx64"\n", bank, *val); - } + "MCE: rdmsr MC%u_STATUS in vMCE# context " + "0x%"PRIx64"\n", bank, *val); } break; case MSR_IA32_MC0_ADDR: - if ( !list_empty(&vmce->impact_header) ) + if ( bank < GUEST_MC_BANK_NUM ) { - entry = list_entry(vmce->impact_header.next, - struct bank_entry, list); - if ( entry->bank == bank ) - { - *val = entry->mci_addr; + *val = v->arch.vmce.bank[bank].mci_addr; + if ( *val ) mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_ADDR in vMCE# context " "0x%"PRIx64"\n", bank, *val); - } } break; case MSR_IA32_MC0_MISC: - if ( !list_empty(&vmce->impact_header) ) + if ( bank < GUEST_MC_BANK_NUM ) { - entry = list_entry(vmce->impact_header.next, - struct bank_entry, list); - if ( entry->bank == bank ) - { - *val = entry->mci_misc; + *val = v->arch.vmce.bank[bank].mci_misc; + if ( *val ) mce_printk(MCE_VERBOSE, - "MCE: rd MC%u_MISC in vMCE# context " + "MCE: rdmsr MC%u_MISC in vMCE# context " "0x%"PRIx64"\n", bank, *val); - } } break; default: @@ -157,56 +149,50 @@ */ int vmce_rdmsr(uint32_t msr, uint64_t *val) { - const struct vcpu *cur = current; - struct domain_mca_msrs *vmce = dom_vmce(cur->domain); + struct vcpu *cur = current; int ret = 1; *val = 0; - spin_lock(&vmce->lock); + spin_lock(&cur->arch.vmce.lock); switch ( msr ) { case MSR_IA32_MCG_STATUS: - *val = vmce->mcg_status; + *val = cur->arch.vmce.mcg_status; if (*val) mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val); break; case MSR_IA32_MCG_CAP: - *val = cur->arch.mcg_cap; - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n", - *val); + *val = cur->arch.vmce.mcg_cap; + mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n", *val); break; case MSR_IA32_MCG_CTL: - /* Stick all 1''s when CTL support, and 0''s when no CTL support */ - if ( cur->arch.mcg_cap & MCG_CTL_P ) - *val = ~0ULL; - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", *val); + if ( cur->arch.vmce.mcg_cap & MCG_CTL_P ) + { + *val = ~0UL; + mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", *val); + } break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; break; } - spin_unlock(&vmce->lock); + spin_unlock(&cur->arch.vmce.lock); + return ret; } +/* + * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM, + * when migratie from old vMCE version to new vMCE. + */ static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val) { int ret = 1; unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; - struct domain_mca_msrs *vmce = dom_vmce(v->domain); - struct bank_entry *entry = NULL; - - /* Give the first entry of the list, it corresponds to current - * vMCE# injection. When vMCE# is finished processing by the - * the guest, this node will be deleted. - * Only error bank is written. Non-error banks simply return. - */ - if ( !list_empty(&vmce->impact_header) ) - entry = list_entry(vmce->impact_header.next, struct bank_entry, list); switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { @@ -217,50 +203,46 @@ */ break; case MSR_IA32_MC0_STATUS: - if ( entry && (entry->bank == bank) ) + if ( val ) { - entry->mci_status = val; - mce_printk(MCE_VERBOSE, - "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n", + mce_printk(MCE_QUIET, + "MCE: wr MC%u_STATUS w/ non-zero cause #GP\n", bank); + ret = -1; + } + if ( bank < GUEST_MC_BANK_NUM ) + { + v->arch.vmce.bank[bank].mci_status = val; + mce_printk(MCE_VERBOSE, "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val); } - else - mce_printk(MCE_VERBOSE, - "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val); break; case MSR_IA32_MC0_ADDR: - if ( !~val ) + if ( val ) { mce_printk(MCE_QUIET, - "MCE: wr MC%u_ADDR with all 1s will cause #GP\n", bank); + "MCE: wr MC%u_ADDR w/ non-zero cause #GP\n", bank); ret = -1; } - else if ( entry && (entry->bank == bank) ) + else if ( bank < GUEST_MC_BANK_NUM ) { - entry->mci_addr = val; - mce_printk(MCE_VERBOSE, - "MCE: wr MC%u_ADDR %"PRIx64" in vMCE#\n", bank, val); - } - else + v->arch.vmce.bank[bank].mci_addr = val; mce_printk(MCE_VERBOSE, "MCE: wr MC%u_ADDR %"PRIx64"\n", bank, val); + } break; case MSR_IA32_MC0_MISC: - if ( !~val ) + if ( val ) { mce_printk(MCE_QUIET, - "MCE: wr MC%u_MISC with all 1s will cause #GP\n", bank); + "MCE: wr MC%u_MISC w/ non-zero cause #GP\n", bank); ret = -1; } - else if ( entry && (entry->bank == bank) ) + else if ( bank < GUEST_MC_BANK_NUM ) { - entry->mci_misc = val; - mce_printk(MCE_VERBOSE, - "MCE: wr MC%u_MISC %"PRIx64" in vMCE#\n", bank, val); - } - else + v->arch.vmce.bank[bank].mci_misc = val; mce_printk(MCE_VERBOSE, "MCE: wr MC%u_MISC %"PRIx64"\n", bank, val); + } break; default: switch ( boot_cpu_data.x86_vendor ) @@ -286,52 +268,33 @@ int vmce_wrmsr(u32 msr, u64 val) { struct vcpu *cur = current; - struct bank_entry *entry = NULL; - struct domain_mca_msrs *vmce = dom_vmce(cur->domain); int ret = 1; - spin_lock(&vmce->lock); + spin_lock(&cur->arch.vmce.lock); switch ( msr ) { case MSR_IA32_MCG_CTL: + /* If MCG_CTL exist then stick to all 1''s. If not exist then ignore */ break; case MSR_IA32_MCG_STATUS: - vmce->mcg_status = val; + cur->arch.vmce.mcg_status = val; mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val); - /* For HVM guest, this is the point for deleting vMCE injection node */ - if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) ) - { - vmce->nr_injection--; /* Should be 0 */ - if ( !list_empty(&vmce->impact_header) ) - { - entry = list_entry(vmce->impact_header.next, - struct bank_entry, list); - if ( entry->mci_status & MCi_STATUS_VAL ) - mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have " - "been cleared before write MCG_STATUS MSR\n"); - - mce_printk(MCE_QUIET, "MCE: Delete HVM last injection " - "Node, nr_injection %u\n", - vmce->nr_injection); - list_del(&entry->list); - xfree(entry); - } - else - mce_printk(MCE_QUIET, "MCE: Not found HVM guest" - " last injection Node, something Wrong!\n"); - } break; case MSR_IA32_MCG_CAP: - mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n"); - ret = -1; + /* + * According to Intel SDM, IA32_MCG_CAP is a read-only register, + * the effect of writing to the IA32_MCG_CAP is undefined. Here we + * treat writing as ''write not change''. Guest would not surprise. + */ + mce_printk(MCE_QUIET, "MCE: MCG_CAP is read only and write not change\n"); break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0; break; } - spin_unlock(&vmce->lock); + spin_unlock(&cur->arch.vmce.lock); return ret; } @@ -342,7 +305,7 @@ for_each_vcpu( d, v ) { struct hvm_vmce_vcpu ctxt = { - .caps = v->arch.mcg_cap + .caps = v->arch.vmce.mcg_cap }; err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt); @@ -422,93 +385,38 @@ return 0; } -/* This node list records errors impacting a domain. when one - * MCE# happens, one error bank impacts a domain. This error node - * will be inserted to the tail of the per_dom data for vMCE# MSR - * virtualization. When one vMCE# injection is finished processing - * processed by guest, the corresponding node will be deleted. - * This node list is for GUEST vMCE# MSRS virtualization. - */ -static struct bank_entry* alloc_bank_entry(void) +int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, + uint64_t gstatus) { - struct bank_entry *entry; + struct vcpu *v = d->vcpu[0]; - entry = xzalloc(struct bank_entry); - if ( entry == NULL ) - { - printk(KERN_ERR "MCE: malloc bank_entry failed\n"); - return NULL; - } - - INIT_LIST_HEAD(&entry->list); - return entry; -} - -/* Fill error bank info for #vMCE injection and GUEST vMCE# - * MSR virtualization data - * 1) Log down how many nr_injections of the impacted. - * 2) Copy MCE# error bank to impacted DOM node list, - * for vMCE# MSRs virtualization - */ -int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, - uint64_t gstatus) { - struct bank_entry *entry; - - /* This error bank impacts one domain, we need to fill domain related - * data for vMCE MSRs virtualization and vMCE# injection */ if ( mc_bank->mc_domid != (uint16_t)~0 ) { - /* For HVM guest, Only when first vMCE is consumed by HVM guest - * successfully, will we generete another node and inject another vMCE. - */ - if ( d->is_hvm && (dom_vmce(d)->nr_injection > 0) ) + if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP ) { - mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous" + mce_printk(MCE_QUIET, "MCE: guest has not handled previous" " vMCE yet!\n"); return -1; } - entry = alloc_bank_entry(); - if ( entry == NULL ) - return -1; + spin_lock(&v->arch.vmce.lock); - entry->mci_status = mc_bank->mc_status; - entry->mci_addr = mc_bank->mc_addr; - entry->mci_misc = mc_bank->mc_misc; - entry->bank = mc_bank->mc_bank; + v->arch.vmce.mcg_status = gstatus; + /* + * 1. Skip bank 0 to avoid ''bank 0 quirk'' of old processors + * 2. Filter MCi_STATUS MSCOD model specific error code to guest + */ + v->arch.vmce.bank[1].mci_status = mc_bank->mc_status & + MCi_STATUS_MSCOD_MASK; + v->arch.vmce.bank[1].mci_addr = mc_bank->mc_addr; + v->arch.vmce.bank[1].mci_misc = mc_bank->mc_misc; - spin_lock(&dom_vmce(d)->lock); - /* New error Node, insert to the tail of the per_dom data */ - list_add_tail(&entry->list, &dom_vmce(d)->impact_header); - /* Fill MSR global status */ - dom_vmce(d)->mcg_status = gstatus; - /* New node impact the domain, need another vMCE# injection*/ - dom_vmce(d)->nr_injection++; - spin_unlock(&dom_vmce(d)->lock); - - mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d " - "status %"PRIx64" addr %"PRIx64" domid %d]\n ", - mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr, - mc_bank->mc_domid); + spin_unlock(&v->arch.vmce.lock); } return 0; } -#if 0 /* currently unused */ -int vmce_domain_inject( - struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global) -{ - int ret; - - ret = fill_vmsr_data(bank, d, global->mc_gstatus); - if ( ret < 0 ) - return ret; - - return inject_vmce(d); -} -#endif - static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct domain *d) { struct vcpu *v; diff -r fbd9e864c047 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Mon Sep 17 18:02:59 2012 +0800 +++ b/xen/arch/x86/domain.c Tue Sep 18 22:39:10 2012 +0800 @@ -571,9 +571,6 @@ if ( (rc = iommu_domain_init(d)) != 0 ) goto fail; - - /* For Guest vMCE MSRs virtualization */ - vmce_init_msr(d); } if ( is_hvm_domain(d) ) @@ -600,7 +597,6 @@ fail: d->is_dying = DOMDYING_dead; - vmce_destroy_msr(d); cleanup_domain_irq_mapping(d); free_xenheap_page(d->shared_info); if ( paging_initialised ) @@ -623,7 +619,6 @@ else xfree(d->arch.pv_domain.e820); - vmce_destroy_msr(d); free_domain_pirqs(d); if ( !is_idle_domain(d) ) iommu_domain_destroy(d); diff -r fbd9e864c047 xen/arch/x86/domctl.c --- a/xen/arch/x86/domctl.c Mon Sep 17 18:02:59 2012 +0800 +++ b/xen/arch/x86/domctl.c Tue Sep 18 22:39:10 2012 +0800 @@ -1024,7 +1024,7 @@ evc->syscall32_callback_eip = 0; evc->syscall32_disables_events = 0; } - evc->mcg_cap = v->arch.mcg_cap; + evc->mcg_cap = v->arch.vmce.mcg_cap; } else { diff -r fbd9e864c047 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Mon Sep 17 18:02:59 2012 +0800 +++ b/xen/arch/x86/traps.c Tue Sep 18 22:39:10 2012 +0800 @@ -3133,50 +3133,6 @@ break; ASSERT(trap <= VCPU_TRAP_LAST); - /* inject vMCE to PV_Guest including DOM0. */ - if ( trap == VCPU_TRAP_MCE ) - { - gdprintk(XENLOG_DEBUG, "MCE: Return from vMCE# trap!\n"); - if ( curr->vcpu_id == 0 ) - { - struct domain *d = curr->domain; - - if ( !d->arch.vmca_msrs->nr_injection ) - { - printk(XENLOG_WARNING "MCE: ret from vMCE#, " - "no injection node\n"); - goto end; - } - - d->arch.vmca_msrs->nr_injection--; - if ( !list_empty(&d->arch.vmca_msrs->impact_header) ) - { - struct bank_entry *entry; - - entry = list_entry(d->arch.vmca_msrs->impact_header.next, - struct bank_entry, list); - gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n"); - list_del(&entry->list); - } - else - printk(XENLOG_ERR "MCE: didn''t found last injection node\n"); - - /* further injection */ - if ( d->arch.vmca_msrs->nr_injection > 0 && - guest_has_trap_callback(d, 0, TRAP_machine_check) && - !test_and_set_bool(curr->mce_pending) ) - { - int cpu = smp_processor_id(); - - cpumask_copy(curr->cpu_affinity_tmp, curr->cpu_affinity); - printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n", - cpu, curr->processor); - vcpu_set_affinity(curr, cpumask_of(cpu)); - } - } - } - -end: /* Restore previous asynchronous exception mask. */ curr->async_exception_mask = curr->async_exception_state(trap).old_mask; } diff -r fbd9e864c047 xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Mon Sep 17 18:02:59 2012 +0800 +++ b/xen/include/asm-x86/domain.h Tue Sep 18 22:39:10 2012 +0800 @@ -296,9 +296,6 @@ struct PITState vpit; - /* For Guest vMCA handling */ - struct domain_mca_msrs *vmca_msrs; - /* TSC management (emulation, pv, scaling, stats) */ int tsc_mode; /* see include/asm-x86/time.h */ bool_t vtsc; /* tsc is emulated (may change after migrate) */ @@ -438,8 +435,8 @@ * and thus should be saved/restored. */ bool_t nonlazy_xstate_used; - uint64_t mcg_cap; - + struct vmce vmce; + struct paging_vcpu paging; uint32_t gdbsx_vcpu_event; diff -r fbd9e864c047 xen/include/asm-x86/mce.h --- a/xen/include/asm-x86/mce.h Mon Sep 17 18:02:59 2012 +0800 +++ b/xen/include/asm-x86/mce.h Tue Sep 18 22:39:10 2012 +0800 @@ -3,28 +3,50 @@ #ifndef _XEN_X86_MCE_H #define _XEN_X86_MCE_H -/* This entry is for recording bank nodes for the impacted domain, - * put into impact_header list. */ -struct bank_entry { - struct list_head list; - uint16_t bank; +/* + * Emulalte 2 banks for guest + * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: + * 1). Intel cpu whose family-model value < 06-1A; + * 2). AMD K7 + * Bank1: used to transfer error info to guest + */ +#define GUEST_MC_BANK_NUM 2 + +/* + * MCG_SER_P: software error recovery supported + * MCG_TES_P: to avoid MCi_status bit56:53 model specific + * MCG_CMCI_P: expose CMCI capability but never really inject it to guest, + * for sake of performance since guest not polling periodically + */ +#define INTEL_GUEST_MCG_CAP (MCG_SER_P | \ + MCG_TES_P | \ + MCG_CMCI_P | \ + GUEST_MC_BANK_NUM) + +#define AMD_GUEST_MCG_CAP (MCG_SER_P | \ + GUEST_MC_BANK_NUM) + +/* Filter MSCOD model specific error code to guest */ +#define MCi_STATUS_MSCOD_MASK (~(0xffffULL << 16)) + +/* No mci_ctl since it stick all 1''s */ +struct vmce_bank { uint64_t mci_status; uint64_t mci_addr; uint64_t mci_misc; + uint64_t mci_ctl2; }; -struct domain_mca_msrs -{ - /* Guest should not change below values after DOM boot up */ +/* No mcg_ctl since it not expose to guest */ +struct vmce { + uint64_t mcg_cap; uint64_t mcg_status; - uint16_t nr_injection; - struct list_head impact_header; + struct vmce_bank bank[GUEST_MC_BANK_NUM]; + spinlock_t lock; }; /* Guest vMCE MSRs virtualization */ -extern int vmce_init_msr(struct domain *d); -extern void vmce_destroy_msr(struct domain *d); extern void vmce_init_vcpu(struct vcpu *); extern int vmce_restore_vcpu(struct vcpu *, uint64_t caps); extern int vmce_wrmsr(uint32_t msr, uint64_t val); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Xen/MCE: vMCE emulation This patch provides virtual MCE support to guest. It emulates a simple and clean MCE MSRs interface to guest by faking caps to guest if needed and masking caps if unnecessary: 1. Providing a well-defined MCG_CAP to guest, filter out un-necessary caps and provide only guest needed caps; 2. Disabling MCG_CTL to avoid model specific; 3. Sticking all 1''s to MCi_CTL to guest to avoid model specific; 4. Enabling CMCI cap but never really inject to guest to prevent polling periodically; 5. Masking MSCOD field of MCi_STATUS to avoid model specific; 6. Keeping natural semantics by per-vcpu instead of per-domain variables; 7. Using bank1 and reserving bank0 to work around ''bank0 quirk'' of some very old processors; 8. Cleaning some vMCE# injection logic which shared by Intel and AMD but useless under new vMCE implement; 9. Keeping compatilbe w/ old xen version which has been backported to SLES11 SP2, so that old vMCE would not blocked when migrate to new vMCE; Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com> diff -r f76cd381459e xen/arch/x86/cpu/mcheck/mce.h --- a/xen/arch/x86/cpu/mcheck/mce.h Wed Sep 19 21:32:15 2012 +0800 +++ b/xen/arch/x86/cpu/mcheck/mce.h Wed Sep 19 23:22:57 2012 +0800 @@ -169,15 +169,13 @@ int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, uint64_t gstatus); int inject_vmce(struct domain *d); -int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, - struct mcinfo_global *global); static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: if (msr >= MSR_IA32_MC0_CTL2 && - msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) + msr < MSR_IA32_MCx_CTL2(v->arch.vmce.mcg_cap & MCG_CAP_COUNT) ) return 1; break; case X86_VENDOR_AMD: @@ -195,7 +193,7 @@ static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr) { if ( (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(v->arch.mcg_cap & MCG_CAP_COUNT)) || + msr < MSR_IA32_MCx_CTL(v->arch.vmce.mcg_cap & MCG_CAP_COUNT)) || mce_vendor_bank_msr(v, msr) ) return 1; return 0; diff -r f76cd381459e xen/arch/x86/cpu/mcheck/mce_intel.c --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Wed Sep 19 21:32:15 2012 +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Wed Sep 19 23:22:57 2012 +0800 @@ -982,14 +982,15 @@ /* intel specific MCA MSR */ int intel_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { + unsigned int bank = msr - MSR_IA32_MC0_CTL2; int ret = 0; - if ( msr >= MSR_IA32_MC0_CTL2 && - msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) + if ( bank < GUEST_MC_BANK_NUM ) { - mce_printk(MCE_QUIET, "We have disabled CMCI capability, " - "Guest should not write this MSR!\n"); - ret = 1; + v->arch.vmce.bank[bank].mci_ctl2 = val; + mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %"PRIx64"\n", + bank, val); + ret = 1; } return ret; @@ -997,13 +998,14 @@ int intel_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { + unsigned int bank = msr - MSR_IA32_MC0_CTL2; int ret = 0; - if ( msr >= MSR_IA32_MC0_CTL2 && - msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) + if ( bank < GUEST_MC_BANK_NUM ) { - mce_printk(MCE_QUIET, "We have disabled CMCI capability, " - "Guest should not read this MSR!\n"); + *val = v->arch.vmce.bank[bank].mci_ctl2; + mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL2 0x%"PRIx64"\n", + bank, *val); ret = 1; } diff -r f76cd381459e xen/arch/x86/cpu/mcheck/vmce.c --- a/xen/arch/x86/cpu/mcheck/vmce.c Wed Sep 19 21:32:15 2012 +0800 +++ b/xen/arch/x86/cpu/mcheck/vmce.c Wed Sep 19 23:22:57 2012 +0800 @@ -1,5 +1,22 @@ /* - * vmce.c - virtual MCE support + * vmce.c - provide software emulated vMCE support to guest + * + * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com> + * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <xen/init.h> @@ -19,67 +36,55 @@ #include "mce.h" #include "x86_mca.h" -/* - * Emulate 2 banks for guest - * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: - * 1). Intel cpu whose family-model value < 06-1A; - * 2). AMD K7 - * Bank1: used to transfer error info to guest - */ -#define GUEST_BANK_NUM 2 -#define GUEST_MCG_CAP (MCG_TES_P | MCG_SER_P | GUEST_BANK_NUM) - -#define dom_vmce(x) ((x)->arch.vmca_msrs) - -int vmce_init_msr(struct domain *d) -{ - dom_vmce(d) = xmalloc(struct domain_mca_msrs); - if ( !dom_vmce(d) ) - return -ENOMEM; - - dom_vmce(d)->mcg_status = 0x0; - dom_vmce(d)->nr_injection = 0; - - INIT_LIST_HEAD(&dom_vmce(d)->impact_header); - spin_lock_init(&dom_vmce(d)->lock); - - return 0; -} - -void vmce_destroy_msr(struct domain *d) -{ - if ( !dom_vmce(d) ) - return; - xfree(dom_vmce(d)); - dom_vmce(d) = NULL; -} - void vmce_init_vcpu(struct vcpu *v) { - v->arch.mcg_cap = GUEST_MCG_CAP; + int i; + + /* global MCA MSRs init */ + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) + v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP; + else + v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP; + + v->arch.vmce.mcg_status = 0; + + /* per-bank MCA MSRs init */ + for ( i = 0; i < GUEST_MC_BANK_NUM; i++ ) + memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank)); + + spin_lock_init(&v->arch.vmce.lock); } int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) { - if ( caps & ~GUEST_MCG_CAP & ~MCG_CAP_COUNT & ~MCG_CTL_P ) + uint64_t guest_mcg_cap; + + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) + guest_mcg_cap = INTEL_GUEST_MCG_CAP; + else + guest_mcg_cap = AMD_GUEST_MCG_CAP; + + if ( caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) { dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", is_hvm_vcpu(v) ? "HVM" : "PV", caps, v->domain->domain_id, - v->vcpu_id, GUEST_MCG_CAP & ~MCG_CAP_COUNT); + v->vcpu_id, guest_mcg_cap & ~MCG_CAP_COUNT); return -EPERM; } - v->arch.mcg_cap = caps; + v->arch.vmce.mcg_cap = caps; return 0; } -static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) +/* + * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM, + * when migratie from old vMCE version to new vMCE. + */ +static int bank_mce_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val) { int ret = 1; unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; - struct domain_mca_msrs *vmce = dom_vmce(v->domain); - struct bank_entry *entry; *val = 0; @@ -92,46 +97,33 @@ bank, *val); break; case MSR_IA32_MC0_STATUS: - /* Only error bank is read. Non-error banks simply return. */ - if ( !list_empty(&vmce->impact_header) ) + if ( bank < GUEST_MC_BANK_NUM ) { - entry = list_entry(vmce->impact_header.next, - struct bank_entry, list); - if ( entry->bank == bank ) - { - *val = entry->mci_status; + *val = v->arch.vmce.bank[bank].mci_status; + if ( *val ) mce_printk(MCE_VERBOSE, - "MCE: rd MC%u_STATUS in vMCE# context " - "value 0x%"PRIx64"\n", bank, *val); - } + "MCE: rdmsr MC%u_STATUS in vMCE# context " + "0x%"PRIx64"\n", bank, *val); } break; case MSR_IA32_MC0_ADDR: - if ( !list_empty(&vmce->impact_header) ) + if ( bank < GUEST_MC_BANK_NUM ) { - entry = list_entry(vmce->impact_header.next, - struct bank_entry, list); - if ( entry->bank == bank ) - { - *val = entry->mci_addr; + *val = v->arch.vmce.bank[bank].mci_addr; + if ( *val ) mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_ADDR in vMCE# context " "0x%"PRIx64"\n", bank, *val); - } } break; case MSR_IA32_MC0_MISC: - if ( !list_empty(&vmce->impact_header) ) + if ( bank < GUEST_MC_BANK_NUM ) { - entry = list_entry(vmce->impact_header.next, - struct bank_entry, list); - if ( entry->bank == bank ) - { - *val = entry->mci_misc; + *val = v->arch.vmce.bank[bank].mci_misc; + if ( *val ) mce_printk(MCE_VERBOSE, - "MCE: rd MC%u_MISC in vMCE# context " + "MCE: rdmsr MC%u_MISC in vMCE# context " "0x%"PRIx64"\n", bank, *val); - } } break; default: @@ -157,56 +149,50 @@ */ int vmce_rdmsr(uint32_t msr, uint64_t *val) { - const struct vcpu *cur = current; - struct domain_mca_msrs *vmce = dom_vmce(cur->domain); + struct vcpu *cur = current; int ret = 1; *val = 0; - spin_lock(&vmce->lock); + spin_lock(&cur->arch.vmce.lock); switch ( msr ) { case MSR_IA32_MCG_STATUS: - *val = vmce->mcg_status; + *val = cur->arch.vmce.mcg_status; if (*val) mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val); break; case MSR_IA32_MCG_CAP: - *val = cur->arch.mcg_cap; - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n", - *val); + *val = cur->arch.vmce.mcg_cap; + mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n", *val); break; case MSR_IA32_MCG_CTL: - /* Stick all 1''s when CTL support, and 0''s when no CTL support */ - if ( cur->arch.mcg_cap & MCG_CTL_P ) - *val = ~0ULL; - mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", *val); + if ( cur->arch.vmce.mcg_cap & MCG_CTL_P ) + { + *val = ~0UL; + mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", *val); + } break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; break; } - spin_unlock(&vmce->lock); + spin_unlock(&cur->arch.vmce.lock); + return ret; } +/* + * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM, + * when migratie from old vMCE version to new vMCE. + */ static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val) { int ret = 1; unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; - struct domain_mca_msrs *vmce = dom_vmce(v->domain); - struct bank_entry *entry = NULL; - - /* Give the first entry of the list, it corresponds to current - * vMCE# injection. When vMCE# is finished processing by the - * the guest, this node will be deleted. - * Only error bank is written. Non-error banks simply return. - */ - if ( !list_empty(&vmce->impact_header) ) - entry = list_entry(vmce->impact_header.next, struct bank_entry, list); switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { @@ -217,50 +203,46 @@ */ break; case MSR_IA32_MC0_STATUS: - if ( entry && (entry->bank == bank) ) + if ( val ) { - entry->mci_status = val; - mce_printk(MCE_VERBOSE, - "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n", + mce_printk(MCE_QUIET, + "MCE: wr MC%u_STATUS w/ non-zero cause #GP\n", bank); + ret = -1; + } + if ( bank < GUEST_MC_BANK_NUM ) + { + v->arch.vmce.bank[bank].mci_status = val; + mce_printk(MCE_VERBOSE, "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val); } - else - mce_printk(MCE_VERBOSE, - "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val); break; case MSR_IA32_MC0_ADDR: - if ( !~val ) + if ( val ) { mce_printk(MCE_QUIET, - "MCE: wr MC%u_ADDR with all 1s will cause #GP\n", bank); + "MCE: wr MC%u_ADDR w/ non-zero cause #GP\n", bank); ret = -1; } - else if ( entry && (entry->bank == bank) ) + else if ( bank < GUEST_MC_BANK_NUM ) { - entry->mci_addr = val; - mce_printk(MCE_VERBOSE, - "MCE: wr MC%u_ADDR %"PRIx64" in vMCE#\n", bank, val); - } - else + v->arch.vmce.bank[bank].mci_addr = val; mce_printk(MCE_VERBOSE, "MCE: wr MC%u_ADDR %"PRIx64"\n", bank, val); + } break; case MSR_IA32_MC0_MISC: - if ( !~val ) + if ( val ) { mce_printk(MCE_QUIET, - "MCE: wr MC%u_MISC with all 1s will cause #GP\n", bank); + "MCE: wr MC%u_MISC w/ non-zero cause #GP\n", bank); ret = -1; } - else if ( entry && (entry->bank == bank) ) + else if ( bank < GUEST_MC_BANK_NUM ) { - entry->mci_misc = val; - mce_printk(MCE_VERBOSE, - "MCE: wr MC%u_MISC %"PRIx64" in vMCE#\n", bank, val); - } - else + v->arch.vmce.bank[bank].mci_misc = val; mce_printk(MCE_VERBOSE, "MCE: wr MC%u_MISC %"PRIx64"\n", bank, val); + } break; default: switch ( boot_cpu_data.x86_vendor ) @@ -286,52 +268,33 @@ int vmce_wrmsr(u32 msr, u64 val) { struct vcpu *cur = current; - struct bank_entry *entry = NULL; - struct domain_mca_msrs *vmce = dom_vmce(cur->domain); int ret = 1; - spin_lock(&vmce->lock); + spin_lock(&cur->arch.vmce.lock); switch ( msr ) { case MSR_IA32_MCG_CTL: + /* If MCG_CTL exist then stick to all 1''s. If not exist then ignore */ break; case MSR_IA32_MCG_STATUS: - vmce->mcg_status = val; + cur->arch.vmce.mcg_status = val; mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val); - /* For HVM guest, this is the point for deleting vMCE injection node */ - if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) ) - { - vmce->nr_injection--; /* Should be 0 */ - if ( !list_empty(&vmce->impact_header) ) - { - entry = list_entry(vmce->impact_header.next, - struct bank_entry, list); - if ( entry->mci_status & MCi_STATUS_VAL ) - mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should have " - "been cleared before write MCG_STATUS MSR\n"); - - mce_printk(MCE_QUIET, "MCE: Delete HVM last injection " - "Node, nr_injection %u\n", - vmce->nr_injection); - list_del(&entry->list); - xfree(entry); - } - else - mce_printk(MCE_QUIET, "MCE: Not found HVM guest" - " last injection Node, something Wrong!\n"); - } break; case MSR_IA32_MCG_CAP: - mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n"); - ret = -1; + /* + * According to Intel SDM, IA32_MCG_CAP is a read-only register, + * the effect of writing to the IA32_MCG_CAP is undefined. Here we + * treat writing as ''write not change''. Guest would not surprise. + */ + mce_printk(MCE_QUIET, "MCE: MCG_CAP is read only and write not change\n"); break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0; break; } - spin_unlock(&vmce->lock); + spin_unlock(&cur->arch.vmce.lock); return ret; } @@ -342,7 +305,7 @@ for_each_vcpu( d, v ) { struct hvm_vmce_vcpu ctxt = { - .caps = v->arch.mcg_cap + .caps = v->arch.vmce.mcg_cap }; err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt); @@ -422,93 +385,38 @@ return 0; } -/* This node list records errors impacting a domain. when one - * MCE# happens, one error bank impacts a domain. This error node - * will be inserted to the tail of the per_dom data for vMCE# MSR - * virtualization. When one vMCE# injection is finished processing - * processed by guest, the corresponding node will be deleted. - * This node list is for GUEST vMCE# MSRS virtualization. - */ -static struct bank_entry* alloc_bank_entry(void) +int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, + uint64_t gstatus) { - struct bank_entry *entry; + struct vcpu *v = d->vcpu[0]; - entry = xzalloc(struct bank_entry); - if ( entry == NULL ) - { - printk(KERN_ERR "MCE: malloc bank_entry failed\n"); - return NULL; - } - - INIT_LIST_HEAD(&entry->list); - return entry; -} - -/* Fill error bank info for #vMCE injection and GUEST vMCE# - * MSR virtualization data - * 1) Log down how many nr_injections of the impacted. - * 2) Copy MCE# error bank to impacted DOM node list, - * for vMCE# MSRs virtualization - */ -int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, - uint64_t gstatus) { - struct bank_entry *entry; - - /* This error bank impacts one domain, we need to fill domain related - * data for vMCE MSRs virtualization and vMCE# injection */ if ( mc_bank->mc_domid != (uint16_t)~0 ) { - /* For HVM guest, Only when first vMCE is consumed by HVM guest - * successfully, will we generete another node and inject another vMCE. - */ - if ( d->is_hvm && (dom_vmce(d)->nr_injection > 0) ) + if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP ) { - mce_printk(MCE_QUIET, "MCE: HVM guest has not handled previous" + mce_printk(MCE_QUIET, "MCE: guest has not handled previous" " vMCE yet!\n"); return -1; } - entry = alloc_bank_entry(); - if ( entry == NULL ) - return -1; + spin_lock(&v->arch.vmce.lock); - entry->mci_status = mc_bank->mc_status; - entry->mci_addr = mc_bank->mc_addr; - entry->mci_misc = mc_bank->mc_misc; - entry->bank = mc_bank->mc_bank; + v->arch.vmce.mcg_status = gstatus; + /* + * 1. Skip bank 0 to avoid ''bank 0 quirk'' of old processors + * 2. Filter MCi_STATUS MSCOD model specific error code to guest + */ + v->arch.vmce.bank[1].mci_status = mc_bank->mc_status & + MCi_STATUS_MSCOD_MASK; + v->arch.vmce.bank[1].mci_addr = mc_bank->mc_addr; + v->arch.vmce.bank[1].mci_misc = mc_bank->mc_misc; - spin_lock(&dom_vmce(d)->lock); - /* New error Node, insert to the tail of the per_dom data */ - list_add_tail(&entry->list, &dom_vmce(d)->impact_header); - /* Fill MSR global status */ - dom_vmce(d)->mcg_status = gstatus; - /* New node impact the domain, need another vMCE# injection*/ - dom_vmce(d)->nr_injection++; - spin_unlock(&dom_vmce(d)->lock); - - mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d " - "status %"PRIx64" addr %"PRIx64" domid %d]\n ", - mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr, - mc_bank->mc_domid); + spin_unlock(&v->arch.vmce.lock); } return 0; } -#if 0 /* currently unused */ -int vmce_domain_inject( - struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global) -{ - int ret; - - ret = fill_vmsr_data(bank, d, global->mc_gstatus); - if ( ret < 0 ) - return ret; - - return inject_vmce(d); -} -#endif - static int is_hvm_vmce_ready(struct mcinfo_bank *bank, struct domain *d) { struct vcpu *v; diff -r f76cd381459e xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Wed Sep 19 21:32:15 2012 +0800 +++ b/xen/arch/x86/domain.c Wed Sep 19 23:22:57 2012 +0800 @@ -571,9 +571,6 @@ if ( (rc = iommu_domain_init(d)) != 0 ) goto fail; - - /* For Guest vMCE MSRs virtualization */ - vmce_init_msr(d); } if ( is_hvm_domain(d) ) @@ -600,7 +597,6 @@ fail: d->is_dying = DOMDYING_dead; - vmce_destroy_msr(d); cleanup_domain_irq_mapping(d); free_xenheap_page(d->shared_info); if ( paging_initialised ) @@ -623,7 +619,6 @@ else xfree(d->arch.pv_domain.e820); - vmce_destroy_msr(d); free_domain_pirqs(d); if ( !is_idle_domain(d) ) iommu_domain_destroy(d); diff -r f76cd381459e xen/arch/x86/domctl.c --- a/xen/arch/x86/domctl.c Wed Sep 19 21:32:15 2012 +0800 +++ b/xen/arch/x86/domctl.c Wed Sep 19 23:22:57 2012 +0800 @@ -1024,7 +1024,7 @@ evc->syscall32_callback_eip = 0; evc->syscall32_disables_events = 0; } - evc->mcg_cap = v->arch.mcg_cap; + evc->mcg_cap = v->arch.vmce.mcg_cap; } else { diff -r f76cd381459e xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Wed Sep 19 21:32:15 2012 +0800 +++ b/xen/arch/x86/traps.c Wed Sep 19 23:22:57 2012 +0800 @@ -3133,50 +3133,6 @@ break; ASSERT(trap <= VCPU_TRAP_LAST); - /* inject vMCE to PV_Guest including DOM0. */ - if ( trap == VCPU_TRAP_MCE ) - { - gdprintk(XENLOG_DEBUG, "MCE: Return from vMCE# trap!\n"); - if ( curr->vcpu_id == 0 ) - { - struct domain *d = curr->domain; - - if ( !d->arch.vmca_msrs->nr_injection ) - { - printk(XENLOG_WARNING "MCE: ret from vMCE#, " - "no injection node\n"); - goto end; - } - - d->arch.vmca_msrs->nr_injection--; - if ( !list_empty(&d->arch.vmca_msrs->impact_header) ) - { - struct bank_entry *entry; - - entry = list_entry(d->arch.vmca_msrs->impact_header.next, - struct bank_entry, list); - gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n"); - list_del(&entry->list); - } - else - printk(XENLOG_ERR "MCE: didn''t found last injection node\n"); - - /* further injection */ - if ( d->arch.vmca_msrs->nr_injection > 0 && - guest_has_trap_callback(d, 0, TRAP_machine_check) && - !test_and_set_bool(curr->mce_pending) ) - { - int cpu = smp_processor_id(); - - cpumask_copy(curr->cpu_affinity_tmp, curr->cpu_affinity); - printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n", - cpu, curr->processor); - vcpu_set_affinity(curr, cpumask_of(cpu)); - } - } - } - -end: /* Restore previous asynchronous exception mask. */ curr->async_exception_mask = curr->async_exception_state(trap).old_mask; } diff -r f76cd381459e xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Wed Sep 19 21:32:15 2012 +0800 +++ b/xen/include/asm-x86/domain.h Wed Sep 19 23:22:57 2012 +0800 @@ -296,9 +296,6 @@ struct PITState vpit; - /* For Guest vMCA handling */ - struct domain_mca_msrs *vmca_msrs; - /* TSC management (emulation, pv, scaling, stats) */ int tsc_mode; /* see include/asm-x86/time.h */ bool_t vtsc; /* tsc is emulated (may change after migrate) */ @@ -438,8 +435,8 @@ * and thus should be saved/restored. */ bool_t nonlazy_xstate_used; - uint64_t mcg_cap; - + struct vmce vmce; + struct paging_vcpu paging; uint32_t gdbsx_vcpu_event; diff -r f76cd381459e xen/include/asm-x86/mce.h --- a/xen/include/asm-x86/mce.h Wed Sep 19 21:32:15 2012 +0800 +++ b/xen/include/asm-x86/mce.h Wed Sep 19 23:22:57 2012 +0800 @@ -3,28 +3,50 @@ #ifndef _XEN_X86_MCE_H #define _XEN_X86_MCE_H -/* This entry is for recording bank nodes for the impacted domain, - * put into impact_header list. */ -struct bank_entry { - struct list_head list; - uint16_t bank; +/* + * Emulalte 2 banks for guest + * Bank0: reserved for ''bank0 quirk'' occur at some very old processors: + * 1). Intel cpu whose family-model value < 06-1A; + * 2). AMD K7 + * Bank1: used to transfer error info to guest + */ +#define GUEST_MC_BANK_NUM 2 + +/* + * MCG_SER_P: software error recovery supported + * MCG_TES_P: to avoid MCi_status bit56:53 model specific + * MCG_CMCI_P: expose CMCI capability but never really inject it to guest, + * for sake of performance since guest not polling periodically + */ +#define INTEL_GUEST_MCG_CAP (MCG_SER_P | \ + MCG_TES_P | \ + MCG_CMCI_P | \ + GUEST_MC_BANK_NUM) + +#define AMD_GUEST_MCG_CAP (MCG_SER_P | \ + GUEST_MC_BANK_NUM) + +/* Filter MSCOD model specific error code to guest */ +#define MCi_STATUS_MSCOD_MASK (~(0xffffULL << 16)) + +/* No mci_ctl since it stick all 1''s */ +struct vmce_bank { uint64_t mci_status; uint64_t mci_addr; uint64_t mci_misc; + uint64_t mci_ctl2; }; -struct domain_mca_msrs -{ - /* Guest should not change below values after DOM boot up */ +/* No mcg_ctl since it not expose to guest */ +struct vmce { + uint64_t mcg_cap; uint64_t mcg_status; - uint16_t nr_injection; - struct list_head impact_header; + struct vmce_bank bank[GUEST_MC_BANK_NUM]; + spinlock_t lock; }; /* Guest vMCE MSRs virtualization */ -extern int vmce_init_msr(struct domain *d); -extern void vmce_destroy_msr(struct domain *d); extern void vmce_init_vcpu(struct vcpu *); extern int vmce_restore_vcpu(struct vcpu *, uint64_t caps); extern int vmce_wrmsr(uint32_t msr, uint64_t val); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel