The attached code implements a VMX world switch to vmxassist (a small assist module residing in a VMX enabled partition where it is responsible for emulating real mode) whever CR0.PE is disabled. The patch temporarily disables the PGE feature flag in cpuid as it is currently broken (try running an unmodified 2.6 kernel that sets PGE in mm/init.c/paging_init()). The patch adds consistency checks before setting the ARCH_VMX_IO_WAIT state to detect race conditions on SMP systems. Signed-Off-By: Leendert van Doorn <leendert@watson.ibm.com> BTW: vmxassist is undergoing cleanups and will be released soon, I''m exploring the other two problems but the current work arounds allow me to make progress on vmxassist. Leendert diff --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet --exclude=PENDING -ru -Bb -N xeno-unstable.bk/xen/arch/x86/vmx.c xeno-unstable.real-mode/xen/arch/x86/vmx.c --- xeno-unstable.bk/xen/arch/x86/vmx.c 2005-04-17 11:47:01.000000000 -0400 +++ xeno-unstable.real-mode/xen/arch/x86/vmx.c 2005-04-17 05:24:37.000000000 -0400 @@ -194,6 +194,7 @@ cpuid(input, &eax, &ebx, &ecx, &edx); if (input == 1) { + clear_bit(X86_FEATURE_PGE, &edx); /* temporarily disabled */ clear_bit(X86_FEATURE_PSE, &edx); clear_bit(X86_FEATURE_PAE, &edx); clear_bit(X86_FEATURE_PSE36, &edx); @@ -381,10 +382,261 @@ do_block(); } -static int -vm86assist(struct exec_domain *d) +enum { COPY_IN = 0, COPY_OUT }; + +static inline int +vmx_copy(void *buf, unsigned long laddr, int size, int dir) { - /* stay tuned ... */ + unsigned char *addr; + unsigned long mfn; + + if ((size + (laddr & (PAGE_SIZE - 1))) >= PAGE_SIZE) { + printf("vmx_copy exceeds page boundary\n"); + return 0; + } + + mfn = phys_to_machine_mapping(gva_to_gpte(laddr) >> PAGE_SHIFT); + addr = map_domain_mem((mfn << PAGE_SHIFT) | (laddr & ~PAGE_MASK)); + + if (dir == COPY_IN) + memcpy(buf, addr, size); + else + memcpy(addr, buf, size); + + unmap_domain_mem(addr); + return 1; +} + +int +vmx_world_save(struct exec_domain *d, struct vmx_assist_context *c) +{ + unsigned long inst_len; + int error = 0; + + error |= __vmread(INSTRUCTION_LEN, &inst_len); + error |= __vmread(GUEST_EIP, &c->eip); + c->eip += inst_len; /* skip transition instruction */ + error |= __vmread(GUEST_ESP, &c->esp); + error |= __vmread(GUEST_EFLAGS, &c->eflags); + + error |= __vmread(CR0_READ_SHADOW, &c->cr0); + c->cr3 = d->arch.arch_vmx.cpu_cr3; + error |= __vmread(CR4_READ_SHADOW, &c->cr4); + + error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit); + error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base); + + error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit); + error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base); + + error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel); + error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit); + error |= __vmread(GUEST_CS_BASE, &c->cs_base); + error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes); + + error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel); + error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit); + error |= __vmread(GUEST_DS_BASE, &c->ds_base); + error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes); + + error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel); + error |= __vmread(GUEST_ES_LIMIT, &c->es_limit); + error |= __vmread(GUEST_ES_BASE, &c->es_base); + error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes); + + error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel); + error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit); + error |= __vmread(GUEST_SS_BASE, &c->ss_base); + error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes); + + error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel); + error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit); + error |= __vmread(GUEST_FS_BASE, &c->fs_base); + error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes); + + error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel); + error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit); + error |= __vmread(GUEST_GS_BASE, &c->gs_base); + error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes); + + error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel); + error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit); + error |= __vmread(GUEST_TR_BASE, &c->tr_base); + error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes); + + error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel); + error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit); + error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base); + error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes); + + return !error; +} + +int +vmx_world_restore(struct exec_domain *d, struct vmx_assist_context *c) +{ + unsigned long mfn, old_cr4; + int error = 0; + + error |= __vmwrite(GUEST_EIP, c->eip); + error |= __vmwrite(GUEST_ESP, c->esp); + error |= __vmwrite(GUEST_EFLAGS, c->eflags); + + error |= __vmwrite(CR0_READ_SHADOW, c->cr0); + + if (c->cr3 == d->arch.arch_vmx.cpu_cr3) { + /* + * This is simple TLB flush, implying the guest has + * removed some translation or changed page attributes. + * We simply invalidate the shadow. + */ + mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT); + if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table)) { + VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3); + domain_crash_synchronous(); + return 0; + } + shadow_sync_all(d->domain); + } else { + /* + * If different, make a shadow. Check if the PDBR is valid + * first. + */ + VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %lx", c->cr3); + if ((c->cr3 >> PAGE_SHIFT) > d->domain->max_pages) { + VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3); + domain_crash_synchronous(); + return 0; + } + mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT); + d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT); + update_pagetables(d); + /* + * arch.shadow_table should now hold the next CR3 for shadow + */ + d->arch.arch_vmx.cpu_cr3 = c->cr3; + VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", c->cr3); + __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table)); + } + + error |= __vmread(CR4_READ_SHADOW, &old_cr4); + error |= __vmwrite(GUEST_CR4, (c->cr4 | X86_CR4_VMXE)); + error |= __vmwrite(CR4_READ_SHADOW, c->cr4); + + error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit); + error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base); + + error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit); + error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base); + + error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel); + error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit); + error |= __vmwrite(GUEST_CS_BASE, c->cs_base); + error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes); + + error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel); + error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit); + error |= __vmwrite(GUEST_DS_BASE, c->ds_base); + error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes); + + error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel); + error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit); + error |= __vmwrite(GUEST_ES_BASE, c->es_base); + error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes); + + error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel); + error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit); + error |= __vmwrite(GUEST_SS_BASE, c->ss_base); + error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes); + + error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel); + error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit); + error |= __vmwrite(GUEST_FS_BASE, c->fs_base); + error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes); + + error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel); + error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit); + error |= __vmwrite(GUEST_GS_BASE, c->gs_base); + error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes); + + error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel); + error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit); + error |= __vmwrite(GUEST_TR_BASE, c->tr_base); + error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes); + + error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel); + error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit); + error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base); + error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes); + + return !error; +} + +enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE }; + +int +vmx_assist(struct exec_domain *d, int mode) +{ + struct vmx_assist_context c; + unsigned long magic, cp; + + /* make sure vmxassist exists (this is not an error) */ + if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), COPY_IN)) + return 0; + if (magic != VMXASSIST_MAGIC) + return 0; + + switch (mode) { + /* + * Transfer control to vmxassist. + * Store the current context in VMXASSIST_OLD_CONTEXT and load + * the new VMXASSIST_NEW_CONTEXT context. This context was created + * by vmxassist and will transfer control to it. + */ + case VMX_ASSIST_INVOKE: + /* save the old context */ + if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN)) + goto error; + if (cp != 0) { + if (!vmx_world_save(d, &c)) + goto error; + if (!vmx_copy(&c, cp, sizeof(c), COPY_OUT)) + goto error; + } + + /* restore the new context, this should activate vmxassist */ + if (!vmx_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), COPY_IN)) + goto error; + if (cp != 0) { + if (!vmx_copy(&c, cp, sizeof(c), COPY_IN)) + goto error; + if (!vmx_world_restore(d, &c)) + goto error; + return 1; + } + break; + + /* + * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE + * above. + */ + case VMX_ASSIST_RESTORE: + /* save the old context */ + if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN)) + goto error; + if (cp != 0) { + if (!vmx_copy(&c, cp, sizeof(c), COPY_IN)) + goto error; + if (!vmx_world_restore(d, &c)) + goto error; + return 1; + } + break; + } + +error: + printf("Failed to transfer to vmxassist\n"); + domain_crash_synchronous(); return 0; } @@ -398,6 +650,7 @@ { unsigned long value; unsigned long old_cr; + unsigned long eip; struct exec_domain *d = current; switch (gp) { @@ -468,15 +721,28 @@ put_page_and_type(&frame_table[old_base_mfn]); } else { if ((value & X86_CR0_PE) == 0) { - unsigned long eip; - __vmread(GUEST_EIP, &eip); VMX_DBG_LOG(DBG_LEVEL_1, "Disabling CR0.PE at %%eip 0x%lx", eip); - if (vm86assist(d)) { + if (vmx_assist(d, VMX_ASSIST_INVOKE)) { + set_bit(VMX_CPU_STATE_ASSIST_ENABLED, + &d->arch.arch_vmx.cpu_state); __vmread(GUEST_EIP, &eip); VMX_DBG_LOG(DBG_LEVEL_1, - "Transfering control to vm86assist %%eip 0x%lx", eip); + "Transfering control to vmxassist %%eip 0x%lx", eip); + return 0; /* do not update eip! */ + } + } else if (test_bit(VMX_CPU_STATE_ASSIST_ENABLED, + &d->arch.arch_vmx.cpu_state)) { + __vmread(GUEST_EIP, &eip); + VMX_DBG_LOG(DBG_LEVEL_1, + "Enabling CR0.PE at %%eip 0x%lx", eip); + if (vmx_assist(d, VMX_ASSIST_RESTORE)) { + clear_bit(VMX_CPU_STATE_ASSIST_ENABLED, + &d->arch.arch_vmx.cpu_state); + __vmread(GUEST_EIP, &eip); + VMX_DBG_LOG(DBG_LEVEL_1, + "Restoring to %%eip 0x%lx", eip); return 0; /* do not update eip! */ } } @@ -548,6 +814,7 @@ */ if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) { vmx_shadow_clear_state(d->domain); + shadow_sync_all(d->domain); } break; default: diff --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet --exclude=PENDING -ru -Bb -N xeno-unstable.bk/xen/arch/x86/vmx_platform.c xeno-unstable.real-mode/xen/arch/x86/vmx_platform.c --- xeno-unstable.bk/xen/arch/x86/vmx_platform.c 2005-04-17 11:47:01.000000000 -0400 +++ xeno-unstable.real-mode/xen/arch/x86/vmx_platform.c 2005-04-17 11:18:32.000000000 -0400 @@ -484,6 +484,11 @@ vm86 = inst_decoder_regs->eflags & X86_EFLAGS_VM; + if (test_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags)) { + printf("VMX I/O has not yet completed\n"); + domain_crash_synchronous(); + } + set_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags); p->dir = dir; p->pdata_valid = pvalid; diff --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet --exclude=PENDING -ru -Bb -N xeno-unstable.bk/xen/include/asm-x86/vmx_vmcs.h xeno-unstable.real-mode/xen/include/asm-x86/vmx_vmcs.h --- xeno-unstable.bk/xen/include/asm-x86/vmx_vmcs.h 2005-04-17 11:47:01.000000000 -0400 +++ xeno-unstable.real-mode/xen/include/asm-x86/vmx_vmcs.h 2005-04-15 13:15:45.000000000 -0400 @@ -22,24 +22,15 @@ #include <asm/config.h> #include <asm/vmx_cpu.h> #include <asm/vmx_platform.h> +#include <public/vmx_assist.h> extern int start_vmx(void); extern void stop_vmx(void); void vmx_enter_scheduler(void); -union vmcs_arbytes { - struct arbyte_fields { - unsigned int - seg_type: 4, s: 1, dpl: 2, p: 1, - reserved0: 4, avl: 1, reserved1: 1, - default_ops_size: 1, g: 1, null_bit: 1, - reserved2: 15; - } __attribute__((packed)) fields; - unsigned int bytes; -}; - #define VMX_CPU_STATE_PG_ENABLED 0 +#define VMX_CPU_STATE_ASSIST_ENABLED 1 #define VMCS_SIZE 0x1000 struct vmcs_struct { diff --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet --exclude=PENDING -ru -Bb -N xeno-unstable.bk/xen/include/public/vmx_assist.h xeno-unstable.real-mode/xen/include/public/vmx_assist.h --- xeno-unstable.bk/xen/include/public/vmx_assist.h 1969-12-31 19:00:00.000000000 -0500 +++ xeno-unstable.real-mode/xen/include/public/vmx_assist.h 2005-04-17 11:47:58.000000000 -0400 @@ -0,0 +1,101 @@ +/* + * vmx_assist.h: Context definitions for the VMXASSIST world switch. + * + * Leendert van Doorn, leendert@watson.ibm.com + * Copyright (c) 2005, International Business Machines Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ +#ifndef _VMX_ASSIST_H_ +#define _VMX_ASSIST_H_ + +#define VMXASSIST_BASE 0xE0000 +#define VMXASSIST_MAGIC 0x17101966 +#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8) + +#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12) +#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4) + +#ifndef __ASSEMBLY__ + +union vmcs_arbytes { + struct arbyte_fields { + unsigned int seg_type : 4, + s : 1, + dpl : 2, + p : 1, + reserved0 : 4, + avl : 1, + reserved1 : 1, + default_ops_size: 1, + g : 1, + null_bit : 1, + reserved2 : 15; + } __attribute__((packed)) fields; + unsigned int bytes; +}; + +/* + * World switch state + */ +typedef struct vmx_assist_context { + unsigned long eip; /* execution pointer */ + unsigned long esp; /* stack point */ + unsigned long eflags; /* flags register */ + unsigned long cr0; + unsigned long cr3; /* page table directory */ + unsigned long cr4; + unsigned long idtr_limit; /* idt */ + unsigned long idtr_base; + unsigned long gdtr_limit; /* gdt */ + unsigned long gdtr_base; + unsigned long cs_sel; /* cs selector */ + unsigned long cs_limit; + unsigned long cs_base; + union vmcs_arbytes cs_arbytes; + unsigned long ds_sel; /* ds selector */ + unsigned long ds_limit; + unsigned long ds_base; + union vmcs_arbytes ds_arbytes; + unsigned long es_sel; /* es selector */ + unsigned long es_limit; + unsigned long es_base; + union vmcs_arbytes es_arbytes; + unsigned long ss_sel; /* ss selector */ + unsigned long ss_limit; + unsigned long ss_base; + union vmcs_arbytes ss_arbytes; + unsigned long fs_sel; /* fs selector */ + unsigned long fs_limit; + unsigned long fs_base; + union vmcs_arbytes fs_arbytes; + unsigned long gs_sel; /* gs selector */ + unsigned long gs_limit; + unsigned long gs_base; + union vmcs_arbytes gs_arbytes; + unsigned long tr_sel; /* task selector */ + unsigned long tr_limit; + unsigned long tr_base; + union vmcs_arbytes tr_arbytes; + unsigned long ldtr_sel; /* ldtr selector */ + unsigned long ldtr_limit; + unsigned long ldtr_base; + union vmcs_arbytes ldtr_arbytes; +} vmx_assist_context_t; + +#endif /* __ASSEMBLY__ */ + +#endif /* _VMX_ASSIST_H_ */ + _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel