The heart of this patch is vmx exit handler for PVH guest. It is nicely
isolated in a separate module. A call to it is added to
vmx_pvh_vmexit_handler().
Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
diff -r eca698a4e733 -r 0a38c610f26b xen/arch/x86/hvm/vmx/Makefile
--- a/xen/arch/x86/hvm/vmx/Makefile Fri Jan 11 16:32:36 2013 -0800
+++ b/xen/arch/x86/hvm/vmx/Makefile Fri Jan 11 16:34:17 2013 -0800
@@ -5,3 +5,4 @@ obj-y += vmcs.o
obj-y += vmx.o
obj-y += vpmu_core2.o
obj-y += vvmx.o
+obj-y += vmx_pvh.o
diff -r eca698a4e733 -r 0a38c610f26b xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Fri Jan 11 16:32:36 2013 -0800
+++ b/xen/arch/x86/hvm/vmx/vmx.c Fri Jan 11 16:34:17 2013 -0800
@@ -1546,7 +1546,9 @@ static struct hvm_function_table __read_
.nhvm_intr_blocked = nvmx_intr_blocked,
.nhvm_domain_relinquish_resources = nvmx_domain_relinquish_resources,
.update_eoi_exit_bitmap = vmx_update_eoi_exit_bitmap,
- .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled
+ .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled,
+ .pvh_set_vcpu_info = vmx_pvh_set_vcpu_info,
+ .pvh_read_descriptor = vmx_pvh_read_descriptor
};
struct hvm_function_table * __init start_vmx(void)
@@ -2280,6 +2282,14 @@ void vmx_vmexit_handler(struct cpu_user_
perfc_incra(vmexits, exit_reason);
+ if ( is_pvh_vcpu(v) ) {
+ if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
+ return vmx_failed_vmentry(exit_reason, regs);
+
+ vmx_pvh_vmexit_handler(regs);
+ return;
+ }
+
/* Handle the interrupt we missed before allowing any more in. */
switch ( (uint16_t)exit_reason )
{
diff -r eca698a4e733 -r 0a38c610f26b xen/arch/x86/hvm/vmx/vmx_pvh.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c Fri Jan 11 16:34:17 2013 -0800
@@ -0,0 +1,849 @@
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/irq.h>
+#include <xen/softirq.h>
+#include <xen/domain_page.h>
+#include <xen/hypercall.h>
+#include <xen/guest_access.h>
+#include <xen/perfc.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/regs.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/types.h>
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/spinlock.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/traps.h>
+#include <asm/mem_sharing.h>
+#include <asm/hvm/emulate.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/support.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <public/sched.h>
+#include <public/hvm/ioreq.h>
+#include <asm/hvm/vpic.h>
+#include <asm/hvm/vlapic.h>
+#include <asm/x86_emulate.h>
+#include <asm/hvm/vpt.h>
+#include <public/hvm/save.h>
+#include <asm/hvm/trace.h>
+#include <asm/xenoprof.h>
+#include <asm/debugger.h>
+
+volatile int pvhdbg=0;
+#define dbgp0(...) dprintk(XENLOG_ERR, __VA_ARGS__);
+#define dbgp1(...) {(pvhdbg==1) ? printk(__VA_ARGS__):0;}
+#define dbgp2(...) {(pvhdbg==2) ? printk(__VA_ARGS__):0;}
+
+
+static void read_vmcs_selectors(struct cpu_user_regs *regs)
+{
+ regs->cs = vmr(GUEST_CS_SELECTOR);
+ regs->ss = vmr(GUEST_SS_SELECTOR);
+ regs->ds = vmr(GUEST_DS_SELECTOR);
+ regs->es = vmr(GUEST_ES_SELECTOR);
+ regs->gs = vmr(GUEST_GS_SELECTOR);
+ regs->fs = vmr(GUEST_FS_SELECTOR);
+}
+
+/* returns : 0 success */
+static noinline int vmxit_msr_read(struct cpu_user_regs *regs)
+{
+ int rc=1;
+
+ u64 msr_content = 0;
+ switch (regs->ecx)
+ {
+ case MSR_IA32_MISC_ENABLE:
+ {
+ rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
+ msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
+ break;
+ }
+ default:
+ {
+ /* fixme: see hvm_msr_read_intercept() */
+ rdmsrl(regs->ecx, msr_content);
+ break;
+ }
+ }
+ regs->eax = (uint32_t)msr_content;
+ regs->edx = (uint32_t)(msr_content >> 32);
+ update_guest_eip();
+ rc = 0;
+
+ dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n",
regs->ecx, regs->eax,
+ regs->edx, vmr(GUEST_RIP), vmr(GUEST_RSP));
+ return rc;
+}
+
+/* returns : 0 success */
+static noinline int vmxit_msr_write(struct cpu_user_regs *regs)
+{
+ uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx
<< 32);
+ int rc=1;
+
+ dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n",
regs->ecx,
+ regs->eax,regs->edx);
+
+ if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) {
+ update_guest_eip();
+ rc = 0;
+ }
+ return rc;
+}
+
+/* Returns: rc == 0: handled the MTF vmexit */
+static noinline int vmxit_mtf(struct cpu_user_regs *regs)
+{
+ struct vcpu *vp = current;
+ int rc=1, ss=vp->arch.hvm_vcpu.single_step;
+
+ dbgp2("\n");
+ vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+ __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control);
+ vp->arch.hvm_vcpu.single_step = 0;
+
+ if ( vp->domain->debugger_attached && ss ) {
+ domain_pause_for_debugger();
+ rc = 0;
+ }
+ return rc;
+}
+
+static noinline int vmxit_int3(struct cpu_user_regs *regs)
+{
+ int ilen = get_instruction_length();
+ struct vcpu *vp = current;
+ struct hvm_trap trap_info = {
+ .vector = TRAP_int3,
+ .type = X86_EVENTTYPE_SW_EXCEPTION,
+ .error_code = HVM_DELIVER_NO_ERROR_CODE,
+ .insn_len = ilen
+ };
+
+ regs->eip += ilen;
+
+ /* gdbsx or another debugger. Never pause dom0 */
+ if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp,
regs) )
+ {
+ dbgp1("[%d]PVH: domain pause for debugger\n",
smp_processor_id());
+ current->arch.gdbsx_vcpu_event = TRAP_int3;
+ domain_pause_for_debugger();
+ return 0;
+ }
+
+ regs->eip -= ilen;
+ hvm_inject_trap(&trap_info);
+
+ return 0;
+}
+
+/* Returns: rc == 0: handled the exception/NMI */
+static noinline int vmxit_exception(struct cpu_user_regs *regs)
+{
+ unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) &
INTR_INFO_VECTOR_MASK;
+ int rc=1;
+ struct vcpu *vp = current;
+
+ dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector,
vmr(GUEST_CS_SELECTOR),
+ regs->eip);
+
+ if (vector == TRAP_debug) {
+ unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+ write_debugreg(6, exit_qualification | 0xffff0ff0);
+ regs->rip = vmr(GUEST_RIP);
+ regs->rsp = vmr(GUEST_RSP);
+ rc = 0;
+
+ /* gdbsx or another debugger */
+ if ( vp->domain->domain_id != 0 && /* never pause dom0
*/
+ guest_kernel_mode(vp, regs) &&
vp->domain->debugger_attached )
+ {
+ domain_pause_for_debugger();
+ } else {
+ hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
+ }
+ }
+ if (vector == TRAP_int3) {
+ rc = vmxit_int3(regs);
+ }
+
+ if (vector == TRAP_invalid_op) {
+ if ( guest_kernel_mode(vp, regs) || emulate_forced_invalid_op(regs)==0
)
+ {
+ hvm_inject_hw_exception(TRAP_invalid_op,
HVM_DELIVER_NO_ERROR_CODE);
+ rc = 0;
+ }
+
+ }
+ if (vector == TRAP_no_device) {
+ hvm_funcs.fpu_dirty_intercept(); /* calls vmx_fpu_dirty_intercept */
+ rc = 0;
+ }
+
+ if (vector == TRAP_gp_fault) {
+ regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
+ /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */
+ rc = 1;
+ }
+
+ if (vector == TRAP_page_fault) {
+ printk("PVH: Unexpected vector page_fault. IP:%lx\n",
regs->eip);
+ rc = 1;
+ }
+
+ if (rc)
+ printk("PVH: Unhandled trap vector:%d. IP:%lx\n", vector,
regs->eip);
+ return rc;
+}
+
+static noinline int vmxit_invlpg(void)
+{
+ ulong vaddr = __vmread(EXIT_QUALIFICATION);
+
+ update_guest_eip();
+ vpid_sync_vcpu_gva(current, vaddr);
+ return 0;
+}
+
+static noinline int pvh_grant_table_op(
+ unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
+{
+ switch (cmd)
+ {
+ case GNTTABOP_map_grant_ref:
+ case GNTTABOP_unmap_grant_ref:
+ case GNTTABOP_setup_table:
+ case GNTTABOP_copy:
+ case GNTTABOP_query_size:
+ case GNTTABOP_set_version:
+ return do_grant_table_op(cmd, uop, count);
+ }
+ return -ENOSYS;
+}
+
+static long pvh_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+{
+ long rc = -ENOSYS;
+
+ switch ( cmd )
+ {
+ case VCPUOP_register_runstate_memory_area:
+ case VCPUOP_get_runstate_info:
+ case VCPUOP_set_periodic_timer:
+ case VCPUOP_stop_periodic_timer:
+ case VCPUOP_set_singleshot_timer:
+ case VCPUOP_stop_singleshot_timer:
+ case VCPUOP_is_up:
+ case VCPUOP_up:
+ case VCPUOP_initialise:
+ rc = do_vcpu_op(cmd, vcpuid, arg);
+
+ /* pvh boot vcpu setting context for bringing up smp vcpu */
+ if (cmd == VCPUOP_initialise)
+ vmx_vmcs_enter(current);
+ }
+ return rc;
+}
+
+static long pvh_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
+{
+ switch ( cmd )
+ {
+ case PHYSDEVOP_map_pirq:
+ case PHYSDEVOP_unmap_pirq:
+ case PHYSDEVOP_eoi:
+ case PHYSDEVOP_irq_status_query:
+ case PHYSDEVOP_get_free_pirq:
+ return do_physdev_op(cmd, arg);
+
+ default:
+ if ( IS_PRIV(current->domain) )
+ return do_physdev_op(cmd, arg);
+ }
+ return -ENOSYS;
+}
+
+static noinline long do_pvh_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void)
arg)
+{
+ long rc = -EINVAL;
+ struct xen_hvm_param harg;
+ struct domain *d;
+
+ if ( copy_from_guest(&harg, arg, 1) )
+ return -EFAULT;
+
+ rc = rcu_lock_target_domain_by_id(harg.domid, &d);
+ if ( rc != 0 )
+ return rc;
+
+ if (is_hvm_domain(d)) {
+ /* pvh dom0 is building an hvm guest */
+ rcu_unlock_domain(d);
+ return do_hvm_op(op, arg);
+ }
+
+ rc = -ENOSYS;
+ if (op == HVMOP_set_param) {
+ if (harg.index == HVM_PARAM_CALLBACK_IRQ) {
+ struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+ uint64_t via = harg.value;
+ uint8_t via_type = (uint8_t)(via >> 56) + 1;
+
+ if (via_type == HVMIRQ_callback_vector) {
+ hvm_irq->callback_via_type = HVMIRQ_callback_vector;
+ hvm_irq->callback_via.vector = (uint8_t)via;
+ rc = 0;
+ }
+ }
+ }
+ rcu_unlock_domain(d);
+ return rc;
+}
+
+typedef unsigned long pvh_hypercall_t(
+ unsigned long, unsigned long, unsigned long, unsigned long, unsigned long,
+ unsigned long);
+
+int hcall_a[NR_hypercalls];
+
+static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = {
+ [__HYPERVISOR_platform_op] = (pvh_hypercall_t *)do_platform_op,
+ [__HYPERVISOR_memory_op] = (pvh_hypercall_t *)do_memory_op,
+ /* [__HYPERVISOR_set_timer_op] = (pvh_hypercall_t *)do_set_timer_op, */
+ [__HYPERVISOR_xen_version] = (pvh_hypercall_t *)do_xen_version,
+ [__HYPERVISOR_console_io] = (pvh_hypercall_t *)do_console_io,
+ [__HYPERVISOR_grant_table_op] = (pvh_hypercall_t *)pvh_grant_table_op,
+ [__HYPERVISOR_vcpu_op] = (pvh_hypercall_t *)pvh_vcpu_op,
+ [__HYPERVISOR_mmuext_op] = (pvh_hypercall_t *)do_mmuext_op,
+ [__HYPERVISOR_xsm_op] = (pvh_hypercall_t *)do_xsm_op,
+ [__HYPERVISOR_sched_op] = (pvh_hypercall_t *)do_sched_op,
+ [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t *)do_event_channel_op,
+ [__HYPERVISOR_physdev_op] = (pvh_hypercall_t *)pvh_physdev_op,
+ [__HYPERVISOR_hvm_op] = (pvh_hypercall_t *)do_pvh_hvm_op,
+ [__HYPERVISOR_sysctl] = (pvh_hypercall_t *)do_sysctl,
+ [__HYPERVISOR_domctl] = (pvh_hypercall_t *)do_domctl
+};
+
+/* fixme: Do we need to worry about this and slow things down in this path? */
+static int pvh_long_mode_enabled(void)
+{
+ /* A 64bit linux guest should always run in this mode with CS.L selecting
+ * either 64bit mode or 32bit compat mode */
+ return 1;
+}
+
+/* Check if hypercall is valid
+ * Returns: 0 if hcall is not valid with eax set to the errno to ret to guest
+ */
+static noinline int hcall_valid(struct cpu_user_regs *regs)
+{
+ struct segment_register sreg;
+
+ if (!pvh_long_mode_enabled()) {
+ printk("PVH Error: Expected long mode set\n");
+ return 1;
+ }
+ hvm_get_segment_register(current, x86_seg_ss, &sreg);
+ if ( unlikely(sreg.attr.fields.dpl == 3) ) {
+ regs->eax = -EPERM;
+ return 0;
+ }
+
+ /* domU''s are not allowed following hcalls */
+ if ( !IS_PRIV(current->domain) &&
+ (regs->eax == __HYPERVISOR_xsm_op ||
+ regs->eax == __HYPERVISOR_platform_op ||
+ regs->eax == __HYPERVISOR_domctl) ) { /* for privcmd mmap */
+
+ regs->eax = -EPERM;
+ return 0;
+ }
+ return 1;
+}
+
+static noinline int vmxit_vmcall(struct cpu_user_regs *regs)
+{
+ uint32_t hnum = regs->eax;
+
+ if (hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] ==NULL)
+ {
+ dbgp0("PVH[%d]: UnImplemented HCALL:%ld. Ret -ENOSYS IP:%lx
SP:%lx\n",
+ smp_processor_id(), regs->eax, vmr(GUEST_RIP),
vmr(GUEST_RSP));
+ regs->eax = -ENOSYS;
+ update_guest_eip();
+ return HVM_HCALL_completed;
+ }
+
+ dbgp2("vmxit_vmcall: hcall eax:$%ld\n", regs->eax);
+ if (regs->eax == __HYPERVISOR_sched_op && regs->rdi ==
SCHEDOP_shutdown) {
+ /* PVH fixme: go thru this again to make sure nothing is left out */
+ printk("PVH: FIXME: SCHEDOP_shutdown hcall\n");
+ regs->eax = -ENOSYS;
+ update_guest_eip();
+ domain_crash_synchronous();
+ return HVM_HCALL_completed;
+ }
+
+ if ( !hcall_valid(regs) )
+ return HVM_HCALL_completed;
+
+ /* PVH fixme: search for this and do it. PV method will not work */
+ current->arch.hvm_vcpu.hcall_preempted = 0;
+
+ regs->rax = pvh_hypercall64_table[hnum](regs->rdi, regs->rsi,
regs->rdx,
+ regs->r10, regs->r8,
regs->r9);
+
+ if ( !current->arch.hvm_vcpu.hcall_preempted )
+ update_guest_eip();
+ else
+ printk("PVH: Hcall :%d preempted\n", hnum);
+
+ return HVM_HCALL_completed;
+}
+
+static noinline uint64_t *get_gpr_ptr(struct cpu_user_regs *regs, uint gpr)
+{
+ switch (gpr)
+ {
+ case VMX_CONTROL_REG_ACCESS_GPR_EAX:
+ return ®s->eax;
+ case VMX_CONTROL_REG_ACCESS_GPR_ECX:
+ return ®s->ecx;
+ case VMX_CONTROL_REG_ACCESS_GPR_EDX:
+ return ®s->edx;
+ case VMX_CONTROL_REG_ACCESS_GPR_EBX:
+ return ®s->ebx;
+ case VMX_CONTROL_REG_ACCESS_GPR_ESP:
+ return ®s->esp;
+ case VMX_CONTROL_REG_ACCESS_GPR_EBP:
+ return ®s->ebp;
+ case VMX_CONTROL_REG_ACCESS_GPR_ESI:
+ return ®s->esi;
+ case VMX_CONTROL_REG_ACCESS_GPR_EDI:
+ return ®s->edi;
+ case VMX_CONTROL_REG_ACCESS_GPR_R8:
+ return ®s->r8;
+ case VMX_CONTROL_REG_ACCESS_GPR_R9:
+ return ®s->r9;
+ case VMX_CONTROL_REG_ACCESS_GPR_R10:
+ return ®s->r10;
+ case VMX_CONTROL_REG_ACCESS_GPR_R11:
+ return ®s->r11;
+ case VMX_CONTROL_REG_ACCESS_GPR_R12:
+ return ®s->r12;
+ case VMX_CONTROL_REG_ACCESS_GPR_R13:
+ return ®s->r13;
+ case VMX_CONTROL_REG_ACCESS_GPR_R14:
+ return ®s->r14;
+ case VMX_CONTROL_REG_ACCESS_GPR_R15:
+ return ®s->r15;
+ default:
+ return NULL;
+ }
+}
+/* Returns: rc == 0: success */
+static noinline int access_cr0(struct cpu_user_regs *regs, uint acc_typ,
+ uint64_t *regp)
+{
+ struct vcpu *vp = current;
+
+ if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+ {
+ unsigned long new_cr0 = *regp;
+ unsigned long old_cr0 = __vmread(GUEST_CR0);
+
+ dbgp2("PVH:writing to CR0. RIP:%lx val:0x%lx\n",
vmr(GUEST_RIP),*regp);
+ if ( (u32)new_cr0 != new_cr0 )
+ {
+ HVM_DBG_LOG(DBG_LEVEL_1,
+ "Guest setting upper 32 bits in CR0: %lx",
new_cr0);
+ return 1;
+ }
+
+ new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
+ /* ET is reserved and should be always be 1. */
+ new_cr0 |= X86_CR0_ET;
+
+ /* pvh cannot change to real mode */
+ if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) != (X86_CR0_PG|X86_CR0_PE)
) {
+ printk("PVH domU attempting to turn off PE/PG.
CR0:%lx\n", new_cr0);
+ return 1;
+ }
+ /* TS going from 1 to 0 */
+ if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 &
X86_CR0_TS)==0) )
+ vmx_fpu_enter(vp);
+
+ vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] =
new_cr0;
+ __vmwrite(GUEST_CR0, new_cr0);
+ __vmwrite(CR0_READ_SHADOW, new_cr0);
+ } else {
+ *regp = __vmread(GUEST_CR0);
+ }
+ return 0;
+}
+
+/* Returns: rc == 0: success */
+static noinline int access_cr4(struct cpu_user_regs *regs, uint acc_typ,
+ uint64_t *regp)
+{
+ if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+ {
+ u64 old_cr4 = __vmread(GUEST_CR4);
+
+ if ( (old_cr4 ^ (*regp)) & (X86_CR4_PSE | X86_CR4_PGE |
X86_CR4_PAE) )
+ vpid_sync_all();
+
+ /* pvh_verify_cr4_wr(*regp)); */
+ __vmwrite(GUEST_CR4, *regp);
+ } else {
+ *regp = __vmread(GUEST_CR4);
+ }
+ return 0;
+}
+
+/* Returns: rc == 0: success */
+static noinline int vmxit_cr_access(struct cpu_user_regs *regs)
+{
+ unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+ uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
+ int cr, rc = 1;
+
+ switch ( acc_typ )
+ {
+ case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
+ case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
+ {
+ uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
+ uint64_t *regp = get_gpr_ptr(regs, gpr);
+ cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
+
+ if (regp == NULL)
+ break;
+
+ /* pl don''t embed switch statements */
+ if (cr == 0)
+ rc = access_cr0(regs, acc_typ, regp);
+ else if (cr == 3) {
+ printk("PVH: d%d: unexpected cr3 access vmexit.
rip:%lx\n",
+ current->domain->domain_id, vmr(GUEST_RIP));
+ domain_crash_synchronous();
+ } else if (cr == 4)
+ rc = access_cr4(regs, acc_typ, regp);
+
+ if (rc == 0)
+ update_guest_eip();
+ break;
+ }
+ case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
+ {
+ struct vcpu *vp = current;
+ unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] &
~X86_CR0_TS;
+ vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] =
cr0;
+ vmx_fpu_enter(vp);
+ __vmwrite(GUEST_CR0, cr0);
+ __vmwrite(CR0_READ_SHADOW, cr0);
+ update_guest_eip();
+ rc = 0;
+ }
+ }
+ return rc;
+}
+
+/* NOTE: a PVH sets IOPL natively by setting bits in the eflags and not by
+ * hypercalls used by a PV */
+static int noinline vmxit_io_instr(struct cpu_user_regs *regs)
+{
+ int curr_lvl;
+ int requested = (regs->rflags >> 12) & 3;
+
+ read_vmcs_selectors(regs);
+ curr_lvl = regs->cs & 3;
+
+ if (requested >= curr_lvl && emulate_privileged_op(regs))
+ return 0;
+
+ hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
+ return 0;
+}
+
+static noinline int pvh_ept_handle_violation(unsigned long qualification,
paddr_t gpa)
+{
+ unsigned long gla, gfn = gpa >> PAGE_SHIFT;
+ p2m_type_t p2mt;
+ mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt);
+
+ gdprintk(XENLOG_ERR, "Dom:%d EPT violation %#lx (%c%c%c/%c%c%c),
"
+ "gpa %#"PRIpaddr", mfn %#lx, type %i.\n",
+ current->domain->domain_id, qualification,
+ (qualification & EPT_READ_VIOLATION) ? ''r'' :
''-'',
+ (qualification & EPT_WRITE_VIOLATION) ? ''w''
: ''-'',
+ (qualification & EPT_EXEC_VIOLATION) ? ''x'' :
''-'',
+ (qualification & EPT_EFFECTIVE_READ) ? ''r'' :
''-'',
+ (qualification & EPT_EFFECTIVE_WRITE) ? ''w''
: ''-'',
+ (qualification & EPT_EFFECTIVE_EXEC) ? ''x'' :
''-'',
+ gpa, mfn_x(mfn), p2mt);
+
+ ept_walk_table(current->domain, gfn);
+
+ if ( qualification & EPT_GLA_VALID )
+ {
+ gla = __vmread(GUEST_LINEAR_ADDRESS);
+ gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
+ }
+
+ hvm_inject_hw_exception(TRAP_gp_fault, 0);
+ return 0;
+}
+
+static noinline void pvh_cpuid(struct cpu_user_regs *regs)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ asm volatile ( "cpuid"
+ : "=a" (eax), "=b" (ebx), "=c"
(ecx), "=d" (edx)
+ : "0" (regs->eax), "2" (regs->rcx) );
+
+ regs->rax = eax; regs->rbx = ebx; regs->rcx = ecx; regs->rdx =
edx;
+}
+
+void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs)
+{
+ unsigned long exit_qualification;
+ unsigned int vector, exit_reason = __vmread(VM_EXIT_REASON);
+ int rc=0, ccpu = smp_processor_id();
+ struct vcpu *vp = current;
+
+ dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx
CR0:%lx\n",
+ ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP), regs->rflags,
+ vmr(GUEST_CR0));
+
+ /* for guest_kernel_mode() */
+ regs->cs = vmr(GUEST_CS_SELECTOR);
+
+ switch ( (uint16_t)exit_reason )
+ {
+ case EXIT_REASON_EXCEPTION_NMI:
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ break;
+ default:
+ local_irq_enable();
+ }
+
+ switch ( (uint16_t)exit_reason )
+ {
+ case EXIT_REASON_EXCEPTION_NMI: /* 0 */
+ rc = vmxit_exception(regs);
+ break;
+
+ case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */
+ {
+ vector = __vmread(VM_EXIT_INTR_INFO);
+ vector &= INTR_INFO_VECTOR_MASK;
+ vmx_do_extint(regs);
+ break;
+ }
+
+ case EXIT_REASON_TRIPLE_FAULT: /* 2 */
+ {
+ dbgp0("PVH:Triple Flt:[%d]exitreas:%d RIP:%lx RSP:%lx
EFLAGS:%lx CR3:%lx\n",
+ ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP),
+ regs->rflags, vmr(GUEST_CR3));
+
+ vp->arch.hvm_vcpu.guest_cr[3] = vp->arch.hvm_vcpu.hw_cr[3] +
__vmread(GUEST_CR3);
+ rc = 1;
+ break;
+ }
+ case EXIT_REASON_PENDING_VIRT_INTR: /* 7 */
+ {
+ struct vcpu *v = current;
+ /* Disable the interrupt window. */
+ v->arch.hvm_vmx.exec_control &=
~CPU_BASED_VIRTUAL_INTR_PENDING;
+ __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
v->arch.hvm_vmx.exec_control);
+ break;
+ }
+
+ case EXIT_REASON_CPUID: /* 10 */
+ {
+ if ( guest_kernel_mode(vp, regs) ) {
+ pv_cpuid(regs);
+
+ /* Because we are setting CR4.OSFXSR to 0, we need to disable
+ * this because, during boot, user process "init"
(which doesn''t
+ * do cpuid), will do ''pxor xmm0,xmm0'' and
cause #UD. For now
+ * disable this. HVM doesn''t allow setting of
CR4.OSFXSR.
+ * fixme: this and also look at CR4.OSXSAVE */
+
+ __clear_bit(X86_FEATURE_FXSR, ®s->edx);
+ } else
+ pvh_cpuid(regs);
+
+ /* fixme: investigate and fix the XSAVE/MMX/FPU stuff */
+
+ update_guest_eip();
+ dbgp2("cpuid:%ld RIP:%lx\n", regs->eax,
vmr(GUEST_RIP));
+ break;
+ }
+
+ case EXIT_REASON_HLT: /* 12 */
+ {
+ update_guest_eip();
+ hvm_hlt(regs->eflags);
+ break;
+ }
+
+ case EXIT_REASON_INVLPG: /* 14 */
+ rc = vmxit_invlpg();
+ break;
+
+ case EXIT_REASON_RDTSC: /* 16 */
+ {
+ rc = 1;
+ break;
+ }
+
+ case EXIT_REASON_VMCALL: /* 18 */
+ rc = vmxit_vmcall(regs);
+ break;
+
+ case EXIT_REASON_CR_ACCESS: /* 28 */
+ rc = vmxit_cr_access(regs);
+ break;
+
+ case EXIT_REASON_DR_ACCESS: /* 29 */
+ {
+ exit_qualification = __vmread(EXIT_QUALIFICATION);
+ vmx_dr_access(exit_qualification, regs);
+ break;
+ }
+
+ case EXIT_REASON_IO_INSTRUCTION:
+ vmxit_io_instr(regs);
+ break;
+
+ case EXIT_REASON_MSR_READ: /* 31 */
+ rc = vmxit_msr_read(regs);
+ break;
+
+ case EXIT_REASON_MSR_WRITE: /* 32 */
+ rc = vmxit_msr_write(regs);
+ break;
+
+ case EXIT_REASON_MONITOR_TRAP_FLAG: /* 37 */
+ rc = vmxit_mtf(regs);
+ break;
+
+ case EXIT_REASON_EPT_VIOLATION:
+ {
+ paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+ exit_qualification = __vmread(EXIT_QUALIFICATION);
+ rc = pvh_ept_handle_violation(exit_qualification, gpa);
+ break;
+ }
+ default:
+ rc = 1;
+ printk("PVH: Unexpected exit reason:%d 0x%x\n",
exit_reason,
+ exit_reason);
+ }
+
+ if (rc) {
+ exit_qualification = __vmread(EXIT_QUALIFICATION);
+ printk("PVH: [%d] exit_reas:%d 0x%x qual:%ld 0x%lx
cr0:0x%016lx\n",
+ ccpu, exit_reason, exit_reason, exit_qualification,
+ exit_qualification, vmr(GUEST_CR0));
+ printk("PVH: [%d] RIP:%lx RSP:%lx\n", ccpu,
+ vmr(GUEST_RIP), vmr(GUEST_RSP));
+ domain_crash_synchronous();
+ }
+}
+
+/*
+ * Sets info for non boot vcpu. VCPU 0 context is set by library which needs
+ * to be modified to send
+ * correct selectors and gs_base. For now, we use this for nonboot vcpu
+ * in which case the call somes from the kernel cpu_initialize_context().
+ */
+int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp)
+{
+ if (v->vcpu_id == 0)
+ return 0;
+
+ vmx_vmcs_enter(v);
+ __vmwrite(GUEST_GDTR_BASE, ctxtp->u.pvh.gdtaddr);
+ __vmwrite(GUEST_GDTR_LIMIT, ctxtp->u.pvh.gdtsz);
+ __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user);
+
+ __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs);
+ __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds);
+ __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es);
+ __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss);
+ __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs);
+
+ if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) )
+ return -EINVAL;
+
+ vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel);
+
+ vmx_vmcs_exit(v);
+ return 0;
+}
+
+int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
+ const struct cpu_user_regs *regs,
+ unsigned long *base, unsigned long *limit,
+ unsigned int *ar)
+{
+ unsigned int tmp_ar = 0;
+ BUG_ON(v!=current);
+ BUG_ON(!is_pvh_vcpu(v));
+
+ if (sel == (unsigned int)regs->cs) {
+ *base = vmr(GUEST_CS_BASE);
+ *limit = vmr(GUEST_CS_LIMIT);
+ tmp_ar = vmr(GUEST_CS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->ds) {
+ *base = vmr(GUEST_DS_BASE);
+ *limit = vmr(GUEST_DS_LIMIT);
+ tmp_ar = vmr(GUEST_DS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->ss) {
+ *base = vmr(GUEST_SS_BASE);
+ *limit = vmr(GUEST_SS_LIMIT);
+ tmp_ar = vmr(GUEST_SS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->gs) {
+ *base = vmr(GUEST_GS_BASE);
+ *limit = vmr(GUEST_GS_LIMIT);
+ tmp_ar = vmr(GUEST_GS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->fs) {
+ *base = vmr(GUEST_FS_BASE);
+ *limit = vmr(GUEST_FS_LIMIT);
+ tmp_ar = vmr(GUEST_FS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->es) {
+ *base = vmr(GUEST_ES_BASE);
+ *limit = vmr(GUEST_ES_LIMIT);
+ tmp_ar = vmr(GUEST_ES_AR_BYTES);
+ } else {
+ printk("Unmatched segment selector:%d\n", sel);
+ return 0;
+ }
+
+ if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) { /* x86 mess!! */
+ *base = 0UL;
+ *limit = ~0UL;
+ }
+ /* Fixup ar so that it looks the same as in native mode */
+ *ar = (tmp_ar << 8);
+ return 1;
+}
+
diff -r eca698a4e733 -r 0a38c610f26b xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h Fri Jan 11 16:32:36 2013 -0800
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h Fri Jan 11 16:34:17 2013 -0800
@@ -156,11 +156,28 @@ void vmx_update_cpu_exec_control(struct
# define VMX_CONTROL_REG_ACCESS_TYPE_LMSW 3
/* 10:8 - general purpose register operand */
#define VMX_CONTROL_REG_ACCESS_GPR(eq) (((eq) >> 8) & 0xf)
+#define VMX_CONTROL_REG_ACCESS_GPR_EAX 0
+#define VMX_CONTROL_REG_ACCESS_GPR_ECX 1
+#define VMX_CONTROL_REG_ACCESS_GPR_EDX 2
+#define VMX_CONTROL_REG_ACCESS_GPR_EBX 3
+#define VMX_CONTROL_REG_ACCESS_GPR_ESP 4
+#define VMX_CONTROL_REG_ACCESS_GPR_EBP 5
+#define VMX_CONTROL_REG_ACCESS_GPR_ESI 6
+#define VMX_CONTROL_REG_ACCESS_GPR_EDI 7
+#define VMX_CONTROL_REG_ACCESS_GPR_R8 8
+#define VMX_CONTROL_REG_ACCESS_GPR_R9 9
+#define VMX_CONTROL_REG_ACCESS_GPR_R10 10
+#define VMX_CONTROL_REG_ACCESS_GPR_R11 11
+#define VMX_CONTROL_REG_ACCESS_GPR_R12 12
+#define VMX_CONTROL_REG_ACCESS_GPR_R13 13
+#define VMX_CONTROL_REG_ACCESS_GPR_R14 14
+#define VMX_CONTROL_REG_ACCESS_GPR_R15 15
/*
* Access Rights
*/
#define X86_SEG_AR_SEG_TYPE 0xf /* 3:0, segment type */
+#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment */
#define X86_SEG_AR_DESC_TYPE (1u << 4) /* 4, descriptor type */
#define X86_SEG_AR_DPL 0x60 /* 6:5, descriptor privilege level
*/
#define X86_SEG_AR_SEG_PRESENT (1u << 7) /* 7, segment present */
@@ -420,6 +437,11 @@ void setup_ept_dump(void);
void update_guest_eip(void);
void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs
*regs);
void vmx_do_extint(struct cpu_user_regs *regs);
+void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs);
+int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp);
+int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
+ const struct cpu_user_regs *regs, unsigned long *base,
+ unsigned long *limit, unsigned int *ar);
/* EPT violation qualifications definitions */
#define _EPT_READ_VIOLATION 0