thr3ads.net - Xen devel - [PATCH 10/18 V2]: PVH xen: introduce vmx

If this information is useful, please help other people find it:
Share via:

Mukesh Rathor

2013-Mar-16 00:41 UTC

[PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

The heart of this patch is vmx exit handler for PVH guest. It is nicely
 isolated in a separate module as preferred by most of us. A call to it
 is added to vmx_pvh_vmexit_handler().

Changes in V2:
  - Move non VMX generic code to arch/x86/hvm/pvh.c
  - Remove get_gpr_ptr() and use existing decode_register() instead.
  - Defer call to pvh vmx exit handler until interrupts are enabled. So the
    caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT now.
  - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set
    the correct feature bits in CR4 during vmcs creation.
  - Fix few hard tabs.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/hvm/Makefile         |    3 +-
 xen/arch/x86/hvm/pvh.c            |  220 ++++++++++++++
 xen/arch/x86/hvm/vmx/Makefile     |    1 +
 xen/arch/x86/hvm/vmx/vmx.c        |    7 +
 xen/arch/x86/hvm/vmx/vmx_pvh.c    |  587 +++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/hvm/vmx/vmx.h |    7 +-
 xen/include/asm-x86/pvh.h         |    6 +
 7 files changed, 829 insertions(+), 2 deletions(-)
 create mode 100644 xen/arch/x86/hvm/pvh.c
 create mode 100644 xen/arch/x86/hvm/vmx/vmx_pvh.c
 create mode 100644 xen/include/asm-x86/pvh.h

diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile
index eea5555..65ff9f3 100644
--- a/xen/arch/x86/hvm/Makefile
+++ b/xen/arch/x86/hvm/Makefile
@@ -22,4 +22,5 @@ obj-y += vlapic.o
 obj-y += vmsi.o
 obj-y += vpic.o
 obj-y += vpt.o
-obj-y += vpmu.o
\ No newline at end of file
+obj-y += vpmu.o
+obj-y += pvh.o
diff --git a/xen/arch/x86/hvm/pvh.c b/xen/arch/x86/hvm/pvh.c
new file mode 100644
index 0000000..c12c4b7
--- /dev/null
+++ b/xen/arch/x86/hvm/pvh.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2013, Mukesh Rathor, Oracle Corp.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <xen/hypercall.h>
+#include <xen/guest_access.h>
+#include <asm/p2m.h>
+#include <asm/traps.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <public/sched.h>
+
+static int pvh_grant_table_op(
+              unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
+{
+    switch (cmd)
+    {
+        case GNTTABOP_map_grant_ref:
+        case GNTTABOP_unmap_grant_ref:
+        case GNTTABOP_setup_table:
+        case GNTTABOP_copy:
+        case GNTTABOP_query_size:
+        case GNTTABOP_set_version:
+            return do_grant_table_op(cmd, uop, count);
+    }
+    return -ENOSYS;
+}
+
+static long pvh_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+{
+    long rc = -ENOSYS;
+
+    switch ( cmd )
+    {
+        case VCPUOP_register_runstate_memory_area:
+        case VCPUOP_get_runstate_info:
+        case VCPUOP_set_periodic_timer:
+        case VCPUOP_stop_periodic_timer:
+        case VCPUOP_set_singleshot_timer:
+        case VCPUOP_stop_singleshot_timer:
+        case VCPUOP_is_up:
+        case VCPUOP_up:
+        case VCPUOP_initialise:
+            rc = do_vcpu_op(cmd, vcpuid, arg);
+
+            /* pvh boot vcpu setting context for bringing up smp vcpu */
+            if (cmd == VCPUOP_initialise)
+                vmx_vmcs_enter(current);
+    }
+    return rc;
+}
+
+static long pvh_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
+{
+    switch ( cmd )
+    {
+        case PHYSDEVOP_map_pirq:
+        case PHYSDEVOP_unmap_pirq:
+        case PHYSDEVOP_eoi:
+        case PHYSDEVOP_irq_status_query:
+        case PHYSDEVOP_get_free_pirq:
+            return do_physdev_op(cmd, arg);
+
+        default:
+            if ( IS_PRIV(current->domain) )
+                return do_physdev_op(cmd, arg);
+    }
+    return -ENOSYS;
+}
+
+static long do_pvh_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
+{
+    long rc = -EINVAL;
+    struct xen_hvm_param harg;
+    struct domain *d;
+
+    if ( copy_from_guest(&harg, arg, 1) )
+        return -EFAULT;
+
+    rc = rcu_lock_target_domain_by_id(harg.domid, &d);
+    if ( rc != 0 )
+        return rc;
+
+    if (is_hvm_domain(d)) {
+        /* pvh dom0 is building an hvm guest */
+        rcu_unlock_domain(d);
+        return do_hvm_op(op, arg);  
+    }
+
+    rc = -ENOSYS;
+    if (op == HVMOP_set_param) {
+        if (harg.index == HVM_PARAM_CALLBACK_IRQ) {
+            struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+            uint64_t via = harg.value;
+            uint8_t via_type = (uint8_t)(via >> 56) + 1;
+
+            if (via_type == HVMIRQ_callback_vector) {
+                hvm_irq->callback_via_type = HVMIRQ_callback_vector;
+                hvm_irq->callback_via.vector = (uint8_t)via;
+                rc = 0;
+            }
+        }
+    }
+    rcu_unlock_domain(d);
+    return rc;
+}
+
+typedef unsigned long pvh_hypercall_t(
+    unsigned long, unsigned long, unsigned long, unsigned long, unsigned long,
+    unsigned long);
+
+int hcall_a[NR_hypercalls];
+
+static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = {
+    [__HYPERVISOR_platform_op]     = (pvh_hypercall_t *)do_platform_op,
+    [__HYPERVISOR_memory_op]       = (pvh_hypercall_t *)do_memory_op,
+    /* [__HYPERVISOR_set_timer_op]     = (pvh_hypercall_t *)do_set_timer_op, */
+    [__HYPERVISOR_xen_version]     = (pvh_hypercall_t *)do_xen_version,
+    [__HYPERVISOR_console_io]      = (pvh_hypercall_t *)do_console_io,
+    [__HYPERVISOR_grant_table_op]  = (pvh_hypercall_t *)pvh_grant_table_op,
+    [__HYPERVISOR_vcpu_op]         = (pvh_hypercall_t *)pvh_vcpu_op,
+    [__HYPERVISOR_mmuext_op]       = (pvh_hypercall_t *)do_mmuext_op,
+    [__HYPERVISOR_xsm_op]          = (pvh_hypercall_t *)do_xsm_op,
+    [__HYPERVISOR_sched_op]        = (pvh_hypercall_t *)do_sched_op,
+    [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t *)do_event_channel_op,
+    [__HYPERVISOR_physdev_op]      = (pvh_hypercall_t *)pvh_physdev_op,
+    [__HYPERVISOR_hvm_op]          = (pvh_hypercall_t *)do_pvh_hvm_op,
+    [__HYPERVISOR_sysctl]          = (pvh_hypercall_t *)do_sysctl,
+    [__HYPERVISOR_domctl]          = (pvh_hypercall_t *)do_domctl
+};
+
+/* fixme: Do we need to worry about this and slow things down in this path? */
+static int pvh_long_mode_enabled(void)
+{
+    /* A 64bit linux guest should always run in this mode with CS.L selecting
+     * either 64bit mode or 32bit compat mode */
+    return 1;
+}
+
+/* Check if hypercall is valid 
+ * Returns: 0 if hcall is not valid with eax set to the errno to ret to guest
+ */
+static int hcall_valid(struct cpu_user_regs *regs)
+{
+    struct segment_register sreg;
+
+    if (!pvh_long_mode_enabled()) 
+    {
+        gdprintk(XENLOG_ERR, "PVH Error: Expected long mode set\n");
+        return 1;
+    }
+    hvm_get_segment_register(current, x86_seg_ss, &sreg);
+    if ( unlikely(sreg.attr.fields.dpl == 3) ) 
+    {
+        regs->eax = -EPERM;
+        return 0;
+    }
+
+    /* domU''s are not allowed following hcalls */
+    if ( !IS_PRIV(current->domain) &&
+         (regs->eax == __HYPERVISOR_xsm_op ||
+          regs->eax == __HYPERVISOR_platform_op ||
+          regs->eax == __HYPERVISOR_domctl) ) {     /* for privcmd mmap */
+
+        regs->eax = -EPERM;
+        return 0;
+    }
+    return 1;
+}
+
+int pvh_do_hypercall(struct cpu_user_regs *regs)
+{
+    uint32_t hnum = regs->eax;
+
+    if ( hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] == NULL ) 
+    {
+        gdprintk(XENLOG_WARNING, "PVH: Unimplemented HCALL:%d. Returning
"
+                 "-ENOSYS. domid:%d IP:%lx SP:%lx\n", 
+                 hnum, current->domain->domain_id, regs->rip,
regs->rsp);
+        regs->eax = -ENOSYS;
+        vmx_update_guest_eip();
+        return HVM_HCALL_completed;
+    }
+
+    if ( regs->eax == __HYPERVISOR_sched_op && regs->rdi ==
SCHEDOP_shutdown )
+    {
+        regs->eax = -ENOSYS;
+        vmx_update_guest_eip();
+
+        /* PVH fixme: show_guest_stack() from domain crash causes PF */
+        domain_crash_synchronous();
+        return HVM_HCALL_completed;
+    }
+
+    if ( !hcall_valid(regs) )
+        return HVM_HCALL_completed;
+
+    current->arch.hvm_vcpu.hcall_preempted = 0;
+    regs->rax = pvh_hypercall64_table[hnum](regs->rdi, regs->rsi,
regs->rdx,
+                                            regs->r10, regs->r8,
regs->r9);
+
+    if ( current->arch.hvm_vcpu.hcall_preempted )
+        return HVM_HCALL_preempted;
+         
+    return HVM_HCALL_completed;
+}
+
diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile
index 373b3d9..8b71dae 100644
--- a/xen/arch/x86/hvm/vmx/Makefile
+++ b/xen/arch/x86/hvm/vmx/Makefile
@@ -5,3 +5,4 @@ obj-y += vmcs.o
 obj-y += vmx.o
 obj-y += vpmu_core2.o
 obj-y += vvmx.o
+obj-y += vmx_pvh.o
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 194c87b..5503fc9 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1529,6 +1529,8 @@ static struct hvm_function_table __read_mostly
vmx_function_table = {
     .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled,
     .process_isr          = vmx_process_isr,
     .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
+    .pvh_set_vcpu_info    = vmx_pvh_set_vcpu_info,
+    .pvh_read_descriptor  = vmx_pvh_read_descriptor,
 };
 
 struct hvm_function_table * __init start_vmx(void)
@@ -2364,6 +2366,11 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
         return vmx_failed_vmentry(exit_reason, regs);
 
+    if ( is_pvh_vcpu(v) ) {
+        vmx_pvh_vmexit_handler(regs);
+        return;
+    }
+
     if ( v->arch.hvm_vmx.vmx_realmode )
     {
         /* Put RFLAGS back the way the guest wants it */
diff --git a/xen/arch/x86/hvm/vmx/vmx_pvh.c b/xen/arch/x86/hvm/vmx/vmx_pvh.c
new file mode 100644
index 0000000..14ca0f6
--- /dev/null
+++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c
@@ -0,0 +1,587 @@
+/*
+ * Copyright (C) 2013, Mukesh Rathor, Oracle Corp.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <xen/hypercall.h>
+#include <xen/guest_access.h>
+#include <asm/p2m.h>
+#include <asm/traps.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <public/sched.h>
+#include <asm/pvh.h>
+
+volatile int pvhdbg=0;
+#define dbgp1(...) {(pvhdbg==1) ? printk(__VA_ARGS__):0;}
+#define dbgp2(...) {(pvhdbg==2) ? printk(__VA_ARGS__):0;}
+
+
+static void read_vmcs_selectors(struct cpu_user_regs *regs)
+{
+    regs->cs = __vmread(GUEST_CS_SELECTOR);
+    regs->ss = __vmread(GUEST_SS_SELECTOR);
+    regs->ds = __vmread(GUEST_DS_SELECTOR);
+    regs->es = __vmread(GUEST_ES_SELECTOR);
+    regs->gs = __vmread(GUEST_GS_SELECTOR);
+    regs->fs = __vmread(GUEST_FS_SELECTOR);
+}
+
+/* returns : 0 success */
+static int vmxit_msr_read(struct cpu_user_regs *regs)
+{
+    int rc=1;
+
+    u64 msr_content = 0;
+    switch (regs->ecx)
+    {
+        case MSR_IA32_MISC_ENABLE:
+        {
+            rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
+            msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+                           MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
+            break;
+        }
+        default:
+        {
+            /* fixme: see hvm_msr_read_intercept() */
+            rdmsrl(regs->ecx, msr_content);  
+            break;
+        }
+    }
+    regs->eax = (uint32_t)msr_content;
+    regs->edx = (uint32_t)(msr_content >> 32);
+    vmx_update_guest_eip();
+    rc = 0;
+
+    dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n",
regs->ecx, regs->eax,
+          regs->edx, regs->rip, regs->rsp);
+    return rc;
+}
+
+/* returns : 0 success */
+static int vmxit_msr_write(struct cpu_user_regs *regs)
+{
+    uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx
<< 32);
+    int rc=1;
+
+    dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n",
regs->ecx,
+          regs->eax,regs->edx);
+
+    if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) {
+        vmx_update_guest_eip();
+        rc = 0;
+    }
+    return rc;
+}
+
+/* Returns: rc == 0: handled the MTF vmexit */
+static int vmxit_mtf(struct cpu_user_regs *regs)
+{
+    struct vcpu *vp = current;
+    int rc=1, ss=vp->arch.hvm_vcpu.single_step; 
+
+    vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control);
+    vp->arch.hvm_vcpu.single_step = 0;
+
+    if ( vp->domain->debugger_attached && ss ) {
+        domain_pause_for_debugger();
+        rc = 0;
+    }
+    return rc;
+}
+
+static int vmxit_int3(struct cpu_user_regs *regs)
+{
+    int ilen = vmx_get_instruction_length();
+    struct vcpu *vp = current;
+    struct hvm_trap trap_info = { 
+                        .vector = TRAP_int3, 
+                        .type = X86_EVENTTYPE_SW_EXCEPTION,
+                        .error_code = HVM_DELIVER_NO_ERROR_CODE, 
+                        .insn_len = ilen
+    };
+
+    regs->eip += ilen;
+
+    /* gdbsx or another debugger. Never pause dom0 */
+    if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp,
regs) )
+    {
+        dbgp1("[%d]PVH: domain pause for debugger\n",
smp_processor_id());
+        current->arch.gdbsx_vcpu_event = TRAP_int3;
+        domain_pause_for_debugger();
+        return 0;
+    }
+
+    regs->eip -= ilen;
+    hvm_inject_trap(&trap_info);
+
+    return 0;
+}
+
+static int vmxit_invalid_op(struct cpu_user_regs *regs)
+{
+    ulong addr=0;
+
+    if ( guest_kernel_mode(current, regs) || 
+         (addr = emulate_forced_invalid_op(regs)) == 0 )
+    {
+        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+        return 0;
+    } 
+
+    if (addr != EXCRET_fault_fixed)
+        hvm_inject_page_fault(0, addr);
+
+    return 0;
+}
+
+/* Returns: rc == 0: handled the exception/NMI */
+static int vmxit_exception(struct cpu_user_regs *regs)
+{
+    unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) &
INTR_INFO_VECTOR_MASK;
+    int rc=1; 
+    struct vcpu *vp = current;
+
+    dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, 
+          __vmread(GUEST_CS_SELECTOR), regs->eip);
+
+    if (vector == TRAP_debug) {
+        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+        write_debugreg(6, exit_qualification | 0xffff0ff0);
+        rc = 0;
+        
+        /* gdbsx or another debugger */
+        if ( vp->domain->domain_id != 0 &&    /* never pause dom0
*/
+             guest_kernel_mode(vp, regs) && 
vp->domain->debugger_attached )
+        {
+            domain_pause_for_debugger();
+        } else {
+            hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
+        }
+    } 
+    if (vector == TRAP_int3) {
+        rc = vmxit_int3(regs);
+
+    }  else if (vector == TRAP_invalid_op) {
+        rc = vmxit_invalid_op(regs);
+
+    } else if (vector == TRAP_no_device) {
+        hvm_funcs.fpu_dirty_intercept();  /* calls vmx_fpu_dirty_intercept */
+        rc = 0;
+
+    } else if (vector == TRAP_gp_fault) {
+        regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
+        /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */
+        rc = 1;
+
+    } else if (vector == TRAP_page_fault) {
+        printk("PVH: Unexpected vector page_fault. IP:%lx\n",
regs->eip);
+        rc = 1;
+    }
+    if (rc)
+        printk("PVH: Unhandled trap vector:%d. IP:%lx\n", vector,
regs->eip);
+
+    return rc;
+}
+
+static int vmxit_invlpg(void)
+{
+    ulong vaddr = __vmread(EXIT_QUALIFICATION);
+
+    vmx_update_guest_eip();
+    vpid_sync_vcpu_gva(current, vaddr);
+    return 0;
+}
+
+static int vmxit_vmcall(struct cpu_user_regs *regs)
+{
+    if ( pvh_do_hypercall(regs) != HVM_HCALL_preempted)
+        vmx_update_guest_eip();
+
+    return 0;;
+}
+
+/* Returns: rc == 0: success */
+static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, 
+                      uint64_t *regp)
+{
+    struct vcpu *vp = current;
+
+    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+    {
+        unsigned long new_cr0 = *regp;
+        unsigned long old_cr0 = __vmread(GUEST_CR0);
+
+        dbgp2("PVH:writing to CR0. RIP:%lx val:0x%lx\n",
regs->rip, *regp);
+        if ( (u32)new_cr0 != new_cr0 )
+        {
+            HVM_DBG_LOG(DBG_LEVEL_1, 
+                        "Guest setting upper 32 bits in CR0: %lx",
new_cr0);
+            return 1;
+        }
+
+        new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
+        /* ET is reserved and should be always be 1. */
+        new_cr0 |= X86_CR0_ET;
+
+        /* pvh cannot change to real mode */
+        if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) != (X86_CR0_PG|X86_CR0_PE)
) {
+            printk("PVH attempting to turn off PE/PG. CR0:%lx\n",
new_cr0);
+            return 1;
+        }
+        /* TS going from 1 to 0 */
+        if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 &
X86_CR0_TS)==0) )
+            vmx_fpu_enter(vp);
+
+        vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] =
new_cr0;
+        __vmwrite(GUEST_CR0, new_cr0);
+        __vmwrite(CR0_READ_SHADOW, new_cr0);
+    } else {
+        *regp = __vmread(GUEST_CR0);
+    } 
+    return 0;
+}
+
+/* Returns: rc == 0: success */
+static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, 
+                      uint64_t *regp)
+{
+    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+    {
+        u64 old_cr4 = __vmread(GUEST_CR4);
+
+        if ( (old_cr4 ^ (*regp)) & (X86_CR4_PSE | X86_CR4_PGE |
X86_CR4_PAE) )
+            vpid_sync_all();
+
+        /* pvh_verify_cr4_wr(*regp)); */
+        __vmwrite(GUEST_CR4, *regp);
+    } else {
+        *regp = __vmread(GUEST_CR4);
+    } 
+    return 0;
+}
+
+/* Returns: rc == 0: success */
+static int vmxit_cr_access(struct cpu_user_regs *regs)
+{
+    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+    uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
+    int cr, rc = 1;
+
+    switch ( acc_typ )
+    {
+        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
+        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
+        {
+            uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
+            uint64_t *regp = decode_register(gpr, regs, 0);
+            cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
+
+            if (regp == NULL)
+                break;
+
+            /* pl don''t embed switch statements */
+            if (cr == 0)
+                rc = access_cr0(regs, acc_typ, regp);
+            else if (cr == 3) {
+                printk("PVH: d%d: unexpected cr3 access vmexit.
rip:%lx\n",
+                       current->domain->domain_id, regs->rip);
+                domain_crash_synchronous();
+            } else if (cr == 4) 
+                rc = access_cr4(regs, acc_typ, regp);
+
+            if (rc == 0)
+                vmx_update_guest_eip();
+            break;
+        }
+        case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
+        {
+            struct vcpu *vp = current;
+            unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] &
~X86_CR0_TS;
+            vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] =
cr0;
+            vmx_fpu_enter(vp);
+            __vmwrite(GUEST_CR0, cr0);
+            __vmwrite(CR0_READ_SHADOW, cr0);
+            vmx_update_guest_eip();
+            rc = 0;
+        }
+    }
+    return rc;
+}
+
+/* NOTE: a PVH sets IOPL natively by setting bits in the eflags and not by
+ * hypercalls used by a PV */
+static int vmxit_io_instr(struct cpu_user_regs *regs)
+{
+    int curr_lvl;
+    int requested = (regs->rflags >> 12) & 3;
+
+    read_vmcs_selectors(regs);
+    curr_lvl = regs->cs & 3;
+
+    if (requested >= curr_lvl && emulate_privileged_op(regs)) 
+        return 0;
+
+    hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
+    return 0;
+}
+
+static int pvh_ept_handle_violation(unsigned long qualification, 
+                                    paddr_t gpa, struct cpu_user_regs *regs)
+{
+    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
+    p2m_type_t p2mt;
+    mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt);
+
+    gdprintk(XENLOG_ERR, "Dom:%d EPT violation %#lx (%c%c%c/%c%c%c),
"
+             "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx
RSP:0x%lx\n",
+             current->domain->domain_id, qualification, 
+             (qualification & EPT_READ_VIOLATION) ? ''r'' :
''-'',
+             (qualification & EPT_WRITE_VIOLATION) ? ''w''
: ''-'',
+             (qualification & EPT_EXEC_VIOLATION) ? ''x'' :
''-'',
+             (qualification & EPT_EFFECTIVE_READ) ? ''r'' :
''-'',
+             (qualification & EPT_EFFECTIVE_WRITE) ? ''w''
: ''-'',
+             (qualification & EPT_EFFECTIVE_EXEC) ? ''x'' :
''-'',
+             gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp);
+             
+    ept_walk_table(current->domain, gfn);
+
+    if ( qualification & EPT_GLA_VALID )
+    {
+        gla = __vmread(GUEST_LINEAR_ADDRESS);
+        gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
+    }
+
+    hvm_inject_hw_exception(TRAP_gp_fault, 0);
+    return 0;
+}
+
+static void pvh_user_cpuid(struct cpu_user_regs *regs)
+{
+    unsigned int eax, ebx, ecx, edx;
+
+    asm volatile ( "cpuid"
+              : "=a" (eax), "=b" (ebx), "=c"
(ecx), "=d" (edx)
+              : "0" (regs->eax), "2" (regs->rcx) );
+
+    regs->rax = eax; regs->rbx = ebx; regs->rcx = ecx; regs->rdx =
edx;
+}
+
+/* 
+ * Main exit handler for PVH case. Called from vmx_vmexit_handler(). 
+ * Note: in vmx_asm_vmexit_handler, rip/rsp/eflags are updated in regs{} 
+ */
+void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs)
+{
+    unsigned long exit_qualification;
+    unsigned int exit_reason = __vmread(VM_EXIT_REASON);
+    int rc=0, ccpu = smp_processor_id();
+    struct vcpu *vp = current;
+
+    dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx
CR0:%lx\n",
+          ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags, 
+          __vmread(GUEST_CR0));
+
+    /* for guest_kernel_mode() */
+    regs->cs = __vmread(GUEST_CS_SELECTOR); 
+
+    switch ( (uint16_t)exit_reason )
+    {
+        case EXIT_REASON_EXCEPTION_NMI:      /* 0 */
+            rc = vmxit_exception(regs);
+            break;
+            
+        case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */
+        case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */
+            break;              /* handled in vmx_vmexit_handler() */
+
+        case EXIT_REASON_TRIPLE_FAULT:       /* 2 */
+        {
+            printk("PVH:Triple Flt:[%d] RIP:%lx RSP:%lx EFLAGS:%lx
CR3:%lx\n",
+                   ccpu, regs->rip, regs->rsp, regs->rflags, 
+                   __vmread(GUEST_CR3));
+
+            rc = 1;
+            break;
+        }
+        case EXIT_REASON_PENDING_VIRT_INTR:  /* 7 */
+        {
+            struct vcpu *v = current;
+
+            /* Disable the interrupt window. */
+            v->arch.hvm_vmx.exec_control &=
~CPU_BASED_VIRTUAL_INTR_PENDING;
+            __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
v->arch.hvm_vmx.exec_control);
+            break;
+        }
+
+        case EXIT_REASON_CPUID:              /* 10 */
+        {
+            if ( guest_kernel_mode(vp, regs) ) {
+                pv_cpuid(regs);
+            } else
+                pvh_user_cpuid(regs);
+
+            vmx_update_guest_eip();
+            dbgp2("cpuid:%ld RIP:%lx\n", regs->eax, regs->rip);
+            break;
+        }
+
+        case EXIT_REASON_HLT:             /* 12 */
+        {
+            vmx_update_guest_eip();
+            hvm_hlt(regs->eflags);
+            break;
+        }
+
+        case EXIT_REASON_INVLPG:             /* 14 */
+            rc = vmxit_invlpg();
+            break;
+
+        case EXIT_REASON_RDTSC:              /* 16 */
+            rc = 1;
+            break;
+
+        case EXIT_REASON_VMCALL:             /* 18 */
+            rc = vmxit_vmcall(regs);
+            break;
+
+        case EXIT_REASON_CR_ACCESS:          /* 28 */
+            rc = vmxit_cr_access(regs);
+            break;
+
+        case EXIT_REASON_DR_ACCESS:          /* 29 */
+        {
+            exit_qualification = __vmread(EXIT_QUALIFICATION);
+            vmx_dr_access(exit_qualification, regs);
+            break;
+        }
+
+        case EXIT_REASON_IO_INSTRUCTION:
+            vmxit_io_instr(regs);
+            break;
+
+        case EXIT_REASON_MSR_READ:           /* 31 */
+            rc = vmxit_msr_read(regs);
+            break;
+
+        case EXIT_REASON_MSR_WRITE:          /* 32 */
+            rc = vmxit_msr_write(regs);
+            break;
+
+        case EXIT_REASON_MONITOR_TRAP_FLAG:  /* 37 */
+            rc = vmxit_mtf(regs);
+            break;
+
+        case EXIT_REASON_EPT_VIOLATION:
+        {
+            paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+            exit_qualification = __vmread(EXIT_QUALIFICATION);
+            rc = pvh_ept_handle_violation(exit_qualification, gpa, regs);
+            break;
+        }
+        default: 
+            rc = 1;
+            printk("PVH: Unexpected exit reason:%d 0x%x\n",
exit_reason,
+                   exit_reason);
+    }
+    if (rc) {
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        printk("PVH: [%d] exit_reas:%d 0x%x qual:%ld 0x%lx
cr0:0x%016lx\n",
+             ccpu, exit_reason, exit_reason, exit_qualification,
+             exit_qualification, __vmread(GUEST_CR0));
+        printk("PVH: [%d] RIP:%lx RSP:%lx\n", ccpu, regs->rip,
regs->rsp);
+        domain_crash_synchronous();
+    }
+}
+
+/* 
+ * Sets info for non boot vcpu. VCPU 0 context is set by library.
+ * We use this for nonboot vcpu in which case the call comes from the 
+ * kernel cpu_initialize_context().
+ */
+int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp)
+{
+    if (v->vcpu_id == 0)
+        return 0;
+
+    vmx_vmcs_enter(v);
+    __vmwrite(GUEST_GDTR_BASE, ctxtp->u.pvh.gdtaddr);
+    __vmwrite(GUEST_GDTR_LIMIT, ctxtp->u.pvh.gdtsz);
+    __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user);
+
+    __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs);
+    __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds);
+    __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es);
+    __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss);
+    __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs);
+
+    if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) )
+        return -EINVAL;
+
+    vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel);
+
+    vmx_vmcs_exit(v);
+    return 0;
+}
+
+int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
+                            const struct cpu_user_regs *regs,
+                            unsigned long *base, unsigned long *limit,
+                            unsigned int *ar)
+{
+    unsigned int tmp_ar = 0;
+    BUG_ON(v!=current);
+    BUG_ON(!is_pvh_vcpu(v));
+
+    if (sel == (unsigned int)regs->cs) {
+        *base = __vmread(GUEST_CS_BASE);
+        *limit = __vmread(GUEST_CS_LIMIT);
+        tmp_ar = __vmread(GUEST_CS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->ds) {
+        *base = __vmread(GUEST_DS_BASE);
+        *limit = __vmread(GUEST_DS_LIMIT);
+        tmp_ar = __vmread(GUEST_DS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->ss) {
+        *base = __vmread(GUEST_SS_BASE);
+        *limit = __vmread(GUEST_SS_LIMIT);
+        tmp_ar = __vmread(GUEST_SS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->gs) {
+        *base = __vmread(GUEST_GS_BASE);
+        *limit = __vmread(GUEST_GS_LIMIT);
+        tmp_ar = __vmread(GUEST_GS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->fs) {
+        *base = __vmread(GUEST_FS_BASE);
+        *limit = __vmread(GUEST_FS_LIMIT);
+        tmp_ar = __vmread(GUEST_FS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->es) {
+        *base = __vmread(GUEST_ES_BASE);
+        *limit = __vmread(GUEST_ES_LIMIT);
+        tmp_ar = __vmread(GUEST_ES_AR_BYTES); 
+    } else {
+        printk("Unmatched segment selector:%d\n", sel);
+        return 0;
+    }
+
+    if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) {           /* x86 mess!! */
+        *base = 0UL;
+        *limit = ~0UL;
+    }
+    /* Fixup ar so that it looks the same as in native mode */
+    *ar = (tmp_ar << 8);
+    return 1;
+}
+
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h
b/xen/include/asm-x86/hvm/vmx/vmx.h
index a742e16..5679e8d 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -189,6 +189,7 @@ void vmx_update_secondary_exec_control(struct vcpu *v);
  * Access Rights
  */
 #define X86_SEG_AR_SEG_TYPE     0xf        /* 3:0, segment type */
+#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment */
 #define X86_SEG_AR_DESC_TYPE    (1u << 4)  /* 4, descriptor type */
 #define X86_SEG_AR_DPL          0x60       /* 6:5, descriptor privilege level
*/
 #define X86_SEG_AR_SEG_PRESENT  (1u << 7)  /* 7, segment present */
@@ -442,10 +443,14 @@ void ept_p2m_uninit(struct p2m_domain *p2m);
 
 void ept_walk_table(struct domain *d, unsigned long gfn);
 void setup_ept_dump(void);
-
 void vmx_update_guest_eip(void);
 void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs
*regs);
 void vmx_do_extint(struct cpu_user_regs *regs);
+void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs);
+int  vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp);
+int  vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
+                         const struct cpu_user_regs *regs, unsigned long *base,
+                         unsigned long *limit, unsigned int *ar);
 
 int alloc_p2m_hap_data(struct p2m_domain *p2m);
 void free_p2m_hap_data(struct p2m_domain *p2m);
diff --git a/xen/include/asm-x86/pvh.h b/xen/include/asm-x86/pvh.h
new file mode 100644
index 0000000..73e59d3
--- /dev/null
+++ b/xen/include/asm-x86/pvh.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_X86_PVH_H__
+#define __ASM_X86_PVH_H__
+
+int pvh_do_hypercall(struct cpu_user_regs *regs);
+
+#endif  /* __ASM_X86_PVH_H__ */
-- 
1.7.2.3

Jan Beulich

2013-Mar-18 12:16 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

>>> On 16.03.13 at 01:41, Mukesh Rathor
<mukesh.rathor@oracle.com> wrote:
> +static int pvh_grant_table_op(
> +              unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int
count)
> +{
> +    switch (cmd)
> +    {
> +        case GNTTABOP_map_grant_ref:
> +        case GNTTABOP_unmap_grant_ref:
> +        case GNTTABOP_setup_table:
> +        case GNTTABOP_copy:
> +        case GNTTABOP_query_size:
> +        case GNTTABOP_set_version:
> +            return do_grant_table_op(cmd, uop, count);
While for HVM guests such white lists may be appropriate, for PVH
this doesn''t seem to be the case: The code would require updating
whenever a new sub-hypercall gets added to any of the hypercalls
dealt with like this.
> +int hcall_a[NR_hypercalls];
What''s this?
> +static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = {
> +    [__HYPERVISOR_platform_op]     = (pvh_hypercall_t *)do_platform_op,
> +    [__HYPERVISOR_memory_op]       = (pvh_hypercall_t *)do_memory_op,
> +    /* [__HYPERVISOR_set_timer_op]     = (pvh_hypercall_t
*)do_set_timer_op, */
Why is this commented out?
> +    [__HYPERVISOR_xen_version]     = (pvh_hypercall_t *)do_xen_version,
> +    [__HYPERVISOR_console_io]      = (pvh_hypercall_t *)do_console_io,
> +    [__HYPERVISOR_grant_table_op]  = (pvh_hypercall_t
*)pvh_grant_table_op,
> +    [__HYPERVISOR_vcpu_op]         = (pvh_hypercall_t *)pvh_vcpu_op,
> +    [__HYPERVISOR_mmuext_op]       = (pvh_hypercall_t *)do_mmuext_op,
> +    [__HYPERVISOR_xsm_op]          = (pvh_hypercall_t *)do_xsm_op,
> +    [__HYPERVISOR_sched_op]        = (pvh_hypercall_t *)do_sched_op,
> +    [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t
*)do_event_channel_op,
> +    [__HYPERVISOR_physdev_op]      = (pvh_hypercall_t *)pvh_physdev_op,
> +    [__HYPERVISOR_hvm_op]          = (pvh_hypercall_t *)do_pvh_hvm_op,
> +    [__HYPERVISOR_sysctl]          = (pvh_hypercall_t *)do_sysctl,
> +    [__HYPERVISOR_domctl]          = (pvh_hypercall_t *)do_domctl
> +};
> +
> +/* fixme: Do we need to worry about this and slow things down in this
path? */
???
> +static int pvh_long_mode_enabled(void)
> +{
> +    /* A 64bit linux guest should always run in this mode with CS.L
selecting
> +     * either 64bit mode or 32bit compat mode */
> +    return 1;
How about compat mode guests? And then isn''t this equivalent
to !is_pv_32bit_...(), i.e. determined via a global per-domain
flag once an for all?

The way it''s now the function is pretty pointless.
> +}
> +
> +/* Check if hypercall is valid 
> + * Returns: 0 if hcall is not valid with eax set to the errno to ret to
guest
> + */
> +static int hcall_valid(struct cpu_user_regs *regs)
> +{
> +    struct segment_register sreg;
> +
> +    if (!pvh_long_mode_enabled()) 
> +    {
> +        gdprintk(XENLOG_ERR, "PVH Error: Expected long mode
set\n");
> +        return 1;
> +    }
> +    hvm_get_segment_register(current, x86_seg_ss, &sreg);
> +    if ( unlikely(sreg.attr.fields.dpl == 3) ) 
> +    {
> +        regs->eax = -EPERM;
> +        return 0;
> +    }
> +
> +    /* domU''s are not allowed following hcalls */
> +    if ( !IS_PRIV(current->domain) &&
> +         (regs->eax == __HYPERVISOR_xsm_op ||
> +          regs->eax == __HYPERVISOR_platform_op ||
> +          regs->eax == __HYPERVISOR_domctl) ) {     /* for privcmd mmap
*/
Why do you need to do this here? The hypercall handlers should
be checking this as needed, no need to duplicate these checks.
> +
> +        regs->eax = -EPERM;
> +        return 0;
> +    }
> +    return 1;
> +}
> +
> +int pvh_do_hypercall(struct cpu_user_regs *regs)
> +{
> +    uint32_t hnum = regs->eax;
> +
> +    if ( hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] == NULL )
> +    {
> +        gdprintk(XENLOG_WARNING, "PVH: Unimplemented HCALL:%d.
Returning "
> +                 "-ENOSYS. domid:%d IP:%lx SP:%lx\n", 
> +                 hnum, current->domain->domain_id, regs->rip,
regs->rsp);
> +        regs->eax = -ENOSYS;
> +        vmx_update_guest_eip();
> +        return HVM_HCALL_completed;
> +    }
> +
> +    if ( regs->eax == __HYPERVISOR_sched_op && regs->rdi ==
SCHEDOP_shutdown )
> +    {
> +        regs->eax = -ENOSYS;
> +        vmx_update_guest_eip();
> +
> +        /* PVH fixme: show_guest_stack() from domain crash causes PF */
> +        domain_crash_synchronous();
What''s this all about?
> --- /dev/null
> +++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c
The whole file needs to be looked through for formatting issues.
> --- a/xen/include/asm-x86/hvm/vmx/vmx.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
> @@ -189,6 +189,7 @@ void vmx_update_secondary_exec_control(struct vcpu *v);
>   * Access Rights
>   */
>  #define X86_SEG_AR_SEG_TYPE     0xf        /* 3:0, segment type */
> +#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment
*/
Not used anywhere afaics, so what''s the point of adding this in an
already big patch?

Jan

Konrad Rzeszutek Wilk

2013-Mar-18 16:32 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Fri, Mar 15, 2013 at 05:41:45PM -0700, Mukesh Rathor
wrote:> The heart of this patch is vmx exit handler for PVH guest. It is nicely
>  isolated in a separate module as preferred by most of us. A call to it
>  is added to vmx_pvh_vmexit_handler().
> 
> Changes in V2:
>   - Move non VMX generic code to arch/x86/hvm/pvh.c
>   - Remove get_gpr_ptr() and use existing decode_register() instead.
>   - Defer call to pvh vmx exit handler until interrupts are enabled. So the
>     caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT
now.
>   - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set
>     the correct feature bits in CR4 during vmcs creation.
>   - Fix few hard tabs.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
> ---
>  xen/arch/x86/hvm/Makefile         |    3 +-
>  xen/arch/x86/hvm/pvh.c            |  220 ++++++++++++++
>  xen/arch/x86/hvm/vmx/Makefile     |    1 +
>  xen/arch/x86/hvm/vmx/vmx.c        |    7 +
>  xen/arch/x86/hvm/vmx/vmx_pvh.c    |  587
+++++++++++++++++++++++++++++++++++++
>  xen/include/asm-x86/hvm/vmx/vmx.h |    7 +-
>  xen/include/asm-x86/pvh.h         |    6 +
>  7 files changed, 829 insertions(+), 2 deletions(-)
>  create mode 100644 xen/arch/x86/hvm/pvh.c
>  create mode 100644 xen/arch/x86/hvm/vmx/vmx_pvh.c
>  create mode 100644 xen/include/asm-x86/pvh.h
> 
> diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile
> index eea5555..65ff9f3 100644
> --- a/xen/arch/x86/hvm/Makefile
> +++ b/xen/arch/x86/hvm/Makefile
> @@ -22,4 +22,5 @@ obj-y += vlapic.o
>  obj-y += vmsi.o
>  obj-y += vpic.o
>  obj-y += vpt.o
> -obj-y += vpmu.o
> \ No newline at end of file
> +obj-y += vpmu.o
> +obj-y += pvh.o
> diff --git a/xen/arch/x86/hvm/pvh.c b/xen/arch/x86/hvm/pvh.c
> new file mode 100644
> index 0000000..c12c4b7
> --- /dev/null
> +++ b/xen/arch/x86/hvm/pvh.c
> @@ -0,0 +1,220 @@
> +/*
> + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License v2 as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; if not, write to the
> + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
> + * Boston, MA 021110-1307, USA.

Don''t use the address anymore. They might change their location
(if they haven''t already - its a pretty price location).
> + */
> +
> +#include <xen/hypercall.h>
> +#include <xen/guest_access.h>
> +#include <asm/p2m.h>
> +#include <asm/traps.h>
> +#include <asm/hvm/vmx/vmx.h>
> +#include <public/sched.h>
> +
> +static int pvh_grant_table_op(
> +              unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int
count)
> +{
> +    switch (cmd)
> +    {
> +        case GNTTABOP_map_grant_ref:
> +        case GNTTABOP_unmap_grant_ref:
> +        case GNTTABOP_setup_table:
> +        case GNTTABOP_copy:
> +        case GNTTABOP_query_size:
> +        case GNTTABOP_set_version:
> +            return do_grant_table_op(cmd, uop, count);
> +    }
> +    return -ENOSYS;
> +}
> +
> +static long pvh_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
> +{
> +    long rc = -ENOSYS;
> +
> +    switch ( cmd )
> +    {
> +        case VCPUOP_register_runstate_memory_area:
> +        case VCPUOP_get_runstate_info:
> +        case VCPUOP_set_periodic_timer:
> +        case VCPUOP_stop_periodic_timer:
> +        case VCPUOP_set_singleshot_timer:
> +        case VCPUOP_stop_singleshot_timer:
> +        case VCPUOP_is_up:
> +        case VCPUOP_up:
> +        case VCPUOP_initialise:
> +            rc = do_vcpu_op(cmd, vcpuid, arg);
> +
> +            /* pvh boot vcpu setting context for bringing up smp vcpu */
> +            if (cmd == VCPUOP_initialise)
> +                vmx_vmcs_enter(current);
> +    }
> +    return rc;
> +}
> +
> +static long pvh_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
> +{
> +    switch ( cmd )
> +    {
> +        case PHYSDEVOP_map_pirq:
> +        case PHYSDEVOP_unmap_pirq:
> +        case PHYSDEVOP_eoi:
> +        case PHYSDEVOP_irq_status_query:
> +        case PHYSDEVOP_get_free_pirq:
> +            return do_physdev_op(cmd, arg);
> +
> +        default:
> +            if ( IS_PRIV(current->domain) )
> +                return do_physdev_op(cmd, arg);
> +    }
> +    return -ENOSYS;
> +}
> +
> +static long do_pvh_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
> +{
> +    long rc = -EINVAL;
> +    struct xen_hvm_param harg;
> +    struct domain *d;
> +
> +    if ( copy_from_guest(&harg, arg, 1) )
> +        return -EFAULT;
> +
> +    rc = rcu_lock_target_domain_by_id(harg.domid, &d);
> +    if ( rc != 0 )
> +        return rc;
> +
> +    if (is_hvm_domain(d)) {
Formatting is odd compared to the other ''if ( (something)''
> +        /* pvh dom0 is building an hvm guest */
> +        rcu_unlock_domain(d);
> +        return do_hvm_op(op, arg);  
> +    }
> +
> +    rc = -ENOSYS;
> +    if (op == HVMOP_set_param) {
> +        if (harg.index == HVM_PARAM_CALLBACK_IRQ) {
> +            struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
> +            uint64_t via = harg.value;
> +            uint8_t via_type = (uint8_t)(via >> 56) + 1;
> +
> +            if (via_type == HVMIRQ_callback_vector) {
> +                hvm_irq->callback_via_type = HVMIRQ_callback_vector;
> +                hvm_irq->callback_via.vector = (uint8_t)via;
> +                rc = 0;
> +            }
Perhaps it would make sense to also print out the -ENOSYS
ones for development purposes?

Say:

gdprintk(XENLOG_DEBUG, "d%, %s setting HVMOP_set_param[%d] -
ENOSYS\n", d->domain_id, __func__);

And for the ops case:

gdprintk(XENLOG_DEBUG, "d%, %s - ENOSYS\n", ..)
? 
	> +        }
> +    }
> +    rcu_unlock_domain(d);
> +    return rc;
> +}
> +
> +typedef unsigned long pvh_hypercall_t(
> +    unsigned long, unsigned long, unsigned long, unsigned long, unsigned
long,
> +    unsigned long);
No way to re-use the something standard? I am not sure what is
''PVH'' specific
to it? It looks like a garden variety normal hypercalls.

Perhaps just copy-n-paste the hvm_hypercall_t for right now? And the later they
can be exported in a common header file?
> +
> +int hcall_a[NR_hypercalls];
> +
> +static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = {
> +    [__HYPERVISOR_platform_op]     = (pvh_hypercall_t *)do_platform_op,
> +    [__HYPERVISOR_memory_op]       = (pvh_hypercall_t *)do_memory_op,
Could you include a comment saying why timers are not good?
> +    /* [__HYPERVISOR_set_timer_op]     = (pvh_hypercall_t
*)do_set_timer_op, */
> +    [__HYPERVISOR_xen_version]     = (pvh_hypercall_t *)do_xen_version,
> +    [__HYPERVISOR_console_io]      = (pvh_hypercall_t *)do_console_io,
> +    [__HYPERVISOR_grant_table_op]  = (pvh_hypercall_t
*)pvh_grant_table_op,
> +    [__HYPERVISOR_vcpu_op]         = (pvh_hypercall_t *)pvh_vcpu_op,
> +    [__HYPERVISOR_mmuext_op]       = (pvh_hypercall_t *)do_mmuext_op,
> +    [__HYPERVISOR_xsm_op]          = (pvh_hypercall_t *)do_xsm_op,
> +    [__HYPERVISOR_sched_op]        = (pvh_hypercall_t *)do_sched_op,
> +    [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t
*)do_event_channel_op,
> +    [__HYPERVISOR_physdev_op]      = (pvh_hypercall_t *)pvh_physdev_op,
> +    [__HYPERVISOR_hvm_op]          = (pvh_hypercall_t *)do_pvh_hvm_op,
Most of these follow the ''do_X''. Then for the pvh ones you
have:
''pvh_X'', with one exception: do_pvh_hvm_op ?

Should it be just ''pvh_hvm_op'' instead?

> +    [__HYPERVISOR_sysctl]          = (pvh_hypercall_t *)do_sysctl,
> +    [__HYPERVISOR_domctl]          = (pvh_hypercall_t *)do_domctl
> +};
> +
> +/* fixme: Do we need to worry about this and slow things down in this
path? */
> +static int pvh_long_mode_enabled(void)
> +{
> +    /* A 64bit linux guest should always run in this mode with CS.L
selecting
> +     * either 64bit mode or 32bit compat mode */
I think they are called ''native'' or ''64-bit
mode'' per the AMD spec?
> +    return 1;
Well, it always seems to return 1? So this ends up being mostly a nop when
the compiler is done?

Or did the earlier version have the correct checks to make sure that we
are in Long Mode?

> +}
> +
> +/* Check if hypercall is valid 
> + * Returns: 0 if hcall is not valid with eax set to the errno to ret to
guest

Huh? Return 0 on invalid case? That looks odd. Why not anything but zero?

Or perhaps just make it return a bool? That would be easier
to grok.
> + */
> +static int hcall_valid(struct cpu_user_regs *regs)
> +{
> +    struct segment_register sreg;
> +
> +    if (!pvh_long_mode_enabled()) 
> +    {
> +        gdprintk(XENLOG_ERR, "PVH Error: Expected long mode
set\n");
Shouldn''t this set:
	regs->eax = -ENOSYS?
> +        return 1;
> +    }
> +    hvm_get_segment_register(current, x86_seg_ss, &sreg);
> +    if ( unlikely(sreg.attr.fields.dpl == 3) ) 
> +    {
> +        regs->eax = -EPERM;
> +        return 0;
> +    }
> +
> +    /* domU''s are not allowed following hcalls */
And that is b/c we can''t handle it yet? In which case you should
prefix it with a TODO or XXX to remember this.
> +    if ( !IS_PRIV(current->domain) &&
> +         (regs->eax == __HYPERVISOR_xsm_op ||
> +          regs->eax == __HYPERVISOR_platform_op ||
> +          regs->eax == __HYPERVISOR_domctl) ) {     /* for privcmd mmap
*/
> +
> +        regs->eax = -EPERM;
> +        return 0;
> +    }
> +    return 1;
> +}
> +
> +int pvh_do_hypercall(struct cpu_user_regs *regs)
> +{
> +    uint32_t hnum = regs->eax;
> +
> +    if ( hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] == NULL )
> +    {
> +        gdprintk(XENLOG_WARNING, "PVH: Unimplemented HCALL:%d.
Returning "
> +                 "-ENOSYS. domid:%d IP:%lx SP:%lx\n", 
> +                 hnum, current->domain->domain_id, regs->rip,
regs->rsp);
> +        regs->eax = -ENOSYS;
> +        vmx_update_guest_eip();
> +        return HVM_HCALL_completed;
> +    }
> +
> +    if ( regs->eax == __HYPERVISOR_sched_op && regs->rdi ==
SCHEDOP_shutdown )
Oh? We can''t shutdown the guest? How come? > +    {
> +        regs->eax = -ENOSYS;
> +        vmx_update_guest_eip();
> +
> +        /* PVH fixme: show_guest_stack() from domain crash causes PF */
> +        domain_crash_synchronous();
> +        return HVM_HCALL_completed;
> +    }
> +
> +    if ( !hcall_valid(regs) )
> +        return HVM_HCALL_completed;
> +
> +    current->arch.hvm_vcpu.hcall_preempted = 0;
> +    regs->rax = pvh_hypercall64_table[hnum](regs->rdi, regs->rsi,
regs->rdx,
> +                                            regs->r10, regs->r8,
regs->r9);
> +
> +    if ( current->arch.hvm_vcpu.hcall_preempted )
> +        return HVM_HCALL_preempted;
> +         
> +    return HVM_HCALL_completed;
> +}
> +
> diff --git a/xen/arch/x86/hvm/vmx/Makefile b/xen/arch/x86/hvm/vmx/Makefile
> index 373b3d9..8b71dae 100644
> --- a/xen/arch/x86/hvm/vmx/Makefile
> +++ b/xen/arch/x86/hvm/vmx/Makefile
> @@ -5,3 +5,4 @@ obj-y += vmcs.o
>  obj-y += vmx.o
>  obj-y += vpmu_core2.o
>  obj-y += vvmx.o
> +obj-y += vmx_pvh.o
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 194c87b..5503fc9 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -1529,6 +1529,8 @@ static struct hvm_function_table __read_mostly
vmx_function_table = {
>      .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled,
>      .process_isr          = vmx_process_isr,
>      .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m,
> +    .pvh_set_vcpu_info    = vmx_pvh_set_vcpu_info,
> +    .pvh_read_descriptor  = vmx_pvh_read_descriptor,
>  };
>  
>  struct hvm_function_table * __init start_vmx(void)
> @@ -2364,6 +2366,11 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>      if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
>          return vmx_failed_vmentry(exit_reason, regs);
>  
> +    if ( is_pvh_vcpu(v) ) {
> +        vmx_pvh_vmexit_handler(regs);
> +        return;
> +    }
> +
>      if ( v->arch.hvm_vmx.vmx_realmode )
>      {
>          /* Put RFLAGS back the way the guest wants it */
> diff --git a/xen/arch/x86/hvm/vmx/vmx_pvh.c
b/xen/arch/x86/hvm/vmx/vmx_pvh.c
> new file mode 100644
> index 0000000..14ca0f6
> --- /dev/null
> +++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c
> @@ -0,0 +1,587 @@
> +/*
> + * Copyright (C) 2013, Mukesh Rathor, Oracle Corp.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License v2 as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; if not, write to the
> + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
> + * Boston, MA 021110-1307, USA.
Ditto. No address please.
> + */
> +
> +#include <xen/hypercall.h>
> +#include <xen/guest_access.h>
> +#include <asm/p2m.h>
> +#include <asm/traps.h>
> +#include <asm/hvm/vmx/vmx.h>
> +#include <public/sched.h>
> +#include <asm/pvh.h>
> +
This should probably be #ifdef DEBUG> +volatile int pvhdbg=0;
And I think you can remove the ''volatile'' part?

> +#define dbgp1(...) {(pvhdbg==1) ? printk(__VA_ARGS__):0;}
> +#define dbgp2(...) {(pvhdbg==2) ? printk(__VA_ARGS__):0;}
#endif> +
> +
> +static void read_vmcs_selectors(struct cpu_user_regs *regs)
> +{
> +    regs->cs = __vmread(GUEST_CS_SELECTOR);
> +    regs->ss = __vmread(GUEST_SS_SELECTOR);
> +    regs->ds = __vmread(GUEST_DS_SELECTOR);
> +    regs->es = __vmread(GUEST_ES_SELECTOR);
> +    regs->gs = __vmread(GUEST_GS_SELECTOR);
> +    regs->fs = __vmread(GUEST_FS_SELECTOR);
> +}
> +
> +/* returns : 0 success */
What are the non-success criteria?
> +static int vmxit_msr_read(struct cpu_user_regs *regs)
> +{
> +    int rc=1;
X86EMUL_UNHANDLEABLE ?
> +
> +    u64 msr_content = 0;
> +    switch (regs->ecx)
> +    {
> +        case MSR_IA32_MISC_ENABLE:
> +        {
> +            rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
> +            msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
> +                           MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
> +            break;
> +        }
> +        default:
> +        {
> +            /* fixme: see hvm_msr_read_intercept() */
> +            rdmsrl(regs->ecx, msr_content);  
> +            break;
> +        }
> +    }
> +    regs->eax = (uint32_t)msr_content;
> +    regs->edx = (uint32_t)(msr_content >> 32);
> +    vmx_update_guest_eip();
> +    rc = 0;
> +
> +    dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n",
regs->ecx, regs->eax,
> +          regs->edx, regs->rip, regs->rsp);
> +    return rc;
This function looks to return 0 (or X86EMUL_OKAY) irregardless of the MSR?
Perhaps just make it return X86EMUL_OKAY without bothering to use
''rc''?
> +}
> +
> +/* returns : 0 success */
> +static int vmxit_msr_write(struct cpu_user_regs *regs)
> +{
> +    uint64_t msr_content = (uint32_t)regs->eax |
((uint64_t)regs->edx << 32);
> +    int rc=1;
X86EMUL_UNHANDLEABLE
> +
> +    dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n",
regs->ecx,
> +          regs->eax,regs->edx);
> +
> +    if ( hvm_msr_write_intercept(regs->ecx, msr_content) ==
X86EMUL_OKAY ) {
> +        vmx_update_guest_eip();
> +        rc = 0;
X86EMUL_OKAY
> +    }
> +    return rc;
> +}
> +
> +/* Returns: rc == 0: handled the MTF vmexit */
> +static int vmxit_mtf(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *vp = current;
> +    int rc=1, ss=vp->arch.hvm_vcpu.single_step; 
> +
> +    vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
> +    __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
vp->arch.hvm_vmx.exec_control);
> +    vp->arch.hvm_vcpu.single_step = 0;
> +
> +    if ( vp->domain->debugger_attached && ss ) {
> +        domain_pause_for_debugger();
> +        rc = 0;
> +    }
> +    return rc;
> +}
> +
> +static int vmxit_int3(struct cpu_user_regs *regs)
> +{
> +    int ilen = vmx_get_instruction_length();
> +    struct vcpu *vp = current;
> +    struct hvm_trap trap_info = { 
> +                        .vector = TRAP_int3, 
> +                        .type = X86_EVENTTYPE_SW_EXCEPTION,
> +                        .error_code = HVM_DELIVER_NO_ERROR_CODE, 
> +                        .insn_len = ilen
> +    };
> +
> +    regs->eip += ilen;
> +
> +    /* gdbsx or another debugger. Never pause dom0 */
> +    if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp,
regs) )
> +    {
> +        dbgp1("[%d]PVH: domain pause for debugger\n",
smp_processor_id());
> +        current->arch.gdbsx_vcpu_event = TRAP_int3;
> +        domain_pause_for_debugger();
> +        return 0;
> +    }
> +
> +    regs->eip -= ilen;
> +    hvm_inject_trap(&trap_info);
> +
> +    return 0;
> +}
> +
> +static int vmxit_invalid_op(struct cpu_user_regs *regs)
> +{
> +    ulong addr=0;
> +
> +    if ( guest_kernel_mode(current, regs) || 
> +         (addr = emulate_forced_invalid_op(regs)) == 0 )
> +    {
> +        hvm_inject_hw_exception(TRAP_invalid_op,
HVM_DELIVER_NO_ERROR_CODE);
> +        return 0;
> +    } 
> +
> +    if (addr != EXCRET_fault_fixed)
> +        hvm_inject_page_fault(0, addr);
> +
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: handled the exception/NMI */
> +static int vmxit_exception(struct cpu_user_regs *regs)
> +{
> +    unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) &
INTR_INFO_VECTOR_MASK;
> +    int rc=1; 
> +    struct vcpu *vp = current;
> +
> +    dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, 
> +          __vmread(GUEST_CS_SELECTOR), regs->eip);
> +
> +    if (vector == TRAP_debug) {
> +        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        write_debugreg(6, exit_qualification | 0xffff0ff0);
> +        rc = 0;
> +        
> +        /* gdbsx or another debugger */
> +        if ( vp->domain->domain_id != 0 &&    /* never pause
dom0 */
> +             guest_kernel_mode(vp, regs) && 
vp->domain->debugger_attached )
> +        {
> +            domain_pause_for_debugger();
> +        } else {
> +            hvm_inject_hw_exception(TRAP_debug,
HVM_DELIVER_NO_ERROR_CODE);
> +        }
> +    } 
> +    if (vector == TRAP_int3) {
> +        rc = vmxit_int3(regs);
> +
> +    }  else if (vector == TRAP_invalid_op) {
> +        rc = vmxit_invalid_op(regs);
> +
> +    } else if (vector == TRAP_no_device) {
> +        hvm_funcs.fpu_dirty_intercept();  /* calls vmx_fpu_dirty_intercept
*/
> +        rc = 0;
> +
> +    } else if (vector == TRAP_gp_fault) {
> +        regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
> +        /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */
So how come we don''t inject it in?
> +        rc = 1;
Huh? Not X86_EMUL_OK?
> +
> +    } else if (vector == TRAP_page_fault) {
> +        printk("PVH: Unexpected vector page_fault. IP:%lx\n",
regs->eip);
printk(.. some prefix.
> +        rc = 1;
> +    }
> +    if (rc)
> +        printk("PVH: Unhandled trap vector:%d. IP:%lx\n",
vector, regs->eip);
> +
> +    return rc;
> +}
> +
> +static int vmxit_invlpg(void)
> +{
> +    ulong vaddr = __vmread(EXIT_QUALIFICATION);
> +
> +    vmx_update_guest_eip();
> +    vpid_sync_vcpu_gva(current, vaddr);
> +    return 0;
> +}
> +
> +static int vmxit_vmcall(struct cpu_user_regs *regs)
> +{
> +    if ( pvh_do_hypercall(regs) != HVM_HCALL_preempted)
> +        vmx_update_guest_eip();
> +
> +    return 0;;
> +}
> +
> +/* Returns: rc == 0: success */
> +static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, 
> +                      uint64_t *regp)
> +{
> +    struct vcpu *vp = current;
> +
> +    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
> +    {
> +        unsigned long new_cr0 = *regp;
> +        unsigned long old_cr0 = __vmread(GUEST_CR0);
> +
> +        dbgp2("PVH:writing to CR0. RIP:%lx val:0x%lx\n",
regs->rip, *regp);
> +        if ( (u32)new_cr0 != new_cr0 )
> +        {
> +            HVM_DBG_LOG(DBG_LEVEL_1, 
> +                        "Guest setting upper 32 bits in CR0:
%lx", new_cr0);
> +            return 1;
> +        }
> +
> +        new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
> +        /* ET is reserved and should be always be 1. */
> +        new_cr0 |= X86_CR0_ET;
> +
> +        /* pvh cannot change to real mode */
> +        if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) !=
(X86_CR0_PG|X86_CR0_PE) ) {
> +            printk("PVH attempting to turn off PE/PG.
CR0:%lx\n", new_cr0);
> +            return 1;
> +        }
> +        /* TS going from 1 to 0 */
> +        if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 &
X86_CR0_TS)==0) )
> +            vmx_fpu_enter(vp);
Does this really happen? I thought in the PV mode you would be using the
hypercalls
for the fpu swap? Should it be print out an error saying something to the
effect:

	"PVH guest is using cr0 instead of the paravirt lazy FPU switch!" and
	include the EIP?
> +
> +        vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] =
new_cr0;
> +        __vmwrite(GUEST_CR0, new_cr0);
> +        __vmwrite(CR0_READ_SHADOW, new_cr0);
> +    } else {
> +        *regp = __vmread(GUEST_CR0);
> +    } 
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: success */
> +static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, 
> +                      uint64_t *regp)
> +{
> +    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
> +    {
> +        u64 old_cr4 = __vmread(GUEST_CR4);
> +
> +        if ( (old_cr4 ^ (*regp)) & (X86_CR4_PSE | X86_CR4_PGE |
X86_CR4_PAE) )
> +            vpid_sync_all();
> +
> +        /* pvh_verify_cr4_wr(*regp)); */
Needed anymore?
> +        __vmwrite(GUEST_CR4, *regp);
> +    } else {
> +        *regp = __vmread(GUEST_CR4);
> +    } 
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: success */
> +static int vmxit_cr_access(struct cpu_user_regs *regs)
> +{
> +    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +    uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
> +    int cr, rc = 1;
> +
> +    switch ( acc_typ )
> +    {
> +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
> +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
> +        {
> +            uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
> +            uint64_t *regp = decode_register(gpr, regs, 0);
> +            cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
> +
> +            if (regp == NULL)
> +                break;
> +
> +            /* pl don''t embed switch statements */
> +            if (cr == 0)
> +                rc = access_cr0(regs, acc_typ, regp);
> +            else if (cr == 3) {
> +                printk("PVH: d%d: unexpected cr3 access vmexit.
rip:%lx\n",
> +                       current->domain->domain_id, regs->rip);
> +                domain_crash_synchronous();
Uh? Why wouldn''t we want to handle it?
> +            } else if (cr == 4) 
> +                rc = access_cr4(regs, acc_typ, regp);
> +
> +            if (rc == 0)
> +                vmx_update_guest_eip();
> +            break;
> +        }
> +        case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
> +        {
> +            struct vcpu *vp = current;
> +            unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] &
~X86_CR0_TS;
> +            vp->arch.hvm_vcpu.hw_cr[0] =
vp->arch.hvm_vcpu.guest_cr[0] = cr0;
> +            vmx_fpu_enter(vp);
> +            __vmwrite(GUEST_CR0, cr0);
> +            __vmwrite(CR0_READ_SHADOW, cr0);
> +            vmx_update_guest_eip();
> +            rc = 0;
> +        }
> +    }
> +    return rc;
> +}
> +
> +/* NOTE: a PVH sets IOPL natively by setting bits in the eflags and not by
> + * hypercalls used by a PV */

Ahh, so there are now five? PV hypercall families that should not be used:

 - PHYSDEVOP_set_iopl (which I think in your earlier patch you did not check
   for?)
 - HYPERVISOR_update_va_mapping (for all the MMU stuff)
 - HYPERVISOR_update_descriptor (segments and such)
 - HYPERVISOR_fpu_taskswitch (you are doing it in the above function)
 - HYPERVISOR_set_trap_table (again, LDT and GDT are now done via HVM)
 - HYPERVISOR_set_segment_base
 - HYPERVISOR_set_gdt
 - HYPERVISOR_tmem
 .. and host of other.

This should be documented somewhere in docs?
Perhaps in docs/misc/pvh.txt and just outline which ones are not to be used
anymore?

> +static int vmxit_io_instr(struct cpu_user_regs *regs)
> +{
> +    int curr_lvl;
> +    int requested = (regs->rflags >> 12) & 3;
> +
> +    read_vmcs_selectors(regs);
> +    curr_lvl = regs->cs & 3;
> +
> +    if (requested >= curr_lvl && emulate_privileged_op(regs)) 
> +        return 0;
> +
> +    hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
> +    return 0;
> +}
> +
> +static int pvh_ept_handle_violation(unsigned long qualification, 
> +                                    paddr_t gpa, struct cpu_user_regs
*regs)
> +{
> +    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
> +    p2m_type_t p2mt;
> +    mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn,
&p2mt);
> +
> +    gdprintk(XENLOG_ERR, "Dom:%d EPT violation %#lx (%c%c%c/%c%c%c),
"
> +             "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx
RSP:0x%lx\n",
> +             current->domain->domain_id, qualification, 
> +             (qualification & EPT_READ_VIOLATION) ?
''r'' : ''-'',
> +             (qualification & EPT_WRITE_VIOLATION) ?
''w'' : ''-'',
> +             (qualification & EPT_EXEC_VIOLATION) ?
''x'' : ''-'',
> +             (qualification & EPT_EFFECTIVE_READ) ?
''r'' : ''-'',
> +             (qualification & EPT_EFFECTIVE_WRITE) ?
''w'' : ''-'',
> +             (qualification & EPT_EFFECTIVE_EXEC) ?
''x'' : ''-'',
> +             gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp);
> +             
> +    ept_walk_table(current->domain, gfn);
> +
> +    if ( qualification & EPT_GLA_VALID )
> +    {
> +        gla = __vmread(GUEST_LINEAR_ADDRESS);
> +        gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
AHA! There are gdprintk in your code! Could you please replace most (all)
of the printk you have with either gdprintk or the printk(XENLOG_ERR?
> +    }
> +
> +    hvm_inject_hw_exception(TRAP_gp_fault, 0);
> +    return 0;
> +}
> +
> +static void pvh_user_cpuid(struct cpu_user_regs *regs)
> +{
> +    unsigned int eax, ebx, ecx, edx;
> +
> +    asm volatile ( "cpuid"
> +              : "=a" (eax), "=b" (ebx), "=c"
(ecx), "=d" (edx)
> +              : "0" (regs->eax), "2" (regs->rcx)
);
> +
Could you use ''cpuid'' macro defined in processor.h?
> +    regs->rax = eax; regs->rbx = ebx; regs->rcx = ecx;
regs->rdx = edx;
That would make this:

	cpuid(regs->eax, &regs->eax, &regs->ebx, &regs->ecx,
&regs->edx) ?


Or is that not an option since you are re-using the eax register? If so, could
you do:

	unsigned int op = regs->eax;
	cpuid(op, &regs->eax, &regs->ebx, &regs->ecx,
&regs->edx) ?
> +}
> +
> +/* 
> + * Main exit handler for PVH case. Called from vmx_vmexit_handler(). 
> + * Note: in vmx_asm_vmexit_handler, rip/rsp/eflags are updated in regs{} 
> + */
> +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs)
> +{
> +    unsigned long exit_qualification;
> +    unsigned int exit_reason = __vmread(VM_EXIT_REASON);
> +    int rc=0, ccpu = smp_processor_id();
> +    struct vcpu *vp = current;
> +
> +    dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx
CR0:%lx\n",
> +          ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags, 
> +          __vmread(GUEST_CR0));
> +
> +    /* for guest_kernel_mode() */
> +    regs->cs = __vmread(GUEST_CS_SELECTOR); 
> +
> +    switch ( (uint16_t)exit_reason )
Huh? Why the ''uint16_t''? Ah, b/c vmx_vmexit_handler does it
too. I wonder why?
> +    {
> +        case EXIT_REASON_EXCEPTION_NMI:      /* 0 */
> +            rc = vmxit_exception(regs);
> +            break;
> +            
> +        case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */
> +        case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */
> +            break;              /* handled in vmx_vmexit_handler() */
> +
> +        case EXIT_REASON_TRIPLE_FAULT:       /* 2 */
> +        {
> +            printk("PVH:Triple Flt:[%d] RIP:%lx RSP:%lx EFLAGS:%lx
CR3:%lx\n",
> +                   ccpu, regs->rip, regs->rsp, regs->rflags, 
> +                   __vmread(GUEST_CR3));
> +
> +            rc = 1;
> +            break;
> +        }
> +        case EXIT_REASON_PENDING_VIRT_INTR:  /* 7 */
> +        {
> +            struct vcpu *v = current;
> +
> +            /* Disable the interrupt window. */
> +            v->arch.hvm_vmx.exec_control &=
~CPU_BASED_VIRTUAL_INTR_PENDING;
> +            __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
v->arch.hvm_vmx.exec_control);
> +            break;
> +        }
> +
> +        case EXIT_REASON_CPUID:              /* 10 */
> +        {
> +            if ( guest_kernel_mode(vp, regs) ) {
> +                pv_cpuid(regs);
> +            } else
> +                pvh_user_cpuid(regs);
> +
> +            vmx_update_guest_eip();
> +            dbgp2("cpuid:%ld RIP:%lx\n", regs->eax,
regs->rip);
> +            break;
> +        }
> +
> +        case EXIT_REASON_HLT:             /* 12 */
> +        {
> +            vmx_update_guest_eip();
> +            hvm_hlt(regs->eflags);
> +            break;
> +        }
> +
> +        case EXIT_REASON_INVLPG:             /* 14 */
> +            rc = vmxit_invlpg();
> +            break;
> +
> +        case EXIT_REASON_RDTSC:              /* 16 */
> +            rc = 1;
> +            break;
> +
> +        case EXIT_REASON_VMCALL:             /* 18 */
> +            rc = vmxit_vmcall(regs);
> +            break;
> +
> +        case EXIT_REASON_CR_ACCESS:          /* 28 */
> +            rc = vmxit_cr_access(regs);
> +            break;
> +
> +        case EXIT_REASON_DR_ACCESS:          /* 29 */
> +        {
> +            exit_qualification = __vmread(EXIT_QUALIFICATION);
> +            vmx_dr_access(exit_qualification, regs);
> +            break;
> +        }
> +
> +        case EXIT_REASON_IO_INSTRUCTION:
> +            vmxit_io_instr(regs);
> +            break;
> +
> +        case EXIT_REASON_MSR_READ:           /* 31 */
> +            rc = vmxit_msr_read(regs);
> +            break;
> +
> +        case EXIT_REASON_MSR_WRITE:          /* 32 */
> +            rc = vmxit_msr_write(regs);
> +            break;
> +
> +        case EXIT_REASON_MONITOR_TRAP_FLAG:  /* 37 */
> +            rc = vmxit_mtf(regs);
> +            break;
> +
> +        case EXIT_REASON_EPT_VIOLATION:
> +        {
> +            paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
> +            exit_qualification = __vmread(EXIT_QUALIFICATION);
> +            rc = pvh_ept_handle_violation(exit_qualification, gpa, regs);
> +            break;
> +        }
> +        default: 
> +            rc = 1;
> +            printk("PVH: Unexpected exit reason:%d 0x%x\n",
exit_reason,
> +                   exit_reason);
> +    }
> +    if (rc) {
Odd syntax.
> +        exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        printk("PVH: [%d] exit_reas:%d 0x%x qual:%ld 0x%lx
cr0:0x%016lx\n",
> +             ccpu, exit_reason, exit_reason, exit_qualification,
> +             exit_qualification, __vmread(GUEST_CR0));
> +        printk("PVH: [%d] RIP:%lx RSP:%lx\n", ccpu,
regs->rip, regs->rsp);
> +        domain_crash_synchronous();
> +    }
> +}
> +
> +/* 
> + * Sets info for non boot vcpu. VCPU 0 context is set by library.
> + * We use this for nonboot vcpu in which case the call comes from the 
> + * kernel cpu_initialize_context().
> + */
> +int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context
*ctxtp)
> +{
> +    if (v->vcpu_id == 0)
> +        return 0;
> +
> +    vmx_vmcs_enter(v);
> +    __vmwrite(GUEST_GDTR_BASE, ctxtp->u.pvh.gdtaddr);
> +    __vmwrite(GUEST_GDTR_LIMIT, ctxtp->u.pvh.gdtsz);
> +    __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user);
> +
> +    __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs);
> +    __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds);
> +    __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es);
> +    __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss);
> +    __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs);
> +
> +    if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) )
> +        return -EINVAL;
> +
> +    vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel);
> +
> +    vmx_vmcs_exit(v);
> +    return 0;
> +}
> +
> +int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
> +                            const struct cpu_user_regs *regs,
> +                            unsigned long *base, unsigned long *limit,
> +                            unsigned int *ar)
> +{
How come you don''t want to follow the syntax of
vmx_pvh_read_descriptor?
It has a lot less of arguments?
> +    unsigned int tmp_ar = 0;
> +    BUG_ON(v!=current);
> +    BUG_ON(!is_pvh_vcpu(v));
> +
> +    if (sel == (unsigned int)regs->cs) {
> +        *base = __vmread(GUEST_CS_BASE);
> +        *limit = __vmread(GUEST_CS_LIMIT);
> +        tmp_ar = __vmread(GUEST_CS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->ds) {
> +        *base = __vmread(GUEST_DS_BASE);
> +        *limit = __vmread(GUEST_DS_LIMIT);
> +        tmp_ar = __vmread(GUEST_DS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->ss) {
> +        *base = __vmread(GUEST_SS_BASE);
> +        *limit = __vmread(GUEST_SS_LIMIT);
> +        tmp_ar = __vmread(GUEST_SS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->gs) {
> +        *base = __vmread(GUEST_GS_BASE);
> +        *limit = __vmread(GUEST_GS_LIMIT);
> +        tmp_ar = __vmread(GUEST_GS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->fs) {
> +        *base = __vmread(GUEST_FS_BASE);
> +        *limit = __vmread(GUEST_FS_LIMIT);
> +        tmp_ar = __vmread(GUEST_FS_AR_BYTES); 
> +    } else if (sel == (unsigned int)regs->es) {
> +        *base = __vmread(GUEST_ES_BASE);
> +        *limit = __vmread(GUEST_ES_LIMIT);
> +        tmp_ar = __vmread(GUEST_ES_AR_BYTES); 
> +    } else {
> +        printk("Unmatched segment selector:%d\n", sel);
> +        return 0;
> +    }
> +
> +    if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) {           /* x86 mess!! */
> +        *base = 0UL;
> +        *limit = ~0UL;
> +    }
> +    /* Fixup ar so that it looks the same as in native mode */
> +    *ar = (tmp_ar << 8);
I am not sure I follow. Are you doing this to fit in the other bits
of the segment (the upper limit)? Shouldn''t the caller of
vmx_pvh_read_descriptor do this?
> +    return 1;
> +}
> +
> diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h
b/xen/include/asm-x86/hvm/vmx/vmx.h
> index a742e16..5679e8d 100644
> --- a/xen/include/asm-x86/hvm/vmx/vmx.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
> @@ -189,6 +189,7 @@ void vmx_update_secondary_exec_control(struct vcpu *v);
>   * Access Rights
>   */
>  #define X86_SEG_AR_SEG_TYPE     0xf        /* 3:0, segment type */
> +#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment
*/
>  #define X86_SEG_AR_DESC_TYPE    (1u << 4)  /* 4, descriptor type */
>  #define X86_SEG_AR_DPL          0x60       /* 6:5, descriptor privilege
level */
>  #define X86_SEG_AR_SEG_PRESENT  (1u << 7)  /* 7, segment present */
> @@ -442,10 +443,14 @@ void ept_p2m_uninit(struct p2m_domain *p2m);
>  
>  void ept_walk_table(struct domain *d, unsigned long gfn);
>  void setup_ept_dump(void);
> -
Stray change.>  void vmx_update_guest_eip(void);
>  void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs
*regs);
>  void vmx_do_extint(struct cpu_user_regs *regs);
> +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs);
> +int  vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context
*ctxtp);
> +int  vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
> +                         const struct cpu_user_regs *regs, unsigned long
*base,
> +                         unsigned long *limit, unsigned int *ar);
>  
>  int alloc_p2m_hap_data(struct p2m_domain *p2m);
>  void free_p2m_hap_data(struct p2m_domain *p2m);
> diff --git a/xen/include/asm-x86/pvh.h b/xen/include/asm-x86/pvh.h
> new file mode 100644
> index 0000000..73e59d3
> --- /dev/null
> +++ b/xen/include/asm-x86/pvh.h
> @@ -0,0 +1,6 @@
> +#ifndef __ASM_X86_PVH_H__
> +#define __ASM_X86_PVH_H__
> +
> +int pvh_do_hypercall(struct cpu_user_regs *regs);
> +
> +#endif  /* __ASM_X86_PVH_H__ */
> -- 
> 1.7.2.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

Tim Deegan

2013-Mar-21 16:49 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

> +/* Returns: rc == 0: handled the exception/NMI */
> +static int vmxit_exception(struct cpu_user_regs *regs)
> +{
> +    unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) &
INTR_INFO_VECTOR_MASK;
> +    int rc=1; 
> +    struct vcpu *vp = current;
> +
> +    dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, 
> +          __vmread(GUEST_CS_SELECTOR), regs->eip);
> +
> +    if (vector == TRAP_debug) {
Missing spaces (here and below) 
> +        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        write_debugreg(6, exit_qualification | 0xffff0ff0);
> +        rc = 0;
> +        
> +        /* gdbsx or another debugger */
> +        if ( vp->domain->domain_id != 0 &&    /* never pause
dom0 */
> +             guest_kernel_mode(vp, regs) && 
vp->domain->debugger_attached )
> +        {
> +            domain_pause_for_debugger();
> +        } else {
> +            hvm_inject_hw_exception(TRAP_debug,
HVM_DELIVER_NO_ERROR_CODE);
> +        }
> +    } 
> +    if (vector == TRAP_int3) {
> +        rc = vmxit_int3(regs);
> +
> +    }  else if (vector == TRAP_invalid_op) {
> +        rc = vmxit_invalid_op(regs);
> +
> +    } else if (vector == TRAP_no_device) {
> +        hvm_funcs.fpu_dirty_intercept();  /* calls vmx_fpu_dirty_intercept
*/
> +        rc = 0;
> +
> +    } else if (vector == TRAP_gp_fault) {
> +        regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
> +        /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */
???
> +        rc = 1;
> +
> +    } else if (vector == TRAP_page_fault) {
> +        printk("PVH: Unexpected vector page_fault. IP:%lx\n",
regs->eip);
> +        rc = 1;
> +    }
This probably ought to be a switch statement rather than a chain of
''else if''.  Then the single ''default'' case
would catch all unexpected
exits (AFAICS there are three different variations on that here).  And
since you''ve set the exception bitmap in the VMCS, presumably the
correct response to an unexpected trap type is to BUG(). 
> +    if (rc)
> +        printk("PVH: Unhandled trap vector:%d. IP:%lx\n",
vector, regs->eip);
> +
> +    return rc;
> +}
> +
> +static int vmxit_invlpg(void)
> +{
> +    ulong vaddr = __vmread(EXIT_QUALIFICATION);
> +
> +    vmx_update_guest_eip();
> +    vpid_sync_vcpu_gva(current, vaddr);
If there''s no special handling of invlpg, you should just not intercept
it.
> +    return 0;
> +}
>
> +/* Returns: rc == 0: success */
> +static int vmxit_cr_access(struct cpu_user_regs *regs)
> +{
> +    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +    uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
> +    int cr, rc = 1;
> +
> +    switch ( acc_typ )
> +    {
> +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
> +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
> +        {
> +            uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
> +            uint64_t *regp = decode_register(gpr, regs, 0);
> +            cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
> +
> +            if (regp == NULL)
> +                break;
> +
> +            /* pl don''t embed switch statements */
> +            if (cr == 0)
> +                rc = access_cr0(regs, acc_typ, regp);
> +            else if (cr == 3) {
> +                printk("PVH: d%d: unexpected cr3 access vmexit.
rip:%lx\n",
> +                       current->domain->domain_id, regs->rip);
> +                domain_crash_synchronous();
Again, ITYM BUG().  And again. probably a switch statement.

Tim.
> +            } else if (cr == 4) 
> +                rc = access_cr4(regs, acc_typ, regp);
> +
> +            if (rc == 0)
> +                vmx_update_guest_eip();
> +            break;
> +        }

Jan Beulich

2013-Mar-22 08:32 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

>>> On 21.03.13 at 17:49, Tim Deegan <tim@xen.org> wrote:
>> +            else if (cr == 3) {
>> +                printk("PVH: d%d: unexpected cr3 access vmexit.
rip:%lx\n",
> 
>> +                       current->domain->domain_id,
regs->rip);
>> +                domain_crash_synchronous();
> 
> Again, ITYM BUG().  And again. probably a switch statement.
Actually, in cases where just crashing the guest is safe (i.e. no
other execution state is corrupted), I think avoiding the use of
BUG() is quite desirable. I wouldn''t mind an ASSERT() though, to
catch developers'' attention.

Jan

Tim Deegan

2013-Mar-22 10:06 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

At 08:32 +0000 on 22 Mar (1363941127), Jan Beulich
wrote:> >>> On 21.03.13 at 17:49, Tim Deegan <tim@xen.org> wrote:
> >> +            else if (cr == 3) {
> >> +                printk("PVH: d%d: unexpected cr3 access
vmexit. rip:%lx\n",
> > 
> >> +                       current->domain->domain_id,
regs->rip);
> >> +                domain_crash_synchronous();
> > 
> > Again, ITYM BUG().  And again. probably a switch statement.
> 
> Actually, in cases where just crashing the guest is safe (i.e. no
> other execution state is corrupted), I think avoiding the use of
> BUG() is quite desirable. I wouldn''t mind an ASSERT() though, to
> catch developers'' attention.
Ack, ASSERT() + crash is better. 

Tim.

Mukesh Rathor

2013-Apr-02 00:08 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Mon, 18 Mar 2013 12:16:50 +0000
"Jan Beulich" <JBeulich@suse.com> wrote:
> >>> On 16.03.13 at 01:41, Mukesh Rathor
<mukesh.rathor@oracle.com>
> >>> wrote:
> > +static int pvh_grant_table_op(
> > +              unsigned int cmd, XEN_GUEST_HANDLE(void) uop,
> > unsigned int count) +{
> > +    switch (cmd)
> > +    {
> > +        case GNTTABOP_map_grant_ref:
> > +        case GNTTABOP_unmap_grant_ref:
> > +        case GNTTABOP_setup_table:
> > +        case GNTTABOP_copy:
> > +        case GNTTABOP_query_size:
> > +        case GNTTABOP_set_version:
> > +            return do_grant_table_op(cmd, uop, count);
> 
> While for HVM guests such white lists may be appropriate, for PVH
> this doesn''t seem to be the case: The code would require updating
> whenever a new sub-hypercall gets added to any of the hypercalls
> dealt with like this.
Right, these are verified for PVH. As they are added, they would need
to be verfied to make sure OK for PVH. We can remove this later. I can
ifdef DEBUG if you''d like.

Mukesh Rathor

2013-Apr-02 01:26 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Mon, 18 Mar 2013 12:32:06 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> 
> > +}
> > +
> > +typedef unsigned long pvh_hypercall_t(
> > +    unsigned long, unsigned long, unsigned long, unsigned long,
> > unsigned long,
> > +    unsigned long);
> 
> No way to re-use the something standard? I am not sure what is
''PVH''
> specific to it? It looks like a garden variety normal hypercalls.
It does use standard do_xx calls, PV goes thru it''s own table, HVM thru
its own, and PVH thru its own. This was suggested to me early on, and
it''s easier this way since HVM and PVH are not same hcalls.
> > +    rc = 0;
> > +
> > +    dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n",
> > regs->ecx, regs->eax, 
> > +          regs->edx, regs->rip, regs->rsp);
> > +    return rc;
> 
> This function looks to return 0 (or X86EMUL_OKAY) irregardless of the
> MSR? Perhaps just make it return X86EMUL_OKAY without bothering to
> use ''rc''?
> 
> > +}
> > +
> > +/* returns : 0 success */
> > +static int vmxit_msr_write(struct cpu_user_regs *regs)
> > +{
> > +    uint64_t msr_content = (uint32_t)regs->eax |
> > ((uint64_t)regs->edx << 32);
> > +    int rc=1;
> 
> X86EMUL_UNHANDLEABLE
> 
> > +
> > +    dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n",
> > regs->ecx, 
> > +          regs->eax,regs->edx);
> > +
> > +    if ( hvm_msr_write_intercept(regs->ecx, msr_content) =>
> X86EMUL_OKAY ) {
> > +        vmx_update_guest_eip();
> > +        rc = 0;
> 
> X86EMUL_OKAY
No, hide the X96EMUL_* from the caller of this function which deals
with rc or non-zero.
> > +    } else if (vector == TRAP_gp_fault) {
> > +        regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
> > +        /* hvm_inject_hw_exception(TRAP_gp_fault,
> > regs->error_code); */
> 
> So how come we don''t inject it in?
Notice the exception bitmap doesn''t allow GP vmexits. But it helps
tremendously with debug to have this here. I often have this enabled
for debug.
> > +        rc = 1;
> 
> Huh? Not X86_EMUL_OK?
No. there''s no x86 emulation going on here??

> > +            if (regp == NULL)
> > +                break;
> > +
> > +            /* pl don''t embed switch statements */
> > +            if (cr == 0)
> > +                rc = access_cr0(regs, acc_typ, regp);
> > +            else if (cr == 3) {
> > +                printk("PVH: d%d: unexpected cr3 access vmexit.
> > rip:%lx\n", 
> > +                       current->domain->domain_id,
regs->rip);
> > +                domain_crash_synchronous();
> 
> Uh? Why wouldn''t we want to handle it?
Guest does it natively. 

> > +}
> > +
> > +/* NOTE: a PVH sets IOPL natively by setting bits in the eflags
> > and not by
> > + * hypercalls used by a PV */
> 
> 
> Ahh, so there are now five? PV hypercall families that should not be
> used:
> 
>  - PHYSDEVOP_set_iopl (which I think in your earlier patch you did
> not check for?)
>  - HYPERVISOR_update_va_mapping (for all the MMU stuff)
>  - HYPERVISOR_update_descriptor (segments and such)
>  - HYPERVISOR_fpu_taskswitch (you are doing it in the above function)
>  - HYPERVISOR_set_trap_table (again, LDT and GDT are now done via HVM)
>  - HYPERVISOR_set_segment_base
>  - HYPERVISOR_set_gdt
>  - HYPERVISOR_tmem
>  .. and host of other.
> 
> This should be documented somewhere in docs?
> Perhaps in docs/misc/pvh.txt and just outline which ones are not to
> be used anymore?
I am keeping track of all doc stuff, lets document in the end when I 
enable PVH. We''ll be changing stuff for a short while.
> Could you use ''cpuid'' macro defined in processor.h?
since we know in this fucntion we are on intel, we can just do cpuid.
the macro has somw quirks for cyrix cpus, and clears rcx. Besides this
is user mode cpuid that will exactly be like this on a pure PV guest.

thanks,
mukesh

Jan Beulich

2013-Apr-02 07:00 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

>>> On 02.04.13 at 02:08, Mukesh Rathor
<mukesh.rathor@oracle.com> wrote:
> On Mon, 18 Mar 2013 12:16:50 +0000
> "Jan Beulich" <JBeulich@suse.com> wrote:
> 
>> >>> On 16.03.13 at 01:41, Mukesh Rathor
<mukesh.rathor@oracle.com>
>> >>> wrote:
>> > +static int pvh_grant_table_op(
>> > +              unsigned int cmd, XEN_GUEST_HANDLE(void) uop,
>> > unsigned int count) +{
>> > +    switch (cmd)
>> > +    {
>> > +        case GNTTABOP_map_grant_ref:
>> > +        case GNTTABOP_unmap_grant_ref:
>> > +        case GNTTABOP_setup_table:
>> > +        case GNTTABOP_copy:
>> > +        case GNTTABOP_query_size:
>> > +        case GNTTABOP_set_version:
>> > +            return do_grant_table_op(cmd, uop, count);
>> 
>> While for HVM guests such white lists may be appropriate, for PVH
>> this doesn''t seem to be the case: The code would require
updating
>> whenever a new sub-hypercall gets added to any of the hypercalls
>> dealt with like this.
> 
> Right, these are verified for PVH. As they are added, they would need
> to be verfied to make sure OK for PVH. We can remove this later. I can
> ifdef DEBUG if you''d like.
Yes, some sort of code mark would be nice - after all, such code
has to go away before PVH can be declared production ready.

Jan

Konrad Rzeszutek Wilk

2013-Apr-02 14:10 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Mon, Apr 01, 2013 at 06:26:45PM -0700, Mukesh Rathor
wrote:> On Mon, 18 Mar 2013 12:32:06 -0400
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> 
> > 
> > > +}
> > > +
> > > +typedef unsigned long pvh_hypercall_t(
> > > +    unsigned long, unsigned long, unsigned long, unsigned long,
> > > unsigned long,
> > > +    unsigned long);
> > 
> > No way to re-use the something standard? I am not sure what is
''PVH''
> > specific to it? It looks like a garden variety normal hypercalls.
> 
> It does use standard do_xx calls, PV goes thru it''s own table, HVM
thru
> its own, and PVH thru its own. This was suggested to me early on, and
> it''s easier this way since HVM and PVH are not same hcalls.
I meant the typedef.
> 
> > > +    rc = 0;
> > > +
> > > +    dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx
RSP:%lx\n",
> > > regs->ecx, regs->eax, 
> > > +          regs->edx, regs->rip, regs->rsp);
> > > +    return rc;
> > 
> > This function looks to return 0 (or X86EMUL_OKAY) irregardless of the
> > MSR? Perhaps just make it return X86EMUL_OKAY without bothering to
> > use ''rc''?
> > 
> > > +}
> > > +
> > > +/* returns : 0 success */
> > > +static int vmxit_msr_write(struct cpu_user_regs *regs)
> > > +{
> > > +    uint64_t msr_content = (uint32_t)regs->eax |
> > > ((uint64_t)regs->edx << 32);
> > > +    int rc=1;
> > 
> > X86EMUL_UNHANDLEABLE
> > 
> > > +
> > > +    dbgp1("PVH: msr write:0x%lx. eax:0x%lx
edx:0x%lx\n",
> > > regs->ecx, 
> > > +          regs->eax,regs->edx);
> > > +
> > > +    if ( hvm_msr_write_intercept(regs->ecx, msr_content)
=> > > X86EMUL_OKAY ) {
> > > +        vmx_update_guest_eip();
> > > +        rc = 0;
> > 
> > X86EMUL_OKAY
> 
> No, hide the X96EMUL_* from the caller of this function which deals
> with rc or non-zero.
In that case please make that part of the comment at the start. It
says: 0 success. But nothing about the failure path.
> 
> > > +    } else if (vector == TRAP_gp_fault) {
> > > +        regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
> > > +        /* hvm_inject_hw_exception(TRAP_gp_fault,
> > > regs->error_code); */
> > 
> > So how come we don''t inject it in?
> 
> Notice the exception bitmap doesn''t allow GP vmexits. But it helps
> tremendously with debug to have this here. I often have this enabled
> for debug.
So #ifdef DEBUG ?
> 
> > > +        rc = 1;
> > 
> > Huh? Not X86_EMUL_OK?
> 
> No. there''s no x86 emulation going on here??
> 
> 
> > > +            if (regp == NULL)
> > > +                break;
> > > +
> > > +            /* pl don''t embed switch statements */
> > > +            if (cr == 0)
> > > +                rc = access_cr0(regs, acc_typ, regp);
> > > +            else if (cr == 3) {
> > > +                printk("PVH: d%d: unexpected cr3 access
vmexit.
> > > rip:%lx\n", 
> > > +                       current->domain->domain_id,
regs->rip);
> > > +                domain_crash_synchronous();
> > 
> > Uh? Why wouldn''t we want to handle it?
> 
> Guest does it natively. 
Ah, please put that in a comment right above it.
> 
> 
> > > +}
> > > +
> > > +/* NOTE: a PVH sets IOPL natively by setting bits in the eflags
> > > and not by
> > > + * hypercalls used by a PV */
> > 
> > 
> > Ahh, so there are now five? PV hypercall families that should not be
> > used:
> > 
> >  - PHYSDEVOP_set_iopl (which I think in your earlier patch you did
> > not check for?)
> >  - HYPERVISOR_update_va_mapping (for all the MMU stuff)
> >  - HYPERVISOR_update_descriptor (segments and such)
> >  - HYPERVISOR_fpu_taskswitch (you are doing it in the above function)
> >  - HYPERVISOR_set_trap_table (again, LDT and GDT are now done via HVM)
> >  - HYPERVISOR_set_segment_base
> >  - HYPERVISOR_set_gdt
> >  - HYPERVISOR_tmem
> >  .. and host of other.
> > 
> > This should be documented somewhere in docs?
> > Perhaps in docs/misc/pvh.txt and just outline which ones are not to
> > be used anymore?
> 
> I am keeping track of all doc stuff, lets document in the end when I 
> enable PVH. We''ll be changing stuff for a short while.
Why not make this ''doc stuff'' part of the patches? That way
when you
are done with one item you can have a patch to remove it out the TODO
list. Also this way other folks can look at it and if they have time
help you on some of the TODOs.
> 
> > Could you use ''cpuid'' macro defined in processor.h?
> 
> since we know in this fucntion we are on intel, we can just do cpuid.
> the macro has somw quirks for cyrix cpus, and clears rcx. Besides this
> is user mode cpuid that will exactly be like this on a pure PV guest.
I am not following you. Is the reason you are doing this b/c the
macro clears rcx?

> 
> thanks,
> mukesh

Mukesh Rathor

2013-Apr-03 01:29 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Tue, 2 Apr 2013 10:10:12 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> On Mon, Apr 01, 2013 at 06:26:45PM -0700, Mukesh Rathor wrote:
> > On Mon, 18 Mar 2013 12:32:06 -0400
> > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > 
> > No, hide the X96EMUL_* from the caller of this function which deals
> > with rc or non-zero.
> 
> In that case please make that part of the comment at the start. It
> says: 0 success. But nothing about the failure path.
it''s better to have it say some thing than nothing! failure paths are
many and change over time, but when debugging its good to know right
away if a function succeeded.
> > >  - HYPERVISOR_set_segment_base
> > >  - HYPERVISOR_set_gdt
> > >  - HYPERVISOR_tmem
> > >  .. and host of other.
> > > 
> > > This should be documented somewhere in docs?
> > > Perhaps in docs/misc/pvh.txt and just outline which ones are not
> > > to be used anymore?
> > 
> > I am keeping track of all doc stuff, lets document in the end when
> > I enable PVH. We''ll be changing stuff for a short while.
> 
> Why not make this ''doc stuff'' part of the patches? That
way when you
> are done with one item you can have a patch to remove it out the TODO
> list. Also this way other folks can look at it and if they have time
> help you on some of the TODOs.
Lets keep things manageable. I have fixme''s in the code, and also
documentd
in the cover what needs to be done. When we enable PVH, we''ll know what
got done and what didn''t, and we can document it properly.
> > 
> > > Could you use ''cpuid'' macro defined in
processor.h?
> > 
> > since we know in this fucntion we are on intel, we can just do
> > cpuid. the macro has somw quirks for cyrix cpus, and clears rcx.
> > Besides this is user mode cpuid that will exactly be like this on a
> > pure PV guest.
> 
> I am not following you. Is the reason you are doing this b/c the
> macro clears rcx?
Correct.

Mukesh Rathor

2013-Apr-03 01:37 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Thu, 21 Mar 2013 16:49:12 +0000
Tim Deegan <tim@xen.org> wrote:
> ???
> 
> > +        rc = 1;
> > +
> > +    } else if (vector == TRAP_page_fault) {
> > +        printk("PVH: Unexpected vector page_fault.
IP:%lx\n",
> > regs->eip);
> > +        rc = 1;
> > +    }
> 
> This probably ought to be a switch statement rather than a chain of
> ''else if''.  Then the single ''default''
case would catch all unexpected
> exits (AFAICS there are three different variations on that here).  And
> since you''ve set the exception bitmap in the VMCS, presumably the
> correct response to an unexpected trap type is to BUG(). 
Changed to switch.
> > +    return rc;
> > +}
> > +
> > +static int vmxit_invlpg(void)
> > +{
> > +    ulong vaddr = __vmread(EXIT_QUALIFICATION);
> > +
> > +    vmx_update_guest_eip();
> > +    vpid_sync_vcpu_gva(current, vaddr);
> 
> If there''s no special handling of invlpg, you should just not
> intercept it. 
well, we call vpid_sync_vcpu_gva(), but we can optimize this further
in future.
> > __vmread(EXIT_QUALIFICATION);
> > +    uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
> > +    int cr, rc = 1;
> > +
> > +    switch ( acc_typ )
> > +    {
> > +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
> > +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
> > +        {
> > +            uint gpr > >
VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
> > +            uint64_t *regp = decode_register(gpr, regs, 0);
> > +            cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
> > +
> > +            if (regp == NULL)
> > +                break;
> > +
> > +            /* pl don''t embed switch statements */ 
<=======> > +            if (cr == 0)
> > +                rc = access_cr0(regs, acc_typ, regp);
> > +            else if (cr == 3) {
> > +                printk("PVH: d%d: unexpected cr3 access vmexit.
> > rip:%lx\n", 
> > +                       current->domain->domain_id,
regs->rip);
> > +                domain_crash_synchronous();
> 
> Again, ITYM BUG().  And again. probably a switch statement.
No, lets not embed switch statements please, that is obfuscating.

thanks,
Mukesh

Mukesh Rathor

2013-Apr-03 01:42 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Mon, 18 Mar 2013 12:32:06 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> On Fri, Mar 15, 2013 at 05:41:45PM -0700, Mukesh Rathor wrote:
> > +        tmp_ar = __vmread(GUEST_GS_AR_BYTES); 
> > +    } else if (sel == (unsigned int)regs->fs) {
> > +        *base = __vmread(GUEST_FS_BASE);
> > +        *limit = __vmread(GUEST_FS_LIMIT);
> > +        tmp_ar = __vmread(GUEST_FS_AR_BYTES); 
> > +    } else if (sel == (unsigned int)regs->es) {
> > +        *base = __vmread(GUEST_ES_BASE);
> > +        *limit = __vmread(GUEST_ES_LIMIT);
> > +        tmp_ar = __vmread(GUEST_ES_AR_BYTES); 
> > +    } else {
> > +        printk("Unmatched segment selector:%d\n", sel);
> > +        return 0;
> > +    }
> > +
> > +    if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) {           /* x86
> > mess!! */
> > +        *base = 0UL;
> > +        *limit = ~0UL;
> > +    }
> > +    /* Fixup ar so that it looks the same as in native mode */
> > +    *ar = (tmp_ar << 8);
> 
> I am not sure I follow. Are you doing this to fit in the other bits
> of the segment (the upper limit)? Shouldn''t the caller of
> vmx_pvh_read_descriptor do this?
It follows the semantics of the existing read_descriptor() function.


thanks
Mukesh

Mukesh Rathor

2013-Apr-03 01:45 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Tue, 2 Apr 2013 10:10:12 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> On Mon, Apr 01, 2013 at 06:26:45PM -0700, Mukesh Rathor wrote:
> > On Mon, 18 Mar 2013 12:32:06 -0400
> > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > 
> > > 
> > > > +}
> > > > +
> > > > +typedef unsigned long pvh_hypercall_t(
> > > > +    unsigned long, unsigned long, unsigned long, unsigned
long,
> > > > unsigned long,
> > > > +    unsigned long);
> > > 
> > > No way to re-use the something standard? I am not sure what is
> > > ''PVH'' specific to it? It looks like a garden
variety normal
> > > hypercalls.
> > 
> > It does use standard do_xx calls, PV goes thru it''s own
table, HVM
> > thru its own, and PVH thru its own. This was suggested to me early
> > on, and it''s easier this way since HVM and PVH are not same
hcalls.
> 
> I meant the typedef.
Ok, I could move the hvm_typedef* to header, and use that. no biggie.

Jan Beulich

2013-Apr-03 08:06 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

>>> On 03.04.13 at 03:37, Mukesh Rathor
<mukesh.rathor@oracle.com> wrote:
> On Thu, 21 Mar 2013 16:49:12 +0000 Tim Deegan <tim@xen.org> wrote:
>> > +    uint acc_typ =
VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
>> > +    int cr, rc = 1;
>> > +
>> > +    switch ( acc_typ )
>> > +    {
>> > +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
>> > +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
>> > +        {
>> > +            uint gpr >> >
VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
>> > +            uint64_t *regp = decode_register(gpr, regs, 0);
>> > +            cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
>> > +
>> > +            if (regp == NULL)
>> > +                break;
>> > +
>> > +            /* pl don''t embed switch statements */ 
<=======>> > +            if (cr == 0)
>> > +                rc = access_cr0(regs, acc_typ, regp);
>> > +            else if (cr == 3) {
>> > +                printk("PVH: d%d: unexpected cr3 access
vmexit.
>> > rip:%lx\n", 
>> > +                       current->domain->domain_id,
regs->rip);
>> > +                domain_crash_synchronous();
>> 
>> Again, ITYM BUG().  And again. probably a switch statement.
> 
> No, lets not embed switch statements please, that is obfuscating.
I don''t see a reason not to. Long rows of if/else-if are at least as
obfuscating.

Jan

Konrad Rzeszutek Wilk

2013-Apr-03 14:40 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Tue, Apr 02, 2013 at 06:29:18PM -0700, Mukesh Rathor
wrote:> On Tue, 2 Apr 2013 10:10:12 -0400
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> 
> > On Mon, Apr 01, 2013 at 06:26:45PM -0700, Mukesh Rathor wrote:
> > > On Mon, 18 Mar 2013 12:32:06 -0400
> > > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > > 
> > > No, hide the X96EMUL_* from the caller of this function which
deals
> > > with rc or non-zero.
> > 
> > In that case please make that part of the comment at the start. It
> > says: 0 success. But nothing about the failure path.
> 
> it''s better to have it say some thing than nothing! failure paths
are
> many and change over time, but when debugging its good to know right
> away if a function succeeded.
Sure. I am not saying that the comment is bad. I am just saying add
also the error conditions.

I don''t think the error return values (whether they are positive or
negative) is going to change over time. But if they are, then the
comments can be updated at that time too.
> 
> > > >  - HYPERVISOR_set_segment_base
> > > >  - HYPERVISOR_set_gdt
> > > >  - HYPERVISOR_tmem
> > > >  .. and host of other.
> > > > 
> > > > This should be documented somewhere in docs?
> > > > Perhaps in docs/misc/pvh.txt and just outline which ones are
not
> > > > to be used anymore?
> > > 
> > > I am keeping track of all doc stuff, lets document in the end
when
> > > I enable PVH. We''ll be changing stuff for a short while.
> > 
> > Why not make this ''doc stuff'' part of the patches?
That way when you
> > are done with one item you can have a patch to remove it out the TODO
> > list. Also this way other folks can look at it and if they have time
> > help you on some of the TODOs.
> 
> Lets keep things manageable. I have fixme''s in the code, and also
documentd
> in the cover what needs to be done. When we enable PVH, we''ll know
what
> got done and what didn''t, and we can document it properly.
Kind off. We forgot about the 32-bit until Jan realized it. That should be
part of a TODO.

I am a big proponent of putting out a list of ''these are things that
are busted''. That way if somebody is trying to get familiar with this
they can work on off the bugs or items without having to stumble trying
to find the TODOs.

In other words - making it easier for other folks to help you with this.
> 
> > > 
> > > > Could you use ''cpuid'' macro defined in
processor.h?
> > > 
> > > since we know in this fucntion we are on intel, we can just do
> > > cpuid. the macro has somw quirks for cyrix cpus, and clears rcx.
> > > Besides this is user mode cpuid that will exactly be like this on
a
> > > pure PV guest.
> > 
> > I am not following you. Is the reason you are doing this b/c the
> > macro clears rcx?
> 
> Correct.
Please state that as a comment in the code then. It helps as in a couple
of years (months?) somebody will notice this and say: Geey, why not
just use the #define cpuid.

This will give them enough information to make the proper decision
(which might be making the macro _not_ clear the rcx).
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

Mukesh Rathor

2013-Apr-03 23:38 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Wed, 03 Apr 2013 09:06:18 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:
> >>> On 03.04.13 at 03:37, Mukesh Rathor
<mukesh.rathor@oracle.com>
> >>> wrote:
> > On Thu, 21 Mar 2013 16:49:12 +0000 Tim Deegan <tim@xen.org>
wrote:
> >> > +    uint acc_typ > >> >
VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
> >> > +    int cr, rc = 1;
> >> > +
> >> > +    switch ( acc_typ )
> >> > +    {
> >> > +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
> >> > +        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
> >> > +        {
> >> > +            uint gpr > >> >
VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
> >> > +            uint64_t *regp = decode_register(gpr, regs, 0);
> >> > +            cr =
VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
> >> > +
> >> > +            if (regp == NULL)
> >> > +                break;
> >> > +
> >> > +            /* pl don''t embed switch statements */ 
<=======> >> > +            if (cr == 0)
> >> > +                rc = access_cr0(regs, acc_typ, regp);
> >> > +            else if (cr == 3) {
> >> > +                printk("PVH: d%d: unexpected cr3 access
vmexit.
> >> > rip:%lx\n", 
> >> > +                       current->domain->domain_id,
regs->rip);
> >> > +                domain_crash_synchronous();
> >> 
> >> Again, ITYM BUG().  And again. probably a switch statement.
> > 
> > No, lets not embed switch statements please, that is obfuscating.
> 
> I don''t see a reason not to. Long rows of if/else-if are at least
as
> obfuscating.
Not sure I agree. Long switch statements, as few places in xen code,
make it hard to figure which break goes with which switch, and that
makes code reading take longer.

But whatever. I''ll change to switch. 

Mukesh

Mukesh Rathor

2013-Apr-04 01:01 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Mon, 18 Mar 2013 12:32:06 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> On Fri, Mar 15, 2013 at 05:41:45PM -0700, Mukesh Rathor wrote:
> > +            return 1;
> > +        }
> > +
> > +        new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
> > +        /* ET is reserved and should be always be 1. */
> > +        new_cr0 |= X86_CR0_ET;
> > +
> > +        /* pvh cannot change to real mode */
> > +        if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) !> >
(X86_CR0_PG|X86_CR0_PE) ) {
> > +            printk("PVH attempting to turn off PE/PG.
CR0:%lx\n",
> > new_cr0);
> > +            return 1;
> > +        }
> > +        /* TS going from 1 to 0 */
> > +        if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 &
> > X86_CR0_TS)==0) )
> > +            vmx_fpu_enter(vp);
> 
> Does this really happen? I thought in the PV mode you would be using
> the hypercalls for the fpu swap? Should it be print out an error
> saying something to the effect:
> 
> 	"PVH guest is using cr0 instead of the paravirt lazy FPU
> switch!" and include the EIP?
Yes. At present in linux PVH, we use native_clts and not xen_clts pv
ops call. But another PVH guest might not want to use hypercalls, right?

thanks,
Mukesh

Jan Beulich

2013-Apr-04 08:23 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

>>> On 04.04.13 at 03:01, Mukesh Rathor
<mukesh.rathor@oracle.com> wrote:
> On Mon, 18 Mar 2013 12:32:06 -0400
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> 
>> On Fri, Mar 15, 2013 at 05:41:45PM -0700, Mukesh Rathor wrote:
>> > +            return 1;
>> > +        }
>> > +
>> > +        new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
>> > +        /* ET is reserved and should be always be 1. */
>> > +        new_cr0 |= X86_CR0_ET;
>> > +
>> > +        /* pvh cannot change to real mode */
>> > +        if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) !>>
> (X86_CR0_PG|X86_CR0_PE) ) {
>> > +            printk("PVH attempting to turn off PE/PG.
CR0:%lx\n",
>> > new_cr0);
>> > +            return 1;
>> > +        }
>> > +        /* TS going from 1 to 0 */
>> > +        if ( (old_cr0 & X86_CR0_TS) && ((new_cr0
&
>> > X86_CR0_TS)==0) )
>> > +            vmx_fpu_enter(vp);
>> 
>> Does this really happen? I thought in the PV mode you would be using
>> the hypercalls for the fpu swap? Should it be print out an error
>> saying something to the effect:
>> 
>> 	"PVH guest is using cr0 instead of the paravirt lazy FPU
>> switch!" and include the EIP?
> 
> Yes. At present in linux PVH, we use native_clts and not xen_clts pv
> ops call. But another PVH guest might not want to use hypercalls, right?
Correct. Even more so with there being CR read/write emulation
for PV guests, including for the CLTS instruction. So this case
needs to be handled in any case.

Jan

Tim Deegan

2013-Apr-04 09:12 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

At 18:37 -0700 on 02 Apr (1364927854), Mukesh Rathor
wrote:> On Thu, 21 Mar 2013 16:49:12 +0000 Tim Deegan <tim@xen.org> wrote:
> > > +static int vmxit_invlpg(void)
> > > +{
> > > +    ulong vaddr = __vmread(EXIT_QUALIFICATION);
> > > +
> > > +    vmx_update_guest_eip();
> > > +    vpid_sync_vcpu_gva(current, vaddr);
> > 
> > If there''s no special handling of invlpg, you should just not
> > intercept it. 
> 
> well, we call vpid_sync_vcpu_gva(), but we can optimize this further
> in future.
It would be trivial just to drop the intercept now.  All
vpid_sync_vcpu_gva() does is emulate (potentially rather inefficiently)
the invlpg that''s been intercepted.

Tim.

Mukesh Rathor

2013-Apr-04 23:00 UTC

head link

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

On Thu, 4 Apr 2013 10:12:19 +0100
Tim Deegan <tim@xen.org> wrote:
> At 18:37 -0700 on 02 Apr (1364927854), Mukesh Rathor wrote:
> > On Thu, 21 Mar 2013 16:49:12 +0000 Tim Deegan <tim@xen.org>
wrote:
> > > > +static int vmxit_invlpg(void)
> > > > +{
> > > > +    ulong vaddr = __vmread(EXIT_QUALIFICATION);
> > > > +
> > > > +    vmx_update_guest_eip();
> > > > +    vpid_sync_vcpu_gva(current, vaddr);
> > > 
> > > If there''s no special handling of invlpg, you should
just not
> > > intercept it. 
> > 
> > well, we call vpid_sync_vcpu_gva(), but we can optimize this further
> > in future.
> 
> It would be trivial just to drop the intercept now.  All
> vpid_sync_vcpu_gva() does is emulate (potentially rather
> inefficiently) the invlpg that''s been intercepted.
Ok, got it now. Thanks.

Xen devel - Mar 2013 - [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

[PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c

Re: [PATCH 10/18 V2]: PVH xen: introduce vmx_pvh.c and pvh.c