The heart of this patch is vmx exit handler for PVH guest. It is nicely
isolated in a separate module. A call to it is added to
vmx_pvh_vmexit_handler().
Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
diff -r eca698a4e733 -r 0a38c610f26b xen/arch/x86/hvm/vmx/Makefile
--- a/xen/arch/x86/hvm/vmx/Makefile Fri Jan 11 16:32:36 2013 -0800
+++ b/xen/arch/x86/hvm/vmx/Makefile Fri Jan 11 16:34:17 2013 -0800
@@ -5,3 +5,4 @@ obj-y += vmcs.o
obj-y += vmx.o
obj-y += vpmu_core2.o
obj-y += vvmx.o
+obj-y += vmx_pvh.o
diff -r eca698a4e733 -r 0a38c610f26b xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Fri Jan 11 16:32:36 2013 -0800
+++ b/xen/arch/x86/hvm/vmx/vmx.c Fri Jan 11 16:34:17 2013 -0800
@@ -1546,7 +1546,9 @@ static struct hvm_function_table __read_
.nhvm_intr_blocked = nvmx_intr_blocked,
.nhvm_domain_relinquish_resources = nvmx_domain_relinquish_resources,
.update_eoi_exit_bitmap = vmx_update_eoi_exit_bitmap,
- .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled
+ .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled,
+ .pvh_set_vcpu_info = vmx_pvh_set_vcpu_info,
+ .pvh_read_descriptor = vmx_pvh_read_descriptor
};
struct hvm_function_table * __init start_vmx(void)
@@ -2280,6 +2282,14 @@ void vmx_vmexit_handler(struct cpu_user_
perfc_incra(vmexits, exit_reason);
+ if ( is_pvh_vcpu(v) ) {
+ if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
+ return vmx_failed_vmentry(exit_reason, regs);
+
+ vmx_pvh_vmexit_handler(regs);
+ return;
+ }
+
/* Handle the interrupt we missed before allowing any more in. */
switch ( (uint16_t)exit_reason )
{
diff -r eca698a4e733 -r 0a38c610f26b xen/arch/x86/hvm/vmx/vmx_pvh.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c Fri Jan 11 16:34:17 2013 -0800
@@ -0,0 +1,849 @@
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/irq.h>
+#include <xen/softirq.h>
+#include <xen/domain_page.h>
+#include <xen/hypercall.h>
+#include <xen/guest_access.h>
+#include <xen/perfc.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/regs.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/types.h>
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/spinlock.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/traps.h>
+#include <asm/mem_sharing.h>
+#include <asm/hvm/emulate.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/support.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <public/sched.h>
+#include <public/hvm/ioreq.h>
+#include <asm/hvm/vpic.h>
+#include <asm/hvm/vlapic.h>
+#include <asm/x86_emulate.h>
+#include <asm/hvm/vpt.h>
+#include <public/hvm/save.h>
+#include <asm/hvm/trace.h>
+#include <asm/xenoprof.h>
+#include <asm/debugger.h>
+
+volatile int pvhdbg=0;
+#define dbgp0(...) dprintk(XENLOG_ERR, __VA_ARGS__);
+#define dbgp1(...) {(pvhdbg==1) ? printk(__VA_ARGS__):0;}
+#define dbgp2(...) {(pvhdbg==2) ? printk(__VA_ARGS__):0;}
+
+
+static void read_vmcs_selectors(struct cpu_user_regs *regs)
+{
+ regs->cs = vmr(GUEST_CS_SELECTOR);
+ regs->ss = vmr(GUEST_SS_SELECTOR);
+ regs->ds = vmr(GUEST_DS_SELECTOR);
+ regs->es = vmr(GUEST_ES_SELECTOR);
+ regs->gs = vmr(GUEST_GS_SELECTOR);
+ regs->fs = vmr(GUEST_FS_SELECTOR);
+}
+
+/* returns : 0 success */
+static noinline int vmxit_msr_read(struct cpu_user_regs *regs)
+{
+ int rc=1;
+
+ u64 msr_content = 0;
+ switch (regs->ecx)
+ {
+ case MSR_IA32_MISC_ENABLE:
+ {
+ rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
+ msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
+ break;
+ }
+ default:
+ {
+ /* fixme: see hvm_msr_read_intercept() */
+ rdmsrl(regs->ecx, msr_content);
+ break;
+ }
+ }
+ regs->eax = (uint32_t)msr_content;
+ regs->edx = (uint32_t)(msr_content >> 32);
+ update_guest_eip();
+ rc = 0;
+
+ dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n",
regs->ecx, regs->eax,
+ regs->edx, vmr(GUEST_RIP), vmr(GUEST_RSP));
+ return rc;
+}
+
+/* returns : 0 success */
+static noinline int vmxit_msr_write(struct cpu_user_regs *regs)
+{
+ uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx
<< 32);
+ int rc=1;
+
+ dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n",
regs->ecx,
+ regs->eax,regs->edx);
+
+ if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) {
+ update_guest_eip();
+ rc = 0;
+ }
+ return rc;
+}
+
+/* Returns: rc == 0: handled the MTF vmexit */
+static noinline int vmxit_mtf(struct cpu_user_regs *regs)
+{
+ struct vcpu *vp = current;
+ int rc=1, ss=vp->arch.hvm_vcpu.single_step;
+
+ dbgp2("\n");
+ vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+ __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control);
+ vp->arch.hvm_vcpu.single_step = 0;
+
+ if ( vp->domain->debugger_attached && ss ) {
+ domain_pause_for_debugger();
+ rc = 0;
+ }
+ return rc;
+}
+
+static noinline int vmxit_int3(struct cpu_user_regs *regs)
+{
+ int ilen = get_instruction_length();
+ struct vcpu *vp = current;
+ struct hvm_trap trap_info = {
+ .vector = TRAP_int3,
+ .type = X86_EVENTTYPE_SW_EXCEPTION,
+ .error_code = HVM_DELIVER_NO_ERROR_CODE,
+ .insn_len = ilen
+ };
+
+ regs->eip += ilen;
+
+ /* gdbsx or another debugger. Never pause dom0 */
+ if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp,
regs) )
+ {
+ dbgp1("[%d]PVH: domain pause for debugger\n",
smp_processor_id());
+ current->arch.gdbsx_vcpu_event = TRAP_int3;
+ domain_pause_for_debugger();
+ return 0;
+ }
+
+ regs->eip -= ilen;
+ hvm_inject_trap(&trap_info);
+
+ return 0;
+}
+
+/* Returns: rc == 0: handled the exception/NMI */
+static noinline int vmxit_exception(struct cpu_user_regs *regs)
+{
+ unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) &
INTR_INFO_VECTOR_MASK;
+ int rc=1;
+ struct vcpu *vp = current;
+
+ dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector,
vmr(GUEST_CS_SELECTOR),
+ regs->eip);
+
+ if (vector == TRAP_debug) {
+ unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+ write_debugreg(6, exit_qualification | 0xffff0ff0);
+ regs->rip = vmr(GUEST_RIP);
+ regs->rsp = vmr(GUEST_RSP);
+ rc = 0;
+
+ /* gdbsx or another debugger */
+ if ( vp->domain->domain_id != 0 && /* never pause dom0
*/
+ guest_kernel_mode(vp, regs) &&
vp->domain->debugger_attached )
+ {
+ domain_pause_for_debugger();
+ } else {
+ hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
+ }
+ }
+ if (vector == TRAP_int3) {
+ rc = vmxit_int3(regs);
+ }
+
+ if (vector == TRAP_invalid_op) {
+ if ( guest_kernel_mode(vp, regs) || emulate_forced_invalid_op(regs)==0
)
+ {
+ hvm_inject_hw_exception(TRAP_invalid_op,
HVM_DELIVER_NO_ERROR_CODE);
+ rc = 0;
+ }
+
+ }
+ if (vector == TRAP_no_device) {
+ hvm_funcs.fpu_dirty_intercept(); /* calls vmx_fpu_dirty_intercept */
+ rc = 0;
+ }
+
+ if (vector == TRAP_gp_fault) {
+ regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
+ /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */
+ rc = 1;
+ }
+
+ if (vector == TRAP_page_fault) {
+ printk("PVH: Unexpected vector page_fault. IP:%lx\n",
regs->eip);
+ rc = 1;
+ }
+
+ if (rc)
+ printk("PVH: Unhandled trap vector:%d. IP:%lx\n", vector,
regs->eip);
+ return rc;
+}
+
+static noinline int vmxit_invlpg(void)
+{
+ ulong vaddr = __vmread(EXIT_QUALIFICATION);
+
+ update_guest_eip();
+ vpid_sync_vcpu_gva(current, vaddr);
+ return 0;
+}
+
+static noinline int pvh_grant_table_op(
+ unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
+{
+ switch (cmd)
+ {
+ case GNTTABOP_map_grant_ref:
+ case GNTTABOP_unmap_grant_ref:
+ case GNTTABOP_setup_table:
+ case GNTTABOP_copy:
+ case GNTTABOP_query_size:
+ case GNTTABOP_set_version:
+ return do_grant_table_op(cmd, uop, count);
+ }
+ return -ENOSYS;
+}
+
+static long pvh_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+{
+ long rc = -ENOSYS;
+
+ switch ( cmd )
+ {
+ case VCPUOP_register_runstate_memory_area:
+ case VCPUOP_get_runstate_info:
+ case VCPUOP_set_periodic_timer:
+ case VCPUOP_stop_periodic_timer:
+ case VCPUOP_set_singleshot_timer:
+ case VCPUOP_stop_singleshot_timer:
+ case VCPUOP_is_up:
+ case VCPUOP_up:
+ case VCPUOP_initialise:
+ rc = do_vcpu_op(cmd, vcpuid, arg);
+
+ /* pvh boot vcpu setting context for bringing up smp vcpu */
+ if (cmd == VCPUOP_initialise)
+ vmx_vmcs_enter(current);
+ }
+ return rc;
+}
+
+static long pvh_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
+{
+ switch ( cmd )
+ {
+ case PHYSDEVOP_map_pirq:
+ case PHYSDEVOP_unmap_pirq:
+ case PHYSDEVOP_eoi:
+ case PHYSDEVOP_irq_status_query:
+ case PHYSDEVOP_get_free_pirq:
+ return do_physdev_op(cmd, arg);
+
+ default:
+ if ( IS_PRIV(current->domain) )
+ return do_physdev_op(cmd, arg);
+ }
+ return -ENOSYS;
+}
+
+static noinline long do_pvh_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void)
arg)
+{
+ long rc = -EINVAL;
+ struct xen_hvm_param harg;
+ struct domain *d;
+
+ if ( copy_from_guest(&harg, arg, 1) )
+ return -EFAULT;
+
+ rc = rcu_lock_target_domain_by_id(harg.domid, &d);
+ if ( rc != 0 )
+ return rc;
+
+ if (is_hvm_domain(d)) {
+ /* pvh dom0 is building an hvm guest */
+ rcu_unlock_domain(d);
+ return do_hvm_op(op, arg);
+ }
+
+ rc = -ENOSYS;
+ if (op == HVMOP_set_param) {
+ if (harg.index == HVM_PARAM_CALLBACK_IRQ) {
+ struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+ uint64_t via = harg.value;
+ uint8_t via_type = (uint8_t)(via >> 56) + 1;
+
+ if (via_type == HVMIRQ_callback_vector) {
+ hvm_irq->callback_via_type = HVMIRQ_callback_vector;
+ hvm_irq->callback_via.vector = (uint8_t)via;
+ rc = 0;
+ }
+ }
+ }
+ rcu_unlock_domain(d);
+ return rc;
+}
+
+typedef unsigned long pvh_hypercall_t(
+ unsigned long, unsigned long, unsigned long, unsigned long, unsigned long,
+ unsigned long);
+
+int hcall_a[NR_hypercalls];
+
+static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = {
+ [__HYPERVISOR_platform_op] = (pvh_hypercall_t *)do_platform_op,
+ [__HYPERVISOR_memory_op] = (pvh_hypercall_t *)do_memory_op,
+ /* [__HYPERVISOR_set_timer_op] = (pvh_hypercall_t *)do_set_timer_op, */
+ [__HYPERVISOR_xen_version] = (pvh_hypercall_t *)do_xen_version,
+ [__HYPERVISOR_console_io] = (pvh_hypercall_t *)do_console_io,
+ [__HYPERVISOR_grant_table_op] = (pvh_hypercall_t *)pvh_grant_table_op,
+ [__HYPERVISOR_vcpu_op] = (pvh_hypercall_t *)pvh_vcpu_op,
+ [__HYPERVISOR_mmuext_op] = (pvh_hypercall_t *)do_mmuext_op,
+ [__HYPERVISOR_xsm_op] = (pvh_hypercall_t *)do_xsm_op,
+ [__HYPERVISOR_sched_op] = (pvh_hypercall_t *)do_sched_op,
+ [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t *)do_event_channel_op,
+ [__HYPERVISOR_physdev_op] = (pvh_hypercall_t *)pvh_physdev_op,
+ [__HYPERVISOR_hvm_op] = (pvh_hypercall_t *)do_pvh_hvm_op,
+ [__HYPERVISOR_sysctl] = (pvh_hypercall_t *)do_sysctl,
+ [__HYPERVISOR_domctl] = (pvh_hypercall_t *)do_domctl
+};
+
+/* fixme: Do we need to worry about this and slow things down in this path? */
+static int pvh_long_mode_enabled(void)
+{
+ /* A 64bit linux guest should always run in this mode with CS.L selecting
+ * either 64bit mode or 32bit compat mode */
+ return 1;
+}
+
+/* Check if hypercall is valid
+ * Returns: 0 if hcall is not valid with eax set to the errno to ret to guest
+ */
+static noinline int hcall_valid(struct cpu_user_regs *regs)
+{
+ struct segment_register sreg;
+
+ if (!pvh_long_mode_enabled()) {
+ printk("PVH Error: Expected long mode set\n");
+ return 1;
+ }
+ hvm_get_segment_register(current, x86_seg_ss, &sreg);
+ if ( unlikely(sreg.attr.fields.dpl == 3) ) {
+ regs->eax = -EPERM;
+ return 0;
+ }
+
+ /* domU''s are not allowed following hcalls */
+ if ( !IS_PRIV(current->domain) &&
+ (regs->eax == __HYPERVISOR_xsm_op ||
+ regs->eax == __HYPERVISOR_platform_op ||
+ regs->eax == __HYPERVISOR_domctl) ) { /* for privcmd mmap */
+
+ regs->eax = -EPERM;
+ return 0;
+ }
+ return 1;
+}
+
+static noinline int vmxit_vmcall(struct cpu_user_regs *regs)
+{
+ uint32_t hnum = regs->eax;
+
+ if (hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] ==NULL)
+ {
+ dbgp0("PVH[%d]: UnImplemented HCALL:%ld. Ret -ENOSYS IP:%lx
SP:%lx\n",
+ smp_processor_id(), regs->eax, vmr(GUEST_RIP),
vmr(GUEST_RSP));
+ regs->eax = -ENOSYS;
+ update_guest_eip();
+ return HVM_HCALL_completed;
+ }
+
+ dbgp2("vmxit_vmcall: hcall eax:$%ld\n", regs->eax);
+ if (regs->eax == __HYPERVISOR_sched_op && regs->rdi ==
SCHEDOP_shutdown) {
+ /* PVH fixme: go thru this again to make sure nothing is left out */
+ printk("PVH: FIXME: SCHEDOP_shutdown hcall\n");
+ regs->eax = -ENOSYS;
+ update_guest_eip();
+ domain_crash_synchronous();
+ return HVM_HCALL_completed;
+ }
+
+ if ( !hcall_valid(regs) )
+ return HVM_HCALL_completed;
+
+ /* PVH fixme: search for this and do it. PV method will not work */
+ current->arch.hvm_vcpu.hcall_preempted = 0;
+
+ regs->rax = pvh_hypercall64_table[hnum](regs->rdi, regs->rsi,
regs->rdx,
+ regs->r10, regs->r8,
regs->r9);
+
+ if ( !current->arch.hvm_vcpu.hcall_preempted )
+ update_guest_eip();
+ else
+ printk("PVH: Hcall :%d preempted\n", hnum);
+
+ return HVM_HCALL_completed;
+}
+
+static noinline uint64_t *get_gpr_ptr(struct cpu_user_regs *regs, uint gpr)
+{
+ switch (gpr)
+ {
+ case VMX_CONTROL_REG_ACCESS_GPR_EAX:
+ return ®s->eax;
+ case VMX_CONTROL_REG_ACCESS_GPR_ECX:
+ return ®s->ecx;
+ case VMX_CONTROL_REG_ACCESS_GPR_EDX:
+ return ®s->edx;
+ case VMX_CONTROL_REG_ACCESS_GPR_EBX:
+ return ®s->ebx;
+ case VMX_CONTROL_REG_ACCESS_GPR_ESP:
+ return ®s->esp;
+ case VMX_CONTROL_REG_ACCESS_GPR_EBP:
+ return ®s->ebp;
+ case VMX_CONTROL_REG_ACCESS_GPR_ESI:
+ return ®s->esi;
+ case VMX_CONTROL_REG_ACCESS_GPR_EDI:
+ return ®s->edi;
+ case VMX_CONTROL_REG_ACCESS_GPR_R8:
+ return ®s->r8;
+ case VMX_CONTROL_REG_ACCESS_GPR_R9:
+ return ®s->r9;
+ case VMX_CONTROL_REG_ACCESS_GPR_R10:
+ return ®s->r10;
+ case VMX_CONTROL_REG_ACCESS_GPR_R11:
+ return ®s->r11;
+ case VMX_CONTROL_REG_ACCESS_GPR_R12:
+ return ®s->r12;
+ case VMX_CONTROL_REG_ACCESS_GPR_R13:
+ return ®s->r13;
+ case VMX_CONTROL_REG_ACCESS_GPR_R14:
+ return ®s->r14;
+ case VMX_CONTROL_REG_ACCESS_GPR_R15:
+ return ®s->r15;
+ default:
+ return NULL;
+ }
+}
+/* Returns: rc == 0: success */
+static noinline int access_cr0(struct cpu_user_regs *regs, uint acc_typ,
+ uint64_t *regp)
+{
+ struct vcpu *vp = current;
+
+ if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+ {
+ unsigned long new_cr0 = *regp;
+ unsigned long old_cr0 = __vmread(GUEST_CR0);
+
+ dbgp2("PVH:writing to CR0. RIP:%lx val:0x%lx\n",
vmr(GUEST_RIP),*regp);
+ if ( (u32)new_cr0 != new_cr0 )
+ {
+ HVM_DBG_LOG(DBG_LEVEL_1,
+ "Guest setting upper 32 bits in CR0: %lx",
new_cr0);
+ return 1;
+ }
+
+ new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
+ /* ET is reserved and should be always be 1. */
+ new_cr0 |= X86_CR0_ET;
+
+ /* pvh cannot change to real mode */
+ if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) != (X86_CR0_PG|X86_CR0_PE)
) {
+ printk("PVH domU attempting to turn off PE/PG.
CR0:%lx\n", new_cr0);
+ return 1;
+ }
+ /* TS going from 1 to 0 */
+ if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 &
X86_CR0_TS)==0) )
+ vmx_fpu_enter(vp);
+
+ vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] =
new_cr0;
+ __vmwrite(GUEST_CR0, new_cr0);
+ __vmwrite(CR0_READ_SHADOW, new_cr0);
+ } else {
+ *regp = __vmread(GUEST_CR0);
+ }
+ return 0;
+}
+
+/* Returns: rc == 0: success */
+static noinline int access_cr4(struct cpu_user_regs *regs, uint acc_typ,
+ uint64_t *regp)
+{
+ if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+ {
+ u64 old_cr4 = __vmread(GUEST_CR4);
+
+ if ( (old_cr4 ^ (*regp)) & (X86_CR4_PSE | X86_CR4_PGE |
X86_CR4_PAE) )
+ vpid_sync_all();
+
+ /* pvh_verify_cr4_wr(*regp)); */
+ __vmwrite(GUEST_CR4, *regp);
+ } else {
+ *regp = __vmread(GUEST_CR4);
+ }
+ return 0;
+}
+
+/* Returns: rc == 0: success */
+static noinline int vmxit_cr_access(struct cpu_user_regs *regs)
+{
+ unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+ uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
+ int cr, rc = 1;
+
+ switch ( acc_typ )
+ {
+ case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
+ case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
+ {
+ uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
+ uint64_t *regp = get_gpr_ptr(regs, gpr);
+ cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
+
+ if (regp == NULL)
+ break;
+
+ /* pl don''t embed switch statements */
+ if (cr == 0)
+ rc = access_cr0(regs, acc_typ, regp);
+ else if (cr == 3) {
+ printk("PVH: d%d: unexpected cr3 access vmexit.
rip:%lx\n",
+ current->domain->domain_id, vmr(GUEST_RIP));
+ domain_crash_synchronous();
+ } else if (cr == 4)
+ rc = access_cr4(regs, acc_typ, regp);
+
+ if (rc == 0)
+ update_guest_eip();
+ break;
+ }
+ case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
+ {
+ struct vcpu *vp = current;
+ unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] &
~X86_CR0_TS;
+ vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] =
cr0;
+ vmx_fpu_enter(vp);
+ __vmwrite(GUEST_CR0, cr0);
+ __vmwrite(CR0_READ_SHADOW, cr0);
+ update_guest_eip();
+ rc = 0;
+ }
+ }
+ return rc;
+}
+
+/* NOTE: a PVH sets IOPL natively by setting bits in the eflags and not by
+ * hypercalls used by a PV */
+static int noinline vmxit_io_instr(struct cpu_user_regs *regs)
+{
+ int curr_lvl;
+ int requested = (regs->rflags >> 12) & 3;
+
+ read_vmcs_selectors(regs);
+ curr_lvl = regs->cs & 3;
+
+ if (requested >= curr_lvl && emulate_privileged_op(regs))
+ return 0;
+
+ hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
+ return 0;
+}
+
+static noinline int pvh_ept_handle_violation(unsigned long qualification,
paddr_t gpa)
+{
+ unsigned long gla, gfn = gpa >> PAGE_SHIFT;
+ p2m_type_t p2mt;
+ mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt);
+
+ gdprintk(XENLOG_ERR, "Dom:%d EPT violation %#lx (%c%c%c/%c%c%c),
"
+ "gpa %#"PRIpaddr", mfn %#lx, type %i.\n",
+ current->domain->domain_id, qualification,
+ (qualification & EPT_READ_VIOLATION) ? ''r'' :
''-'',
+ (qualification & EPT_WRITE_VIOLATION) ? ''w''
: ''-'',
+ (qualification & EPT_EXEC_VIOLATION) ? ''x'' :
''-'',
+ (qualification & EPT_EFFECTIVE_READ) ? ''r'' :
''-'',
+ (qualification & EPT_EFFECTIVE_WRITE) ? ''w''
: ''-'',
+ (qualification & EPT_EFFECTIVE_EXEC) ? ''x'' :
''-'',
+ gpa, mfn_x(mfn), p2mt);
+
+ ept_walk_table(current->domain, gfn);
+
+ if ( qualification & EPT_GLA_VALID )
+ {
+ gla = __vmread(GUEST_LINEAR_ADDRESS);
+ gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
+ }
+
+ hvm_inject_hw_exception(TRAP_gp_fault, 0);
+ return 0;
+}
+
+static noinline void pvh_cpuid(struct cpu_user_regs *regs)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ asm volatile ( "cpuid"
+ : "=a" (eax), "=b" (ebx), "=c"
(ecx), "=d" (edx)
+ : "0" (regs->eax), "2" (regs->rcx) );
+
+ regs->rax = eax; regs->rbx = ebx; regs->rcx = ecx; regs->rdx =
edx;
+}
+
+void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs)
+{
+ unsigned long exit_qualification;
+ unsigned int vector, exit_reason = __vmread(VM_EXIT_REASON);
+ int rc=0, ccpu = smp_processor_id();
+ struct vcpu *vp = current;
+
+ dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx
CR0:%lx\n",
+ ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP), regs->rflags,
+ vmr(GUEST_CR0));
+
+ /* for guest_kernel_mode() */
+ regs->cs = vmr(GUEST_CS_SELECTOR);
+
+ switch ( (uint16_t)exit_reason )
+ {
+ case EXIT_REASON_EXCEPTION_NMI:
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ break;
+ default:
+ local_irq_enable();
+ }
+
+ switch ( (uint16_t)exit_reason )
+ {
+ case EXIT_REASON_EXCEPTION_NMI: /* 0 */
+ rc = vmxit_exception(regs);
+ break;
+
+ case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */
+ {
+ vector = __vmread(VM_EXIT_INTR_INFO);
+ vector &= INTR_INFO_VECTOR_MASK;
+ vmx_do_extint(regs);
+ break;
+ }
+
+ case EXIT_REASON_TRIPLE_FAULT: /* 2 */
+ {
+ dbgp0("PVH:Triple Flt:[%d]exitreas:%d RIP:%lx RSP:%lx
EFLAGS:%lx CR3:%lx\n",
+ ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP),
+ regs->rflags, vmr(GUEST_CR3));
+
+ vp->arch.hvm_vcpu.guest_cr[3] = vp->arch.hvm_vcpu.hw_cr[3] +
__vmread(GUEST_CR3);
+ rc = 1;
+ break;
+ }
+ case EXIT_REASON_PENDING_VIRT_INTR: /* 7 */
+ {
+ struct vcpu *v = current;
+ /* Disable the interrupt window. */
+ v->arch.hvm_vmx.exec_control &=
~CPU_BASED_VIRTUAL_INTR_PENDING;
+ __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
v->arch.hvm_vmx.exec_control);
+ break;
+ }
+
+ case EXIT_REASON_CPUID: /* 10 */
+ {
+ if ( guest_kernel_mode(vp, regs) ) {
+ pv_cpuid(regs);
+
+ /* Because we are setting CR4.OSFXSR to 0, we need to disable
+ * this because, during boot, user process "init"
(which doesn''t
+ * do cpuid), will do ''pxor xmm0,xmm0'' and
cause #UD. For now
+ * disable this. HVM doesn''t allow setting of
CR4.OSFXSR.
+ * fixme: this and also look at CR4.OSXSAVE */
+
+ __clear_bit(X86_FEATURE_FXSR, ®s->edx);
+ } else
+ pvh_cpuid(regs);
+
+ /* fixme: investigate and fix the XSAVE/MMX/FPU stuff */
+
+ update_guest_eip();
+ dbgp2("cpuid:%ld RIP:%lx\n", regs->eax,
vmr(GUEST_RIP));
+ break;
+ }
+
+ case EXIT_REASON_HLT: /* 12 */
+ {
+ update_guest_eip();
+ hvm_hlt(regs->eflags);
+ break;
+ }
+
+ case EXIT_REASON_INVLPG: /* 14 */
+ rc = vmxit_invlpg();
+ break;
+
+ case EXIT_REASON_RDTSC: /* 16 */
+ {
+ rc = 1;
+ break;
+ }
+
+ case EXIT_REASON_VMCALL: /* 18 */
+ rc = vmxit_vmcall(regs);
+ break;
+
+ case EXIT_REASON_CR_ACCESS: /* 28 */
+ rc = vmxit_cr_access(regs);
+ break;
+
+ case EXIT_REASON_DR_ACCESS: /* 29 */
+ {
+ exit_qualification = __vmread(EXIT_QUALIFICATION);
+ vmx_dr_access(exit_qualification, regs);
+ break;
+ }
+
+ case EXIT_REASON_IO_INSTRUCTION:
+ vmxit_io_instr(regs);
+ break;
+
+ case EXIT_REASON_MSR_READ: /* 31 */
+ rc = vmxit_msr_read(regs);
+ break;
+
+ case EXIT_REASON_MSR_WRITE: /* 32 */
+ rc = vmxit_msr_write(regs);
+ break;
+
+ case EXIT_REASON_MONITOR_TRAP_FLAG: /* 37 */
+ rc = vmxit_mtf(regs);
+ break;
+
+ case EXIT_REASON_EPT_VIOLATION:
+ {
+ paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+ exit_qualification = __vmread(EXIT_QUALIFICATION);
+ rc = pvh_ept_handle_violation(exit_qualification, gpa);
+ break;
+ }
+ default:
+ rc = 1;
+ printk("PVH: Unexpected exit reason:%d 0x%x\n",
exit_reason,
+ exit_reason);
+ }
+
+ if (rc) {
+ exit_qualification = __vmread(EXIT_QUALIFICATION);
+ printk("PVH: [%d] exit_reas:%d 0x%x qual:%ld 0x%lx
cr0:0x%016lx\n",
+ ccpu, exit_reason, exit_reason, exit_qualification,
+ exit_qualification, vmr(GUEST_CR0));
+ printk("PVH: [%d] RIP:%lx RSP:%lx\n", ccpu,
+ vmr(GUEST_RIP), vmr(GUEST_RSP));
+ domain_crash_synchronous();
+ }
+}
+
+/*
+ * Sets info for non boot vcpu. VCPU 0 context is set by library which needs
+ * to be modified to send
+ * correct selectors and gs_base. For now, we use this for nonboot vcpu
+ * in which case the call somes from the kernel cpu_initialize_context().
+ */
+int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp)
+{
+ if (v->vcpu_id == 0)
+ return 0;
+
+ vmx_vmcs_enter(v);
+ __vmwrite(GUEST_GDTR_BASE, ctxtp->u.pvh.gdtaddr);
+ __vmwrite(GUEST_GDTR_LIMIT, ctxtp->u.pvh.gdtsz);
+ __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user);
+
+ __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs);
+ __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds);
+ __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es);
+ __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss);
+ __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs);
+
+ if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) )
+ return -EINVAL;
+
+ vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel);
+
+ vmx_vmcs_exit(v);
+ return 0;
+}
+
+int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
+ const struct cpu_user_regs *regs,
+ unsigned long *base, unsigned long *limit,
+ unsigned int *ar)
+{
+ unsigned int tmp_ar = 0;
+ BUG_ON(v!=current);
+ BUG_ON(!is_pvh_vcpu(v));
+
+ if (sel == (unsigned int)regs->cs) {
+ *base = vmr(GUEST_CS_BASE);
+ *limit = vmr(GUEST_CS_LIMIT);
+ tmp_ar = vmr(GUEST_CS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->ds) {
+ *base = vmr(GUEST_DS_BASE);
+ *limit = vmr(GUEST_DS_LIMIT);
+ tmp_ar = vmr(GUEST_DS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->ss) {
+ *base = vmr(GUEST_SS_BASE);
+ *limit = vmr(GUEST_SS_LIMIT);
+ tmp_ar = vmr(GUEST_SS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->gs) {
+ *base = vmr(GUEST_GS_BASE);
+ *limit = vmr(GUEST_GS_LIMIT);
+ tmp_ar = vmr(GUEST_GS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->fs) {
+ *base = vmr(GUEST_FS_BASE);
+ *limit = vmr(GUEST_FS_LIMIT);
+ tmp_ar = vmr(GUEST_FS_AR_BYTES);
+ } else if (sel == (unsigned int)regs->es) {
+ *base = vmr(GUEST_ES_BASE);
+ *limit = vmr(GUEST_ES_LIMIT);
+ tmp_ar = vmr(GUEST_ES_AR_BYTES);
+ } else {
+ printk("Unmatched segment selector:%d\n", sel);
+ return 0;
+ }
+
+ if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) { /* x86 mess!! */
+ *base = 0UL;
+ *limit = ~0UL;
+ }
+ /* Fixup ar so that it looks the same as in native mode */
+ *ar = (tmp_ar << 8);
+ return 1;
+}
+
diff -r eca698a4e733 -r 0a38c610f26b xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h Fri Jan 11 16:32:36 2013 -0800
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h Fri Jan 11 16:34:17 2013 -0800
@@ -156,11 +156,28 @@ void vmx_update_cpu_exec_control(struct
# define VMX_CONTROL_REG_ACCESS_TYPE_LMSW 3
/* 10:8 - general purpose register operand */
#define VMX_CONTROL_REG_ACCESS_GPR(eq) (((eq) >> 8) & 0xf)
+#define VMX_CONTROL_REG_ACCESS_GPR_EAX 0
+#define VMX_CONTROL_REG_ACCESS_GPR_ECX 1
+#define VMX_CONTROL_REG_ACCESS_GPR_EDX 2
+#define VMX_CONTROL_REG_ACCESS_GPR_EBX 3
+#define VMX_CONTROL_REG_ACCESS_GPR_ESP 4
+#define VMX_CONTROL_REG_ACCESS_GPR_EBP 5
+#define VMX_CONTROL_REG_ACCESS_GPR_ESI 6
+#define VMX_CONTROL_REG_ACCESS_GPR_EDI 7
+#define VMX_CONTROL_REG_ACCESS_GPR_R8 8
+#define VMX_CONTROL_REG_ACCESS_GPR_R9 9
+#define VMX_CONTROL_REG_ACCESS_GPR_R10 10
+#define VMX_CONTROL_REG_ACCESS_GPR_R11 11
+#define VMX_CONTROL_REG_ACCESS_GPR_R12 12
+#define VMX_CONTROL_REG_ACCESS_GPR_R13 13
+#define VMX_CONTROL_REG_ACCESS_GPR_R14 14
+#define VMX_CONTROL_REG_ACCESS_GPR_R15 15
/*
* Access Rights
*/
#define X86_SEG_AR_SEG_TYPE 0xf /* 3:0, segment type */
+#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment */
#define X86_SEG_AR_DESC_TYPE (1u << 4) /* 4, descriptor type */
#define X86_SEG_AR_DPL 0x60 /* 6:5, descriptor privilege level
*/
#define X86_SEG_AR_SEG_PRESENT (1u << 7) /* 7, segment present */
@@ -420,6 +437,11 @@ void setup_ept_dump(void);
void update_guest_eip(void);
void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs
*regs);
void vmx_do_extint(struct cpu_user_regs *regs);
+void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs);
+int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp);
+int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
+ const struct cpu_user_regs *regs, unsigned long *base,
+ unsigned long *limit, unsigned int *ar);
/* EPT violation qualifications definitions */
#define _EPT_READ_VIOLATION 0
>>> On 12.01.13 at 03:01, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > The heart of this patch is vmx exit handler for PVH guest. It is nicely > isolated in a separate module. A call to it is added to > vmx_pvh_vmexit_handler().I''m sorry to say that, but this patch doesn''t look worth commenting on in detail: It''s completely unsorted (mixing VMX and generic stuff) and appears heavily redundant with other code. I think this needs to be sorted out cleanly first. Jan
On Mon, 14 Jan 2013 11:59:30 +0000 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 12.01.13 at 03:01, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > The heart of this patch is vmx exit handler for PVH guest. It is > > nicely isolated in a separate module. A call to it is added to > > vmx_pvh_vmexit_handler(). > > I''m sorry to say that, but this patch doesn''t look worth commenting > on in detail: It''s completely unsorted (mixing VMX and generic stuff) > and appears heavily redundant with other code. I think this needs > to be sorted out cleanly first.Hi Jan, Not sure what you are referring to when you generic stuff, but it''s all VMX stuff, mainly vmx exit handler. We had discussed it during the hackathon and the xen summit prior, and we wanted to keep functions and code for PVH as much separate as possible to avoid filling existing HVM code with if PVH statements. I can look into moving some stuff to common code if you have issues with any specific ones? Or do you not want a separate exit handler for PVH case at all? I think keeping it separate is much better thing to do.... thanks for looking at the patches. Mukesh
>>> On 15.01.13 at 01:54, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Mon, 14 Jan 2013 11:59:30 +0000 "Jan Beulich" <JBeulich@suse.com> wrote: >> >>> On 12.01.13 at 03:01, Mukesh Rathor <mukesh.rathor@oracle.com> >> >>> wrote: >> > The heart of this patch is vmx exit handler for PVH guest. It is >> > nicely isolated in a separate module. A call to it is added to >> > vmx_pvh_vmexit_handler(). >> >> I''m sorry to say that, but this patch doesn''t look worth commenting >> on in detail: It''s completely unsorted (mixing VMX and generic stuff) >> and appears heavily redundant with other code. I think this needs >> to be sorted out cleanly first. > > Not sure what you are referring to when you generic stuff, but it''s > all VMX stuff, mainly vmx exit handler. We had discussed it during > the hackathon and the xen summit prior, and we wanted to keep functions > and code for PVH as much separate as possible to avoid filling existing > HVM code with if PVH statements. I can look into moving some stuff to > common code if you have issues with any specific ones? Or do you not > want a separate exit handler for PVH case at all? I think keeping it > separate is much better thing to do....The main thing are the hypercall wrappers - they''re definitely not VMX-specific, and hence don''t belong in VMX-specific code. Besides that, the sub-hypercall filtering you do there looks pretty fragile too. But stuff like get_gpr_ptr() doesn''t belong here either (and I actually doubt the function should be added in the first place - iirc we already have a function doing just that, and it wasn''t that long ago that I consolidated three(?) incarnations into just one; I guess you understand that I don''t want to see another one appearing without good reason). Jan
On Tue, 15 Jan 2013 08:46:35 +0000 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 15.01.13 at 01:54, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > On Mon, 14 Jan 2013 11:59:30 +0000 "Jan Beulich" > > <JBeulich@suse.com> wrote: > >> >>> On 12.01.13 at 03:01, Mukesh Rathor <mukesh.rathor@oracle.com> > >> >>> wrote: > >> > The heart of this patch is vmx exit handler for PVH guest. It is > >> > nicely isolated in a separate module. A call to it is added to > >> > vmx_pvh_vmexit_handler(). > >> > >> I''m sorry to say that, but this patch doesn''t look worth commenting > >> on in detail: It''s completely unsorted (mixing VMX and generic > >> stuff) and appears heavily redundant with other code. I think this > >> needs to be sorted out cleanly first. > > > > Not sure what you are referring to when you generic stuff, but it''s > > all VMX stuff, mainly vmx exit handler. We had discussed it during > > the hackathon and the xen summit prior, and we wanted to keep > > functions and code for PVH as much separate as possible to avoid > > filling existing HVM code with if PVH statements. I can look into > > moving some stuff to common code if you have issues with any > > specific ones? Or do you not want a separate exit handler for PVH > > case at all? I think keeping it separate is much better thing to > > do.... > > The main thing are the hypercall wrappers - they''re definitely not > VMX-specific, and hence don''t belong in VMX-specific code. BesidesAh, I see. The HVM hcalls are in hvm.c and not vmx.c. Since PVH needs slightly different hcalls and restricts certain ones ok for HVM, I really prefer to not pollute hvm_do_hypercall() with if PVH everywhere. I could add a new function to hvm.c, pvh_hvm_do_hypercall(), or create a new file hvm_pvh.c and add it there. What would you suggest?>too. But stuff like get_gpr_ptr() doesn''t belong here either (and I >actually doubt the function should be added in the first place - iirc >we already have a function doing just that, and it wasn''t that longYup, decode_register() does that, and it''s non-static. I missed it because it was added later, and wasn''t in the tree I was using. I''ll use that, less code for me. thanks, Mukesh
>>> On 24.01.13 at 02:59, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Tue, 15 Jan 2013 08:46:35 +0000 "Jan Beulich" <JBeulich@suse.com> wrote: >> The main thing are the hypercall wrappers - they''re definitely not >> VMX-specific, and hence don''t belong in VMX-specific code. Besides > > Ah, I see. The HVM hcalls are in hvm.c and not vmx.c. Since PVH needs > slightly different hcalls and restricts certain ones ok for HVM, I really > prefer to not pollute hvm_do_hypercall() with if PVH everywhere. I could > add a new function to hvm.c, pvh_hvm_do_hypercall(), or create a new > file hvm_pvh.c and add it there. What would you suggest?The only mail with that patch that I have definitely has these in xen/arch/x86/hvm/vmx/vmx_pvh.c. So yes, the correct thing - if adjustments to the existing ones makes the code too ugly - would be for them to go into xen/arch/x86/hvm/pvh.c. Jan
At 18:01 -0800 on 11 Jan (1357927270), Mukesh Rathor wrote:> The heart of this patch is vmx exit handler for PVH guest. It is nicely > isolated in a separate module. A call to it is added to > vmx_pvh_vmexit_handler().Not looking at this in fine detail yet (considering the various ''fixme''s still around in it), but one or two things:> +static noinline int vmxit_exception(struct cpu_user_regs *regs) > +{ > + unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) & INTR_INFO_VECTOR_MASK; > + int rc=1; > + struct vcpu *vp = current; > + > + dbgp2(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector, vmr(GUEST_CS_SELECTOR), > + regs->eip); > + > + if (vector == TRAP_debug) { > + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); > + write_debugreg(6, exit_qualification | 0xffff0ff0); > + regs->rip = vmr(GUEST_RIP); > + regs->rsp = vmr(GUEST_RSP); > + rc = 0; > + > + /* gdbsx or another debugger */This is a hard tab. Actually, the whitespace in this file needs attention generally.> +void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs) > +{ > + unsigned long exit_qualification; > + unsigned int vector, exit_reason = __vmread(VM_EXIT_REASON); > + int rc=0, ccpu = smp_processor_id(); > + struct vcpu *vp = current; > + > + dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx CR0:%lx\n", > + ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP), regs->rflags, > + vmr(GUEST_CR0)); > + > + /* for guest_kernel_mode() */ > + regs->cs = vmr(GUEST_CS_SELECTOR); > + > + switch ( (uint16_t)exit_reason ) > + { > + case EXIT_REASON_EXCEPTION_NMI: > + case EXIT_REASON_EXTERNAL_INTERRUPT: > + case EXIT_REASON_MCE_DURING_VMENTRY: > + break; > + default: > + local_irq_enable(); > + }That''s a bit risky: EXIT_REASON_EXCEPTION_NMI includes a lot of cases that might not be safe to handle with interrupts disabled. Also I think it means there are paths through this function that don''t enable irqs at all. I think it''d be better to do it the way vmx_vmexit_handler() does: explicitly sort out the things that _must_ be done with irqs disabled first, so it''s clear which code runs with irqs enabled and which doesn''t.> + switch ( (uint16_t)exit_reason ) > + { > + case EXIT_REASON_EXCEPTION_NMI: /* 0 */ > + rc = vmxit_exception(regs); > + break; > + > + case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */ > + { > + vector = __vmread(VM_EXIT_INTR_INFO); > + vector &= INTR_INFO_VECTOR_MASK; > + vmx_do_extint(regs); > + break; > + } > + > + case EXIT_REASON_TRIPLE_FAULT: /* 2 */ > + { > + dbgp0("PVH:Triple Flt:[%d]exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx CR3:%lx\n", > + ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP), > + regs->rflags, vmr(GUEST_CR3)); > + > + vp->arch.hvm_vcpu.guest_cr[3] = vp->arch.hvm_vcpu.hw_cr[3] > + __vmread(GUEST_CR3); > + rc = 1; > + break; > + } > + case EXIT_REASON_PENDING_VIRT_INTR: /* 7 */ > + { > + struct vcpu *v = current; > + /* Disable the interrupt window. */ > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; > + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); > + break; > + } > + > + case EXIT_REASON_CPUID: /* 10 */ > + { > + if ( guest_kernel_mode(vp, regs) ) { > + pv_cpuid(regs); > + > + /* Because we are setting CR4.OSFXSR to 0, we need to disable > + * this because, during boot, user process "init" (which doesn''t > + * do cpuid), will do ''pxor xmm0,xmm0'' and cause #UD. For now > + * disable this. HVM doesn''t allow setting of CR4.OSFXSR. > + * fixme: this and also look at CR4.OSXSAVE */ > + > + __clear_bit(X86_FEATURE_FXSR, ®s->edx);Shouldn''t this be gated on which leaf the guest asked for? Tim.
On Thu, 24 Jan 2013 16:31:22 +0000 Tim Deegan <tim@xen.org> wrote:> At 18:01 -0800 on 11 Jan (1357927270), Mukesh Rathor wrote: > That''s a bit risky: EXIT_REASON_EXCEPTION_NMI includes a lot of cases > that might not be safe to handle with interrupts disabled. Also I > think it means there are paths through this function that don''t > enable irqs at all. > > I think it''d be better to do it the way vmx_vmexit_handler() does: > explicitly sort out the things that _must_ be done with irqs disabled > first, so it''s clear which code runs with irqs enabled and which > doesn''t.Yup, fixed already.> This is a hard tab. Actually, the whitespace in this file needs > attention generally.rats, i turn tab on when making linux changes, and then on xen side sometimes forget to turn them off. anyways, fixed. thanks, mukesh
On Thu, 24 Jan 2013 16:31:22 +0000 Tim Deegan <tim@xen.org> wrote:> At 18:01 -0800 on 11 Jan (1357927270), Mukesh Rathor wrote: > > The heart of this patch is vmx exit handler for PVH guest. It is > > nicely isolated in a separate module. A call to it is added to > > vmx_pvh_vmexit_handler(). > > Not looking at this in fine detail yet (considering the various > ''fixme''s still around in it), but one or two things:That''s where I need suggestions/help ;).
On Thu, 24 Jan 2013 09:21:45 +0000 "Jan Beulich" <JBeulich@suse.com> wrote:> >>> On 24.01.13 at 02:59, Mukesh Rathor <mukesh.rathor@oracle.com> > >>> wrote: > > On Tue, 15 Jan 2013 08:46:35 +0000 "Jan Beulich" > > <JBeulich@suse.com> wrote: > >> The main thing are the hypercall wrappers - they''re definitely not > >> VMX-specific, and hence don''t belong in VMX-specific code. Besides > > > > Ah, I see. The HVM hcalls are in hvm.c and not vmx.c. Since PVH > > needs slightly different hcalls and restricts certain ones ok for > > HVM, I really prefer to not pollute hvm_do_hypercall() with if PVH > > everywhere. I could add a new function to hvm.c, > > pvh_hvm_do_hypercall(), or create a new file hvm_pvh.c and add it > > there. What would you suggest? > > The only mail with that patch that I have definitely has these in > xen/arch/x86/hvm/vmx/vmx_pvh.c.I meant for an HVM guest, the hcalls are in hvm.c and not vmx.c. I kinda tried to follow that.> So yes, the correct thing - if adjustments to the existing ones > makes the code too ugly - would be for them to go into > xen/arch/x86/hvm/pvh.c.Ok, will do it.
On Thu, 24 Jan 2013 16:31:22 +0000 Tim Deegan <tim@xen.org> wrote:> At 18:01 -0800 on 11 Jan (1357927270), Mukesh Rathor wrote: > > + > > + case EXIT_REASON_CPUID: /* 10 */ > > + { > > + if ( guest_kernel_mode(vp, regs) ) { > > + pv_cpuid(regs); > > + > > + /* Because we are setting CR4.OSFXSR to 0, we need > > to disable > > + * this because, during boot, user process > > "init" (which doesn''t > > + * do cpuid), will do ''pxor xmm0,xmm0'' and cause > > #UD. For now > > + * disable this. HVM doesn''t allow setting of > > CR4.OSFXSR. > > + * fixme: this and also look at CR4.OSXSAVE */ > > + > > + __clear_bit(X86_FEATURE_FXSR, ®s->edx); > > Shouldn''t this be gated on which leaf the guest asked for?Yup, looking at it. X86_FEATURE_FXSR is EAX==1, but it doesn''t work. The user process "init" running nash is executing pxor %xmm0, %xmm0 and taking #UD. Strangely, it works with EAX==0, meaning if I clear the bit for EAX==0, changing the intel string "ineI". This user process doesn''t do cpuid, so it must be affected by it some other way. Pretty hard to debug since it''s in nash user code from ramdisk and I am not able to set breakpoint or put printf''s easily to figure why clearing bit for EAX==0 makes it work, or what''s going on for PV and HVM guest. CR0.EM is 0, so UD is coming from CR4.OSFXSR==0. Reading the SDMs to learn OSFXSR stuff better.... Will continue investigating. Thanks, Mukesh
At 16:05 -0800 on 19 Feb (1361289934), Mukesh Rathor wrote:> On Thu, 24 Jan 2013 16:31:22 +0000 > Tim Deegan <tim@xen.org> wrote: > > > At 18:01 -0800 on 11 Jan (1357927270), Mukesh Rathor wrote: > > > + > > > + case EXIT_REASON_CPUID: /* 10 */ > > > + { > > > + if ( guest_kernel_mode(vp, regs) ) { > > > + pv_cpuid(regs); > > > + > > > + /* Because we are setting CR4.OSFXSR to 0, we need > > > to disable > > > + * this because, during boot, user process > > > "init" (which doesn''t > > > + * do cpuid), will do ''pxor xmm0,xmm0'' and cause > > > #UD. For now > > > + * disable this. HVM doesn''t allow setting of > > > CR4.OSFXSR. > > > + * fixme: this and also look at CR4.OSXSAVE */ > > > + > > > + __clear_bit(X86_FEATURE_FXSR, ®s->edx); > > > > Shouldn''t this be gated on which leaf the guest asked for? > > Yup, looking at it. X86_FEATURE_FXSR is EAX==1, but it doesn''t work. > The user process "init" running nash is executing pxor %xmm0, %xmm0 and > taking #UD. Strangely, it works with EAX==0, meaning if I clear the bit > for EAX==0, changing the intel string "ineI". This user process doesn''t > do cpuid, so it must be affected by it some other way. > > Pretty hard to debug since it''s in nash user code from ramdisk and I am > not able to set breakpoint or put printf''s easily to figure why clearing > bit for EAX==0 makes it work, or what''s going on for PV and HVM guest. > CR0.EM is 0, so UD is coming from CR4.OSFXSR==0. Reading the SDMs to > learn OSFXSR stuff better....Perhaps you need to clear the FXSR feature bit in leaf 0x80000001:EDX as well? Tim.
On Wed, 20 Feb 2013 09:58:28 +0000 Tim Deegan <tim@xen.org> wrote:> At 16:05 -0800 on 19 Feb (1361289934), Mukesh Rathor wrote: > > On Thu, 24 Jan 2013 16:31:22 +0000 > > Tim Deegan <tim@xen.org> wrote: > > > > > At 18:01 -0800 on 11 Jan (1357927270), Mukesh Rathor wrote: > > > > + > > > > + case EXIT_REASON_CPUID: /* 10 */ > > > > + { > > > > + if ( guest_kernel_mode(vp, regs) ) { > > > > + pv_cpuid(regs); > > > > + > > > > + /* Because we are setting CR4.OSFXSR to 0, we > > > > need to disable > > > > + * this because, during boot, user process > > > > "init" (which doesn''t > > > > + * do cpuid), will do ''pxor xmm0,xmm0'' and > > > > cause #UD. For now > > > > + * disable this. HVM doesn''t allow setting of > > > > CR4.OSFXSR. > > > > + * fixme: this and also look at CR4.OSXSAVE */ > > > > + > > > > + __clear_bit(X86_FEATURE_FXSR, ®s->edx); > > > > > > Shouldn''t this be gated on which leaf the guest asked for? > > > > Yup, looking at it. X86_FEATURE_FXSR is EAX==1, but it doesn''t > > work. The user process "init" running nash is executing pxor %xmm0, > > %xmm0 and taking #UD. Strangely, it works with EAX==0, meaning if I > > clear the bit for EAX==0, changing the intel string "ineI". This > > user process doesn''t do cpuid, so it must be affected by it some > > other way. > > > > Pretty hard to debug since it''s in nash user code from ramdisk and > > I am not able to set breakpoint or put printf''s easily to figure > > why clearing bit for EAX==0 makes it work, or what''s going on for > > PV and HVM guest. CR0.EM is 0, so UD is coming from CR4.OSFXSR==0. > > Reading the SDMs to learn OSFXSR stuff better.... > > Perhaps you need to clear the FXSR feature bit in leaf 0x80000001:EDX > as well?That appears to be AMD only.
At 19:05 -0800 on 20 Feb (1361387119), Mukesh Rathor wrote:> On Wed, 20 Feb 2013 09:58:28 +0000 > Tim Deegan <tim@xen.org> wrote: > > > At 16:05 -0800 on 19 Feb (1361289934), Mukesh Rathor wrote: > > > On Thu, 24 Jan 2013 16:31:22 +0000 > > > Tim Deegan <tim@xen.org> wrote: > > > > > > > At 18:01 -0800 on 11 Jan (1357927270), Mukesh Rathor wrote: > > > > > + > > > > > + case EXIT_REASON_CPUID: /* 10 */ > > > > > + { > > > > > + if ( guest_kernel_mode(vp, regs) ) { > > > > > + pv_cpuid(regs); > > > > > + > > > > > + /* Because we are setting CR4.OSFXSR to 0, we > > > > > need to disable > > > > > + * this because, during boot, user process > > > > > "init" (which doesn''t > > > > > + * do cpuid), will do ''pxor xmm0,xmm0'' and > > > > > cause #UD. For now > > > > > + * disable this. HVM doesn''t allow setting of > > > > > CR4.OSFXSR. > > > > > + * fixme: this and also look at CR4.OSXSAVE */ > > > > > + > > > > > + __clear_bit(X86_FEATURE_FXSR, ®s->edx); > > > > > > > > Shouldn''t this be gated on which leaf the guest asked for? > > > > > > Yup, looking at it. X86_FEATURE_FXSR is EAX==1, but it doesn''t > > > work. The user process "init" running nash is executing pxor %xmm0, > > > %xmm0 and taking #UD. Strangely, it works with EAX==0, meaning if I > > > clear the bit for EAX==0, changing the intel string "ineI". This > > > user process doesn''t do cpuid, so it must be affected by it some > > > other way. > > > > > > Pretty hard to debug since it''s in nash user code from ramdisk and > > > I am not able to set breakpoint or put printf''s easily to figure > > > why clearing bit for EAX==0 makes it work, or what''s going on for > > > PV and HVM guest. CR0.EM is 0, so UD is coming from CR4.OSFXSR==0. > > > Reading the SDMs to learn OSFXSR stuff better.... > > > > Perhaps you need to clear the FXSR feature bit in leaf 0x80000001:EDX > > as well? > > That appears to be AMD only.It still needs to be handled. :) You are testing this stuff on AMD as well, right? Tim.
On Thu, 21 Feb 2013 09:10:01 +0000 Tim Deegan <tim@xen.org> wrote:> At 19:05 -0800 on 20 Feb (1361387119), Mukesh Rathor wrote: > > On Wed, 20 Feb 2013 09:58:28 +0000 > > Tim Deegan <tim@xen.org> wrote: > > > > > At 16:05 -0800 on 19 Feb (1361289934), Mukesh Rathor wrote: > > > > On Thu, 24 Jan 2013 16:31:22 +0000 > > > > Tim Deegan <tim@xen.org> wrote: > > > > > > > > > At 18:01 -0800 on 11 Jan (1357927270), Mukesh Rathor wrote: > > > > > > + > > > > > > + case EXIT_REASON_CPUID: /* 10 */ > > > > > > + { > > > > > > + if ( guest_kernel_mode(vp, regs) ) { > > > > > > + pv_cpuid(regs); > > > > > > + > > > > > > + /* Because we are setting CR4.OSFXSR to 0, > > > > > > we need to disable > > > > > > + * this because, during boot, user process > > > > > > "init" (which doesn''t > > > > > > + * do cpuid), will do ''pxor xmm0,xmm0'' and > > > > > > cause #UD. For now > > > > > > + * disable this. HVM doesn''t allow setting > > > > > > of CR4.OSFXSR. > > > > > > + * fixme: this and also look at > > > > > > CR4.OSXSAVE */ + > > > > > > + __clear_bit(X86_FEATURE_FXSR, ®s->edx); > > > > > > > > > > Shouldn''t this be gated on which leaf the guest asked for? > > > > > > > > Yup, looking at it. X86_FEATURE_FXSR is EAX==1, but it doesn''t > > > > work. The user process "init" running nash is executing pxor > > > > %xmm0, %xmm0 and taking #UD. Strangely, it works with EAX==0, > > > > meaning if I clear the bit for EAX==0, changing the intel > > > > string "ineI". This user process doesn''t do cpuid, so it must > > > > be affected by it some other way. > > > > > > > > Pretty hard to debug since it''s in nash user code from ramdisk > > > > and I am not able to set breakpoint or put printf''s easily to > > > > figure why clearing bit for EAX==0 makes it work, or what''s > > > > going on for PV and HVM guest. CR0.EM is 0, so UD is coming > > > > from CR4.OSFXSR==0. Reading the SDMs to learn OSFXSR stuff > > > > better.... > > > > > > Perhaps you need to clear the FXSR feature bit in leaf > > > 0x80000001:EDX as well? > > > > That appears to be AMD only. > > It still needs to be handled. :) You are testing this stuff on AMD > as well, right?Right, in pvh_svm.c when its ported. I had talked to someone from AMD at xen summit who said he''d port it to AMD as soon as phase I patch is checked in. It shouldn''t be too hard to do that. I estimate a week or two if I was to do SVM changes. Right now my hurdle working on Version 2 of the patch is all this SSE stuff that I''m trying to figure out. I was hoping to punt it to phase II by turning off whatever bits in CPUID, but nothing works. Thanks, Mukesh
At 11:20 -0800 on 21 Feb (1361445604), Mukesh Rathor wrote:> > It still needs to be handled. :) You are testing this stuff on AMD > > as well, right? > > Right, in pvh_svm.c when its ported. I had talked to someone from AMD > at xen summit who said he''d port it to AMD as soon as phase I patch > is checked in. It shouldn''t be too hard to do that. I estimate a week > or two if I was to do SVM changes.OK, so long as there''s a plan. :) Cheers, Tim.