thr3ads.net - Xen devel - [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization [Mar 2013]

If this information is useful, please help other people find it:
Share via:

Mukesh Rathor

2013-Mar-16 00:39 UTC

[PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

This patch mainly contains code to create a VMCS for PVH guest, and HVM
 specific vcpu/domain creation code.

Changes in V2:
  - Avoid call to hvm_do_resume() at call site rather than return in it.
  - Return for PVH vmx_do_resume prior to intel debugger stuff.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/hvm/hvm.c      |   90 ++++++++++++++-
 xen/arch/x86/hvm/vmx/vmcs.c |  266 ++++++++++++++++++++++++++++++++++++++++++-
 xen/arch/x86/hvm/vmx/vmx.c  |   34 ++++++
 3 files changed, 383 insertions(+), 7 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index ea7adf6..18889ad 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -510,6 +510,29 @@ static int hvm_print_line(
     return X86EMUL_OKAY;
 }
 
+static int hvm_pvh_dom_initialise(struct domain *d)
+{
+    int rc;
+
+    if (!d->arch.hvm_domain.hap_enabled)
+        return -EINVAL;
+
+    spin_lock_init(&d->arch.hvm_domain.irq_lock);
+    hvm_init_guest_time(d);
+
+    hvm_init_cacheattr_region_list(d);
+
+    if ( (rc=paging_enable(d, PG_refcounts|PG_translate|PG_external)) != 0 )
+        goto fail1;
+
+    if ( (rc = hvm_funcs.domain_initialise(d)) == 0 )
+        return 0;
+
+fail1:
+    hvm_destroy_cacheattr_region_list(d);
+    return rc;
+}
+
 int hvm_domain_initialise(struct domain *d)
 {
     int rc;
@@ -520,6 +543,8 @@ int hvm_domain_initialise(struct domain *d)
                  "on a non-VT/AMDV platform.\n");
         return -EINVAL;
     }
+    if ( is_pvh_domain(d) )
+        return hvm_pvh_dom_initialise(d);
 
     spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
     spin_lock_init(&d->arch.hvm_domain.irq_lock);
@@ -584,6 +609,11 @@ int hvm_domain_initialise(struct domain *d)
 
 void hvm_domain_relinquish_resources(struct domain *d)
 {
+    if ( is_pvh_domain(d) ) 
+    {
+        pit_deinit(d);
+        return;
+    }
     if ( hvm_funcs.nhvm_domain_relinquish_resources )
         hvm_funcs.nhvm_domain_relinquish_resources(d);
 
@@ -609,10 +639,14 @@ void hvm_domain_relinquish_resources(struct domain *d)
 void hvm_domain_destroy(struct domain *d)
 {
     hvm_funcs.domain_destroy(d);
+    hvm_destroy_cacheattr_region_list(d);
+
+    if ( is_pvh_domain(d) )
+        return;
+
     rtc_deinit(d);
     stdvga_deinit(d);
     vioapic_deinit(d);
-    hvm_destroy_cacheattr_region_list(d);
 }
 
 static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h)
@@ -1066,14 +1100,47 @@ static int __init
__hvm_register_CPU_XSAVE_save_and_restore(void)
 }
 __initcall(__hvm_register_CPU_XSAVE_save_and_restore);
 
+static int hvm_pvh_vcpu_initialise(struct vcpu *v)
+{
+    int rc;
+
+    if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
+        return rc;
+
+    softirq_tasklet_init( &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
+                          (void(*)(unsigned long))hvm_assert_evtchn_irq,
+                          (unsigned long)v );
+
+    v->arch.hvm_vcpu.hcall_64bit = 1;
+    v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN;
+    v->arch.user_regs.eflags = 2;
+    v->arch.hvm_vcpu.inject_trap.vector = -1;
+
+    if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) {
+        hvm_funcs.vcpu_destroy(v);
+        return rc;
+    }
+
+    /* during domain shutdown: pvh_vmx_vmexit_handler->emulate_privileged_op
+     * -> guest_io_read -> pv_pit_handler -> handle_speaker_io ->
_spin_lock
+     *  so we call pit_init to initialize the spin lock */
+    if ( v->vcpu_id == 0 )
+        pit_init(v, cpu_khz);
+
+    return 0;
+}
+
 int hvm_vcpu_initialise(struct vcpu *v)
 {
     int rc;
     struct domain *d = v->domain;
-    domid_t dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
+    domid_t dm_domid; 
 
     hvm_asid_flush_vcpu(v);
 
+    if ( is_pvh_vcpu(v) )
+        return hvm_pvh_vcpu_initialise(v);
+
     if ( (rc = vlapic_init(v)) != 0 )
         goto fail1;
 
@@ -1084,6 +1151,8 @@ int hvm_vcpu_initialise(struct vcpu *v)
          && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) 
         goto fail3;
 
+    dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
+
     /* Create ioreq event channel. */
     rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL);
     if ( rc < 0 )
@@ -1163,7 +1232,10 @@ void hvm_vcpu_destroy(struct vcpu *v)
 
     tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
     hvm_vcpu_cacheattr_destroy(v);
-    vlapic_destroy(v);
+
+    if ( !is_pvh_vcpu(v) )
+        vlapic_destroy(v);
+
     hvm_funcs.vcpu_destroy(v);
 
     /* Event channel is already freed by evtchn_destroy(). */
@@ -4514,6 +4586,8 @@ static int hvm_memory_event_traps(long p, uint32_t reason,
 
 void hvm_memory_event_cr0(unsigned long value, unsigned long old) 
 {
+    if ( is_pvh_vcpu(current) )
+        return;
     hvm_memory_event_traps(current->domain->arch.hvm_domain
                              .params[HVM_PARAM_MEMORY_EVENT_CR0],
                            MEM_EVENT_REASON_CR0,
@@ -4522,6 +4596,8 @@ void hvm_memory_event_cr0(unsigned long value, unsigned
long old)
 
 void hvm_memory_event_cr3(unsigned long value, unsigned long old) 
 {
+    if ( is_pvh_vcpu(current) )
+        return;
     hvm_memory_event_traps(current->domain->arch.hvm_domain
                              .params[HVM_PARAM_MEMORY_EVENT_CR3],
                            MEM_EVENT_REASON_CR3,
@@ -4530,6 +4606,8 @@ void hvm_memory_event_cr3(unsigned long value, unsigned
long old)
 
 void hvm_memory_event_cr4(unsigned long value, unsigned long old) 
 {
+    if ( is_pvh_vcpu(current) )
+        return;
     hvm_memory_event_traps(current->domain->arch.hvm_domain
                              .params[HVM_PARAM_MEMORY_EVENT_CR4],
                            MEM_EVENT_REASON_CR4,
@@ -4538,6 +4616,8 @@ void hvm_memory_event_cr4(unsigned long value, unsigned
long old)
 
 void hvm_memory_event_msr(unsigned long msr, unsigned long value)
 {
+    if ( is_pvh_vcpu(current) )
+        return;
     hvm_memory_event_traps(current->domain->arch.hvm_domain
                              .params[HVM_PARAM_MEMORY_EVENT_MSR],
                            MEM_EVENT_REASON_MSR,
@@ -4550,6 +4630,8 @@ int hvm_memory_event_int3(unsigned long gla)
     unsigned long gfn;
     gfn = paging_gva_to_gfn(current, gla, &pfec);
 
+    if ( is_pvh_vcpu(current) )
+        return 0;
     return hvm_memory_event_traps(current->domain->arch.hvm_domain
                                     .params[HVM_PARAM_MEMORY_EVENT_INT3],
                                   MEM_EVENT_REASON_INT3,
@@ -4562,6 +4644,8 @@ int hvm_memory_event_single_step(unsigned long gla)
     unsigned long gfn;
     gfn = paging_gva_to_gfn(current, gla, &pfec);
 
+    if ( is_pvh_vcpu(current) )
+        return 0;
     return hvm_memory_event_traps(current->domain->arch.hvm_domain
             .params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP],
             MEM_EVENT_REASON_SINGLESTEP,
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 9926ffb..b0bea9c 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -624,7 +624,7 @@ void vmx_vmcs_exit(struct vcpu *v)
     {
         /* Don''t confuse vmx_do_resume (for @v or @current!) */
         vmx_clear_vmcs(v);
-        if ( is_hvm_vcpu(current) )
+        if ( is_hvm_or_pvh_vcpu(current) )
             vmx_load_vmcs(current);
 
         spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
@@ -815,6 +815,253 @@ void virtual_vmcs_vmwrite(void *vvmcs, u32 vmcs_encoding,
u64 val)
     virtual_vmcs_exit(vvmcs);
 }
 
+static int pvh_construct_vmcs(struct vcpu *v)
+{
+    uint16_t sysenter_cs;
+    unsigned long sysenter_eip;
+    struct domain *d = v->domain;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    struct ept_data *ept = &p2m->ept;
+    u32 vmexit_ctl = vmx_vmexit_control;
+    u32 vmentry_ctl = vmx_vmentry_control;
+    u64 required, tmpval = -1;
+
+    if ( !paging_mode_hap(d) )
+    {
+        printk("ERROR: HAP is required to run PV in HVM
container\n");
+        return -EINVAL;
+    }
+
+    /* VMCS controls. */
+    vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS;
+    __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
+
+    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
+
+    /* if rdtsc exiting is turned on and it goes thru emulate_privileged_op,
+     * then pv_vcpu.ctrlreg must be added to pvh struct */
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;
+
+    v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
+                                      CPU_BASED_CR3_LOAD_EXITING |
+                                      CPU_BASED_CR3_STORE_EXITING);
+    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+
+    /* I/O access bitmap. */
+    __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
+    __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
+
+    /* MSR access bitmap. */
+    if ( cpu_has_vmx_msr_bitmap )
+    {
+        unsigned long *msr_bitmap = alloc_xenheap_page();
+        int msr_type = MSR_TYPE_R | MSR_TYPE_W;
+
+        if ( msr_bitmap == NULL )
+            return -ENOMEM;
+
+        memset(msr_bitmap, ~0, PAGE_SIZE);
+        v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
+        __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
+
+        vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type);
+
+        /* pure hvm doesn''t do this. safe? see:
long_mode_do_msr_write() */
+#if 0
+        vmx_disable_intercept_for_msr(v, MSR_STAR);
+        vmx_disable_intercept_for_msr(v, MSR_LSTAR);
+        vmx_disable_intercept_for_msr(v, MSR_CSTAR);
+        vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK);
+#endif
+    } else {
+        printk("PVH: CPU does NOT have msr bitmap\n");
+        return -EINVAL;
+    }
+
+    if ( !cpu_has_vmx_vpid ) {
+        printk("PVH: At present VPID support is required to run
PVH\n");
+        return -EINVAL;
+    }
+
+    v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
+
+    if ( cpu_has_vmx_secondary_exec_control ) {
+        v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn off
all */
+        v->arch.hvm_vmx.secondary_exec_control |= 
+                                             
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+        v->arch.hvm_vmx.secondary_exec_control |=
SECONDARY_EXEC_ENABLE_VPID;
+
+        v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_EPT;
+        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+                  v->arch.hvm_vmx.secondary_exec_control);
+    } else {
+        printk("PVH: NO Secondary Exec control\n");
+        return -EINVAL;
+    }
+
+    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
+
+    #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4
+    #define VM_ENTRY_LOAD_EFER 0x8000
+    vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS;
+    vmentry_ctl &= ~VM_ENTRY_LOAD_EFER;
+    vmentry_ctl &= ~VM_ENTRY_SMM;
+    vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR;
+    vmentry_ctl |= VM_ENTRY_IA32E_MODE;
+    __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
+
+    /* MSR intercepts. */
+    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+    __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+
+    /* Host data selectors. */
+    __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
+    __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
+    __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
+    __vmwrite(HOST_FS_SELECTOR, 0);
+    __vmwrite(HOST_GS_SELECTOR, 0);
+    __vmwrite(HOST_FS_BASE, 0);
+    __vmwrite(HOST_GS_BASE, 0);
+
+    vmx_set_host_env(v);
+
+    /* Host control registers. */
+    v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
+    __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
+    __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? X86_CR4_OSXSAVE :
0));
+
+    /* Host CS:RIP. */
+    __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
+    __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
+
+    /* Host SYSENTER CS:RIP. */
+    rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs);
+    __vmwrite(HOST_SYSENTER_CS, sysenter_cs);
+    rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip);
+    __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
+
+    __vmwrite(VM_ENTRY_INTR_INFO, 0);
+
+    __vmwrite(CR3_TARGET_COUNT, 0);
+
+    __vmwrite(GUEST_ACTIVITY_STATE, 0);
+
+    /* Set default guest context values here. Some of these are then
overwritten
+     * in vmx_pvh_set_vcpu_info() by guest itself during vcpu bringup */
+    __vmwrite(GUEST_CS_BASE, 0);
+    __vmwrite(GUEST_CS_LIMIT, ~0u);
+    __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */
+    __vmwrite(GUEST_CS_SELECTOR, 0x10);
+
+    __vmwrite(GUEST_DS_BASE, 0);
+    __vmwrite(GUEST_DS_LIMIT, ~0u);
+    __vmwrite(GUEST_DS_AR_BYTES, 0xc093);
+    __vmwrite(GUEST_DS_SELECTOR, 0x18);
+
+    __vmwrite(GUEST_SS_BASE, 0);         /* use same seg as DS */
+    __vmwrite(GUEST_SS_LIMIT, ~0u);
+    __vmwrite(GUEST_SS_AR_BYTES, 0xc093);
+    __vmwrite(GUEST_SS_SELECTOR, 0x18);
+
+    __vmwrite(GUEST_ES_SELECTOR, 0);
+    __vmwrite(GUEST_FS_SELECTOR, 0);
+    __vmwrite(GUEST_GS_SELECTOR, 0);
+
+    /* Guest segment bases. */
+    __vmwrite(GUEST_ES_BASE, 0);
+    __vmwrite(GUEST_FS_BASE, 0);
+    __vmwrite(GUEST_GS_BASE, 0);
+
+    /* Guest segment limits. */
+    __vmwrite(GUEST_ES_LIMIT, ~0u);
+    __vmwrite(GUEST_FS_LIMIT, ~0u);
+    __vmwrite(GUEST_GS_LIMIT, ~0u);
+
+    /* Guest segment AR bytes. */
+    __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
+    __vmwrite(GUEST_FS_AR_BYTES, 0xc093);
+    __vmwrite(GUEST_GS_AR_BYTES, 0xc093);
+
+    /* Guest IDT. */
+    __vmwrite(GUEST_GDTR_BASE, 0);
+    __vmwrite(GUEST_GDTR_LIMIT, 0);
+
+    /* Guest LDT. */
+    __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */
+    __vmwrite(GUEST_LDTR_SELECTOR, 0);
+    __vmwrite(GUEST_LDTR_BASE, 0);
+    __vmwrite(GUEST_LDTR_LIMIT, 0);
+
+    /* Guest TSS. */
+    __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */
+    __vmwrite(GUEST_TR_BASE, 0);
+    __vmwrite(GUEST_TR_LIMIT, 0xff);
+
+    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+    __vmwrite(GUEST_DR7, 0);
+    __vmwrite(VMCS_LINK_POINTER, ~0UL);
+
+    __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+    __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+
+    v->arch.hvm_vmx.exception_bitmap = 
+                                   HVM_TRAP_MASK     | (1 << TRAP_debug)
|
+                                   (1U << TRAP_int3) | (1U <<
TRAP_no_device);
+    __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap);
+
+    __vmwrite(TSC_OFFSET, 0);
+
+    /* Set WP bit so rdonly pages are not written from CPL 0 */
+    tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP;
+    __vmwrite(GUEST_CR0, tmpval);
+    __vmwrite(CR0_READ_SHADOW, tmpval);
+    v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] = tmpval;
+
+    tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
+    required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR;
+    if ( (tmpval & required) != required )
+    {
+        printk("PVH: required CR4 features not available:%lx\n",
required);
+        return -EINVAL;
+    }
+    __vmwrite(GUEST_CR4, tmpval);
+    __vmwrite(CR4_READ_SHADOW, tmpval);
+    v->arch.hvm_vcpu.guest_cr[4] = tmpval;
+
+    __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
+    __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
+
+     v->arch.hvm_vmx.vmx_realmode = 0;
+
+    ept->asr  = pagetable_get_pfn(p2m_get_pagetable(p2m));
+    __vmwrite(EPT_POINTER, ept_get_eptp(ept));
+
+    if ( cpu_has_vmx_pat )
+    {
+        u64 host_pat, guest_pat;
+
+        rdmsrl(MSR_IA32_CR_PAT, host_pat);
+        guest_pat = MSR_IA32_CR_PAT_RESET;
+
+        __vmwrite(HOST_PAT, host_pat);
+        __vmwrite(GUEST_PAT, guest_pat);
+    }
+    return 0;
+}
+
 static int construct_vmcs(struct vcpu *v)
 {
     struct domain *d = v->domain;
@@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v)
 
     vmx_vmcs_enter(v);
 
+    if ( is_pvh_vcpu(v) ) {
+        int rc = pvh_construct_vmcs(v);
+        vmx_vmcs_exit(v);
+        return rc;
+    }
+
     /* VMCS controls. */
     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
 
@@ -1259,8 +1512,10 @@ void vmx_do_resume(struct vcpu *v)
 
         vmx_clear_vmcs(v);
         vmx_load_vmcs(v);
-        hvm_migrate_timers(v);
-        hvm_migrate_pirqs(v);
+        if ( !is_pvh_vcpu(v) ) {
+            hvm_migrate_timers(v);
+            hvm_migrate_pirqs(v);
+        }
         vmx_set_host_env(v);
         /*
          * Both n1 VMCS and n2 VMCS need to update the host environment after 
@@ -1272,6 +1527,9 @@ void vmx_do_resume(struct vcpu *v)
         hvm_asid_flush_vcpu(v);
     }
 
+    if ( is_pvh_vcpu(v) )
+        reset_stack_and_jump(vmx_asm_do_vmentry);
+
     debug_state = v->domain->debugger_attached
                   ||
v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3]
                   ||
v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP];
@@ -1455,7 +1713,7 @@ static void vmcs_dump(unsigned char ch)
 
     for_each_domain ( d )
     {
-        if ( !is_hvm_domain(d) )
+        if ( !is_hvm_or_pvh_domain(d) )
             continue;
         printk("\n>>> Domain %d <<<\n",
d->domain_id);
         for_each_vcpu ( d, v )
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index e64980f..194c87b 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -79,6 +79,9 @@ static int vmx_domain_initialise(struct domain *d)
 {
     int rc;
 
+    if ( is_pvh_domain(d) )
+        return 0;
+
     if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
         return rc;
 
@@ -87,6 +90,9 @@ static int vmx_domain_initialise(struct domain *d)
 
 static void vmx_domain_destroy(struct domain *d)
 {
+    if ( is_pvh_domain(d) )
+        return;
+
     vmx_free_vlapic_mapping(d);
 }
 
@@ -110,6 +116,12 @@ static int vmx_vcpu_initialise(struct vcpu *v)
 
     vpmu_initialise(v);
 
+    if (is_pvh_vcpu(v) ) 
+    {
+        /* this for hvm_long_mode_enabled(v) */
+        v->arch.hvm_vcpu.guest_efer = EFER_SCE | EFER_LMA | EFER_LME;
+        return 0;
+    }
     vmx_install_vlapic_mapping(v);
 
     /* %eax == 1 signals full real-mode support to the guest loader. */
@@ -1033,6 +1045,23 @@ static void vmx_update_host_cr3(struct vcpu *v)
     vmx_vmcs_exit(v);
 }
 
+static void vmx_update_pvh_cr(struct vcpu *v, unsigned int cr)
+{
+    vmx_vmcs_enter(v);
+    switch ( cr )
+    {
+        case 3:
+            __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.guest_cr[3]);
+            hvm_asid_flush_vcpu(v);
+            break;
+
+        default:
+            printk("PVH: d%d v%d unexpected cr%d update at
rip:%lx\n",
+                   v->domain->domain_id, v->vcpu_id, cr,
__vmread(GUEST_RIP));
+    }
+    vmx_vmcs_exit(v);
+}
+
 void vmx_update_debug_state(struct vcpu *v)
 {
     unsigned long mask;
@@ -1052,6 +1081,11 @@ void vmx_update_debug_state(struct vcpu *v)
 
 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
 {
+    if ( is_pvh_vcpu(v) ) {
+        vmx_update_pvh_cr(v, cr);
+        return;
+    }
+
     vmx_vmcs_enter(v);
 
     switch ( cr )
-- 
1.7.2.3

Jan Beulich

2013-Mar-18 12:03 UTC

head link

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

>>> On 16.03.13 at 01:39, Mukesh Rathor
<mukesh.rathor@oracle.com> wrote:
> +        /* pure hvm doesn''t do this. safe? see:
long_mode_do_msr_write() */
> +#if 0
> +        vmx_disable_intercept_for_msr(v, MSR_STAR);
> +        vmx_disable_intercept_for_msr(v, MSR_LSTAR);
> +        vmx_disable_intercept_for_msr(v, MSR_CSTAR);
> +        vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK);
> +#endif
This is the sort of comment/code that I think should never be in a
non-RFC patch. Either you''re convinced not intercepting these
MSRs is correct (and secure) for PVH guests (in which case no more
than a comment saying so is needed), or the code is needed and
hence the conditional should be dropped.
> +    } else {
Formatting (not just here).
> +        printk("PVH: CPU does NOT have msr bitmap\n");
> +        return -EINVAL;
> +    }
> +
> +    if ( !cpu_has_vmx_vpid ) {
> +        printk("PVH: At present VPID support is required to run
PVH\n");
> +        return -EINVAL;
> +    }
> +
> +    v->arch.hvm_vmx.secondary_exec_control =
vmx_secondary_exec_control;
> +
> +    if ( cpu_has_vmx_secondary_exec_control ) {
> +        v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn
off all */
> +        v->arch.hvm_vmx.secondary_exec_control |= 
> +                                             
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> +        v->arch.hvm_vmx.secondary_exec_control |=
SECONDARY_EXEC_ENABLE_VPID;
> +
> +        v->arch.hvm_vmx.secondary_exec_control |=
SECONDARY_EXEC_ENABLE_EPT;
> +        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
> +                  v->arch.hvm_vmx.secondary_exec_control);
> +    } else {
> +        printk("PVH: NO Secondary Exec control\n");
> +        return -EINVAL;
> +    }
> +
> +    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
> +
> +    #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4
> +    #define VM_ENTRY_LOAD_EFER 0x8000
Please don''t indent #defines (at least not the #).

Jan

Konrad Rzeszutek Wilk

2013-Mar-18 15:28 UTC

head link

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor
wrote:> This patch mainly contains code to create a VMCS for PVH guest, and HVM
>  specific vcpu/domain creation code.
> 
> Changes in V2:
>   - Avoid call to hvm_do_resume() at call site rather than return in it.
>   - Return for PVH vmx_do_resume prior to intel debugger stuff.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
> ---
>  xen/arch/x86/hvm/hvm.c      |   90 ++++++++++++++-
>  xen/arch/x86/hvm/vmx/vmcs.c |  266
++++++++++++++++++++++++++++++++++++++++++-
>  xen/arch/x86/hvm/vmx/vmx.c  |   34 ++++++
>  3 files changed, 383 insertions(+), 7 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index ea7adf6..18889ad 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -510,6 +510,29 @@ static int hvm_print_line(
>      return X86EMUL_OKAY;
>  }
>  
> +static int hvm_pvh_dom_initialise(struct domain *d)
> +{
> +    int rc;
> +
> +    if (!d->arch.hvm_domain.hap_enabled)
> +        return -EINVAL;
> +
> +    spin_lock_init(&d->arch.hvm_domain.irq_lock);
> +    hvm_init_guest_time(d);
> +
> +    hvm_init_cacheattr_region_list(d);
> +
> +    if ( (rc=paging_enable(d, PG_refcounts|PG_translate|PG_external)) != 0
)
> +        goto fail1;
> +
> +    if ( (rc = hvm_funcs.domain_initialise(d)) == 0 )
> +        return 0;
> +
> +fail1:
I don''t think you need the label here? You are not doing an goto.
> +    hvm_destroy_cacheattr_region_list(d);
> +    return rc;
> +}
> +
>  int hvm_domain_initialise(struct domain *d)
>  {
>      int rc;
> @@ -520,6 +543,8 @@ int hvm_domain_initialise(struct domain *d)
>                   "on a non-VT/AMDV platform.\n");
>          return -EINVAL;
>      }
> +    if ( is_pvh_domain(d) )
> +        return hvm_pvh_dom_initialise(d);
>  
>      spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
>      spin_lock_init(&d->arch.hvm_domain.irq_lock);
> @@ -584,6 +609,11 @@ int hvm_domain_initialise(struct domain *d)
>  
>  void hvm_domain_relinquish_resources(struct domain *d)
>  {
> +    if ( is_pvh_domain(d) ) 
> +    {
> +        pit_deinit(d);
> +        return;
> +    }
>      if ( hvm_funcs.nhvm_domain_relinquish_resources )
>          hvm_funcs.nhvm_domain_relinquish_resources(d);
>  
> @@ -609,10 +639,14 @@ void hvm_domain_relinquish_resources(struct domain
*d)
>  void hvm_domain_destroy(struct domain *d)
>  {
>      hvm_funcs.domain_destroy(d);
> +    hvm_destroy_cacheattr_region_list(d);
> +
> +    if ( is_pvh_domain(d) )
> +        return;
> +
>      rtc_deinit(d);
>      stdvga_deinit(d);
>      vioapic_deinit(d);
> -    hvm_destroy_cacheattr_region_list(d);
>  }
>  
>  static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h)
> @@ -1066,14 +1100,47 @@ static int __init
__hvm_register_CPU_XSAVE_save_and_restore(void)
>  }
>  __initcall(__hvm_register_CPU_XSAVE_save_and_restore);
>  
> +static int hvm_pvh_vcpu_initialise(struct vcpu *v)
> +{
> +    int rc;
> +
> +    if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
> +        return rc;
> +
> +    softirq_tasklet_init(
&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
> +                          (void(*)(unsigned long))hvm_assert_evtchn_irq,
> +                          (unsigned long)v );
> +
> +    v->arch.hvm_vcpu.hcall_64bit = 1;
> +    v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN;
> +    v->arch.user_regs.eflags = 2;
So that sets the Reserved flag. Could you include a comment explaining why..
Ah, is it b/c we later on bit-shift it and use it to figure out whether IOPL
needs to be virtualized in arch_set_info_guest? Or is it just b/c
this function is based off hvm_vcpu_initialise? If so, since you are
being called by it, can you skip it?

> +    v->arch.hvm_vcpu.inject_trap.vector = -1;
> +
> +    if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) {
The syntax here is off.
> +        hvm_funcs.vcpu_destroy(v);
> +        return rc;
> +    }
> +
> +    /* during domain shutdown:
pvh_vmx_vmexit_handler->emulate_privileged_op
> +     * -> guest_io_read -> pv_pit_handler -> handle_speaker_io
-> _spin_lock
> +     *  so we call pit_init to initialize the spin lock */
> +    if ( v->vcpu_id == 0 )
> +        pit_init(v, cpu_khz);
> +
> +    return 0;
> +}
> +
>  int hvm_vcpu_initialise(struct vcpu *v)
>  {
>      int rc;
>      struct domain *d = v->domain;
> -    domid_t dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
> +    domid_t dm_domid; 
Not sure I follow, why the move of it further down?>  
>      hvm_asid_flush_vcpu(v);
>  
> +    if ( is_pvh_vcpu(v) )
> +        return hvm_pvh_vcpu_initialise(v);
> +
>      if ( (rc = vlapic_init(v)) != 0 )
>          goto fail1;
>  
> @@ -1084,6 +1151,8 @@ int hvm_vcpu_initialise(struct vcpu *v)
>           && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) 
>          goto fail3;
>  
> +    dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
> +
?
>      /* Create ioreq event channel. */
>      rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL);
>      if ( rc < 0 )
> @@ -1163,7 +1232,10 @@ void hvm_vcpu_destroy(struct vcpu *v)
>  
>      tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
>      hvm_vcpu_cacheattr_destroy(v);
> -    vlapic_destroy(v);
> +
> +    if ( !is_pvh_vcpu(v) )
> +        vlapic_destroy(v);
> +
>      hvm_funcs.vcpu_destroy(v);
>  
>      /* Event channel is already freed by evtchn_destroy(). */
> @@ -4514,6 +4586,8 @@ static int hvm_memory_event_traps(long p, uint32_t
reason,
>  
>  void hvm_memory_event_cr0(unsigned long value, unsigned long old) 
>  {
> +    if ( is_pvh_vcpu(current) )
> +        return;
>      hvm_memory_event_traps(current->domain->arch.hvm_domain
>                               .params[HVM_PARAM_MEMORY_EVENT_CR0],
>                             MEM_EVENT_REASON_CR0,
> @@ -4522,6 +4596,8 @@ void hvm_memory_event_cr0(unsigned long value,
unsigned long old)
>  
>  void hvm_memory_event_cr3(unsigned long value, unsigned long old) 
>  {
> +    if ( is_pvh_vcpu(current) )
> +        return;
>      hvm_memory_event_traps(current->domain->arch.hvm_domain
>                               .params[HVM_PARAM_MEMORY_EVENT_CR3],
>                             MEM_EVENT_REASON_CR3,
> @@ -4530,6 +4606,8 @@ void hvm_memory_event_cr3(unsigned long value,
unsigned long old)
>  
>  void hvm_memory_event_cr4(unsigned long value, unsigned long old) 
>  {
> +    if ( is_pvh_vcpu(current) )
> +        return;
>      hvm_memory_event_traps(current->domain->arch.hvm_domain
>                               .params[HVM_PARAM_MEMORY_EVENT_CR4],
>                             MEM_EVENT_REASON_CR4,
> @@ -4538,6 +4616,8 @@ void hvm_memory_event_cr4(unsigned long value,
unsigned long old)
>  
>  void hvm_memory_event_msr(unsigned long msr, unsigned long value)
>  {
> +    if ( is_pvh_vcpu(current) )
> +        return;
>      hvm_memory_event_traps(current->domain->arch.hvm_domain
>                               .params[HVM_PARAM_MEMORY_EVENT_MSR],
>                             MEM_EVENT_REASON_MSR,
> @@ -4550,6 +4630,8 @@ int hvm_memory_event_int3(unsigned long gla)
>      unsigned long gfn;
>      gfn = paging_gva_to_gfn(current, gla, &pfec);
>  
> +    if ( is_pvh_vcpu(current) )
> +        return 0;
>      return hvm_memory_event_traps(current->domain->arch.hvm_domain
>                                      .params[HVM_PARAM_MEMORY_EVENT_INT3],
>                                    MEM_EVENT_REASON_INT3,
> @@ -4562,6 +4644,8 @@ int hvm_memory_event_single_step(unsigned long gla)
>      unsigned long gfn;
>      gfn = paging_gva_to_gfn(current, gla, &pfec);
>  
> +    if ( is_pvh_vcpu(current) )
> +        return 0;
>      return hvm_memory_event_traps(current->domain->arch.hvm_domain
>              .params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP],
>              MEM_EVENT_REASON_SINGLESTEP,
> diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
> index 9926ffb..b0bea9c 100644
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -624,7 +624,7 @@ void vmx_vmcs_exit(struct vcpu *v)
>      {
>          /* Don''t confuse vmx_do_resume (for @v or @current!) */
>          vmx_clear_vmcs(v);
> -        if ( is_hvm_vcpu(current) )
> +        if ( is_hvm_or_pvh_vcpu(current) )
>              vmx_load_vmcs(current);
>  
>          spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
> @@ -815,6 +815,253 @@ void virtual_vmcs_vmwrite(void *vvmcs, u32
vmcs_encoding, u64 val)
>      virtual_vmcs_exit(vvmcs);
>  }
>  
> +static int pvh_construct_vmcs(struct vcpu *v)
> +{
> +    uint16_t sysenter_cs;
> +    unsigned long sysenter_eip;
> +    struct domain *d = v->domain;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +    struct ept_data *ept = &p2m->ept;
> +    u32 vmexit_ctl = vmx_vmexit_control;
> +    u32 vmentry_ctl = vmx_vmentry_control;
> +    u64 required, tmpval = -1;
> +
> +    if ( !paging_mode_hap(d) )
> +    {
> +        printk("ERROR: HAP is required to run PV in HVM
container\n");
> +        return -EINVAL;
> +    }
> +
> +    /* VMCS controls. */
> +    vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS;
> +    __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
> +
> +    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
> +
> +    /* if rdtsc exiting is turned on and it goes thru
emulate_privileged_op,
> +     * then pv_vcpu.ctrlreg must be added to pvh struct */
That would be the ''timer_mode'' syntax in the guest config
right? Perhaps then
a check at the top of the function to see which timer_mode is used and
exit out with -ENOSYS?

> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;
> +
> +    v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
> +                                      CPU_BASED_CR3_LOAD_EXITING |
> +                                      CPU_BASED_CR3_STORE_EXITING);
> +    v->arch.hvm_vmx.exec_control |=
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
Is that b/c the PV code ends up making the SCHED_yield_op hypercall and
we don''t need the monitor/mwait capability? If so, could you add that
comment in please?
> +    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +
> +    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
> +
> +    /* I/O access bitmap. */
> +    __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
> +    __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap +
PAGE_SIZE));
> +
> +    /* MSR access bitmap. */
> +    if ( cpu_has_vmx_msr_bitmap )
> +    {
> +        unsigned long *msr_bitmap = alloc_xenheap_page();
> +        int msr_type = MSR_TYPE_R | MSR_TYPE_W;
> +
> +        if ( msr_bitmap == NULL )
> +            return -ENOMEM;
> +
> +        memset(msr_bitmap, ~0, PAGE_SIZE);
> +        v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
> +        __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
> +
> +        vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type);
So this looks like the one vmcs.c except that one has this extra:

 895         if ( cpu_has_vmx_pat && paging_mode_hap(d) )
 896             vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, MSR_TYPE_R |
MSR_TYPE_W);
 897     }

Did you miss that?
> +
> +        /* pure hvm doesn''t do this. safe? see:
long_mode_do_msr_write() */
> +#if 0
> +        vmx_disable_intercept_for_msr(v, MSR_STAR);
> +        vmx_disable_intercept_for_msr(v, MSR_LSTAR);
> +        vmx_disable_intercept_for_msr(v, MSR_CSTAR);
> +        vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK);
> +#endif
I would just provide a comment saying:

	/*
	* long_mode_do_msr_write() takes care of MSR_[STAR|LSTAR|CSTAR|SYSCALL_MASK]
	*/
> +    } else {
> +        printk("PVH: CPU does NOT have msr bitmap\n");
Perhaps:

	printk(XENLOG_G_ERR "%s: ..\n", __func__);
?> +        return -EINVAL;
> +    }
> +
> +    if ( !cpu_has_vmx_vpid ) {
> +        printk("PVH: At present VPID support is required to run
PVH\n");
Should you de-allocate msr_bitmap at this point?

Or perhaps move this check (and the one below) to the start of the
function? So you have:

	if ( !cpu_has_vmx_vpid )
		gdprintk ("%s: VPID required for PVH mode.\n", __func__);

	if ( !cpu_has_vmx_secondary_exec_control )
		.. bla bla

> +        return -EINVAL;
> +    }
> +
> +    v->arch.hvm_vmx.secondary_exec_control =
vmx_secondary_exec_control;
> +
> +    if ( cpu_has_vmx_secondary_exec_control ) {
> +        v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn
off all */
> +        v->arch.hvm_vmx.secondary_exec_control |= 
> +                                             
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> +        v->arch.hvm_vmx.secondary_exec_control |=
SECONDARY_EXEC_ENABLE_VPID;
> +
> +        v->arch.hvm_vmx.secondary_exec_control |=
SECONDARY_EXEC_ENABLE_EPT;
> +        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
> +                  v->arch.hvm_vmx.secondary_exec_control);
> +    } else {
> +        printk("PVH: NO Secondary Exec control\n");
> +        return -EINVAL;
Ditto - should you de-allocate msr_bitmap ? Or if you are going to
move the check for cpu_has_vmx_secondary_exec_control, then there is no
need for this if (.. ) else ..

> +    }
> +
> +    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
> +
> +    #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4
> +    #define VM_ENTRY_LOAD_EFER 0x8000
> +    vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS;
> +    vmentry_ctl &= ~VM_ENTRY_LOAD_EFER;
> +    vmentry_ctl &= ~VM_ENTRY_SMM;
> +    vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR;
> +    vmentry_ctl |= VM_ENTRY_IA32E_MODE;
> +    __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
> +
From here on, it looks mostly the same as construct_vmcs right?

Perhaps you can add a comment saying so - so when a cleanup effort
is done later on - these can be candidates for it?
> +    /* MSR intercepts. */
> +    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
> +    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
> +    __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
> +
> +    /* Host data selectors. */
> +    __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
> +    __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
> +    __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
> +    __vmwrite(HOST_FS_SELECTOR, 0);
> +    __vmwrite(HOST_GS_SELECTOR, 0);
> +    __vmwrite(HOST_FS_BASE, 0);
> +    __vmwrite(HOST_GS_BASE, 0);
> +
> +    vmx_set_host_env(v);
> +
> +    /* Host control registers. */
> +    v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
> +    __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> +    __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? X86_CR4_OSXSAVE
: 0));
That formatting looks odd. 
> +
> +    /* Host CS:RIP. */
> +    __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
> +    __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
> +
> +    /* Host SYSENTER CS:RIP. */
> +    rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs);
> +    __vmwrite(HOST_SYSENTER_CS, sysenter_cs);
> +    rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip);
> +    __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
> +
> +    __vmwrite(VM_ENTRY_INTR_INFO, 0);
> +
> +    __vmwrite(CR3_TARGET_COUNT, 0);
> +
> +    __vmwrite(GUEST_ACTIVITY_STATE, 0);
> +
> +    /* Set default guest context values here. Some of these are then
overwritten
> +     * in vmx_pvh_set_vcpu_info() by guest itself during vcpu bringup */
> +    __vmwrite(GUEST_CS_BASE, 0);
> +    __vmwrite(GUEST_CS_LIMIT, ~0u);
> +    __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */
> +    __vmwrite(GUEST_CS_SELECTOR, 0x10);
0x10. Could you use a #define for it? Somehow I thought it would
be running in FLAT_KERNEL_CS but that would be odd. And of course
since are booting in the Linux kernel without the PV MMU we would
be using its native segments. So this would correspond to
GDT_ENTRY_KERNEL_CS right? Might want to mention that
so if the Linux kernel alters its GDT page we don''t blow up?

Thought I guess it does not matter - this is just the initial bootstrap
segments. Presumarily the load_gdt in the Linux kernel later on resets
it to whatever the "new" GDT is.
> +
> +    __vmwrite(GUEST_DS_BASE, 0);
> +    __vmwrite(GUEST_DS_LIMIT, ~0u);
> +    __vmwrite(GUEST_DS_AR_BYTES, 0xc093);
> +    __vmwrite(GUEST_DS_SELECTOR, 0x18);
> +
> +    __vmwrite(GUEST_SS_BASE, 0);         /* use same seg as DS */
> +    __vmwrite(GUEST_SS_LIMIT, ~0u);
> +    __vmwrite(GUEST_SS_AR_BYTES, 0xc093);
> +    __vmwrite(GUEST_SS_SELECTOR, 0x18);
> +
> +    __vmwrite(GUEST_ES_SELECTOR, 0);
> +    __vmwrite(GUEST_FS_SELECTOR, 0);
> +    __vmwrite(GUEST_GS_SELECTOR, 0);
> +
> +    /* Guest segment bases. */
> +    __vmwrite(GUEST_ES_BASE, 0);
> +    __vmwrite(GUEST_FS_BASE, 0);
> +    __vmwrite(GUEST_GS_BASE, 0);
> +
> +    /* Guest segment limits. */
> +    __vmwrite(GUEST_ES_LIMIT, ~0u);
> +    __vmwrite(GUEST_FS_LIMIT, ~0u);
> +    __vmwrite(GUEST_GS_LIMIT, ~0u);
> +
> +    /* Guest segment AR bytes. */
> +    __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
> +    __vmwrite(GUEST_FS_AR_BYTES, 0xc093);
> +    __vmwrite(GUEST_GS_AR_BYTES, 0xc093);
> +
> +    /* Guest IDT. */
> +    __vmwrite(GUEST_GDTR_BASE, 0);
> +    __vmwrite(GUEST_GDTR_LIMIT, 0);
> +
> +    /* Guest LDT. */
> +    __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */
> +    __vmwrite(GUEST_LDTR_SELECTOR, 0);
> +    __vmwrite(GUEST_LDTR_BASE, 0);
> +    __vmwrite(GUEST_LDTR_LIMIT, 0);
> +
> +    /* Guest TSS. */
> +    __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */
> +    __vmwrite(GUEST_TR_BASE, 0);
> +    __vmwrite(GUEST_TR_LIMIT, 0xff);
> +
> +    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
> +    __vmwrite(GUEST_DR7, 0);
> +    __vmwrite(VMCS_LINK_POINTER, ~0UL);
> +
> +    __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
> +    __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
Weird. In the vmcs.c file these are somewhat higher in the code.
> +
> +    v->arch.hvm_vmx.exception_bitmap = 
> +                                   HVM_TRAP_MASK     | (1 <<
TRAP_debug) |
> +                                   (1U << TRAP_int3) | (1U <<
TRAP_no_device);
Odd syntax.
> +    __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap);
> +
> +    __vmwrite(TSC_OFFSET, 0);
Hm, so you did earlier:

v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;

so is this neccessary? Or is just that you want it to be set
to default baseline state?
> +
> +    /* Set WP bit so rdonly pages are not written from CPL 0 */
> +    tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP;
> +    __vmwrite(GUEST_CR0, tmpval);
> +    __vmwrite(CR0_READ_SHADOW, tmpval);
> +    v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] =
tmpval;
> +
> +    tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
> +    required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR;
> +    if ( (tmpval & required) != required )
> +    {
> +        printk("PVH: required CR4 features not available:%lx\n",
required);
> +        return -EINVAL;
You might want to move that to the top of the code. Or if you want
it here, then at least free the msr_bitmap
> +    }
> +    __vmwrite(GUEST_CR4, tmpval);
> +    __vmwrite(CR4_READ_SHADOW, tmpval);
> +    v->arch.hvm_vcpu.guest_cr[4] = tmpval;
> +
> +    __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
> +    __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
> +
> +     v->arch.hvm_vmx.vmx_realmode = 0;
> +
> +    ept->asr  = pagetable_get_pfn(p2m_get_pagetable(p2m));
> +    __vmwrite(EPT_POINTER, ept_get_eptp(ept));
> +
> +    if ( cpu_has_vmx_pat )
> +    {
> +        u64 host_pat, guest_pat;
> +
> +        rdmsrl(MSR_IA32_CR_PAT, host_pat);
> +        guest_pat = MSR_IA32_CR_PAT_RESET;
> +
> +        __vmwrite(HOST_PAT, host_pat);
> +        __vmwrite(GUEST_PAT, guest_pat);
> +    }
> +    return 0;
> +}
> +
>  static int construct_vmcs(struct vcpu *v)
>  {
>      struct domain *d = v->domain;
> @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v)
>  
>      vmx_vmcs_enter(v);
>  
> +    if ( is_pvh_vcpu(v) ) {
> +        int rc = pvh_construct_vmcs(v);
> +        vmx_vmcs_exit(v);
Do you need to call paging_update_paging_modes as construct_vmcs() does?
> +        return rc;
> +    }
> +
>      /* VMCS controls. */
>      __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
>  
> @@ -1259,8 +1512,10 @@ void vmx_do_resume(struct vcpu *v)
>  
>          vmx_clear_vmcs(v);
>          vmx_load_vmcs(v);
> -        hvm_migrate_timers(v);
> -        hvm_migrate_pirqs(v);
> +        if ( !is_pvh_vcpu(v) ) {
> +            hvm_migrate_timers(v);
> +            hvm_migrate_pirqs(v);
> +        }
>          vmx_set_host_env(v);
>          /*
>           * Both n1 VMCS and n2 VMCS need to update the host environment
after
> @@ -1272,6 +1527,9 @@ void vmx_do_resume(struct vcpu *v)
>          hvm_asid_flush_vcpu(v);
>      }
>  
> +    if ( is_pvh_vcpu(v) )
> +        reset_stack_and_jump(vmx_asm_do_vmentry);
> +
>      debug_state = v->domain->debugger_attached
>                    ||
v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3]
>                    ||
v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP];
> @@ -1455,7 +1713,7 @@ static void vmcs_dump(unsigned char ch)
>  
>      for_each_domain ( d )
>      {
> -        if ( !is_hvm_domain(d) )
> +        if ( !is_hvm_or_pvh_domain(d) )
>              continue;
>          printk("\n>>> Domain %d <<<\n",
d->domain_id);
>          for_each_vcpu ( d, v )
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index e64980f..194c87b 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -79,6 +79,9 @@ static int vmx_domain_initialise(struct domain *d)
>  {
>      int rc;
>  
> +    if ( is_pvh_domain(d) )
> +        return 0;
> +
>      if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
>          return rc;
>  
> @@ -87,6 +90,9 @@ static int vmx_domain_initialise(struct domain *d)
>  
>  static void vmx_domain_destroy(struct domain *d)
>  {
> +    if ( is_pvh_domain(d) )
> +        return;
> +
>      vmx_free_vlapic_mapping(d);
>  }
>  
> @@ -110,6 +116,12 @@ static int vmx_vcpu_initialise(struct vcpu *v)
>  
>      vpmu_initialise(v);
>  
> +    if (is_pvh_vcpu(v) ) 
> +    {
> +        /* this for hvm_long_mode_enabled(v) */
> +        v->arch.hvm_vcpu.guest_efer = EFER_SCE | EFER_LMA | EFER_LME;
> +        return 0;
> +    }
>      vmx_install_vlapic_mapping(v);
>  
>      /* %eax == 1 signals full real-mode support to the guest loader. */
> @@ -1033,6 +1045,23 @@ static void vmx_update_host_cr3(struct vcpu *v)
>      vmx_vmcs_exit(v);
>  }
>  
> +static void vmx_update_pvh_cr(struct vcpu *v, unsigned int cr)
> +{
> +    vmx_vmcs_enter(v);
> +    switch ( cr )
> +    {
> +        case 3:
> +            __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.guest_cr[3]);
> +            hvm_asid_flush_vcpu(v);
> +            break;
> +
> +        default:
> +            printk("PVH: d%d v%d unexpected cr%d update at
rip:%lx\n",
You are missing the XENLOG_ERR part.
> +                   v->domain->domain_id, v->vcpu_id, cr,
__vmread(GUEST_RIP));
> +    }
> +    vmx_vmcs_exit(v);
> +}
> +
>  void vmx_update_debug_state(struct vcpu *v)
>  {
>      unsigned long mask;
> @@ -1052,6 +1081,11 @@ void vmx_update_debug_state(struct vcpu *v)
>  
>  static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
>  {
> +    if ( is_pvh_vcpu(v) ) {
> +        vmx_update_pvh_cr(v, cr);
> +        return;
> +    }
> +
>      vmx_vmcs_enter(v);
>  
>      switch ( cr )
> -- 
> 1.7.2.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

Mukesh Rathor

2013-Mar-19 01:00 UTC

head link

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

On Mon, 18 Mar 2013 11:28:43 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor wrote:
> > This patch mainly contains code to create a VMCS for PVH guest, and
> > HVM specific vcpu/domain creation code.
> > 
> > Changes in V2:
> >   - Avoid call to hvm_do_resume() at call site rather than return
> > in it.
> >   - Return for PVH vmx_do_resume prior to intel debugger stuff.
> > 
> > Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
> > ---
> >  xen/arch/x86/hvm/hvm.c      |   90 ++++++++++++++-
> >  xen/arch/x86/hvm/vmx/vmcs.c |  266
> > ++++++++++++++++++++++++++++++++++++++++++-
> > xen/arch/x86/hvm/vmx/vmx.c  |   34 ++++++ 3 files changed, 383
> > insertions(+), 7 deletions(-)
> > 
> > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> > index ea7adf6..18889ad 100644
> > --- a/xen/arch/x86/hvm/hvm.c
> > +++ b/xen/arch/x86/hvm/hvm.c
> > @@ -510,6 +510,29 @@ static int hvm_print_line(
> >      return X86EMUL_OKAY;
> >  }
> >  
> > +static int hvm_pvh_dom_initialise(struct domain *d)
> > +{
> > +    int rc;
> > +
> > +    if (!d->arch.hvm_domain.hap_enabled)
> > +        return -EINVAL;
> > +
> > +    spin_lock_init(&d->arch.hvm_domain.irq_lock);
> > +    hvm_init_guest_time(d);
> > +
> > +    hvm_init_cacheattr_region_list(d);
> > +
> > +    if ( (rc=paging_enable(d,
> > PG_refcounts|PG_translate|PG_external)) != 0 )
> > +        goto fail1;        <===================================
GOTO
> > +
> > +    if ( (rc = hvm_funcs.domain_initialise(d)) == 0 )
> > +        return 0;
> > +
> > +fail1:
> 
> I don''t think you need the label here? You are not doing an goto.
Right above.
> > long))hvm_assert_evtchn_irq,
> > +                          (unsigned long)v );
> > +
> > +    v->arch.hvm_vcpu.hcall_64bit = 1;
> > +    v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN;
> > +    v->arch.user_regs.eflags = 2;
> 
> So that sets the Reserved flag. Could you include a comment
> explaining why.. Ah, is it b/c we later on bit-shift it and use it to
> figure out whether IOPL needs to be virtualized in
> arch_set_info_guest? Or is it just b/c this function is based off
> hvm_vcpu_initialise? If so, since you are being called by it, can you
> skip it?
That resvd bit is required to be set for bootstrap. Set in other places
also, like arch_set_info_guest():

v->arch.user_regs.eflags |= 2;
> 
> > +    v->arch.hvm_vcpu.inject_trap.vector = -1;
> > +
> > +    if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) {
> 
> The syntax here is off.
Hmm... space surrounding "=" in rc=hvm* ?
> >  int hvm_vcpu_initialise(struct vcpu *v)
> >  {
> >      int rc;
> >      struct domain *d = v->domain;
> > -    domid_t dm_domid > >
d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
> > +    domid_t dm_domid; 
> 
> Not sure I follow, why the move of it further down?
params is not defined/allocated for PVH.
> > +    /* VMCS controls. */
> > +    vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS;
> > +    __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
> > vmx_pin_based_exec_control); +
> > +    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
> > +
> > +    /* if rdtsc exiting is turned on and it goes thru
> > emulate_privileged_op,
> > +     * then pv_vcpu.ctrlreg must be added to pvh struct */
> 
> That would be the ''timer_mode'' syntax in the guest config
right?
> Perhaps then a check at the top of the function to see which
> timer_mode is used and exit out with -ENOSYS?
The vtsc setting. We set it to 0 for PVH guests.
> 
> > +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
> > +    v->arch.hvm_vmx.exec_control &=
~CPU_BASED_USE_TSC_OFFSETING;
> > +
> > +    v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING
|
> > +                                      CPU_BASED_CR3_LOAD_EXITING |
> > +                                      CPU_BASED_CR3_STORE_EXITING);
> > +    v->arch.hvm_vmx.exec_control |> >
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> > +    v->arch.hvm_vmx.exec_control &=
~CPU_BASED_MONITOR_TRAP_FLAG;
> 
> Is that b/c the PV code ends up making the SCHED_yield_op hypercall
> and we don''t need the monitor/mwait capability? If so, could you
add
> that comment in please?
No, MTF is debugging feature used mostly for single step.
> > +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS,
> > msr_type);
> > +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP,
> > msr_type);
> > +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP,
> > msr_type);
> > +        vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE,
> > msr_type);
> 
> So this looks like the one vmcs.c except that one has this extra:
> 
>  895         if ( cpu_has_vmx_pat && paging_mode_hap(d) )
>  896             vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT,
> MSR_TYPE_R | MSR_TYPE_W); 897     }
> 
> Did you miss that?
I''ll add it. I guess default must be disabled.
> > +
> > +        /* pure hvm doesn''t do this. safe? see:
> > long_mode_do_msr_write() */ +#if 0
> > +        vmx_disable_intercept_for_msr(v, MSR_STAR);
> > +        vmx_disable_intercept_for_msr(v, MSR_LSTAR);
> > +        vmx_disable_intercept_for_msr(v, MSR_CSTAR);
> > +        vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK);
> > +#endif
> 
> I would just provide a comment saying:
> 
> 	/*
> 	* long_mode_do_msr_write() takes care of
> MSR_[STAR|LSTAR|CSTAR|SYSCALL_MASK] */
Good Idea. I left the "#if 0" there for suggestion.

> > +    } else {
> > +        printk("PVH: CPU does NOT have msr bitmap\n");
> 
> Perhaps:
> 
> 	printk(XENLOG_G_ERR "%s: ..\n", __func__);
> ?
> > +        return -EINVAL;
> > +    }
> > +
> > +    if ( !cpu_has_vmx_vpid ) {
> > +        printk("PVH: At present VPID support is required to run
> > PVH\n");
> 
> Should you de-allocate msr_bitmap at this point?
> 
> Or perhaps move this check (and the one below) to the start of the
> function? So you have:
> 
> 	if ( !cpu_has_vmx_vpid )
> 		gdprintk ("%s: VPID required for PVH mode.\n",
> __func__);
> 
> 	if ( !cpu_has_vmx_secondary_exec_control )
> 		.. bla bla
> 
> 
> > +        return -EINVAL;
> > +    }
> > +
> > +    v->arch.hvm_vmx.secondary_exec_control > >
vmx_secondary_exec_control; +
> > +    if ( cpu_has_vmx_secondary_exec_control ) {
> > +        v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /*
turn
> > off all */
> > +        v->arch.hvm_vmx.secondary_exec_control |= 
> > +
> > SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> > +        v->arch.hvm_vmx.secondary_exec_control |> >
SECONDARY_EXEC_ENABLE_VPID; +
> > +        v->arch.hvm_vmx.secondary_exec_control |> >
SECONDARY_EXEC_ENABLE_EPT;
> > +        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
> > +                  v->arch.hvm_vmx.secondary_exec_control);
> > +    } else {
> > +        printk("PVH: NO Secondary Exec control\n");
> > +        return -EINVAL;
> 
> Ditto - should you de-allocate msr_bitmap ? Or if you are going to
> move the check for cpu_has_vmx_secondary_exec_control, then there is
> no need for this if (.. ) else ..
> 
> 
> > +    }
> > +
> > +    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
> > +
> > +    #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4
> > +    #define VM_ENTRY_LOAD_EFER 0x8000
> > +    vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS;
> > +    vmentry_ctl &= ~VM_ENTRY_LOAD_EFER;
> > +    vmentry_ctl &= ~VM_ENTRY_SMM;
> > +    vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR;
> > +    vmentry_ctl |= VM_ENTRY_IA32E_MODE;
> > +    __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
> > +
> 
> From here on, it looks mostly the same as construct_vmcs right?
> 
> Perhaps you can add a comment saying so - so when a cleanup effort
> is done later on - these can be candidates for it?
> 
> > +    /* MSR intercepts. */
> > +    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
> > +    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
> > +    __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
> > +
> > +    /* Host data selectors. */
> > +    __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
> > +    __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
> > +    __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
> > +    __vmwrite(HOST_FS_SELECTOR, 0);
> > +    __vmwrite(HOST_GS_SELECTOR, 0);
> > +    __vmwrite(HOST_FS_BASE, 0);
> > +    __vmwrite(HOST_GS_BASE, 0);
> > +
> > +    vmx_set_host_env(v);
> > +
> > +    /* Host control registers. */
> > +    v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
> > +    __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> > +    __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ?
> > X86_CR4_OSXSAVE : 0));
> 
> That formatting looks odd. 
Copied from hvm code. whats wrong?
> > +    /* Set default guest context values here. Some of these are
> > then overwritten
> > +     * in vmx_pvh_set_vcpu_info() by guest itself during vcpu
> > bringup */
> > +    __vmwrite(GUEST_CS_BASE, 0);
> > +    __vmwrite(GUEST_CS_LIMIT, ~0u);
> > +    __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */
> > +    __vmwrite(GUEST_CS_SELECTOR, 0x10);
> 
> 0x10. Could you use a #define for it? Somehow I thought it would
> be running in FLAT_KERNEL_CS but that would be odd. And of course
> since are booting in the Linux kernel without the PV MMU we would
> be using its native segments. So this would correspond to
> GDT_ENTRY_KERNEL_CS right? Might want to mention that
> so if the Linux kernel alters its GDT page we don''t blow up?
> 
> Thought I guess it does not matter - this is just the initial
> bootstrap segments. Presumarily the load_gdt in the Linux kernel
> later on resets it to whatever the "new" GDT is.
Correct:
#define __KERNEL_CS     (GDT_ENTRY_KERNEL_CS*8)

And load_gdt loads a new GDT natively.

> > +    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
> > +    __vmwrite(GUEST_DR7, 0);
> > +    __vmwrite(VMCS_LINK_POINTER, ~0UL);
> > +
> > +    __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
> > +    __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
> 
> Weird. In the vmcs.c file these are somewhat higher in the code.
Yes. I just didn''t copy the existing function, but created PVH function
to make it easier for PVH.
> > +
> > +    v->arch.hvm_vmx.exception_bitmap = 
> > +                                   HVM_TRAP_MASK     | (1 <<
> > TRAP_debug) | 
> > +                                   (1U << TRAP_int3) | (1U
<<
> > TRAP_no_device);
> 
> Odd syntax.
Similar to existing hvm code, whats wrong?

> > +    __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap);
> > +
> > +    __vmwrite(TSC_OFFSET, 0);
> 
> Hm, so you did earlier:
> 
> v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;
> 
> so is this neccessary? Or is just that you want it to be set
> to default baseline state?
Not necessary, doesn''t hurt either. I can remove it.
> > +
> > +    /* Set WP bit so rdonly pages are not written from CPL 0 */
> > +    tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP;
> > +    __vmwrite(GUEST_CR0, tmpval);
> > +    __vmwrite(CR0_READ_SHADOW, tmpval);
> > +    v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0]
> > tmpval; +
> > +    tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
> > +    required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR;
> > +    if ( (tmpval & required) != required )
> > +    {
> > +        printk("PVH: required CR4 features not
available:%lx\n",
> > required);
> > +        return -EINVAL;
> 
> You might want to move that to the top of the code. Or if you want
> it here, then at least free the msr_bitmap
I think I''ll just move all the checks top of the code.
> >  {
> >      struct domain *d = v->domain;
> > @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v)
> >  
> >      vmx_vmcs_enter(v);
> >  
> > +    if ( is_pvh_vcpu(v) ) {
> > +        int rc = pvh_construct_vmcs(v);
> > +        vmx_vmcs_exit(v);
> 
> Do you need to call paging_update_paging_modes as construct_vmcs()
> does?
Nop. We don''t need to as the arch_set_info_guest() does it for PVH.


Thanks Konrad.
Mukesh

Jan Beulich

2013-Mar-19 09:19 UTC

head link

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

>>> On 19.03.13 at 02:00, Mukesh Rathor
<mukesh.rathor@oracle.com> wrote:
> On Mon, 18 Mar 2013 11:28:43 -0400 Konrad Rzeszutek Wilk
<konrad.wilk@oracle.com> wrote:
>> On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor wrote:
>> > +    __vmwrite(GUEST_CS_SELECTOR, 0x10);
>> 
>> 0x10. Could you use a #define for it? Somehow I thought it would
>> be running in FLAT_KERNEL_CS but that would be odd. And of course
>> since are booting in the Linux kernel without the PV MMU we would
>> be using its native segments. So this would correspond to
>> GDT_ENTRY_KERNEL_CS right? Might want to mention that
>> so if the Linux kernel alters its GDT page we don''t blow up?
>> 
>> Thought I guess it does not matter - this is just the initial
>> bootstrap segments. Presumarily the load_gdt in the Linux kernel
>> later on resets it to whatever the "new" GDT is.
> 
> Correct:
> #define __KERNEL_CS     (GDT_ENTRY_KERNEL_CS*8)
What Linux does, or what selector values it uses doesn''t matter
at all here. Whatever value you want to put there, it needs an
explanation if it''s not one of the Xen defined ones (in particular
if the value was arbitrarily chosen, as that would leave future
readers completely clueless).

Perhaps to some degree the same also goes for the descriptor
attributes that you store, albeit one can certainly reconstruct
why the value was chosen the way it is.

Jan

Konrad Rzeszutek Wilk

2013-Mar-19 13:23 UTC

head link

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

On Mon, Mar 18, 2013 at 06:00:36PM -0700, Mukesh Rathor
wrote:> On Mon, 18 Mar 2013 11:28:43 -0400
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> 
> > On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor wrote:
> > > This patch mainly contains code to create a VMCS for PVH guest,
and
> > > HVM specific vcpu/domain creation code.
> > > 
> > > Changes in V2:
> > >   - Avoid call to hvm_do_resume() at call site rather than return
> > > in it.
> > >   - Return for PVH vmx_do_resume prior to intel debugger stuff.
> > > 
> > > Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
> > > ---
> > >  xen/arch/x86/hvm/hvm.c      |   90 ++++++++++++++-
> > >  xen/arch/x86/hvm/vmx/vmcs.c |  266
> > > ++++++++++++++++++++++++++++++++++++++++++-
> > > xen/arch/x86/hvm/vmx/vmx.c  |   34 ++++++ 3 files changed, 383
> > > insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> > > index ea7adf6..18889ad 100644
> > > --- a/xen/arch/x86/hvm/hvm.c
> > > +++ b/xen/arch/x86/hvm/hvm.c
> > > @@ -510,6 +510,29 @@ static int hvm_print_line(
> > >      return X86EMUL_OKAY;
> > >  }
> > >  
> > > +static int hvm_pvh_dom_initialise(struct domain *d)
> > > +{
> > > +    int rc;
> > > +
> > > +    if (!d->arch.hvm_domain.hap_enabled)
> > > +        return -EINVAL;
> > > +
> > > +    spin_lock_init(&d->arch.hvm_domain.irq_lock);
> > > +    hvm_init_guest_time(d);
> > > +
> > > +    hvm_init_cacheattr_region_list(d);
> > > +
> > > +    if ( (rc=paging_enable(d,
> > > PG_refcounts|PG_translate|PG_external)) != 0 )
> > > +        goto fail1;       
<=================================== GOTO
> > > +
> > > +    if ( (rc = hvm_funcs.domain_initialise(d)) == 0 )
> > > +        return 0;
> > > +
> > > +fail1:
> > 
> > I don''t think you need the label here? You are not doing an
goto.
> 
> Right above.
Duh!> 
> > > long))hvm_assert_evtchn_irq,
> > > +                          (unsigned long)v );
> > > +
> > > +    v->arch.hvm_vcpu.hcall_64bit = 1;
> > > +    v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN;
> > > +    v->arch.user_regs.eflags = 2;
> > 
> > So that sets the Reserved flag. Could you include a comment
> > explaining why.. Ah, is it b/c we later on bit-shift it and use it to
> > figure out whether IOPL needs to be virtualized in
> > arch_set_info_guest? Or is it just b/c this function is based off
> > hvm_vcpu_initialise? If so, since you are being called by it, can you
> > skip it?
> 
> That resvd bit is required to be set for bootstrap. Set in other places
> also, like arch_set_info_guest():
> 
> v->arch.user_regs.eflags |= 2;
OK, but why? I am not seeing that bit defined? Or was it needed to get
the machine to boot up.> 
> > 
> > > +    v->arch.hvm_vcpu.inject_trap.vector = -1;
> > > +
> > > +    if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) {
> > 
> > The syntax here is off.
> 
> Hmm... space surrounding "=" in rc=hvm* ?
Yes, like this:
 385     if ( (rc = vcpu_init_fpu(v)) != 0 )
> 
> > >  int hvm_vcpu_initialise(struct vcpu *v)
> > >  {
> > >      int rc;
> > >      struct domain *d = v->domain;
> > > -    domid_t dm_domid > > >
d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
> > > +    domid_t dm_domid; 
> > 
> > Not sure I follow, why the move of it further down?
> 
> params is not defined/allocated for PVH.
But you are still using it in the code. As in, you are still
fetching it (just later).
> 
> > > +    /* VMCS controls. */
> > > +    vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS;
> > > +    __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
> > > vmx_pin_based_exec_control); +
> > > +    v->arch.hvm_vmx.exec_control =
vmx_cpu_based_exec_control;
> > > +
> > > +    /* if rdtsc exiting is turned on and it goes thru
> > > emulate_privileged_op,
> > > +     * then pv_vcpu.ctrlreg must be added to pvh struct */
> > 
> > That would be the ''timer_mode'' syntax in the guest
config right?
> > Perhaps then a check at the top of the function to see which
> > timer_mode is used and exit out with -ENOSYS?
> 
> The vtsc setting. We set it to 0 for PVH guests.
> 
OK, so that is the the ''timer_mode'' always set to
''2'' or rather
timer_mode="native" (in the guest config). Which then does
the xc_domain_set_tsc_info hypercall to set the vtsc.

You need to document that please.
> > 
> > > +    v->arch.hvm_vmx.exec_control &=
~CPU_BASED_RDTSC_EXITING;
> > > +    v->arch.hvm_vmx.exec_control &=
~CPU_BASED_USE_TSC_OFFSETING;
> > > +
> > > +    v->arch.hvm_vmx.exec_control &=
~(CPU_BASED_INVLPG_EXITING |
> > > +                                      CPU_BASED_CR3_LOAD_EXITING
|
> > > +                                     
CPU_BASED_CR3_STORE_EXITING);
> > > +    v->arch.hvm_vmx.exec_control |> > >
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> > > +    v->arch.hvm_vmx.exec_control &=
~CPU_BASED_MONITOR_TRAP_FLAG;
> > 
> > Is that b/c the PV code ends up making the SCHED_yield_op hypercall
> > and we don''t need the monitor/mwait capability? If so, could
you add
> > that comment in please?
> 
> No, MTF is debugging feature used mostly for single step.
> 
> > > +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS,
> > > msr_type);
> > > +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP,
> > > msr_type);
> > > +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP,
> > > msr_type);
> > > +        vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE,
> > > msr_type);
> > 
> > So this looks like the one vmcs.c except that one has this extra:
> > 
> >  895         if ( cpu_has_vmx_pat && paging_mode_hap(d) )
> >  896             vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT,
> > MSR_TYPE_R | MSR_TYPE_W); 897     }
> > 
> > Did you miss that?
> 
> I''ll add it. I guess default must be disabled.
> 
> > > +
> > > +        /* pure hvm doesn''t do this. safe? see:
> > > long_mode_do_msr_write() */ +#if 0
> > > +        vmx_disable_intercept_for_msr(v, MSR_STAR);
> > > +        vmx_disable_intercept_for_msr(v, MSR_LSTAR);
> > > +        vmx_disable_intercept_for_msr(v, MSR_CSTAR);
> > > +        vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK);
> > > +#endif
> > 
> > I would just provide a comment saying:
> > 
> > 	/*
> > 	* long_mode_do_msr_write() takes care of
> > MSR_[STAR|LSTAR|CSTAR|SYSCALL_MASK] */
> 
> Good Idea. I left the "#if 0" there for suggestion.
> 
> 
> > > +    } else {
> > > +        printk("PVH: CPU does NOT have msr bitmap\n");
> > 
> > Perhaps:
> > 
> > 	printk(XENLOG_G_ERR "%s: ..\n", __func__);
> > ?
> > > +        return -EINVAL;
> > > +    }
> > > +
> > > +    if ( !cpu_has_vmx_vpid ) {
> > > +        printk("PVH: At present VPID support is required to
run
> > > PVH\n");
> > 
> > Should you de-allocate msr_bitmap at this point?
> > 
> > Or perhaps move this check (and the one below) to the start of the
> > function? So you have:
> > 
> > 	if ( !cpu_has_vmx_vpid )
> > 		gdprintk ("%s: VPID required for PVH mode.\n",
> > __func__);
> > 
> > 	if ( !cpu_has_vmx_secondary_exec_control )
> > 		.. bla bla
> > 
> > 
> > > +        return -EINVAL;
> > > +    }
> > > +
> > > +    v->arch.hvm_vmx.secondary_exec_control > > >
vmx_secondary_exec_control; +
> > > +    if ( cpu_has_vmx_secondary_exec_control ) {
> > > +        v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF;
/* turn
> > > off all */
> > > +        v->arch.hvm_vmx.secondary_exec_control |= 
> > > +
> > > SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> > > +        v->arch.hvm_vmx.secondary_exec_control |> >
> SECONDARY_EXEC_ENABLE_VPID; +
> > > +        v->arch.hvm_vmx.secondary_exec_control |> >
> SECONDARY_EXEC_ENABLE_EPT;
> > > +        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
> > > +                  v->arch.hvm_vmx.secondary_exec_control);
> > > +    } else {
> > > +        printk("PVH: NO Secondary Exec control\n");
> > > +        return -EINVAL;
> > 
> > Ditto - should you de-allocate msr_bitmap ? Or if you are going to
> > move the check for cpu_has_vmx_secondary_exec_control, then there is
> > no need for this if (.. ) else ..
> > 
> > 
> > > +    }
> > > +
> > > +    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
> > > +
> > > +    #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4
> > > +    #define VM_ENTRY_LOAD_EFER 0x8000
> > > +    vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS;
> > > +    vmentry_ctl &= ~VM_ENTRY_LOAD_EFER;
> > > +    vmentry_ctl &= ~VM_ENTRY_SMM;
> > > +    vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR;
> > > +    vmentry_ctl |= VM_ENTRY_IA32E_MODE;
> > > +    __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
> > > +
> > 
> > From here on, it looks mostly the same as construct_vmcs right?
> > 
> > Perhaps you can add a comment saying so - so when a cleanup effort
> > is done later on - these can be candidates for it?
> > 
> > > +    /* MSR intercepts. */
> > > +    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
> > > +    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
> > > +    __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
> > > +
> > > +    /* Host data selectors. */
> > > +    __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
> > > +    __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
> > > +    __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
> > > +    __vmwrite(HOST_FS_SELECTOR, 0);
> > > +    __vmwrite(HOST_GS_SELECTOR, 0);
> > > +    __vmwrite(HOST_FS_BASE, 0);
> > > +    __vmwrite(HOST_GS_BASE, 0);
> > > +
> > > +    vmx_set_host_env(v);
> > > +
> > > +    /* Host control registers. */
> > > +    v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
> > > +    __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> > > +    __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ?
> > > X86_CR4_OSXSAVE : 0));
> > 
> > That formatting looks odd. 
> 
> Copied from hvm code. whats wrong?
The HVM code has some extra spaces.
> 
> > > +    /* Set default guest context values here. Some of these are
> > > then overwritten
> > > +     * in vmx_pvh_set_vcpu_info() by guest itself during vcpu
> > > bringup */
> > > +    __vmwrite(GUEST_CS_BASE, 0);
> > > +    __vmwrite(GUEST_CS_LIMIT, ~0u);
> > > +    __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */
> > > +    __vmwrite(GUEST_CS_SELECTOR, 0x10);
> > 
> > 0x10. Could you use a #define for it? Somehow I thought it would
> > be running in FLAT_KERNEL_CS but that would be odd. And of course
> > since are booting in the Linux kernel without the PV MMU we would
> > be using its native segments. So this would correspond to
> > GDT_ENTRY_KERNEL_CS right? Might want to mention that
> > so if the Linux kernel alters its GDT page we don''t blow up?
> > 
> > Thought I guess it does not matter - this is just the initial
> > bootstrap segments. Presumarily the load_gdt in the Linux kernel
> > later on resets it to whatever the "new" GDT is.
> 
> Correct:
> #define __KERNEL_CS     (GDT_ENTRY_KERNEL_CS*8)
> 
> And load_gdt loads a new GDT natively.
> 
> 
> > > +    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
> > > +    __vmwrite(GUEST_DR7, 0);
> > > +    __vmwrite(VMCS_LINK_POINTER, ~0UL);
> > > +
> > > +    __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
> > > +    __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
> > 
> > Weird. In the vmcs.c file these are somewhat higher in the code.
> 
> Yes. I just didn''t copy the existing function, but created PVH
function
> to make it easier for PVH.
Sure, but the next step (not these patches) will be to collapse some
of the redundant logic.
> 
> > > +
> > > +    v->arch.hvm_vmx.exception_bitmap = 
> > > +                                   HVM_TRAP_MASK     | (1
<<
> > > TRAP_debug) | 
> > > +                                   (1U << TRAP_int3) | (1U
<<
> > > TRAP_no_device);
> > 
> > Odd syntax.
> 
> Similar to existing hvm code, whats wrong?
Tabs. The HVM looks different (I think it uses spaces?)> 
> 
> > > +    __vmwrite(EXCEPTION_BITMAP,
v->arch.hvm_vmx.exception_bitmap);
> > > +
> > > +    __vmwrite(TSC_OFFSET, 0);
> > 
> > Hm, so you did earlier:
> > 
> > v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;
> > 
> > so is this neccessary? Or is just that you want it to be set
> > to default baseline state?
> 
> Not necessary, doesn''t hurt either. I can remove it.
> 
> > > +
> > > +    /* Set WP bit so rdonly pages are not written from CPL 0 */
> > > +    tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP;
> > > +    __vmwrite(GUEST_CR0, tmpval);
> > > +    __vmwrite(CR0_READ_SHADOW, tmpval);
> > > +    v->arch.hvm_vcpu.hw_cr[0] =
v->arch.hvm_vcpu.guest_cr[0] > > > tmpval; +
> > > +    tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
> > > +    required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR;
> > > +    if ( (tmpval & required) != required )
> > > +    {
> > > +        printk("PVH: required CR4 features not
available:%lx\n",
> > > required);
> > > +        return -EINVAL;
> > 
> > You might want to move that to the top of the code. Or if you want
> > it here, then at least free the msr_bitmap
> 
> I think I''ll just move all the checks top of the code.
> 
> > >  {
> > >      struct domain *d = v->domain;
> > > @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v)
> > >  
> > >      vmx_vmcs_enter(v);
> > >  
> > > +    if ( is_pvh_vcpu(v) ) {
> > > +        int rc = pvh_construct_vmcs(v);
> > > +        vmx_vmcs_exit(v);
> > 
> > Do you need to call paging_update_paging_modes as construct_vmcs()
> > does?
> 
> Nop. We don''t need to as the arch_set_info_guest() does it for
PVH.
Ok, could you provide a comment please? That way when one looks at this
code and the HVM one it will be clear.> 
> 
> Thanks Konrad.
Of course. Thank you for bearing with this tiring review
process.> Mukesh
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

Mukesh Rathor

2013-Mar-26 22:30 UTC

head link

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

On Tue, 19 Mar 2013 09:23:30 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> On Mon, Mar 18, 2013 at 06:00:36PM -0700, Mukesh Rathor wrote:
> > On Mon, 18 Mar 2013 11:28:43 -0400
> > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > 
> > > So that sets the Reserved flag. Could you include a comment
> > > explaining why.. Ah, is it b/c we later on bit-shift it and use
> > > it to figure out whether IOPL needs to be virtualized in
> > > arch_set_info_guest? Or is it just b/c this function is based off
> > > hvm_vcpu_initialise? If so, since you are being called by it, can
> > > you skip it?
> > 
> > That resvd bit is required to be set for bootstrap. Set in other
> > places also, like arch_set_info_guest():
> > 
> > v->arch.user_regs.eflags |= 2;
> 
> OK, but why? I am not seeing that bit defined? Or was it needed to get
> the machine to boot up.
It''s needed to get the machine to boot up. Thats the EFLAGS default in
the CPU upon reset or power up. It''s a resvd bit.
> > > Not sure I follow, why the move of it further down?
> > 
> > params is not defined/allocated for PVH.
> 
> But you are still using it in the code. As in, you are still
> fetching it (just later).
No, not really. The function returns early for PVH before params is
used. params is NULL for PVH:

 int hvm_vcpu_initialise(struct vcpu *v)
 {
     int rc;
     struct domain *d = v->domain;
-    domid_t dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
+    domid_t dm_domid; 
 
     hvm_asid_flush_vcpu(v);
 
+    if ( is_pvh_vcpu(v) )
+        return hvm_pvh_vcpu_initialise(v);    <==============+
  ...........
> > > > +    /* VMCS controls. */
> > > > +    vmx_pin_based_exec_control &=
~PIN_BASED_VIRTUAL_NMIS;
> > > > +    __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
> > > > vmx_pin_based_exec_control); +
> > > > +    v->arch.hvm_vmx.exec_control =
vmx_cpu_based_exec_control;
> > > > +
> > > > +    /* if rdtsc exiting is turned on and it goes thru
> > > > emulate_privileged_op,
> > > > +     * then pv_vcpu.ctrlreg must be added to pvh struct */
> > > 
> > > That would be the ''timer_mode'' syntax in the
guest config right?
> > > Perhaps then a check at the top of the function to see which
> > > timer_mode is used and exit out with -ENOSYS?
> > 
> > The vtsc setting. We set it to 0 for PVH guests.
> > 
> 
> OK, so that is the the ''timer_mode'' always set to
''2'' or rather
> timer_mode="native" (in the guest config). Which then does
> the xc_domain_set_tsc_info hypercall to set the vtsc.
> You need to document that please.
Right. I set vtsc to 0 in tsc_set_info(), like suggested in prior review,
but may be I should print a warning there that its defaulted to that.
I also had put in cover letter to expand this in Phase II or III or IV... btw.
I''ll document this in tsc_set_info() also.

thanks,
Mukesh

Xen devel - Mar 2013 - [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

[PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization

Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization