Mukesh Rathor
2013-Mar-16 00:39 UTC
[PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization
This patch mainly contains code to create a VMCS for PVH guest, and HVM specific vcpu/domain creation code. Changes in V2: - Avoid call to hvm_do_resume() at call site rather than return in it. - Return for PVH vmx_do_resume prior to intel debugger stuff. Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> --- xen/arch/x86/hvm/hvm.c | 90 ++++++++++++++- xen/arch/x86/hvm/vmx/vmcs.c | 266 ++++++++++++++++++++++++++++++++++++++++++- xen/arch/x86/hvm/vmx/vmx.c | 34 ++++++ 3 files changed, 383 insertions(+), 7 deletions(-) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index ea7adf6..18889ad 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -510,6 +510,29 @@ static int hvm_print_line( return X86EMUL_OKAY; } +static int hvm_pvh_dom_initialise(struct domain *d) +{ + int rc; + + if (!d->arch.hvm_domain.hap_enabled) + return -EINVAL; + + spin_lock_init(&d->arch.hvm_domain.irq_lock); + hvm_init_guest_time(d); + + hvm_init_cacheattr_region_list(d); + + if ( (rc=paging_enable(d, PG_refcounts|PG_translate|PG_external)) != 0 ) + goto fail1; + + if ( (rc = hvm_funcs.domain_initialise(d)) == 0 ) + return 0; + +fail1: + hvm_destroy_cacheattr_region_list(d); + return rc; +} + int hvm_domain_initialise(struct domain *d) { int rc; @@ -520,6 +543,8 @@ int hvm_domain_initialise(struct domain *d) "on a non-VT/AMDV platform.\n"); return -EINVAL; } + if ( is_pvh_domain(d) ) + return hvm_pvh_dom_initialise(d); spin_lock_init(&d->arch.hvm_domain.pbuf_lock); spin_lock_init(&d->arch.hvm_domain.irq_lock); @@ -584,6 +609,11 @@ int hvm_domain_initialise(struct domain *d) void hvm_domain_relinquish_resources(struct domain *d) { + if ( is_pvh_domain(d) ) + { + pit_deinit(d); + return; + } if ( hvm_funcs.nhvm_domain_relinquish_resources ) hvm_funcs.nhvm_domain_relinquish_resources(d); @@ -609,10 +639,14 @@ void hvm_domain_relinquish_resources(struct domain *d) void hvm_domain_destroy(struct domain *d) { hvm_funcs.domain_destroy(d); + hvm_destroy_cacheattr_region_list(d); + + if ( is_pvh_domain(d) ) + return; + rtc_deinit(d); stdvga_deinit(d); vioapic_deinit(d); - hvm_destroy_cacheattr_region_list(d); } static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h) @@ -1066,14 +1100,47 @@ static int __init __hvm_register_CPU_XSAVE_save_and_restore(void) } __initcall(__hvm_register_CPU_XSAVE_save_and_restore); +static int hvm_pvh_vcpu_initialise(struct vcpu *v) +{ + int rc; + + if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 ) + return rc; + + softirq_tasklet_init( &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet, + (void(*)(unsigned long))hvm_assert_evtchn_irq, + (unsigned long)v ); + + v->arch.hvm_vcpu.hcall_64bit = 1; + v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN; + v->arch.user_regs.eflags = 2; + v->arch.hvm_vcpu.inject_trap.vector = -1; + + if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) { + hvm_funcs.vcpu_destroy(v); + return rc; + } + + /* during domain shutdown: pvh_vmx_vmexit_handler->emulate_privileged_op + * -> guest_io_read -> pv_pit_handler -> handle_speaker_io -> _spin_lock + * so we call pit_init to initialize the spin lock */ + if ( v->vcpu_id == 0 ) + pit_init(v, cpu_khz); + + return 0; +} + int hvm_vcpu_initialise(struct vcpu *v) { int rc; struct domain *d = v->domain; - domid_t dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; + domid_t dm_domid; hvm_asid_flush_vcpu(v); + if ( is_pvh_vcpu(v) ) + return hvm_pvh_vcpu_initialise(v); + if ( (rc = vlapic_init(v)) != 0 ) goto fail1; @@ -1084,6 +1151,8 @@ int hvm_vcpu_initialise(struct vcpu *v) && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) goto fail3; + dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; + /* Create ioreq event channel. */ rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL); if ( rc < 0 ) @@ -1163,7 +1232,10 @@ void hvm_vcpu_destroy(struct vcpu *v) tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet); hvm_vcpu_cacheattr_destroy(v); - vlapic_destroy(v); + + if ( !is_pvh_vcpu(v) ) + vlapic_destroy(v); + hvm_funcs.vcpu_destroy(v); /* Event channel is already freed by evtchn_destroy(). */ @@ -4514,6 +4586,8 @@ static int hvm_memory_event_traps(long p, uint32_t reason, void hvm_memory_event_cr0(unsigned long value, unsigned long old) { + if ( is_pvh_vcpu(current) ) + return; hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR0], MEM_EVENT_REASON_CR0, @@ -4522,6 +4596,8 @@ void hvm_memory_event_cr0(unsigned long value, unsigned long old) void hvm_memory_event_cr3(unsigned long value, unsigned long old) { + if ( is_pvh_vcpu(current) ) + return; hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR3], MEM_EVENT_REASON_CR3, @@ -4530,6 +4606,8 @@ void hvm_memory_event_cr3(unsigned long value, unsigned long old) void hvm_memory_event_cr4(unsigned long value, unsigned long old) { + if ( is_pvh_vcpu(current) ) + return; hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR4], MEM_EVENT_REASON_CR4, @@ -4538,6 +4616,8 @@ void hvm_memory_event_cr4(unsigned long value, unsigned long old) void hvm_memory_event_msr(unsigned long msr, unsigned long value) { + if ( is_pvh_vcpu(current) ) + return; hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_MSR], MEM_EVENT_REASON_MSR, @@ -4550,6 +4630,8 @@ int hvm_memory_event_int3(unsigned long gla) unsigned long gfn; gfn = paging_gva_to_gfn(current, gla, &pfec); + if ( is_pvh_vcpu(current) ) + return 0; return hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_INT3], MEM_EVENT_REASON_INT3, @@ -4562,6 +4644,8 @@ int hvm_memory_event_single_step(unsigned long gla) unsigned long gfn; gfn = paging_gva_to_gfn(current, gla, &pfec); + if ( is_pvh_vcpu(current) ) + return 0; return hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP], MEM_EVENT_REASON_SINGLESTEP, diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 9926ffb..b0bea9c 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -624,7 +624,7 @@ void vmx_vmcs_exit(struct vcpu *v) { /* Don''t confuse vmx_do_resume (for @v or @current!) */ vmx_clear_vmcs(v); - if ( is_hvm_vcpu(current) ) + if ( is_hvm_or_pvh_vcpu(current) ) vmx_load_vmcs(current); spin_unlock(&v->arch.hvm_vmx.vmcs_lock); @@ -815,6 +815,253 @@ void virtual_vmcs_vmwrite(void *vvmcs, u32 vmcs_encoding, u64 val) virtual_vmcs_exit(vvmcs); } +static int pvh_construct_vmcs(struct vcpu *v) +{ + uint16_t sysenter_cs; + unsigned long sysenter_eip; + struct domain *d = v->domain; + struct p2m_domain *p2m = p2m_get_hostp2m(d); + struct ept_data *ept = &p2m->ept; + u32 vmexit_ctl = vmx_vmexit_control; + u32 vmentry_ctl = vmx_vmentry_control; + u64 required, tmpval = -1; + + if ( !paging_mode_hap(d) ) + { + printk("ERROR: HAP is required to run PV in HVM container\n"); + return -EINVAL; + } + + /* VMCS controls. */ + vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS; + __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); + + v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; + + /* if rdtsc exiting is turned on and it goes thru emulate_privileged_op, + * then pv_vcpu.ctrlreg must be added to pvh struct */ + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING; + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING; + + v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; + v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP; + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW; + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); + + /* I/O access bitmap. */ + __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0)); + __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE)); + + /* MSR access bitmap. */ + if ( cpu_has_vmx_msr_bitmap ) + { + unsigned long *msr_bitmap = alloc_xenheap_page(); + int msr_type = MSR_TYPE_R | MSR_TYPE_W; + + if ( msr_bitmap == NULL ) + return -ENOMEM; + + memset(msr_bitmap, ~0, PAGE_SIZE); + v->arch.hvm_vmx.msr_bitmap = msr_bitmap; + __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap)); + + vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type); + vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type); + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type); + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type); + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type); + vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type); + + /* pure hvm doesn''t do this. safe? see: long_mode_do_msr_write() */ +#if 0 + vmx_disable_intercept_for_msr(v, MSR_STAR); + vmx_disable_intercept_for_msr(v, MSR_LSTAR); + vmx_disable_intercept_for_msr(v, MSR_CSTAR); + vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); +#endif + } else { + printk("PVH: CPU does NOT have msr bitmap\n"); + return -EINVAL; + } + + if ( !cpu_has_vmx_vpid ) { + printk("PVH: At present VPID support is required to run PVH\n"); + return -EINVAL; + } + + v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control; + + if ( cpu_has_vmx_secondary_exec_control ) { + v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn off all */ + v->arch.hvm_vmx.secondary_exec_control |= + SECONDARY_EXEC_PAUSE_LOOP_EXITING; + v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID; + + v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_EPT; + __vmwrite(SECONDARY_VM_EXEC_CONTROL, + v->arch.hvm_vmx.secondary_exec_control); + } else { + printk("PVH: NO Secondary Exec control\n"); + return -EINVAL; + } + + __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl); + + #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4 + #define VM_ENTRY_LOAD_EFER 0x8000 + vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS; + vmentry_ctl &= ~VM_ENTRY_LOAD_EFER; + vmentry_ctl &= ~VM_ENTRY_SMM; + vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR; + vmentry_ctl |= VM_ENTRY_IA32E_MODE; + __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl); + + /* MSR intercepts. */ + __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); + __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); + __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); + + /* Host data selectors. */ + __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); + __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); + __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS); + __vmwrite(HOST_FS_SELECTOR, 0); + __vmwrite(HOST_GS_SELECTOR, 0); + __vmwrite(HOST_FS_BASE, 0); + __vmwrite(HOST_GS_BASE, 0); + + vmx_set_host_env(v); + + /* Host control registers. */ + v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; + __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); + __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? X86_CR4_OSXSAVE : 0)); + + /* Host CS:RIP. */ + __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); + __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler); + + /* Host SYSENTER CS:RIP. */ + rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs); + __vmwrite(HOST_SYSENTER_CS, sysenter_cs); + rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip); + __vmwrite(HOST_SYSENTER_EIP, sysenter_eip); + + __vmwrite(VM_ENTRY_INTR_INFO, 0); + + __vmwrite(CR3_TARGET_COUNT, 0); + + __vmwrite(GUEST_ACTIVITY_STATE, 0); + + /* Set default guest context values here. Some of these are then overwritten + * in vmx_pvh_set_vcpu_info() by guest itself during vcpu bringup */ + __vmwrite(GUEST_CS_BASE, 0); + __vmwrite(GUEST_CS_LIMIT, ~0u); + __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */ + __vmwrite(GUEST_CS_SELECTOR, 0x10); + + __vmwrite(GUEST_DS_BASE, 0); + __vmwrite(GUEST_DS_LIMIT, ~0u); + __vmwrite(GUEST_DS_AR_BYTES, 0xc093); + __vmwrite(GUEST_DS_SELECTOR, 0x18); + + __vmwrite(GUEST_SS_BASE, 0); /* use same seg as DS */ + __vmwrite(GUEST_SS_LIMIT, ~0u); + __vmwrite(GUEST_SS_AR_BYTES, 0xc093); + __vmwrite(GUEST_SS_SELECTOR, 0x18); + + __vmwrite(GUEST_ES_SELECTOR, 0); + __vmwrite(GUEST_FS_SELECTOR, 0); + __vmwrite(GUEST_GS_SELECTOR, 0); + + /* Guest segment bases. */ + __vmwrite(GUEST_ES_BASE, 0); + __vmwrite(GUEST_FS_BASE, 0); + __vmwrite(GUEST_GS_BASE, 0); + + /* Guest segment limits. */ + __vmwrite(GUEST_ES_LIMIT, ~0u); + __vmwrite(GUEST_FS_LIMIT, ~0u); + __vmwrite(GUEST_GS_LIMIT, ~0u); + + /* Guest segment AR bytes. */ + __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */ + __vmwrite(GUEST_FS_AR_BYTES, 0xc093); + __vmwrite(GUEST_GS_AR_BYTES, 0xc093); + + /* Guest IDT. */ + __vmwrite(GUEST_GDTR_BASE, 0); + __vmwrite(GUEST_GDTR_LIMIT, 0); + + /* Guest LDT. */ + __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */ + __vmwrite(GUEST_LDTR_SELECTOR, 0); + __vmwrite(GUEST_LDTR_BASE, 0); + __vmwrite(GUEST_LDTR_LIMIT, 0); + + /* Guest TSS. */ + __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */ + __vmwrite(GUEST_TR_BASE, 0); + __vmwrite(GUEST_TR_LIMIT, 0xff); + + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); + __vmwrite(GUEST_DR7, 0); + __vmwrite(VMCS_LINK_POINTER, ~0UL); + + __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); + __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0); + + v->arch.hvm_vmx.exception_bitmap = + HVM_TRAP_MASK | (1 << TRAP_debug) | + (1U << TRAP_int3) | (1U << TRAP_no_device); + __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap); + + __vmwrite(TSC_OFFSET, 0); + + /* Set WP bit so rdonly pages are not written from CPL 0 */ + tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP; + __vmwrite(GUEST_CR0, tmpval); + __vmwrite(CR0_READ_SHADOW, tmpval); + v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] = tmpval; + + tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features); + required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR; + if ( (tmpval & required) != required ) + { + printk("PVH: required CR4 features not available:%lx\n", required); + return -EINVAL; + } + __vmwrite(GUEST_CR4, tmpval); + __vmwrite(CR4_READ_SHADOW, tmpval); + v->arch.hvm_vcpu.guest_cr[4] = tmpval; + + __vmwrite(CR0_GUEST_HOST_MASK, ~0UL); + __vmwrite(CR4_GUEST_HOST_MASK, ~0UL); + + v->arch.hvm_vmx.vmx_realmode = 0; + + ept->asr = pagetable_get_pfn(p2m_get_pagetable(p2m)); + __vmwrite(EPT_POINTER, ept_get_eptp(ept)); + + if ( cpu_has_vmx_pat ) + { + u64 host_pat, guest_pat; + + rdmsrl(MSR_IA32_CR_PAT, host_pat); + guest_pat = MSR_IA32_CR_PAT_RESET; + + __vmwrite(HOST_PAT, host_pat); + __vmwrite(GUEST_PAT, guest_pat); + } + return 0; +} + static int construct_vmcs(struct vcpu *v) { struct domain *d = v->domain; @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v) vmx_vmcs_enter(v); + if ( is_pvh_vcpu(v) ) { + int rc = pvh_construct_vmcs(v); + vmx_vmcs_exit(v); + return rc; + } + /* VMCS controls. */ __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); @@ -1259,8 +1512,10 @@ void vmx_do_resume(struct vcpu *v) vmx_clear_vmcs(v); vmx_load_vmcs(v); - hvm_migrate_timers(v); - hvm_migrate_pirqs(v); + if ( !is_pvh_vcpu(v) ) { + hvm_migrate_timers(v); + hvm_migrate_pirqs(v); + } vmx_set_host_env(v); /* * Both n1 VMCS and n2 VMCS need to update the host environment after @@ -1272,6 +1527,9 @@ void vmx_do_resume(struct vcpu *v) hvm_asid_flush_vcpu(v); } + if ( is_pvh_vcpu(v) ) + reset_stack_and_jump(vmx_asm_do_vmentry); + debug_state = v->domain->debugger_attached || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3] || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP]; @@ -1455,7 +1713,7 @@ static void vmcs_dump(unsigned char ch) for_each_domain ( d ) { - if ( !is_hvm_domain(d) ) + if ( !is_hvm_or_pvh_domain(d) ) continue; printk("\n>>> Domain %d <<<\n", d->domain_id); for_each_vcpu ( d, v ) diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index e64980f..194c87b 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -79,6 +79,9 @@ static int vmx_domain_initialise(struct domain *d) { int rc; + if ( is_pvh_domain(d) ) + return 0; + if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) return rc; @@ -87,6 +90,9 @@ static int vmx_domain_initialise(struct domain *d) static void vmx_domain_destroy(struct domain *d) { + if ( is_pvh_domain(d) ) + return; + vmx_free_vlapic_mapping(d); } @@ -110,6 +116,12 @@ static int vmx_vcpu_initialise(struct vcpu *v) vpmu_initialise(v); + if (is_pvh_vcpu(v) ) + { + /* this for hvm_long_mode_enabled(v) */ + v->arch.hvm_vcpu.guest_efer = EFER_SCE | EFER_LMA | EFER_LME; + return 0; + } vmx_install_vlapic_mapping(v); /* %eax == 1 signals full real-mode support to the guest loader. */ @@ -1033,6 +1045,23 @@ static void vmx_update_host_cr3(struct vcpu *v) vmx_vmcs_exit(v); } +static void vmx_update_pvh_cr(struct vcpu *v, unsigned int cr) +{ + vmx_vmcs_enter(v); + switch ( cr ) + { + case 3: + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.guest_cr[3]); + hvm_asid_flush_vcpu(v); + break; + + default: + printk("PVH: d%d v%d unexpected cr%d update at rip:%lx\n", + v->domain->domain_id, v->vcpu_id, cr, __vmread(GUEST_RIP)); + } + vmx_vmcs_exit(v); +} + void vmx_update_debug_state(struct vcpu *v) { unsigned long mask; @@ -1052,6 +1081,11 @@ void vmx_update_debug_state(struct vcpu *v) static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr) { + if ( is_pvh_vcpu(v) ) { + vmx_update_pvh_cr(v, cr); + return; + } + vmx_vmcs_enter(v); switch ( cr ) -- 1.7.2.3
Jan Beulich
2013-Mar-18 12:03 UTC
Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization
>>> On 16.03.13 at 01:39, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > + /* pure hvm doesn''t do this. safe? see: long_mode_do_msr_write() */ > +#if 0 > + vmx_disable_intercept_for_msr(v, MSR_STAR); > + vmx_disable_intercept_for_msr(v, MSR_LSTAR); > + vmx_disable_intercept_for_msr(v, MSR_CSTAR); > + vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); > +#endifThis is the sort of comment/code that I think should never be in a non-RFC patch. Either you''re convinced not intercepting these MSRs is correct (and secure) for PVH guests (in which case no more than a comment saying so is needed), or the code is needed and hence the conditional should be dropped.> + } else {Formatting (not just here).> + printk("PVH: CPU does NOT have msr bitmap\n"); > + return -EINVAL; > + } > + > + if ( !cpu_has_vmx_vpid ) { > + printk("PVH: At present VPID support is required to run PVH\n"); > + return -EINVAL; > + } > + > + v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control; > + > + if ( cpu_has_vmx_secondary_exec_control ) { > + v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn off all */ > + v->arch.hvm_vmx.secondary_exec_control |= > + SECONDARY_EXEC_PAUSE_LOOP_EXITING; > + v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID; > + > + v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_EPT; > + __vmwrite(SECONDARY_VM_EXEC_CONTROL, > + v->arch.hvm_vmx.secondary_exec_control); > + } else { > + printk("PVH: NO Secondary Exec control\n"); > + return -EINVAL; > + } > + > + __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl); > + > + #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4 > + #define VM_ENTRY_LOAD_EFER 0x8000Please don''t indent #defines (at least not the #). Jan
Konrad Rzeszutek Wilk
2013-Mar-18 15:28 UTC
Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization
On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor wrote:> This patch mainly contains code to create a VMCS for PVH guest, and HVM > specific vcpu/domain creation code. > > Changes in V2: > - Avoid call to hvm_do_resume() at call site rather than return in it. > - Return for PVH vmx_do_resume prior to intel debugger stuff. > > Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> > --- > xen/arch/x86/hvm/hvm.c | 90 ++++++++++++++- > xen/arch/x86/hvm/vmx/vmcs.c | 266 ++++++++++++++++++++++++++++++++++++++++++- > xen/arch/x86/hvm/vmx/vmx.c | 34 ++++++ > 3 files changed, 383 insertions(+), 7 deletions(-) > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > index ea7adf6..18889ad 100644 > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -510,6 +510,29 @@ static int hvm_print_line( > return X86EMUL_OKAY; > } > > +static int hvm_pvh_dom_initialise(struct domain *d) > +{ > + int rc; > + > + if (!d->arch.hvm_domain.hap_enabled) > + return -EINVAL; > + > + spin_lock_init(&d->arch.hvm_domain.irq_lock); > + hvm_init_guest_time(d); > + > + hvm_init_cacheattr_region_list(d); > + > + if ( (rc=paging_enable(d, PG_refcounts|PG_translate|PG_external)) != 0 ) > + goto fail1; > + > + if ( (rc = hvm_funcs.domain_initialise(d)) == 0 ) > + return 0; > + > +fail1:I don''t think you need the label here? You are not doing an goto.> + hvm_destroy_cacheattr_region_list(d); > + return rc; > +} > + > int hvm_domain_initialise(struct domain *d) > { > int rc; > @@ -520,6 +543,8 @@ int hvm_domain_initialise(struct domain *d) > "on a non-VT/AMDV platform.\n"); > return -EINVAL; > } > + if ( is_pvh_domain(d) ) > + return hvm_pvh_dom_initialise(d); > > spin_lock_init(&d->arch.hvm_domain.pbuf_lock); > spin_lock_init(&d->arch.hvm_domain.irq_lock); > @@ -584,6 +609,11 @@ int hvm_domain_initialise(struct domain *d) > > void hvm_domain_relinquish_resources(struct domain *d) > { > + if ( is_pvh_domain(d) ) > + { > + pit_deinit(d); > + return; > + } > if ( hvm_funcs.nhvm_domain_relinquish_resources ) > hvm_funcs.nhvm_domain_relinquish_resources(d); > > @@ -609,10 +639,14 @@ void hvm_domain_relinquish_resources(struct domain *d) > void hvm_domain_destroy(struct domain *d) > { > hvm_funcs.domain_destroy(d); > + hvm_destroy_cacheattr_region_list(d); > + > + if ( is_pvh_domain(d) ) > + return; > + > rtc_deinit(d); > stdvga_deinit(d); > vioapic_deinit(d); > - hvm_destroy_cacheattr_region_list(d); > } > > static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h) > @@ -1066,14 +1100,47 @@ static int __init __hvm_register_CPU_XSAVE_save_and_restore(void) > } > __initcall(__hvm_register_CPU_XSAVE_save_and_restore); > > +static int hvm_pvh_vcpu_initialise(struct vcpu *v) > +{ > + int rc; > + > + if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 ) > + return rc; > + > + softirq_tasklet_init( &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet, > + (void(*)(unsigned long))hvm_assert_evtchn_irq, > + (unsigned long)v ); > + > + v->arch.hvm_vcpu.hcall_64bit = 1; > + v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN; > + v->arch.user_regs.eflags = 2;So that sets the Reserved flag. Could you include a comment explaining why.. Ah, is it b/c we later on bit-shift it and use it to figure out whether IOPL needs to be virtualized in arch_set_info_guest? Or is it just b/c this function is based off hvm_vcpu_initialise? If so, since you are being called by it, can you skip it?> + v->arch.hvm_vcpu.inject_trap.vector = -1; > + > + if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) {The syntax here is off.> + hvm_funcs.vcpu_destroy(v); > + return rc; > + } > + > + /* during domain shutdown: pvh_vmx_vmexit_handler->emulate_privileged_op > + * -> guest_io_read -> pv_pit_handler -> handle_speaker_io -> _spin_lock > + * so we call pit_init to initialize the spin lock */ > + if ( v->vcpu_id == 0 ) > + pit_init(v, cpu_khz); > + > + return 0; > +} > + > int hvm_vcpu_initialise(struct vcpu *v) > { > int rc; > struct domain *d = v->domain; > - domid_t dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; > + domid_t dm_domid;Not sure I follow, why the move of it further down?> > hvm_asid_flush_vcpu(v); > > + if ( is_pvh_vcpu(v) ) > + return hvm_pvh_vcpu_initialise(v); > + > if ( (rc = vlapic_init(v)) != 0 ) > goto fail1; > > @@ -1084,6 +1151,8 @@ int hvm_vcpu_initialise(struct vcpu *v) > && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) > goto fail3; > > + dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; > +?> /* Create ioreq event channel. */ > rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL); > if ( rc < 0 ) > @@ -1163,7 +1232,10 @@ void hvm_vcpu_destroy(struct vcpu *v) > > tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet); > hvm_vcpu_cacheattr_destroy(v); > - vlapic_destroy(v); > + > + if ( !is_pvh_vcpu(v) ) > + vlapic_destroy(v); > + > hvm_funcs.vcpu_destroy(v); > > /* Event channel is already freed by evtchn_destroy(). */ > @@ -4514,6 +4586,8 @@ static int hvm_memory_event_traps(long p, uint32_t reason, > > void hvm_memory_event_cr0(unsigned long value, unsigned long old) > { > + if ( is_pvh_vcpu(current) ) > + return; > hvm_memory_event_traps(current->domain->arch.hvm_domain > .params[HVM_PARAM_MEMORY_EVENT_CR0], > MEM_EVENT_REASON_CR0, > @@ -4522,6 +4596,8 @@ void hvm_memory_event_cr0(unsigned long value, unsigned long old) > > void hvm_memory_event_cr3(unsigned long value, unsigned long old) > { > + if ( is_pvh_vcpu(current) ) > + return; > hvm_memory_event_traps(current->domain->arch.hvm_domain > .params[HVM_PARAM_MEMORY_EVENT_CR3], > MEM_EVENT_REASON_CR3, > @@ -4530,6 +4606,8 @@ void hvm_memory_event_cr3(unsigned long value, unsigned long old) > > void hvm_memory_event_cr4(unsigned long value, unsigned long old) > { > + if ( is_pvh_vcpu(current) ) > + return; > hvm_memory_event_traps(current->domain->arch.hvm_domain > .params[HVM_PARAM_MEMORY_EVENT_CR4], > MEM_EVENT_REASON_CR4, > @@ -4538,6 +4616,8 @@ void hvm_memory_event_cr4(unsigned long value, unsigned long old) > > void hvm_memory_event_msr(unsigned long msr, unsigned long value) > { > + if ( is_pvh_vcpu(current) ) > + return; > hvm_memory_event_traps(current->domain->arch.hvm_domain > .params[HVM_PARAM_MEMORY_EVENT_MSR], > MEM_EVENT_REASON_MSR, > @@ -4550,6 +4630,8 @@ int hvm_memory_event_int3(unsigned long gla) > unsigned long gfn; > gfn = paging_gva_to_gfn(current, gla, &pfec); > > + if ( is_pvh_vcpu(current) ) > + return 0; > return hvm_memory_event_traps(current->domain->arch.hvm_domain > .params[HVM_PARAM_MEMORY_EVENT_INT3], > MEM_EVENT_REASON_INT3, > @@ -4562,6 +4644,8 @@ int hvm_memory_event_single_step(unsigned long gla) > unsigned long gfn; > gfn = paging_gva_to_gfn(current, gla, &pfec); > > + if ( is_pvh_vcpu(current) ) > + return 0; > return hvm_memory_event_traps(current->domain->arch.hvm_domain > .params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP], > MEM_EVENT_REASON_SINGLESTEP, > diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c > index 9926ffb..b0bea9c 100644 > --- a/xen/arch/x86/hvm/vmx/vmcs.c > +++ b/xen/arch/x86/hvm/vmx/vmcs.c > @@ -624,7 +624,7 @@ void vmx_vmcs_exit(struct vcpu *v) > { > /* Don''t confuse vmx_do_resume (for @v or @current!) */ > vmx_clear_vmcs(v); > - if ( is_hvm_vcpu(current) ) > + if ( is_hvm_or_pvh_vcpu(current) ) > vmx_load_vmcs(current); > > spin_unlock(&v->arch.hvm_vmx.vmcs_lock); > @@ -815,6 +815,253 @@ void virtual_vmcs_vmwrite(void *vvmcs, u32 vmcs_encoding, u64 val) > virtual_vmcs_exit(vvmcs); > } > > +static int pvh_construct_vmcs(struct vcpu *v) > +{ > + uint16_t sysenter_cs; > + unsigned long sysenter_eip; > + struct domain *d = v->domain; > + struct p2m_domain *p2m = p2m_get_hostp2m(d); > + struct ept_data *ept = &p2m->ept; > + u32 vmexit_ctl = vmx_vmexit_control; > + u32 vmentry_ctl = vmx_vmentry_control; > + u64 required, tmpval = -1; > + > + if ( !paging_mode_hap(d) ) > + { > + printk("ERROR: HAP is required to run PV in HVM container\n"); > + return -EINVAL; > + } > + > + /* VMCS controls. */ > + vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS; > + __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); > + > + v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; > + > + /* if rdtsc exiting is turned on and it goes thru emulate_privileged_op, > + * then pv_vcpu.ctrlreg must be added to pvh struct */That would be the ''timer_mode'' syntax in the guest config right? Perhaps then a check at the top of the function to see which timer_mode is used and exit out with -ENOSYS?> + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING; > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING; > + > + v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING | > + CPU_BASED_CR3_LOAD_EXITING | > + CPU_BASED_CR3_STORE_EXITING); > + v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;Is that b/c the PV code ends up making the SCHED_yield_op hypercall and we don''t need the monitor/mwait capability? If so, could you add that comment in please?> + v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP; > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW; > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; > + > + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); > + > + /* I/O access bitmap. */ > + __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0)); > + __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE)); > + > + /* MSR access bitmap. */ > + if ( cpu_has_vmx_msr_bitmap ) > + { > + unsigned long *msr_bitmap = alloc_xenheap_page(); > + int msr_type = MSR_TYPE_R | MSR_TYPE_W; > + > + if ( msr_bitmap == NULL ) > + return -ENOMEM; > + > + memset(msr_bitmap, ~0, PAGE_SIZE); > + v->arch.hvm_vmx.msr_bitmap = msr_bitmap; > + __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap)); > + > + vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type); > + vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type); > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type); > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type); > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type); > + vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type);So this looks like the one vmcs.c except that one has this extra: 895 if ( cpu_has_vmx_pat && paging_mode_hap(d) ) 896 vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, MSR_TYPE_R | MSR_TYPE_W); 897 } Did you miss that?> + > + /* pure hvm doesn''t do this. safe? see: long_mode_do_msr_write() */ > +#if 0 > + vmx_disable_intercept_for_msr(v, MSR_STAR); > + vmx_disable_intercept_for_msr(v, MSR_LSTAR); > + vmx_disable_intercept_for_msr(v, MSR_CSTAR); > + vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); > +#endifI would just provide a comment saying: /* * long_mode_do_msr_write() takes care of MSR_[STAR|LSTAR|CSTAR|SYSCALL_MASK] */> + } else { > + printk("PVH: CPU does NOT have msr bitmap\n");Perhaps: printk(XENLOG_G_ERR "%s: ..\n", __func__); ?> + return -EINVAL; > + } > + > + if ( !cpu_has_vmx_vpid ) { > + printk("PVH: At present VPID support is required to run PVH\n");Should you de-allocate msr_bitmap at this point? Or perhaps move this check (and the one below) to the start of the function? So you have: if ( !cpu_has_vmx_vpid ) gdprintk ("%s: VPID required for PVH mode.\n", __func__); if ( !cpu_has_vmx_secondary_exec_control ) .. bla bla> + return -EINVAL; > + } > + > + v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control; > + > + if ( cpu_has_vmx_secondary_exec_control ) { > + v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn off all */ > + v->arch.hvm_vmx.secondary_exec_control |= > + SECONDARY_EXEC_PAUSE_LOOP_EXITING; > + v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID; > + > + v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_EPT; > + __vmwrite(SECONDARY_VM_EXEC_CONTROL, > + v->arch.hvm_vmx.secondary_exec_control); > + } else { > + printk("PVH: NO Secondary Exec control\n"); > + return -EINVAL;Ditto - should you de-allocate msr_bitmap ? Or if you are going to move the check for cpu_has_vmx_secondary_exec_control, then there is no need for this if (.. ) else ..> + } > + > + __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl); > + > + #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4 > + #define VM_ENTRY_LOAD_EFER 0x8000 > + vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS; > + vmentry_ctl &= ~VM_ENTRY_LOAD_EFER; > + vmentry_ctl &= ~VM_ENTRY_SMM; > + vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR; > + vmentry_ctl |= VM_ENTRY_IA32E_MODE; > + __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl); > +From here on, it looks mostly the same as construct_vmcs right? Perhaps you can add a comment saying so - so when a cleanup effort is done later on - these can be candidates for it?> + /* MSR intercepts. */ > + __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); > + __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); > + __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); > + > + /* Host data selectors. */ > + __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); > + __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); > + __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS); > + __vmwrite(HOST_FS_SELECTOR, 0); > + __vmwrite(HOST_GS_SELECTOR, 0); > + __vmwrite(HOST_FS_BASE, 0); > + __vmwrite(HOST_GS_BASE, 0); > + > + vmx_set_host_env(v); > + > + /* Host control registers. */ > + v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; > + __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); > + __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? X86_CR4_OSXSAVE : 0));That formatting looks odd.> + > + /* Host CS:RIP. */ > + __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); > + __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler); > + > + /* Host SYSENTER CS:RIP. */ > + rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs); > + __vmwrite(HOST_SYSENTER_CS, sysenter_cs); > + rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip); > + __vmwrite(HOST_SYSENTER_EIP, sysenter_eip); > + > + __vmwrite(VM_ENTRY_INTR_INFO, 0); > + > + __vmwrite(CR3_TARGET_COUNT, 0); > + > + __vmwrite(GUEST_ACTIVITY_STATE, 0); > + > + /* Set default guest context values here. Some of these are then overwritten > + * in vmx_pvh_set_vcpu_info() by guest itself during vcpu bringup */ > + __vmwrite(GUEST_CS_BASE, 0); > + __vmwrite(GUEST_CS_LIMIT, ~0u); > + __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */ > + __vmwrite(GUEST_CS_SELECTOR, 0x10);0x10. Could you use a #define for it? Somehow I thought it would be running in FLAT_KERNEL_CS but that would be odd. And of course since are booting in the Linux kernel without the PV MMU we would be using its native segments. So this would correspond to GDT_ENTRY_KERNEL_CS right? Might want to mention that so if the Linux kernel alters its GDT page we don''t blow up? Thought I guess it does not matter - this is just the initial bootstrap segments. Presumarily the load_gdt in the Linux kernel later on resets it to whatever the "new" GDT is.> + > + __vmwrite(GUEST_DS_BASE, 0); > + __vmwrite(GUEST_DS_LIMIT, ~0u); > + __vmwrite(GUEST_DS_AR_BYTES, 0xc093); > + __vmwrite(GUEST_DS_SELECTOR, 0x18); > + > + __vmwrite(GUEST_SS_BASE, 0); /* use same seg as DS */ > + __vmwrite(GUEST_SS_LIMIT, ~0u); > + __vmwrite(GUEST_SS_AR_BYTES, 0xc093); > + __vmwrite(GUEST_SS_SELECTOR, 0x18); > + > + __vmwrite(GUEST_ES_SELECTOR, 0); > + __vmwrite(GUEST_FS_SELECTOR, 0); > + __vmwrite(GUEST_GS_SELECTOR, 0); > + > + /* Guest segment bases. */ > + __vmwrite(GUEST_ES_BASE, 0); > + __vmwrite(GUEST_FS_BASE, 0); > + __vmwrite(GUEST_GS_BASE, 0); > + > + /* Guest segment limits. */ > + __vmwrite(GUEST_ES_LIMIT, ~0u); > + __vmwrite(GUEST_FS_LIMIT, ~0u); > + __vmwrite(GUEST_GS_LIMIT, ~0u); > + > + /* Guest segment AR bytes. */ > + __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */ > + __vmwrite(GUEST_FS_AR_BYTES, 0xc093); > + __vmwrite(GUEST_GS_AR_BYTES, 0xc093); > + > + /* Guest IDT. */ > + __vmwrite(GUEST_GDTR_BASE, 0); > + __vmwrite(GUEST_GDTR_LIMIT, 0); > + > + /* Guest LDT. */ > + __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */ > + __vmwrite(GUEST_LDTR_SELECTOR, 0); > + __vmwrite(GUEST_LDTR_BASE, 0); > + __vmwrite(GUEST_LDTR_LIMIT, 0); > + > + /* Guest TSS. */ > + __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */ > + __vmwrite(GUEST_TR_BASE, 0); > + __vmwrite(GUEST_TR_LIMIT, 0xff); > + > + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); > + __vmwrite(GUEST_DR7, 0); > + __vmwrite(VMCS_LINK_POINTER, ~0UL); > + > + __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); > + __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);Weird. In the vmcs.c file these are somewhat higher in the code.> + > + v->arch.hvm_vmx.exception_bitmap = > + HVM_TRAP_MASK | (1 << TRAP_debug) | > + (1U << TRAP_int3) | (1U << TRAP_no_device);Odd syntax.> + __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap); > + > + __vmwrite(TSC_OFFSET, 0);Hm, so you did earlier: v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING; so is this neccessary? Or is just that you want it to be set to default baseline state?> + > + /* Set WP bit so rdonly pages are not written from CPL 0 */ > + tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP; > + __vmwrite(GUEST_CR0, tmpval); > + __vmwrite(CR0_READ_SHADOW, tmpval); > + v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] = tmpval; > + > + tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features); > + required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR; > + if ( (tmpval & required) != required ) > + { > + printk("PVH: required CR4 features not available:%lx\n", required); > + return -EINVAL;You might want to move that to the top of the code. Or if you want it here, then at least free the msr_bitmap> + } > + __vmwrite(GUEST_CR4, tmpval); > + __vmwrite(CR4_READ_SHADOW, tmpval); > + v->arch.hvm_vcpu.guest_cr[4] = tmpval; > + > + __vmwrite(CR0_GUEST_HOST_MASK, ~0UL); > + __vmwrite(CR4_GUEST_HOST_MASK, ~0UL); > + > + v->arch.hvm_vmx.vmx_realmode = 0; > + > + ept->asr = pagetable_get_pfn(p2m_get_pagetable(p2m)); > + __vmwrite(EPT_POINTER, ept_get_eptp(ept)); > + > + if ( cpu_has_vmx_pat ) > + { > + u64 host_pat, guest_pat; > + > + rdmsrl(MSR_IA32_CR_PAT, host_pat); > + guest_pat = MSR_IA32_CR_PAT_RESET; > + > + __vmwrite(HOST_PAT, host_pat); > + __vmwrite(GUEST_PAT, guest_pat); > + } > + return 0; > +} > + > static int construct_vmcs(struct vcpu *v) > { > struct domain *d = v->domain; > @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v) > > vmx_vmcs_enter(v); > > + if ( is_pvh_vcpu(v) ) { > + int rc = pvh_construct_vmcs(v); > + vmx_vmcs_exit(v);Do you need to call paging_update_paging_modes as construct_vmcs() does?> + return rc; > + } > + > /* VMCS controls. */ > __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); > > @@ -1259,8 +1512,10 @@ void vmx_do_resume(struct vcpu *v) > > vmx_clear_vmcs(v); > vmx_load_vmcs(v); > - hvm_migrate_timers(v); > - hvm_migrate_pirqs(v); > + if ( !is_pvh_vcpu(v) ) { > + hvm_migrate_timers(v); > + hvm_migrate_pirqs(v); > + } > vmx_set_host_env(v); > /* > * Both n1 VMCS and n2 VMCS need to update the host environment after > @@ -1272,6 +1527,9 @@ void vmx_do_resume(struct vcpu *v) > hvm_asid_flush_vcpu(v); > } > > + if ( is_pvh_vcpu(v) ) > + reset_stack_and_jump(vmx_asm_do_vmentry); > + > debug_state = v->domain->debugger_attached > || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3] > || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP]; > @@ -1455,7 +1713,7 @@ static void vmcs_dump(unsigned char ch) > > for_each_domain ( d ) > { > - if ( !is_hvm_domain(d) ) > + if ( !is_hvm_or_pvh_domain(d) ) > continue; > printk("\n>>> Domain %d <<<\n", d->domain_id); > for_each_vcpu ( d, v ) > diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c > index e64980f..194c87b 100644 > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -79,6 +79,9 @@ static int vmx_domain_initialise(struct domain *d) > { > int rc; > > + if ( is_pvh_domain(d) ) > + return 0; > + > if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) > return rc; > > @@ -87,6 +90,9 @@ static int vmx_domain_initialise(struct domain *d) > > static void vmx_domain_destroy(struct domain *d) > { > + if ( is_pvh_domain(d) ) > + return; > + > vmx_free_vlapic_mapping(d); > } > > @@ -110,6 +116,12 @@ static int vmx_vcpu_initialise(struct vcpu *v) > > vpmu_initialise(v); > > + if (is_pvh_vcpu(v) ) > + { > + /* this for hvm_long_mode_enabled(v) */ > + v->arch.hvm_vcpu.guest_efer = EFER_SCE | EFER_LMA | EFER_LME; > + return 0; > + } > vmx_install_vlapic_mapping(v); > > /* %eax == 1 signals full real-mode support to the guest loader. */ > @@ -1033,6 +1045,23 @@ static void vmx_update_host_cr3(struct vcpu *v) > vmx_vmcs_exit(v); > } > > +static void vmx_update_pvh_cr(struct vcpu *v, unsigned int cr) > +{ > + vmx_vmcs_enter(v); > + switch ( cr ) > + { > + case 3: > + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.guest_cr[3]); > + hvm_asid_flush_vcpu(v); > + break; > + > + default: > + printk("PVH: d%d v%d unexpected cr%d update at rip:%lx\n",You are missing the XENLOG_ERR part.> + v->domain->domain_id, v->vcpu_id, cr, __vmread(GUEST_RIP)); > + } > + vmx_vmcs_exit(v); > +} > + > void vmx_update_debug_state(struct vcpu *v) > { > unsigned long mask; > @@ -1052,6 +1081,11 @@ void vmx_update_debug_state(struct vcpu *v) > > static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr) > { > + if ( is_pvh_vcpu(v) ) { > + vmx_update_pvh_cr(v, cr); > + return; > + } > + > vmx_vmcs_enter(v); > > switch ( cr ) > -- > 1.7.2.3 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel >
Mukesh Rathor
2013-Mar-19 01:00 UTC
Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization
On Mon, 18 Mar 2013 11:28:43 -0400 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:> On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor wrote: > > This patch mainly contains code to create a VMCS for PVH guest, and > > HVM specific vcpu/domain creation code. > > > > Changes in V2: > > - Avoid call to hvm_do_resume() at call site rather than return > > in it. > > - Return for PVH vmx_do_resume prior to intel debugger stuff. > > > > Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> > > --- > > xen/arch/x86/hvm/hvm.c | 90 ++++++++++++++- > > xen/arch/x86/hvm/vmx/vmcs.c | 266 > > ++++++++++++++++++++++++++++++++++++++++++- > > xen/arch/x86/hvm/vmx/vmx.c | 34 ++++++ 3 files changed, 383 > > insertions(+), 7 deletions(-) > > > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > > index ea7adf6..18889ad 100644 > > --- a/xen/arch/x86/hvm/hvm.c > > +++ b/xen/arch/x86/hvm/hvm.c > > @@ -510,6 +510,29 @@ static int hvm_print_line( > > return X86EMUL_OKAY; > > } > > > > +static int hvm_pvh_dom_initialise(struct domain *d) > > +{ > > + int rc; > > + > > + if (!d->arch.hvm_domain.hap_enabled) > > + return -EINVAL; > > + > > + spin_lock_init(&d->arch.hvm_domain.irq_lock); > > + hvm_init_guest_time(d); > > + > > + hvm_init_cacheattr_region_list(d); > > + > > + if ( (rc=paging_enable(d, > > PG_refcounts|PG_translate|PG_external)) != 0 ) > > + goto fail1; <=================================== GOTO > > + > > + if ( (rc = hvm_funcs.domain_initialise(d)) == 0 ) > > + return 0; > > + > > +fail1: > > I don''t think you need the label here? You are not doing an goto.Right above.> > long))hvm_assert_evtchn_irq, > > + (unsigned long)v ); > > + > > + v->arch.hvm_vcpu.hcall_64bit = 1; > > + v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN; > > + v->arch.user_regs.eflags = 2; > > So that sets the Reserved flag. Could you include a comment > explaining why.. Ah, is it b/c we later on bit-shift it and use it to > figure out whether IOPL needs to be virtualized in > arch_set_info_guest? Or is it just b/c this function is based off > hvm_vcpu_initialise? If so, since you are being called by it, can you > skip it?That resvd bit is required to be set for bootstrap. Set in other places also, like arch_set_info_guest(): v->arch.user_regs.eflags |= 2;> > > + v->arch.hvm_vcpu.inject_trap.vector = -1; > > + > > + if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) { > > The syntax here is off.Hmm... space surrounding "=" in rc=hvm* ?> > int hvm_vcpu_initialise(struct vcpu *v) > > { > > int rc; > > struct domain *d = v->domain; > > - domid_t dm_domid > > d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; > > + domid_t dm_domid; > > Not sure I follow, why the move of it further down?params is not defined/allocated for PVH.> > + /* VMCS controls. */ > > + vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS; > > + __vmwrite(PIN_BASED_VM_EXEC_CONTROL, > > vmx_pin_based_exec_control); + > > + v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; > > + > > + /* if rdtsc exiting is turned on and it goes thru > > emulate_privileged_op, > > + * then pv_vcpu.ctrlreg must be added to pvh struct */ > > That would be the ''timer_mode'' syntax in the guest config right? > Perhaps then a check at the top of the function to see which > timer_mode is used and exit out with -ENOSYS?The vtsc setting. We set it to 0 for PVH guests.> > > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING; > > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING; > > + > > + v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING | > > + CPU_BASED_CR3_LOAD_EXITING | > > + CPU_BASED_CR3_STORE_EXITING); > > + v->arch.hvm_vmx.exec_control |> > CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; > > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; > > Is that b/c the PV code ends up making the SCHED_yield_op hypercall > and we don''t need the monitor/mwait capability? If so, could you add > that comment in please?No, MTF is debugging feature used mostly for single step.> > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, > > msr_type); > > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, > > msr_type); > > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, > > msr_type); > > + vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, > > msr_type); > > So this looks like the one vmcs.c except that one has this extra: > > 895 if ( cpu_has_vmx_pat && paging_mode_hap(d) ) > 896 vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, > MSR_TYPE_R | MSR_TYPE_W); 897 } > > Did you miss that?I''ll add it. I guess default must be disabled.> > + > > + /* pure hvm doesn''t do this. safe? see: > > long_mode_do_msr_write() */ +#if 0 > > + vmx_disable_intercept_for_msr(v, MSR_STAR); > > + vmx_disable_intercept_for_msr(v, MSR_LSTAR); > > + vmx_disable_intercept_for_msr(v, MSR_CSTAR); > > + vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); > > +#endif > > I would just provide a comment saying: > > /* > * long_mode_do_msr_write() takes care of > MSR_[STAR|LSTAR|CSTAR|SYSCALL_MASK] */Good Idea. I left the "#if 0" there for suggestion.> > + } else { > > + printk("PVH: CPU does NOT have msr bitmap\n"); > > Perhaps: > > printk(XENLOG_G_ERR "%s: ..\n", __func__); > ? > > + return -EINVAL; > > + } > > + > > + if ( !cpu_has_vmx_vpid ) { > > + printk("PVH: At present VPID support is required to run > > PVH\n"); > > Should you de-allocate msr_bitmap at this point? > > Or perhaps move this check (and the one below) to the start of the > function? So you have: > > if ( !cpu_has_vmx_vpid ) > gdprintk ("%s: VPID required for PVH mode.\n", > __func__); > > if ( !cpu_has_vmx_secondary_exec_control ) > .. bla bla > > > > + return -EINVAL; > > + } > > + > > + v->arch.hvm_vmx.secondary_exec_control > > vmx_secondary_exec_control; + > > + if ( cpu_has_vmx_secondary_exec_control ) { > > + v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn > > off all */ > > + v->arch.hvm_vmx.secondary_exec_control |= > > + > > SECONDARY_EXEC_PAUSE_LOOP_EXITING; > > + v->arch.hvm_vmx.secondary_exec_control |> > SECONDARY_EXEC_ENABLE_VPID; + > > + v->arch.hvm_vmx.secondary_exec_control |> > SECONDARY_EXEC_ENABLE_EPT; > > + __vmwrite(SECONDARY_VM_EXEC_CONTROL, > > + v->arch.hvm_vmx.secondary_exec_control); > > + } else { > > + printk("PVH: NO Secondary Exec control\n"); > > + return -EINVAL; > > Ditto - should you de-allocate msr_bitmap ? Or if you are going to > move the check for cpu_has_vmx_secondary_exec_control, then there is > no need for this if (.. ) else .. > > > > + } > > + > > + __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl); > > + > > + #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4 > > + #define VM_ENTRY_LOAD_EFER 0x8000 > > + vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS; > > + vmentry_ctl &= ~VM_ENTRY_LOAD_EFER; > > + vmentry_ctl &= ~VM_ENTRY_SMM; > > + vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR; > > + vmentry_ctl |= VM_ENTRY_IA32E_MODE; > > + __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl); > > + > > From here on, it looks mostly the same as construct_vmcs right? > > Perhaps you can add a comment saying so - so when a cleanup effort > is done later on - these can be candidates for it? > > > + /* MSR intercepts. */ > > + __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); > > + __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); > > + __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); > > + > > + /* Host data selectors. */ > > + __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); > > + __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); > > + __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS); > > + __vmwrite(HOST_FS_SELECTOR, 0); > > + __vmwrite(HOST_GS_SELECTOR, 0); > > + __vmwrite(HOST_FS_BASE, 0); > > + __vmwrite(HOST_GS_BASE, 0); > > + > > + vmx_set_host_env(v); > > + > > + /* Host control registers. */ > > + v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; > > + __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); > > + __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? > > X86_CR4_OSXSAVE : 0)); > > That formatting looks odd.Copied from hvm code. whats wrong?> > + /* Set default guest context values here. Some of these are > > then overwritten > > + * in vmx_pvh_set_vcpu_info() by guest itself during vcpu > > bringup */ > > + __vmwrite(GUEST_CS_BASE, 0); > > + __vmwrite(GUEST_CS_LIMIT, ~0u); > > + __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */ > > + __vmwrite(GUEST_CS_SELECTOR, 0x10); > > 0x10. Could you use a #define for it? Somehow I thought it would > be running in FLAT_KERNEL_CS but that would be odd. And of course > since are booting in the Linux kernel without the PV MMU we would > be using its native segments. So this would correspond to > GDT_ENTRY_KERNEL_CS right? Might want to mention that > so if the Linux kernel alters its GDT page we don''t blow up? > > Thought I guess it does not matter - this is just the initial > bootstrap segments. Presumarily the load_gdt in the Linux kernel > later on resets it to whatever the "new" GDT is.Correct: #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) And load_gdt loads a new GDT natively.> > + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); > > + __vmwrite(GUEST_DR7, 0); > > + __vmwrite(VMCS_LINK_POINTER, ~0UL); > > + > > + __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); > > + __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0); > > Weird. In the vmcs.c file these are somewhat higher in the code.Yes. I just didn''t copy the existing function, but created PVH function to make it easier for PVH.> > + > > + v->arch.hvm_vmx.exception_bitmap = > > + HVM_TRAP_MASK | (1 << > > TRAP_debug) | > > + (1U << TRAP_int3) | (1U << > > TRAP_no_device); > > Odd syntax.Similar to existing hvm code, whats wrong?> > + __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap); > > + > > + __vmwrite(TSC_OFFSET, 0); > > Hm, so you did earlier: > > v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING; > > so is this neccessary? Or is just that you want it to be set > to default baseline state?Not necessary, doesn''t hurt either. I can remove it.> > + > > + /* Set WP bit so rdonly pages are not written from CPL 0 */ > > + tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP; > > + __vmwrite(GUEST_CR0, tmpval); > > + __vmwrite(CR0_READ_SHADOW, tmpval); > > + v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] > > tmpval; + > > + tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features); > > + required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR; > > + if ( (tmpval & required) != required ) > > + { > > + printk("PVH: required CR4 features not available:%lx\n", > > required); > > + return -EINVAL; > > You might want to move that to the top of the code. Or if you want > it here, then at least free the msr_bitmapI think I''ll just move all the checks top of the code.> > { > > struct domain *d = v->domain; > > @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v) > > > > vmx_vmcs_enter(v); > > > > + if ( is_pvh_vcpu(v) ) { > > + int rc = pvh_construct_vmcs(v); > > + vmx_vmcs_exit(v); > > Do you need to call paging_update_paging_modes as construct_vmcs() > does?Nop. We don''t need to as the arch_set_info_guest() does it for PVH. Thanks Konrad. Mukesh
Jan Beulich
2013-Mar-19 09:19 UTC
Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization
>>> On 19.03.13 at 02:00, Mukesh Rathor <mukesh.rathor@oracle.com> wrote: > On Mon, 18 Mar 2013 11:28:43 -0400 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: >> On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor wrote: >> > + __vmwrite(GUEST_CS_SELECTOR, 0x10); >> >> 0x10. Could you use a #define for it? Somehow I thought it would >> be running in FLAT_KERNEL_CS but that would be odd. And of course >> since are booting in the Linux kernel without the PV MMU we would >> be using its native segments. So this would correspond to >> GDT_ENTRY_KERNEL_CS right? Might want to mention that >> so if the Linux kernel alters its GDT page we don''t blow up? >> >> Thought I guess it does not matter - this is just the initial >> bootstrap segments. Presumarily the load_gdt in the Linux kernel >> later on resets it to whatever the "new" GDT is. > > Correct: > #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)What Linux does, or what selector values it uses doesn''t matter at all here. Whatever value you want to put there, it needs an explanation if it''s not one of the Xen defined ones (in particular if the value was arbitrarily chosen, as that would leave future readers completely clueless). Perhaps to some degree the same also goes for the descriptor attributes that you store, albeit one can certainly reconstruct why the value was chosen the way it is. Jan
Konrad Rzeszutek Wilk
2013-Mar-19 13:23 UTC
Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization
On Mon, Mar 18, 2013 at 06:00:36PM -0700, Mukesh Rathor wrote:> On Mon, 18 Mar 2013 11:28:43 -0400 > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > > > On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor wrote: > > > This patch mainly contains code to create a VMCS for PVH guest, and > > > HVM specific vcpu/domain creation code. > > > > > > Changes in V2: > > > - Avoid call to hvm_do_resume() at call site rather than return > > > in it. > > > - Return for PVH vmx_do_resume prior to intel debugger stuff. > > > > > > Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com> > > > --- > > > xen/arch/x86/hvm/hvm.c | 90 ++++++++++++++- > > > xen/arch/x86/hvm/vmx/vmcs.c | 266 > > > ++++++++++++++++++++++++++++++++++++++++++- > > > xen/arch/x86/hvm/vmx/vmx.c | 34 ++++++ 3 files changed, 383 > > > insertions(+), 7 deletions(-) > > > > > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > > > index ea7adf6..18889ad 100644 > > > --- a/xen/arch/x86/hvm/hvm.c > > > +++ b/xen/arch/x86/hvm/hvm.c > > > @@ -510,6 +510,29 @@ static int hvm_print_line( > > > return X86EMUL_OKAY; > > > } > > > > > > +static int hvm_pvh_dom_initialise(struct domain *d) > > > +{ > > > + int rc; > > > + > > > + if (!d->arch.hvm_domain.hap_enabled) > > > + return -EINVAL; > > > + > > > + spin_lock_init(&d->arch.hvm_domain.irq_lock); > > > + hvm_init_guest_time(d); > > > + > > > + hvm_init_cacheattr_region_list(d); > > > + > > > + if ( (rc=paging_enable(d, > > > PG_refcounts|PG_translate|PG_external)) != 0 ) > > > + goto fail1; <=================================== GOTO > > > + > > > + if ( (rc = hvm_funcs.domain_initialise(d)) == 0 ) > > > + return 0; > > > + > > > +fail1: > > > > I don''t think you need the label here? You are not doing an goto. > > Right above.Duh!> > > > long))hvm_assert_evtchn_irq, > > > + (unsigned long)v ); > > > + > > > + v->arch.hvm_vcpu.hcall_64bit = 1; > > > + v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN; > > > + v->arch.user_regs.eflags = 2; > > > > So that sets the Reserved flag. Could you include a comment > > explaining why.. Ah, is it b/c we later on bit-shift it and use it to > > figure out whether IOPL needs to be virtualized in > > arch_set_info_guest? Or is it just b/c this function is based off > > hvm_vcpu_initialise? If so, since you are being called by it, can you > > skip it? > > That resvd bit is required to be set for bootstrap. Set in other places > also, like arch_set_info_guest(): > > v->arch.user_regs.eflags |= 2;OK, but why? I am not seeing that bit defined? Or was it needed to get the machine to boot up.> > > > > > + v->arch.hvm_vcpu.inject_trap.vector = -1; > > > + > > > + if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) { > > > > The syntax here is off. > > Hmm... space surrounding "=" in rc=hvm* ?Yes, like this: 385 if ( (rc = vcpu_init_fpu(v)) != 0 )> > > > int hvm_vcpu_initialise(struct vcpu *v) > > > { > > > int rc; > > > struct domain *d = v->domain; > > > - domid_t dm_domid > > > d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; > > > + domid_t dm_domid; > > > > Not sure I follow, why the move of it further down? > > params is not defined/allocated for PVH.But you are still using it in the code. As in, you are still fetching it (just later).> > > > + /* VMCS controls. */ > > > + vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS; > > > + __vmwrite(PIN_BASED_VM_EXEC_CONTROL, > > > vmx_pin_based_exec_control); + > > > + v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; > > > + > > > + /* if rdtsc exiting is turned on and it goes thru > > > emulate_privileged_op, > > > + * then pv_vcpu.ctrlreg must be added to pvh struct */ > > > > That would be the ''timer_mode'' syntax in the guest config right? > > Perhaps then a check at the top of the function to see which > > timer_mode is used and exit out with -ENOSYS? > > The vtsc setting. We set it to 0 for PVH guests. >OK, so that is the the ''timer_mode'' always set to ''2'' or rather timer_mode="native" (in the guest config). Which then does the xc_domain_set_tsc_info hypercall to set the vtsc. You need to document that please.> > > > > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING; > > > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING; > > > + > > > + v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING | > > > + CPU_BASED_CR3_LOAD_EXITING | > > > + CPU_BASED_CR3_STORE_EXITING); > > > + v->arch.hvm_vmx.exec_control |> > > CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; > > > + v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; > > > > Is that b/c the PV code ends up making the SCHED_yield_op hypercall > > and we don''t need the monitor/mwait capability? If so, could you add > > that comment in please? > > No, MTF is debugging feature used mostly for single step. > > > > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, > > > msr_type); > > > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, > > > msr_type); > > > + vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, > > > msr_type); > > > + vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, > > > msr_type); > > > > So this looks like the one vmcs.c except that one has this extra: > > > > 895 if ( cpu_has_vmx_pat && paging_mode_hap(d) ) > > 896 vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, > > MSR_TYPE_R | MSR_TYPE_W); 897 } > > > > Did you miss that? > > I''ll add it. I guess default must be disabled. > > > > + > > > + /* pure hvm doesn''t do this. safe? see: > > > long_mode_do_msr_write() */ +#if 0 > > > + vmx_disable_intercept_for_msr(v, MSR_STAR); > > > + vmx_disable_intercept_for_msr(v, MSR_LSTAR); > > > + vmx_disable_intercept_for_msr(v, MSR_CSTAR); > > > + vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); > > > +#endif > > > > I would just provide a comment saying: > > > > /* > > * long_mode_do_msr_write() takes care of > > MSR_[STAR|LSTAR|CSTAR|SYSCALL_MASK] */ > > Good Idea. I left the "#if 0" there for suggestion. > > > > > + } else { > > > + printk("PVH: CPU does NOT have msr bitmap\n"); > > > > Perhaps: > > > > printk(XENLOG_G_ERR "%s: ..\n", __func__); > > ? > > > + return -EINVAL; > > > + } > > > + > > > + if ( !cpu_has_vmx_vpid ) { > > > + printk("PVH: At present VPID support is required to run > > > PVH\n"); > > > > Should you de-allocate msr_bitmap at this point? > > > > Or perhaps move this check (and the one below) to the start of the > > function? So you have: > > > > if ( !cpu_has_vmx_vpid ) > > gdprintk ("%s: VPID required for PVH mode.\n", > > __func__); > > > > if ( !cpu_has_vmx_secondary_exec_control ) > > .. bla bla > > > > > > > + return -EINVAL; > > > + } > > > + > > > + v->arch.hvm_vmx.secondary_exec_control > > > vmx_secondary_exec_control; + > > > + if ( cpu_has_vmx_secondary_exec_control ) { > > > + v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn > > > off all */ > > > + v->arch.hvm_vmx.secondary_exec_control |= > > > + > > > SECONDARY_EXEC_PAUSE_LOOP_EXITING; > > > + v->arch.hvm_vmx.secondary_exec_control |> > > SECONDARY_EXEC_ENABLE_VPID; + > > > + v->arch.hvm_vmx.secondary_exec_control |> > > SECONDARY_EXEC_ENABLE_EPT; > > > + __vmwrite(SECONDARY_VM_EXEC_CONTROL, > > > + v->arch.hvm_vmx.secondary_exec_control); > > > + } else { > > > + printk("PVH: NO Secondary Exec control\n"); > > > + return -EINVAL; > > > > Ditto - should you de-allocate msr_bitmap ? Or if you are going to > > move the check for cpu_has_vmx_secondary_exec_control, then there is > > no need for this if (.. ) else .. > > > > > > > + } > > > + > > > + __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl); > > > + > > > + #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4 > > > + #define VM_ENTRY_LOAD_EFER 0x8000 > > > + vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS; > > > + vmentry_ctl &= ~VM_ENTRY_LOAD_EFER; > > > + vmentry_ctl &= ~VM_ENTRY_SMM; > > > + vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR; > > > + vmentry_ctl |= VM_ENTRY_IA32E_MODE; > > > + __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl); > > > + > > > > From here on, it looks mostly the same as construct_vmcs right? > > > > Perhaps you can add a comment saying so - so when a cleanup effort > > is done later on - these can be candidates for it? > > > > > + /* MSR intercepts. */ > > > + __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); > > > + __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); > > > + __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); > > > + > > > + /* Host data selectors. */ > > > + __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); > > > + __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); > > > + __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS); > > > + __vmwrite(HOST_FS_SELECTOR, 0); > > > + __vmwrite(HOST_GS_SELECTOR, 0); > > > + __vmwrite(HOST_FS_BASE, 0); > > > + __vmwrite(HOST_GS_BASE, 0); > > > + > > > + vmx_set_host_env(v); > > > + > > > + /* Host control registers. */ > > > + v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; > > > + __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); > > > + __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? > > > X86_CR4_OSXSAVE : 0)); > > > > That formatting looks odd. > > Copied from hvm code. whats wrong?The HVM code has some extra spaces.> > > > + /* Set default guest context values here. Some of these are > > > then overwritten > > > + * in vmx_pvh_set_vcpu_info() by guest itself during vcpu > > > bringup */ > > > + __vmwrite(GUEST_CS_BASE, 0); > > > + __vmwrite(GUEST_CS_LIMIT, ~0u); > > > + __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */ > > > + __vmwrite(GUEST_CS_SELECTOR, 0x10); > > > > 0x10. Could you use a #define for it? Somehow I thought it would > > be running in FLAT_KERNEL_CS but that would be odd. And of course > > since are booting in the Linux kernel without the PV MMU we would > > be using its native segments. So this would correspond to > > GDT_ENTRY_KERNEL_CS right? Might want to mention that > > so if the Linux kernel alters its GDT page we don''t blow up? > > > > Thought I guess it does not matter - this is just the initial > > bootstrap segments. Presumarily the load_gdt in the Linux kernel > > later on resets it to whatever the "new" GDT is. > > Correct: > #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) > > And load_gdt loads a new GDT natively. > > > > > + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); > > > + __vmwrite(GUEST_DR7, 0); > > > + __vmwrite(VMCS_LINK_POINTER, ~0UL); > > > + > > > + __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); > > > + __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0); > > > > Weird. In the vmcs.c file these are somewhat higher in the code. > > Yes. I just didn''t copy the existing function, but created PVH function > to make it easier for PVH.Sure, but the next step (not these patches) will be to collapse some of the redundant logic.> > > > + > > > + v->arch.hvm_vmx.exception_bitmap = > > > + HVM_TRAP_MASK | (1 << > > > TRAP_debug) | > > > + (1U << TRAP_int3) | (1U << > > > TRAP_no_device); > > > > Odd syntax. > > Similar to existing hvm code, whats wrong?Tabs. The HVM looks different (I think it uses spaces?)> > > > > + __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap); > > > + > > > + __vmwrite(TSC_OFFSET, 0); > > > > Hm, so you did earlier: > > > > v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING; > > > > so is this neccessary? Or is just that you want it to be set > > to default baseline state? > > Not necessary, doesn''t hurt either. I can remove it. > > > > + > > > + /* Set WP bit so rdonly pages are not written from CPL 0 */ > > > + tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP; > > > + __vmwrite(GUEST_CR0, tmpval); > > > + __vmwrite(CR0_READ_SHADOW, tmpval); > > > + v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] > > > tmpval; + > > > + tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features); > > > + required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR; > > > + if ( (tmpval & required) != required ) > > > + { > > > + printk("PVH: required CR4 features not available:%lx\n", > > > required); > > > + return -EINVAL; > > > > You might want to move that to the top of the code. Or if you want > > it here, then at least free the msr_bitmap > > I think I''ll just move all the checks top of the code. > > > > { > > > struct domain *d = v->domain; > > > @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v) > > > > > > vmx_vmcs_enter(v); > > > > > > + if ( is_pvh_vcpu(v) ) { > > > + int rc = pvh_construct_vmcs(v); > > > + vmx_vmcs_exit(v); > > > > Do you need to call paging_update_paging_modes as construct_vmcs() > > does? > > Nop. We don''t need to as the arch_set_info_guest() does it for PVH.Ok, could you provide a comment please? That way when one looks at this code and the HVM one it will be clear.> > > Thanks Konrad.Of course. Thank you for bearing with this tiring review process.> Mukesh > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel >
Mukesh Rathor
2013-Mar-26 22:30 UTC
Re: [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization
On Tue, 19 Mar 2013 09:23:30 -0400 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:> On Mon, Mar 18, 2013 at 06:00:36PM -0700, Mukesh Rathor wrote: > > On Mon, 18 Mar 2013 11:28:43 -0400 > > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > > > > > So that sets the Reserved flag. Could you include a comment > > > explaining why.. Ah, is it b/c we later on bit-shift it and use > > > it to figure out whether IOPL needs to be virtualized in > > > arch_set_info_guest? Or is it just b/c this function is based off > > > hvm_vcpu_initialise? If so, since you are being called by it, can > > > you skip it? > > > > That resvd bit is required to be set for bootstrap. Set in other > > places also, like arch_set_info_guest(): > > > > v->arch.user_regs.eflags |= 2; > > OK, but why? I am not seeing that bit defined? Or was it needed to get > the machine to boot up.It''s needed to get the machine to boot up. Thats the EFLAGS default in the CPU upon reset or power up. It''s a resvd bit.> > > Not sure I follow, why the move of it further down? > > > > params is not defined/allocated for PVH. > > But you are still using it in the code. As in, you are still > fetching it (just later).No, not really. The function returns early for PVH before params is used. params is NULL for PVH: int hvm_vcpu_initialise(struct vcpu *v) { int rc; struct domain *d = v->domain; - domid_t dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; + domid_t dm_domid; hvm_asid_flush_vcpu(v); + if ( is_pvh_vcpu(v) ) + return hvm_pvh_vcpu_initialise(v); <==============+ ...........> > > > + /* VMCS controls. */ > > > > + vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS; > > > > + __vmwrite(PIN_BASED_VM_EXEC_CONTROL, > > > > vmx_pin_based_exec_control); + > > > > + v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; > > > > + > > > > + /* if rdtsc exiting is turned on and it goes thru > > > > emulate_privileged_op, > > > > + * then pv_vcpu.ctrlreg must be added to pvh struct */ > > > > > > That would be the ''timer_mode'' syntax in the guest config right? > > > Perhaps then a check at the top of the function to see which > > > timer_mode is used and exit out with -ENOSYS? > > > > The vtsc setting. We set it to 0 for PVH guests. > > > > OK, so that is the the ''timer_mode'' always set to ''2'' or rather > timer_mode="native" (in the guest config). Which then does > the xc_domain_set_tsc_info hypercall to set the vtsc. > You need to document that please.Right. I set vtsc to 0 in tsc_set_info(), like suggested in prior review, but may be I should print a warning there that its defaulted to that. I also had put in cover letter to expand this in Phase II or III or IV... btw. I''ll document this in tsc_set_info() also. thanks, Mukesh