This patch series enables the Last Branch Recording feature for the guest. Instead of trapping each LBR stack MSR access, the MSRs are passthroughed to the guest. Those MSRs are switched (i.e. load and saved) on VMExit and VMEntry. Test: Try "perf record -b ./test_program" on guest. Wei Wang (4): KVM/vmx: re-write the msr auto switch feature KVM/vmx: auto switch MSR_IA32_DEBUGCTLMSR perf/x86: add a function to get the lbr stack KVM/vmx: enable lbr for the guest arch/x86/events/intel/lbr.c | 23 +++++++ arch/x86/include/asm/perf_event.h | 14 ++++ arch/x86/kvm/vmx.c | 135 +++++++++++++++++++++++++++++++++----- 3 files changed, 154 insertions(+), 18 deletions(-) -- 2.7.4
Wei Wang
2017-Sep-25 04:44 UTC
[PATCH v1 1/4] KVM/vmx: re-write the msr auto switch feature
This patch clarifies a vague statement in the SDM: the recommended maximum number of MSRs that can be automically switched by CPU during VMExit and VMEntry is 512, rather than 512 Bytes of MSRs. Depending on the CPU implementations, it may also support more than 512 MSRs to be auto switched. This can be calculated by (MSR_IA32_VMX_MISC[27:25] + 1) * 512. Signed-off-by: Wei Wang <wei.w.wang at intel.com> --- arch/x86/kvm/vmx.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0726ca7..8434fc8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -158,6 +158,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 #define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW +#define KVM_VMX_DEFAULT_MSR_AUTO_LOAD_COUNT 512 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); @@ -178,9 +179,10 @@ static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, int, S_IRUGO); +static int msr_autoload_count_max = KVM_VMX_DEFAULT_MSR_AUTO_LOAD_COUNT; + extern const ulong vmx_return; -#define NR_AUTOLOAD_MSRS 8 #define VMCS02_POOL_SIZE 1 struct vmcs { @@ -588,8 +590,8 @@ struct vcpu_vmx { bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry *guest; + struct vmx_msr_entry *host; } msr_autoload; struct { int loaded; @@ -1942,6 +1944,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) m->host[i] = m->host[m->nr]; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, m->nr); } static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -1997,7 +2000,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, if (m->guest[i].index == msr) break; - if (i == NR_AUTOLOAD_MSRS) { + if (i == msr_autoload_count_max) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -2005,6 +2008,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, ++m->nr; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, m->nr); } m->guest[i].index = msr; @@ -5501,6 +5505,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autoload.guest)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); @@ -6670,6 +6675,21 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +static void update_msr_autoload_count_max(void) +{ + u64 vmx_msr; + int n; + + /* + * According to the Intel SDM, if Bits 27:25 of MSR_IA32_VMX_MISC is + * n, then (n + 1) * 512 is the recommended max number of MSRs to be + * included in the VMExit and VMEntry MSR auto switch list. + */ + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + n = ((vmx_msr & 0xe000000) >> 25) + 1; + msr_autoload_count_max = n * KVM_VMX_DEFAULT_MSR_AUTO_LOAD_COUNT; +} + /* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ @@ -6837,6 +6857,7 @@ static __init int hardware_setup(void) kvm_disable_tdp(); update_ple_window_actual_max(); + update_msr_autoload_count_max(); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -9248,6 +9269,19 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); } +/* + * Currently, the CPU does not support the auto save of MSRs on VMEntry, so we + * save the MSRs for the host before entering into guest. + */ +static void vmx_save_host_msrs(struct msr_autoload *m) + +{ + u32 i; + + for (i = 0; i < m->nr; i++) + m->host[i].value = __rdmsr(m->host[i].index); +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -9304,6 +9338,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_arm_hv_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; + + vmx_save_host_msrs(&vmx->msr_autoload); asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" @@ -9504,6 +9540,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + size_t bytes = msr_autoload_count_max * sizeof(struct vmx_msr_entry); if (enable_pml) vmx_destroy_pml_buffer(vmx); @@ -9512,15 +9549,17 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) vmx_free_vcpu_nested(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); + free_pages_exact(vmx->msr_autoload.host, bytes); + free_pages_exact(vmx->msr_autoload.guest, bytes); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); } static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { - int err; + int err, cpu; + size_t bytes; struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - int cpu; if (!vmx) return ERR_PTR(-ENOMEM); @@ -9559,6 +9598,17 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_msrs; loaded_vmcs_init(vmx->loaded_vmcs); + bytes = msr_autoload_count_max * sizeof(struct vmx_msr_entry); + vmx->msr_autoload.guest = alloc_pages_exact(bytes, + GFP_KERNEL | __GFP_ZERO); + if (!vmx->msr_autoload.guest) + goto free_vmcs; + + vmx->msr_autoload.host = alloc_pages_exact(bytes, + GFP_KERNEL | __GFP_ZERO); + if (!vmx->msr_autoload.guest) + goto free_autoload_guest; + cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; @@ -9566,11 +9616,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx_vcpu_put(&vmx->vcpu); put_cpu(); if (err) - goto free_vmcs; + goto free_autoload_host; if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) - goto free_vmcs; + goto free_autoload_host; } if (enable_ept) { @@ -9579,7 +9629,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) VMX_EPT_IDENTITY_PAGETABLE_ADDR; err = init_rmode_identity_map(kvm); if (err) - goto free_vmcs; + goto free_autoload_host; } if (nested) { @@ -9594,6 +9644,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; +free_autoload_host: + free_pages_exact(vmx->msr_autoload.host, bytes); +free_autoload_guest: + free_pages_exact(vmx->msr_autoload.guest, bytes); free_vmcs: free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); -- 2.7.4
Passthrough the MSR_IA32_DEBUGCTLMSR to the guest, and take advantage of the hardware VT-x feature to auto switch the msr upon VMExit and VMEntry. Signed-off-by: Wei Wang <wei.w.wang at intel.com> --- arch/x86/kvm/vmx.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8434fc8..5f5c2f1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5502,13 +5502,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autoload.guest)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + add_atomic_switch_msr(vmx, MSR_IA32_DEBUGCTLMSR, 0, 0); + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -6821,6 +6820,7 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_DEBUGCTLMSR, false); memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -9285,7 +9285,7 @@ static void vmx_save_host_msrs(struct msr_autoload *m) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr3, cr4; + unsigned long cr3, cr4; /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ @@ -9333,7 +9333,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vcpu->arch.pkru); atomic_switch_perf_msrs(vmx); - debugctlmsr = get_debugctlmsr(); vmx_arm_hv_timer(vcpu); @@ -9445,10 +9444,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); - /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (debugctlmsr) - update_debugctlmsr(debugctlmsr); - #ifndef CONFIG_X86_64 /* * The sysexit path does not restore ds/es, so we must set them to -- 2.7.4
Wei Wang
2017-Sep-25 04:44 UTC
[PATCH v1 3/4] perf/x86: add a function to get the lbr stack
The LBR stack MSRs are architecturally specific. The perf subsystem has already assigned the abstracted MSR values based on the CPU architecture. This patch enables a caller outside the perf subsystem to get the LBR stack info. This is useful for hyperviosrs to prepare the lbr feature for the guest. Signed-off-by: Wei Wang <wei.w.wang at intel.com> --- arch/x86/events/intel/lbr.c | 23 +++++++++++++++++++++++ arch/x86/include/asm/perf_event.h | 14 ++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8a6bbac..ea547ec 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1230,3 +1230,26 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = snb_lbr_sel_map; } + +/** + * perf_get_lbr_stack - get the lbr stack related MSRs + * + * @stack: the caller's memory to get the lbr stack + * + * Returns: 0 indicates that the lbr stack has been successfully obtained. + */ +int perf_get_lbr_stack(struct perf_lbr_stack *stack) +{ + stack->lbr_nr = x86_pmu.lbr_nr; + stack->lbr_tos = x86_pmu.lbr_tos; + stack->lbr_from = x86_pmu.lbr_from; + stack->lbr_to = x86_pmu.lbr_to; + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + stack->lbr_info = MSR_LBR_INFO_0; + else + stack->lbr_info = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(perf_get_lbr_stack); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f353061..c098462 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -266,7 +266,16 @@ struct perf_guest_switch_msr { u64 host, guest; }; +struct perf_lbr_stack { + int lbr_nr; + unsigned long lbr_tos; + unsigned long lbr_from; + unsigned long lbr_to; + unsigned long lbr_info; +}; + extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern int perf_get_lbr_stack(struct perf_lbr_stack *stack); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); #else @@ -276,6 +285,11 @@ static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) return NULL; } +static inline int perf_get_lbr_stack(struct perf_lbr_stack *stack) +{ + return -1; +} + static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { memset(cap, 0, sizeof(*cap)); -- 2.7.4
Passthrough the LBR stack to the guest, and auto switch the stack MSRs upon VMEntry and VMExit. Signed-off-by: Wei Wang <wei.w.wang at intel.com> --- arch/x86/kvm/vmx.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5f5c2f1..35e02a7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -107,6 +107,9 @@ static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly enable_lbrv = 1; +module_param_named(lbrv, enable_lbrv, bool, 0444); + #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ @@ -5428,6 +5431,25 @@ static void ept_set_mmio_spte_mask(void) VMX_EPT_MISCONFIG_WX_VALUE); } +static void auto_switch_lbr_msrs(struct vcpu_vmx *vmx) +{ + int i; + struct perf_lbr_stack lbr_stack; + + perf_get_lbr_stack(&lbr_stack); + + add_atomic_switch_msr(vmx, MSR_LBR_SELECT, 0, 0); + add_atomic_switch_msr(vmx, lbr_stack.lbr_tos, 0, 0); + + for (i = 0; i < lbr_stack.lbr_nr; i++) { + add_atomic_switch_msr(vmx, lbr_stack.lbr_from + i, 0, 0); + add_atomic_switch_msr(vmx, lbr_stack.lbr_to + i, 0, 0); + if (lbr_stack.lbr_info) + add_atomic_switch_msr(vmx, lbr_stack.lbr_info + i, 0, + 0); + } +} + #define VMX_XSS_EXIT_BITMAP 0 /* * Sets up the vmcs for emulated real mode. @@ -5508,6 +5530,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) add_atomic_switch_msr(vmx, MSR_IA32_DEBUGCTLMSR, 0, 0); + if (enable_lbrv) + auto_switch_lbr_msrs(vmx); + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -6721,6 +6746,28 @@ void vmx_enable_tdp(void) kvm_enable_tdp(); } +static void vmx_passthrough_lbr_msrs(void) +{ + int i; + struct perf_lbr_stack lbr_stack; + + if (perf_get_lbr_stack(&lbr_stack) < 0) { + enable_lbrv = false; + return; + } + + vmx_disable_intercept_for_msr(MSR_LBR_SELECT, false); + vmx_disable_intercept_for_msr(lbr_stack.lbr_tos, false); + + for (i = 0; i < lbr_stack.lbr_nr; i++) { + vmx_disable_intercept_for_msr(lbr_stack.lbr_from + i, false); + vmx_disable_intercept_for_msr(lbr_stack.lbr_to + i, false); + if (lbr_stack.lbr_info) + vmx_disable_intercept_for_msr(lbr_stack.lbr_info + i, + false); + } +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6822,6 +6869,9 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); vmx_disable_intercept_for_msr(MSR_IA32_DEBUGCTLMSR, false); + if (enable_lbrv) + vmx_passthrough_lbr_msrs(); + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, -- 2.7.4
On 25/09/2017 06:44, Wei Wang wrote:> Passthrough the LBR stack to the guest, and auto switch the stack MSRs > upon VMEntry and VMExit. > > Signed-off-by: Wei Wang <wei.w.wang at intel.com>This has to be enabled separately for each guest, because it may prevent live migration to hosts with a different family/model. Paolo> --- > arch/x86/kvm/vmx.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 50 insertions(+) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 5f5c2f1..35e02a7 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -107,6 +107,9 @@ static u64 __read_mostly host_xss; > static bool __read_mostly enable_pml = 1; > module_param_named(pml, enable_pml, bool, S_IRUGO); > > +static bool __read_mostly enable_lbrv = 1; > +module_param_named(lbrv, enable_lbrv, bool, 0444); > + > #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL > > /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ > @@ -5428,6 +5431,25 @@ static void ept_set_mmio_spte_mask(void) > VMX_EPT_MISCONFIG_WX_VALUE); > } > > +static void auto_switch_lbr_msrs(struct vcpu_vmx *vmx) > +{ > + int i; > + struct perf_lbr_stack lbr_stack; > + > + perf_get_lbr_stack(&lbr_stack); > + > + add_atomic_switch_msr(vmx, MSR_LBR_SELECT, 0, 0); > + add_atomic_switch_msr(vmx, lbr_stack.lbr_tos, 0, 0); > + > + for (i = 0; i < lbr_stack.lbr_nr; i++) { > + add_atomic_switch_msr(vmx, lbr_stack.lbr_from + i, 0, 0); > + add_atomic_switch_msr(vmx, lbr_stack.lbr_to + i, 0, 0); > + if (lbr_stack.lbr_info) > + add_atomic_switch_msr(vmx, lbr_stack.lbr_info + i, 0, > + 0); > + } > +} > + > #define VMX_XSS_EXIT_BITMAP 0 > /* > * Sets up the vmcs for emulated real mode. > @@ -5508,6 +5530,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) > > add_atomic_switch_msr(vmx, MSR_IA32_DEBUGCTLMSR, 0, 0); > > + if (enable_lbrv) > + auto_switch_lbr_msrs(vmx); > + > if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) > vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); > > @@ -6721,6 +6746,28 @@ void vmx_enable_tdp(void) > kvm_enable_tdp(); > } > > +static void vmx_passthrough_lbr_msrs(void) > +{ > + int i; > + struct perf_lbr_stack lbr_stack; > + > + if (perf_get_lbr_stack(&lbr_stack) < 0) { > + enable_lbrv = false; > + return; > + } > + > + vmx_disable_intercept_for_msr(MSR_LBR_SELECT, false); > + vmx_disable_intercept_for_msr(lbr_stack.lbr_tos, false); > + > + for (i = 0; i < lbr_stack.lbr_nr; i++) { > + vmx_disable_intercept_for_msr(lbr_stack.lbr_from + i, false); > + vmx_disable_intercept_for_msr(lbr_stack.lbr_to + i, false); > + if (lbr_stack.lbr_info) > + vmx_disable_intercept_for_msr(lbr_stack.lbr_info + i, > + false); > + } > +} > + > static __init int hardware_setup(void) > { > int r = -ENOMEM, i, msr; > @@ -6822,6 +6869,9 @@ static __init int hardware_setup(void) > vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); > vmx_disable_intercept_for_msr(MSR_IA32_DEBUGCTLMSR, false); > > + if (enable_lbrv) > + vmx_passthrough_lbr_msrs(); > + > memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, > vmx_msr_bitmap_legacy, PAGE_SIZE); > memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, >
Paolo Bonzini
2017-Sep-25 11:54 UTC
[PATCH v1 1/4] KVM/vmx: re-write the msr auto switch feature
On 25/09/2017 06:44, Wei Wang wrote:> > +static void update_msr_autoload_count_max(void) > +{ > + u64 vmx_msr; > + int n; > + > + /* > + * According to the Intel SDM, if Bits 27:25 of MSR_IA32_VMX_MISC is > + * n, then (n + 1) * 512 is the recommended max number of MSRs to be > + * included in the VMExit and VMEntry MSR auto switch list. > + */ > + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); > + n = ((vmx_msr & 0xe000000) >> 25) + 1; > + msr_autoload_count_max = n * KVM_VMX_DEFAULT_MSR_AUTO_LOAD_COUNT; > +} > +Any reasons to do this if it's unlikely that we'll ever update more than 512 MSRs? Paolo
Paolo Bonzini
2017-Sep-25 11:57 UTC
[PATCH v1 2/4] KVM/vmx: auto switch MSR_IA32_DEBUGCTLMSR
On 25/09/2017 06:44, Wei Wang wrote:> Passthrough the MSR_IA32_DEBUGCTLMSR to the guest, and take advantage of > the hardware VT-x feature to auto switch the msr upon VMExit and VMEntry.I think most bits in the MSR should not be passed through (for example FREEZE_WHILE_SMM_EN, FREEZE_LBRS_ON_PMI etc.). Using auto-switch of course is fine instead. Paolo
> +static void auto_switch_lbr_msrs(struct vcpu_vmx *vmx) > +{ > + int i; > + struct perf_lbr_stack lbr_stack; > + > + perf_get_lbr_stack(&lbr_stack); > + > + add_atomic_switch_msr(vmx, MSR_LBR_SELECT, 0, 0); > + add_atomic_switch_msr(vmx, lbr_stack.lbr_tos, 0, 0); > + > + for (i = 0; i < lbr_stack.lbr_nr; i++) { > + add_atomic_switch_msr(vmx, lbr_stack.lbr_from + i, 0, 0); > + add_atomic_switch_msr(vmx, lbr_stack.lbr_to + i, 0, 0); > + if (lbr_stack.lbr_info) > + add_atomic_switch_msr(vmx, lbr_stack.lbr_info + i, 0, > + 0); > + }That will be really expensive and add a lot of overhead to every entry/exit. perf can already context switch the LBRs on task context switch. With that you can just switch LBR_SELECT, which is *much* cheaper because there are far less context switches than exit/entries. It implies that when KVM is running it needs to prevent perf from enabling LBRs in the context of KVM, but that should be straight forward. -Andi
On Mon, Sep 25, 2017 at 12:44:52PM +0800, Wei Wang wrote:> This patch series enables the Last Branch Recording feature for the > guest. Instead of trapping each LBR stack MSR access, the MSRs are > passthroughed to the guest. Those MSRs are switched (i.e. load and > saved) on VMExit and VMEntry. > > Test: > Try "perf record -b ./test_program" on guest.I don't see where you expose the PERF capabilities MSR? That's normally needed for LBR too to report the version number. -Andi
On 09/25/2017 10:59 PM, Andi Kleen wrote:> On Mon, Sep 25, 2017 at 12:44:52PM +0800, Wei Wang wrote: >> This patch series enables the Last Branch Recording feature for the >> guest. Instead of trapping each LBR stack MSR access, the MSRs are >> passthroughed to the guest. Those MSRs are switched (i.e. load and >> saved) on VMExit and VMEntry. >> >> Test: >> Try "perf record -b ./test_program" on guest. > I don't see where you expose the PERF capabilities MSR? > > That's normally needed for LBR too to report the version > number. >It was missed, thanks for pointing it out. I also found KVM/QEMU doesn't expose CPUID.PDCM, will add that too. Since for now we are enabling LBR, I plan to expose only "PERF_CAP & 0x3f" to the guest, which reports the LBR format only. On the other side, it seems that the (guest) kernel driver also works without the above being supported, should we change it to report error and stop using the PMU features when the check of the above two fails (at intel_pmu_init())? Best, Wei