This patchset includes several VMware guest improvements: Alexey Makhalov (3): x86/vmware: Use tsc_khz value for calibrate_cpu() x86/vmware: Add basic paravirt ops support x86/vmware: Add paravirt sched clock Documentation/kernel-parameters.txt | 4 +++ arch/x86/kernel/cpu/vmware.c | 51 +++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) -- 2.10.1
Alexey Makhalov
2016-Oct-26 05:20 UTC
[PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
After aa297292d708, there are separate native calibrations for cpu_khz and tsc_khz. The code sets x86_platform.calibrate_cpu to native_calibrate_cpu() which looks in cpuid leaf 0x16 or msrs for the cpu frequency. Since we keep the tsc_khz constant (even after vmotion), the cpu_khz and tsc_khz may start diverging. tsc_init() now does cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; We want the cpu_khz and tsc_khz to be sync even if they diverge less then 10%. This patch resolves this issue by setting x86_platform.calibrate_cpu to vmware_get_tsc_khz(). Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Acked-by: Alok N Kataria <akataria at vmware.com> --- arch/x86/kernel/cpu/vmware.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 4e34da4b..480790f 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -83,6 +83,7 @@ static void __init vmware_platform_setup(void) vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ -- 2.10.1
Alexey Makhalov
2016-Oct-26 05:26 UTC
[PATCH 2/3] x86/vmware: Add basic paravirt ops support
Add basic paravirt support: 1. set pv_info.name to "VMware" to have proper boot log message Booting paravirtualized kernel on VMware instead of "... on bare hardware" 2. set pv_cpu_ops.io_delay() to empty function - paravirt_nop() to avoid vm-exits on IO delays. Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Acked-by: Alok N Kataria <akataria at vmware.com> --- arch/x86/kernel/cpu/vmware.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 480790f..e3fb320 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -61,6 +61,16 @@ static unsigned long vmware_get_tsc_khz(void) return vmware_tsc_khz; } +#ifdef CONFIG_PARAVIRT +static void __init vmware_paravirt_ops_setup(void) +{ + pv_info.name = "VMware"; + pv_cpu_ops.io_delay = paravirt_nop; +} +#else +#define vmware_paravirt_ops_setup() do {} while (0) +#endif + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -94,6 +104,8 @@ static void __init vmware_platform_setup(void) } else { pr_warn("Failed to get TSC freq from the hypervisor\n"); } + + vmware_paravirt_ops_setup(); } /* -- 2.10.1
Set pv_time_ops.sched_clock to vmware_sched_clock(). It is simplified version of native_sched_clock() without ring buffer of mult/shift/offset triplets and preempt toggling. Since VMware hypervisor provides constant tsc we can use constant mult/shift/offset triplet calculated at boot time. no-vmw-sched-clock kernel parameter is added to switch back to the native_sched_clock() implementation. Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Acked-by: Alok N Kataria <akataria at vmware.com> --- Documentation/kernel-parameters.txt | 4 ++++ arch/x86/kernel/cpu/vmware.c | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 37babf9..b3b2ec0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2754,6 +2754,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page fault handling. + no-vmw-sched-clock + [X86,PV_OPS] Disable paravirtualized VMware scheduler + clock and use the default one. + no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index e3fb320..6ef22c1 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,10 +24,12 @@ #include <linux/dmi.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/clocksource.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> #include <asm/apic.h> +#include <asm/timer.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -62,10 +64,46 @@ static unsigned long vmware_get_tsc_khz(void) } #ifdef CONFIG_PARAVIRT +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; + +static int vmw_sched_clock __initdata = 1; +static __init int setup_vmw_sched_clock(char *s) +{ + vmw_sched_clock = 0; + return 0; +} +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); + +static unsigned long long vmware_sched_clock(void) +{ + unsigned long long ns; + + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + ns -= vmware_cyc2ns.cyc2ns_offset; + return ns; +} + static void __init vmware_paravirt_ops_setup(void) { pv_info.name = "VMware"; pv_cpu_ops.io_delay = paravirt_nop; + + if (vmware_tsc_khz && vmw_sched_clock) { + unsigned long long tsc_now = rdtsc(); + + clocks_calc_mult_shift(&vmware_cyc2ns.cyc2ns_mul, + &vmware_cyc2ns.cyc2ns_shift, + vmware_tsc_khz, + NSEC_PER_MSEC, 0); + vmware_cyc2ns.cyc2ns_offset + mul_u64_u32_shr(tsc_now, vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + + pv_time_ops.sched_clock = vmware_sched_clock; + pr_info("vmware: using sched offset of %llu ns\n", + vmware_cyc2ns.cyc2ns_offset); + } } #else #define vmware_paravirt_ops_setup() do {} while (0) -- 2.10.1
On Tue, 25 Oct 2016, Alexey Makhalov wrote:> no-vmw-sched-clock kernel parameter is added to switch back to the > native_sched_clock() implementation.You are not switching back. The parameter is used to disable the paravirt sched clock.> #ifdef CONFIG_PARAVIRT > +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; > + > +static int vmw_sched_clock __initdata = 1; > +static __init int setup_vmw_sched_clock(char *s)Please stop glueing a variable to a function w/o a new line between them. It's just stopping the reading flow.> +{ > + vmw_sched_clock = 0; > + return 0; > +} > +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); > + > +static unsigned long long vmware_sched_clock(void) > +{ > + unsigned long long ns; > + > + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, > + vmware_cyc2ns.cyc2ns_shift); > + ns -= vmware_cyc2ns.cyc2ns_offset; > + return ns; > +} > + > static void __init vmware_paravirt_ops_setup(void) > { > pv_info.name = "VMware"; > pv_cpu_ops.io_delay = paravirt_nop; > + > + if (vmware_tsc_khz && vmw_sched_clock) { > + unsigned long long tsc_now = rdtsc(); > + > + clocks_calc_mult_shift(&vmware_cyc2ns.cyc2ns_mul, > + &vmware_cyc2ns.cyc2ns_shift, > + vmware_tsc_khz, > + NSEC_PER_MSEC, 0); > + vmware_cyc2ns.cyc2ns_offset > + mul_u64_u32_shr(tsc_now, vmware_cyc2ns.cyc2ns_mul, > + vmware_cyc2ns.cyc2ns_shift); > + > + pv_time_ops.sched_clock = vmware_sched_clock; > + pr_info("vmware: using sched offset of %llu ns\n",Please use pr_fmt instead of adding the prefix to every print. Thanks, tglx
I believe our trademark guidelines say we aren't supposed to use VMware as a noun to mean a product, only to mean the company. So we can say "running on VMware ESXi" or "running in a VMware virtual machine", but "running on VMware" is wrong. There is supposedly some good legal reason for this related to keeping our trademark. On Tue, 25 Oct 2016 22:26:00 -0700, Alexey Makhalov <amakhalov at vmware.com> wrote:> Add basic paravirt support: > 1. set pv_info.name to "VMware" to have proper boot log message > Booting paravirtualized kernel on VMware > instead of "... on bare hardware" > 2. set pv_cpu_ops.io_delay() to empty function - paravirt_nop() to > avoid vm-exits on IO delays. > > Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> > Acked-by: Alok N Kataria <akataria at vmware.com> > --- > arch/x86/kernel/cpu/vmware.c | 12 ++++++++++++ > 1 file changed, 12 insertions(+) > > diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c > index 480790f..e3fb320 100644 > --- a/arch/x86/kernel/cpu/vmware.c > +++ b/arch/x86/kernel/cpu/vmware.c > @@ -61,6 +61,16 @@ static unsigned long vmware_get_tsc_khz(void) > return vmware_tsc_khz; > } > > +#ifdef CONFIG_PARAVIRT > +static void __init vmware_paravirt_ops_setup(void) > +{ > + pv_info.name = "VMware"; > + pv_cpu_ops.io_delay = paravirt_nop; > +} > +#else > +#define vmware_paravirt_ops_setup() do {} while (0) > +#endif > + > static void __init vmware_platform_setup(void) > { > uint32_t eax, ebx, ecx, edx; > @@ -94,6 +104,8 @@ static void __init vmware_platform_setup(void) > } else { > pr_warn("Failed to get TSC freq from the hypervisor\n"); > } > + > + vmware_paravirt_ops_setup(); > } > > /*-- Tim Mann | work: mann at vmware.com home: tim at tim-mann.org VMware Sr. Staff Engineer | http://www.vmware.com http://tim-mann.org
Reasonably Related Threads
- [PATCH 0/3] x86/vmware guest improvements
- [RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
- [RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
- [PATCH v3 0/3] x86/vmware guest improvements
- [PATCH v3 0/3] x86/vmware guest improvements