This patchset includes several VMware guest improvements: Alexey Makhalov (3): x86/vmware: Use tsc_khz value for calibrate_cpu() x86/vmware: Add basic paravirt ops support x86/vmware: Add paravirt sched clock Documentation/kernel-parameters.txt | 4 +++ arch/x86/kernel/cpu/vmware.c | 51 +++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) -- 2.10.1
Alexey Makhalov
2016-Oct-26  05:20 UTC
[PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
After aa297292d708, there are separate native calibrations for cpu_khz and tsc_khz. The code sets x86_platform.calibrate_cpu to native_calibrate_cpu() which looks in cpuid leaf 0x16 or msrs for the cpu frequency. Since we keep the tsc_khz constant (even after vmotion), the cpu_khz and tsc_khz may start diverging. tsc_init() now does cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; We want the cpu_khz and tsc_khz to be sync even if they diverge less then 10%. This patch resolves this issue by setting x86_platform.calibrate_cpu to vmware_get_tsc_khz(). Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Acked-by: Alok N Kataria <akataria at vmware.com> --- arch/x86/kernel/cpu/vmware.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 4e34da4b..480790f 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -83,6 +83,7 @@ static void __init vmware_platform_setup(void) vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ -- 2.10.1
Alexey Makhalov
2016-Oct-26  05:26 UTC
[PATCH 2/3] x86/vmware: Add basic paravirt ops support
Add basic paravirt support:
 1. set pv_info.name to "VMware" to have proper boot log message
	Booting paravirtualized kernel on VMware
    instead of "... on bare hardware"
 2. set pv_cpu_ops.io_delay() to empty function - paravirt_nop() to
    avoid vm-exits on IO delays.
Signed-off-by: Alexey Makhalov <amakhalov at vmware.com>
Acked-by: Alok N Kataria <akataria at vmware.com>
---
 arch/x86/kernel/cpu/vmware.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 480790f..e3fb320 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -61,6 +61,16 @@ static unsigned long vmware_get_tsc_khz(void)
 	return vmware_tsc_khz;
 }
 
+#ifdef CONFIG_PARAVIRT
+static void __init vmware_paravirt_ops_setup(void)
+{
+	pv_info.name = "VMware";
+	pv_cpu_ops.io_delay = paravirt_nop;
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
 static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
@@ -94,6 +104,8 @@ static void __init vmware_platform_setup(void)
 	} else {
 		pr_warn("Failed to get TSC freq from the hypervisor\n");
 	}
+
+	vmware_paravirt_ops_setup();
 }
 
 /*
-- 
2.10.1
Set pv_time_ops.sched_clock to vmware_sched_clock(). It is simplified
version of native_sched_clock() without ring buffer of mult/shift/offset
triplets and preempt toggling.
Since VMware hypervisor provides constant tsc we can use constant
mult/shift/offset triplet calculated at boot time.
no-vmw-sched-clock kernel parameter is added to switch back to the
native_sched_clock() implementation.
Signed-off-by: Alexey Makhalov <amakhalov at vmware.com>
Acked-by: Alok N Kataria <akataria at vmware.com>
---
 Documentation/kernel-parameters.txt |  4 ++++
 arch/x86/kernel/cpu/vmware.c        | 38 +++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)
diff --git a/Documentation/kernel-parameters.txt
b/Documentation/kernel-parameters.txt
index 37babf9..b3b2ec0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2754,6 +2754,10 @@ bytes respectively. Such letter suffixes can also be
entirely omitted.
 	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page
 			fault handling.
 
+	no-vmw-sched-clock
+			[X86,PV_OPS] Disable paravirtualized VMware scheduler
+			clock and use the default one.
+
 	no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
 			steal time is computed, but won't influence scheduler
 			behaviour
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index e3fb320..6ef22c1 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,10 +24,12 @@
 #include <linux/dmi.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/clocksource.h>
 #include <asm/div64.h>
 #include <asm/x86_init.h>
 #include <asm/hypervisor.h>
 #include <asm/apic.h>
+#include <asm/timer.h>
 
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
@@ -62,10 +64,46 @@ static unsigned long vmware_get_tsc_khz(void)
 }
 
 #ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+
+static int vmw_sched_clock __initdata = 1;
+static __init int setup_vmw_sched_clock(char *s)
+{
+	vmw_sched_clock = 0;
+	return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static unsigned long long vmware_sched_clock(void)
+{
+	unsigned long long ns;
+
+	ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+			     vmware_cyc2ns.cyc2ns_shift);
+	ns -= vmware_cyc2ns.cyc2ns_offset;
+	return ns;
+}
+
 static void __init vmware_paravirt_ops_setup(void)
 {
 	pv_info.name = "VMware";
 	pv_cpu_ops.io_delay = paravirt_nop;
+
+	if (vmware_tsc_khz && vmw_sched_clock) {
+		unsigned long long tsc_now = rdtsc();
+
+		clocks_calc_mult_shift(&vmware_cyc2ns.cyc2ns_mul,
+				       &vmware_cyc2ns.cyc2ns_shift,
+				       vmware_tsc_khz,
+				       NSEC_PER_MSEC, 0);
+		vmware_cyc2ns.cyc2ns_offset +			mul_u64_u32_shr(tsc_now,
vmware_cyc2ns.cyc2ns_mul,
+					vmware_cyc2ns.cyc2ns_shift);
+
+		pv_time_ops.sched_clock = vmware_sched_clock;
+		pr_info("vmware: using sched offset of %llu ns\n",
+			vmware_cyc2ns.cyc2ns_offset);
+	}
 }
 #else
 #define vmware_paravirt_ops_setup() do {} while (0)
-- 
2.10.1
On Tue, 25 Oct 2016, Alexey Makhalov wrote:> no-vmw-sched-clock kernel parameter is added to switch back to the > native_sched_clock() implementation.You are not switching back. The parameter is used to disable the paravirt sched clock.> #ifdef CONFIG_PARAVIRT > +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; > + > +static int vmw_sched_clock __initdata = 1; > +static __init int setup_vmw_sched_clock(char *s)Please stop glueing a variable to a function w/o a new line between them. It's just stopping the reading flow.> +{ > + vmw_sched_clock = 0; > + return 0; > +} > +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); > + > +static unsigned long long vmware_sched_clock(void) > +{ > + unsigned long long ns; > + > + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, > + vmware_cyc2ns.cyc2ns_shift); > + ns -= vmware_cyc2ns.cyc2ns_offset; > + return ns; > +} > + > static void __init vmware_paravirt_ops_setup(void) > { > pv_info.name = "VMware"; > pv_cpu_ops.io_delay = paravirt_nop; > + > + if (vmware_tsc_khz && vmw_sched_clock) { > + unsigned long long tsc_now = rdtsc(); > + > + clocks_calc_mult_shift(&vmware_cyc2ns.cyc2ns_mul, > + &vmware_cyc2ns.cyc2ns_shift, > + vmware_tsc_khz, > + NSEC_PER_MSEC, 0); > + vmware_cyc2ns.cyc2ns_offset > + mul_u64_u32_shr(tsc_now, vmware_cyc2ns.cyc2ns_mul, > + vmware_cyc2ns.cyc2ns_shift); > + > + pv_time_ops.sched_clock = vmware_sched_clock; > + pr_info("vmware: using sched offset of %llu ns\n",Please use pr_fmt instead of adding the prefix to every print. Thanks, tglx
I believe our trademark guidelines say we aren't supposed to use VMware as a noun to mean a product, only to mean the company. So we can say "running on VMware ESXi" or "running in a VMware virtual machine", but "running on VMware" is wrong. There is supposedly some good legal reason for this related to keeping our trademark. On Tue, 25 Oct 2016 22:26:00 -0700, Alexey Makhalov <amakhalov at vmware.com> wrote:> Add basic paravirt support: > 1. set pv_info.name to "VMware" to have proper boot log message > Booting paravirtualized kernel on VMware > instead of "... on bare hardware" > 2. set pv_cpu_ops.io_delay() to empty function - paravirt_nop() to > avoid vm-exits on IO delays. > > Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> > Acked-by: Alok N Kataria <akataria at vmware.com> > --- > arch/x86/kernel/cpu/vmware.c | 12 ++++++++++++ > 1 file changed, 12 insertions(+) > > diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c > index 480790f..e3fb320 100644 > --- a/arch/x86/kernel/cpu/vmware.c > +++ b/arch/x86/kernel/cpu/vmware.c > @@ -61,6 +61,16 @@ static unsigned long vmware_get_tsc_khz(void) > return vmware_tsc_khz; > } > > +#ifdef CONFIG_PARAVIRT > +static void __init vmware_paravirt_ops_setup(void) > +{ > + pv_info.name = "VMware"; > + pv_cpu_ops.io_delay = paravirt_nop; > +} > +#else > +#define vmware_paravirt_ops_setup() do {} while (0) > +#endif > + > static void __init vmware_platform_setup(void) > { > uint32_t eax, ebx, ecx, edx; > @@ -94,6 +104,8 @@ static void __init vmware_platform_setup(void) > } else { > pr_warn("Failed to get TSC freq from the hypervisor\n"); > } > + > + vmware_paravirt_ops_setup(); > } > > /*-- Tim Mann | work: mann at vmware.com home: tim at tim-mann.org VMware Sr. Staff Engineer | http://www.vmware.com http://tim-mann.org
Possibly Parallel Threads
- [PATCH 2/3] x86/vmware: Add basic paravirt ops support
- [PATCH 2/3] x86/vmware: Add basic paravirt ops support
- [PATCH v3 2/3] x86/vmware: Add basic paravirt ops support
- [PATCH 3/3] x86/vmware: Add paravirt sched clock
- [RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()