Alexey Makhalov
2016-Oct-27  19:44 UTC
[RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
After aa297292d708, there are separate native calibrations for cpu_khz and tsc_khz. The code sets x86_platform.calibrate_cpu to native_calibrate_cpu() which looks in cpuid leaf 0x16 or msrs for the cpu frequency. Since we keep the tsc_khz constant (even after vmotion), the cpu_khz and tsc_khz may start diverging. tsc_init() now does cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; We want the cpu_khz and tsc_khz to be sync even if they diverge less then 10%. This patch resolves this issue by setting x86_platform.calibrate_cpu to vmware_get_tsc_khz(). Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Acked-by: Alok N Kataria <akataria at vmware.com> --- arch/x86/kernel/cpu/vmware.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 4e34da4b..480790f 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -83,6 +83,7 @@ static void __init vmware_platform_setup(void) vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ -- 2.10.1
Alexey Makhalov
2016-Oct-27  19:44 UTC
[RESEND PATCH 2/3] x86/vmware: Add basic paravirt ops support
Add basic paravirt support:
 1. set pv_info.name to "VMware hypervisor" to have proper boot log
message
	Booting paravirtualized kernel on VMware hypervisor
    instead of "... on bare hardware"
 2. set pv_cpu_ops.io_delay() to empty function - paravirt_nop() to
    avoid vm-exits on IO delays.
Signed-off-by: Alexey Makhalov <amakhalov at vmware.com>
Acked-by: Alok N Kataria <akataria at vmware.com>
---
 arch/x86/kernel/cpu/vmware.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 480790f..098a524 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -61,6 +61,16 @@ static unsigned long vmware_get_tsc_khz(void)
 	return vmware_tsc_khz;
 }
 
+#ifdef CONFIG_PARAVIRT
+static void __init vmware_paravirt_ops_setup(void)
+{
+	pv_info.name = "VMware hypervisor";
+	pv_cpu_ops.io_delay = paravirt_nop;
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
 static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
@@ -94,6 +104,8 @@ static void __init vmware_platform_setup(void)
 	} else {
 		pr_warn("Failed to get TSC freq from the hypervisor\n");
 	}
+
+	vmware_paravirt_ops_setup();
 }
 
 /*
-- 
2.10.1
Alexey Makhalov
2016-Oct-27  19:44 UTC
[RESEND PATCH 3/3] x86/vmware: Add paravirt sched clock
Set pv_time_ops.sched_clock to vmware_sched_clock(). It is simplified
version of native_sched_clock() without ring buffer of mult/shift/offset
triplets and preempt toggling.
Since VMware hypervisor provides constant tsc we can use constant
mult/shift/offset triplet calculated at boot time.
no-vmw-sched-clock kernel parameter is added to disable the paravirt
sched clock.
Signed-off-by: Alexey Makhalov <amakhalov at vmware.com>
Acked-by: Alok N Kataria <akataria at vmware.com>
---
 Documentation/kernel-parameters.txt |  4 ++++
 arch/x86/kernel/cpu/vmware.c        | 41 +++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)
diff --git a/Documentation/kernel-parameters.txt
b/Documentation/kernel-parameters.txt
index 37babf9..b3b2ec0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2754,6 +2754,10 @@ bytes respectively. Such letter suffixes can also be
entirely omitted.
 	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page
 			fault handling.
 
+	no-vmw-sched-clock
+			[X86,PV_OPS] Disable paravirtualized VMware scheduler
+			clock and use the default one.
+
 	no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
 			steal time is computed, but won't influence scheduler
 			behaviour
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 098a524..9b29511 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,10 +24,15 @@
 #include <linux/dmi.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/clocksource.h>
 #include <asm/div64.h>
 #include <asm/x86_init.h>
 #include <asm/hypervisor.h>
 #include <asm/apic.h>
+#include <asm/timer.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt)	"vmware: " fmt
 
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
@@ -62,10 +67,46 @@ static unsigned long vmware_get_tsc_khz(void)
 }
 
 #ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static int vmw_sched_clock __initdata = 1;
+
+static __init int setup_vmw_sched_clock(char *s)
+{
+	vmw_sched_clock = 0;
+	return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static unsigned long long vmware_sched_clock(void)
+{
+	unsigned long long ns;
+
+	ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+			     vmware_cyc2ns.cyc2ns_shift);
+	ns -= vmware_cyc2ns.cyc2ns_offset;
+	return ns;
+}
+
 static void __init vmware_paravirt_ops_setup(void)
 {
 	pv_info.name = "VMware hypervisor";
 	pv_cpu_ops.io_delay = paravirt_nop;
+
+	if (vmware_tsc_khz && vmw_sched_clock) {
+		unsigned long long tsc_now = rdtsc();
+
+		clocks_calc_mult_shift(&vmware_cyc2ns.cyc2ns_mul,
+				       &vmware_cyc2ns.cyc2ns_shift,
+				       vmware_tsc_khz,
+				       NSEC_PER_MSEC, 0);
+		vmware_cyc2ns.cyc2ns_offset +			mul_u64_u32_shr(tsc_now,
vmware_cyc2ns.cyc2ns_mul,
+					vmware_cyc2ns.cyc2ns_shift);
+
+		pv_time_ops.sched_clock = vmware_sched_clock;
+		pr_info("using sched offset of %llu ns\n",
+			vmware_cyc2ns.cyc2ns_offset);
+	}
 }
 #else
 #define vmware_paravirt_ops_setup() do {} while (0)
-- 
2.10.1
Thomas Gleixner
2016-Oct-27  21:44 UTC
[RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
On Thu, 27 Oct 2016, Alexey Makhalov wrote:> [RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()Please don't do that. RESEND is a keyword, when the same patch (series) is sent again without any modification vs. the first patch (series). A possible reason to do so is when a patch (series) fell through the cracks and the author wants to bring it to attention again for the next merge window. If you send an updated patch (series) then please add Vn after PATCH, where n is the version number of the patch (series). While at it, please add a cover letter [PATCH Vn 0/n] the next time you send a patch series. git send-email supports that.> After aa297292d708, there are separate native calibrations for cpu_khz andWhat is aa297292d708? Please make that: commit aa297292d708 ("x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID") .....> tsc_khz. The code sets x86_platform.calibrate_cpu to native_calibrate_cpu() > which looks in cpuid leaf 0x16 or msrs for the cpu frequency.Which code? And what has the leaf and the msrs to do with your patch?> Since we keep the tsc_khz constant (even after vmotion), the cpu_khz and > tsc_khz may start diverging.Now you talk about vmware related stuff, right?> tsc_init() now does > > cpu_khz = x86_platform.calibrate_cpu(); > tsc_khz = x86_platform.calibrate_tsc(); > if (tsc_khz == 0) > tsc_khz = cpu_khz; > else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) > cpu_khz = tsc_khz;And here you copy from the referenced commit. How is that helpful?> > We want the cpu_khz and tsc_khz to be sync even if they diverge less then > 10%.We? We as kernel developers, users, guest running in vmware ????> This patch resolves this issue by setting x86_platform.calibrate_cpu to"This patch" is just bad. We already know that this is a patch, otherwise you wouldn't have sent it as a patch. "this issue" - which issue? Your explanation above does not tell anything what the issue is.> vmware_get_tsc_khz().Here is an example of a proper changelog for this (except for my potential misinterpretation of the crypto message above): Commit aa297292d708 ("x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID") seperated the calibration mechanisms for cpu_khz and tsc_khz. In a vmware guest this change can lead to divergence between the tsc and the cpu frequency, because <Insert reason> .This causes <INSERT observed malfunction>. Due to the internal design of the vmware hypervisor <Replace that by real reason> it's required to keep tsc and cpu frequency in sync. Solve this by overriding the x86 platform cpu calibration callback with the vmware specific tsc calibration function. So this describes very clearly: 1) The change which causes the problem 2) The problem itself and the resulting malfunction 3) What needs to be achieved to solve the problem 4) A short explanation what the solution is Documentation/SubmittingPatches has further information about proper change logs.> @@ -83,6 +83,7 @@ static void __init vmware_platform_setup(void) > > vmware_tsc_khz = tsc_khz; > x86_platform.calibrate_tsc = vmware_get_tsc_khz;A comment explaining why you set the cpu calibration to vmware_get_tsc_khz might be helpful for casual readers and yourself when you look at that code 5 month from now.> + x86_platform.calibrate_cpu = vmware_get_tsc_khz;Thanks, tglx
Thomas Gleixner
2016-Oct-27  22:10 UTC
[RESEND PATCH 3/3] x86/vmware: Add paravirt sched clock
On Thu, 27 Oct 2016, Alexey Makhalov wrote:> Set pv_time_ops.sched_clock to vmware_sched_clock().Please do not describe WHAT the patch does, describe why. Describe the problem you are solving. I can see from the patch> + pv_time_ops.sched_clock = vmware_sched_clock;that you set pv_time_ops.sched_clock to vmware_sched_clock().> It is simplified > version of native_sched_clock() without ring buffer of mult/shift/offset > triplets and preempt toggling.-ENOPARSE> Since VMware hypervisor provides constant tsc we can use constant > mult/shift/offset triplet calculated at boot time.So now you start to explain something which is understandable> no-vmw-sched-clock kernel parameter is added to disable the paravirt > sched clock.I give you another example: The default sched_clock() implementation is native_sched_clock(). It contains code to handle non constant frequency TSCs, which creates overhead for systems with constant frequency TSCs. The vmware hypervisor guarantees a constant frequency TSC, so native_sched_clock() is not required and slower than a dedicated function which operates with one time calculated conversion factors. Calculate the conversion factors at boot time from the tsc frequency and install an optimized sched_clock() function via paravirt ops. The paravirtualized clock can be disabled on the kernel command line with the new 'no-vmw-sched-clock' option. Can you see the difference and can you spot the structure similar to the example I gave you before?> +static unsigned long long vmware_sched_clock(void) > +{ > + unsigned long long ns; > + > + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, > + vmware_cyc2ns.cyc2ns_shift); > + ns -= vmware_cyc2ns.cyc2ns_offset; > + return ns; > +} > + > static void __init vmware_paravirt_ops_setup(void) > { > pv_info.name = "VMware hypervisor"; > pv_cpu_ops.io_delay = paravirt_nop; > + > + if (vmware_tsc_khz && vmw_sched_clock) { > + unsigned long long tsc_now = rdtsc(); > + > + clocks_calc_mult_shift(&vmware_cyc2ns.cyc2ns_mul, > + &vmware_cyc2ns.cyc2ns_shift, > + vmware_tsc_khz, > + NSEC_PER_MSEC, 0); > + vmware_cyc2ns.cyc2ns_offset > + mul_u64_u32_shr(tsc_now, vmware_cyc2ns.cyc2ns_mul, > + vmware_cyc2ns.cyc2ns_shift); > + > + pv_time_ops.sched_clock = vmware_sched_clock; > + pr_info("using sched offset of %llu ns\n", > + vmware_cyc2ns.cyc2ns_offset);If you either do: if (!vmware_tsc_khz || !vmw_sched_clock) return; or if (vmware_tsc_khz && vmw_sched_clock) setup_sched_clock(); and split out the code into a seperate function then you spare one indentation level and some of these hard to read line breaks. Hint: static void setup_sched_clock(void) { struct cyc2ns_data *d = &vmware_cyc2ns; clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc32ns_shift, vmware_tsc_khz, NSEC_PER_MSEC, 0); reduces the lenght of the arguments significantly and makes this stuff sane to read. Thanks, tglx
Thanks Thomas for the valuable comments. Changelog for the updated patchset: v1->v2 - Update pvinfo.name. v2->v3 - Address comments from Thomas G, * Created separate function: vmware_sched_clock_setup() (patch 3/3) * Updated commit descriptions for 1/3 and 3/3 Alexey Makhalov (3): x86/vmware: Use tsc_khz value for calibrate_cpu() x86/vmware: Add basic paravirt ops support x86/vmware: Add paravirt sched clock Documentation/kernel-parameters.txt | 4 +++ arch/x86/kernel/cpu/vmware.c | 55 +++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) -- 2.10.1
Apparently Analagous Threads
- [RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
- [RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
- [PATCH 0/3] x86/vmware guest improvements
- [PATCH 0/3] x86/vmware guest improvements
- [PATCH v3 0/3] x86/vmware guest improvements