Alexey Makhalov
2020-Feb-12 20:03 UTC
[PATCH 0/5] x86/vmware: Steal time accounting support
Hello, This patchset introduces steal time accounting support for the VMware guest. The idea and implementation of guest steal time support is similar to KVM ones and it is based on steal clock. The steal clock is a per CPU structure in a shared memory between hypervisor and guest, initialized by each CPU through hypercall. Steal clock is got updated by the hypervisor and read by the guest. The patchset consists of 5 items: 1. x86/vmware: Make vmware_select_hypercall() __init Minor clean up. 2. x86/vmware: Remove vmware_sched_clock_setup() Preparation for the main patch. 3. x86/vmware: Steal time clock for VMware guest Core steal time support functionality. 4. x86/vmware: Enable steal time accounting Support for steal time accounting used by update_rq_clock(). 5. x86/vmware: Use bool type for vmw_sched_clock Minor clean up. Alexey Makhalov (5): x86/vmware: Make vmware_select_hypercall() __init x86/vmware: Remove vmware_sched_clock_setup() x86/vmware: Steal time clock for VMware guest x86/vmware: Enable steal time accounting x86/vmware: Use bool type for vmw_sched_clock Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/x86/kernel/cpu/vmware.c | 227 +++++++++++++++++++++++- 2 files changed, 220 insertions(+), 9 deletions(-) -- 2.14.2
Alexey Makhalov
2020-Feb-12 20:03 UTC
[PATCH 1/5] x86/vmware: Make vmware_select_hypercall() __init
vmware_select_hypercall() is used only by the __init functions, and should be annotated with __init as well. Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Reviewed-by: Thomas Hellstrom <thellstrom at vmware.com> --- arch/x86/kernel/cpu/vmware.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 46d732696c1c..d280560fd75e 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -213,7 +213,7 @@ static void __init vmware_platform_setup(void) vmware_set_capabilities(); } -static u8 vmware_select_hypercall(void) +static u8 __init vmware_select_hypercall(void) { int eax, ebx, ecx, edx; -- 2.14.2
Alexey Makhalov
2020-Feb-12 20:03 UTC
[PATCH 2/5] x86/vmware: Remove vmware_sched_clock_setup()
Move cyc2ns setup logic to separate function. This separation will allow to use cyc2ns mult/shift pair not only for the sched_clock but also for other clocks such as steal_clock. Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Reviewed-by: Thomas Hellstrom <thellstrom at vmware.com> --- arch/x86/kernel/cpu/vmware.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index d280560fd75e..efb22fa76ba4 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -122,7 +122,7 @@ static unsigned long long notrace vmware_sched_clock(void) return ns; } -static void __init vmware_sched_clock_setup(void) +static void __init vmware_cyc2ns_setup(void) { struct cyc2ns_data *d = &vmware_cyc2ns; unsigned long long tsc_now = rdtsc(); @@ -132,8 +132,7 @@ static void __init vmware_sched_clock_setup(void) d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, d->cyc2ns_shift); - pv_ops.time.sched_clock = vmware_sched_clock; - pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); + pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); } static void __init vmware_paravirt_ops_setup(void) @@ -141,8 +140,14 @@ static void __init vmware_paravirt_ops_setup(void) pv_info.name = "VMware hypervisor"; pv_ops.cpu.io_delay = paravirt_nop; - if (vmware_tsc_khz && vmw_sched_clock) - vmware_sched_clock_setup(); + if (vmware_tsc_khz == 0) + return; + + vmware_cyc2ns_setup(); + + if (vmw_sched_clock) + pv_ops.time.sched_clock = vmware_sched_clock; + } #else #define vmware_paravirt_ops_setup() do {} while (0) -- 2.14.2
Alexey Makhalov
2020-Feb-12 20:03 UTC
[PATCH 3/5] x86/vmware: Steal time clock for VMware guest
Steal time is the amount of CPU time needed by a guest virtual machine that is not provided by the host. Steal time occurs when the host allocates this CPU time elsewhere: for example, to another guest. Steal time can be enabled by adding VM configuration option stealclock.enable = "TRUE". It is supported by VMs that run hardware version 13 or newer. This change introduces the VMware steal time infrastructure. The high level code (such as enabling, disabling and hot-plug routines) was derived from KVM one. [Tomer: use READ_ONCE macros and 32bit guests support] Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Co-developed-by: Tomer Zeltzer <tomerr90 at gmail.com> Signed-off-by: Tomer Zeltzer <tomerr90 at gmail.com> Reviewed-by: Thomas Hellstrom <thellstrom at vmware.com> --- arch/x86/kernel/cpu/vmware.c | 197 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index efb22fa76ba4..59459992ad47 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -25,6 +25,8 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/clocksource.h> +#include <linux/cpu.h> +#include <linux/reboot.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> @@ -47,6 +49,11 @@ #define VMWARE_CMD_GETVCPU_INFO 68 #define VMWARE_CMD_LEGACY_X2APIC 3 #define VMWARE_CMD_VCPU_RESERVED 31 +#define VMWARE_CMD_STEALCLOCK 91 + +#define STEALCLOCK_NOT_AVAILABLE (-1) +#define STEALCLOCK_DISABLED 0 +#define STEALCLOCK_ENABLED 1 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ __asm__("inl (%%dx), %%eax" : \ @@ -86,6 +93,18 @@ } \ } while (0) +struct vmware_steal_time { + union { + uint64_t clock; /* stolen time counter in units of vtsc */ + struct { + /* only for little-endian */ + uint32_t clock_low; + uint32_t clock_high; + }; + }; + uint64_t reserved[7]; +}; + static unsigned long vmware_tsc_khz __ro_after_init; static u8 vmware_hypercall_mode __ro_after_init; @@ -104,6 +123,8 @@ static unsigned long vmware_get_tsc_khz(void) #ifdef CONFIG_PARAVIRT static struct cyc2ns_data vmware_cyc2ns __ro_after_init; static int vmw_sched_clock __initdata = 1; +static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, steal_time) __aligned(64); +static bool has_steal_clock; static __init int setup_vmw_sched_clock(char *s) { @@ -135,6 +156,163 @@ static void __init vmware_cyc2ns_setup(void) pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); } +static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +{ + uint32_t result, info; + + asm volatile (VMWARE_HYPERCALL : + "=a"(result), + "=c"(info) : + "a"(VMWARE_HYPERVISOR_MAGIC), + "b"(0), + "c"(VMWARE_CMD_STEALCLOCK), + "d"(0), + "S"(arg1), + "D"(arg2) : + "memory"); + return result; +} + +static bool stealclock_enable(phys_addr_t pa) +{ + return vmware_cmd_stealclock(upper_32_bits(pa), + lower_32_bits(pa)) == STEALCLOCK_ENABLED; +} + +static int __stealclock_disable(void) +{ + return vmware_cmd_stealclock(0, 1); +} + +static void stealclock_disable(void) +{ + __stealclock_disable(); +} + +static bool vmware_is_stealclock_available(void) +{ + return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE; +} + +/** + * vmware_steal_clock() - read the per-cpu steal clock + * @cpu: the cpu number whose steal clock we want to read + * + * The function reads the steal clock if we are on a 64-bit system, otherwise + * reads it in parts, checking that the high part didn't change in the + * meantime. + * + * Return: + * The steal clock reading in ns. + */ +static uint64_t vmware_steal_clock(int cpu) +{ + struct vmware_steal_time *steal = &per_cpu(steal_time, cpu); + uint64_t clock; + + if (IS_ENABLED(CONFIG_64BIT)) + clock = READ_ONCE(steal->clock); + else { + uint32_t initial_high, low, high; + + do { + initial_high = READ_ONCE(steal->clock_high); + /* Do not reorder initial_high and high readings */ + virt_rmb(); + low = READ_ONCE(steal->clock_low); + /* Keep low reading in between */ + virt_rmb(); + high = READ_ONCE(steal->clock_high); + } while (initial_high != high); + + clock = ((uint64_t)high << 32) | low; + } + + return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); +} + +static void vmware_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct vmware_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + if (!stealclock_enable(slow_virt_to_phys(st))) { + has_steal_clock = false; + return; + } + + pr_info("vmware-stealtime: cpu %d, pa %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); +} + +static void vmware_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + stealclock_disable(); +} + +static void vmware_guest_cpu_init(void) +{ + if (has_steal_clock) + vmware_register_steal_time(); +} + +static void vmware_pv_guest_cpu_reboot(void *unused) +{ + vmware_disable_steal_time(); +} + +static int vmware_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block vmware_pv_reboot_nb = { + .notifier_call = vmware_pv_reboot_notify, +}; + +#ifdef CONFIG_SMP +static void __init vmware_smp_prepare_boot_cpu(void) +{ + vmware_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static int vmware_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + vmware_guest_cpu_init(); + local_irq_enable(); + return 0; +} + +static int vmware_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + vmware_disable_steal_time(); + local_irq_enable(); + return 0; +} +#endif + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) + static_key_slow_inc(¶virt_steal_enabled); + + return 0; +} +arch_initcall(activate_jump_labels); + static void __init vmware_paravirt_ops_setup(void) { pv_info.name = "VMware hypervisor"; @@ -148,6 +326,25 @@ static void __init vmware_paravirt_ops_setup(void) if (vmw_sched_clock) pv_ops.time.sched_clock = vmware_sched_clock; + if (vmware_is_stealclock_available()) { + has_steal_clock = true; + pv_ops.time.steal_clock = vmware_steal_clock; + + /* We use reboot notifier only to disable steal clock */ + register_reboot_notifier(&vmware_pv_reboot_nb); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu + vmware_smp_prepare_boot_cpu; + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/vmware:online", + vmware_cpu_online, + vmware_cpu_down_prepare) < 0) + pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n"); +#else + vmware_guest_cpu_init(); +#endif + } } #else #define vmware_paravirt_ops_setup() do {} while (0) -- 2.14.2
Alexey Makhalov
2020-Feb-12 20:03 UTC
[PATCH 4/5] x86/vmware: Enable steal time accounting
Set paravirt_steal_rq_enabled if steal clock present. paravirt_steal_rq_enabled is used in sched/core.c to adjust task progress by offsetting stolen time. Use 'no-steal-acc' off switch (share same name with KVM) to disable steal time accounting. Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> Reviewed-by: Thomas Hellstrom <thellstrom at vmware.com> --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/x86/kernel/cpu/vmware.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 87176a90e61a..07fbdccdd77c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3152,7 +3152,7 @@ [X86,PV_OPS] Disable paravirtualized VMware scheduler clock and use the default one. - no-steal-acc [X86,KVM,ARM64] Disable paravirtualized steal time + no-steal-acc [X86,PV_OPS,ARM64] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 59459992ad47..0c65d661d88b 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -125,6 +125,7 @@ static struct cyc2ns_data vmware_cyc2ns __ro_after_init; static int vmw_sched_clock __initdata = 1; static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, steal_time) __aligned(64); static bool has_steal_clock; +static bool steal_acc __initdata = true; /* steal time accounting */ static __init int setup_vmw_sched_clock(char *s) { @@ -133,6 +134,13 @@ static __init int setup_vmw_sched_clock(char *s) } early_param("no-vmw-sched-clock", setup_vmw_sched_clock); +static __init int parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} +early_param("no-steal-acc", parse_no_stealacc); + static unsigned long long notrace vmware_sched_clock(void) { unsigned long long ns; @@ -306,8 +314,11 @@ static int vmware_cpu_down_prepare(unsigned int cpu) static __init int activate_jump_labels(void) { - if (has_steal_clock) + if (has_steal_clock) { static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } return 0; } -- 2.14.2
Alexey Makhalov
2020-Feb-12 20:03 UTC
[PATCH 5/5] x86/vmware: Use bool type for vmw_sched_clock
To be aligned with other bool variables. Signed-off-by: Alexey Makhalov <amakhalov at vmware.com> --- arch/x86/kernel/cpu/vmware.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 0c65d661d88b..54e57931051d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -122,14 +122,14 @@ static unsigned long vmware_get_tsc_khz(void) #ifdef CONFIG_PARAVIRT static struct cyc2ns_data vmware_cyc2ns __ro_after_init; -static int vmw_sched_clock __initdata = 1; +static bool vmw_sched_clock __initdata = true; static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, steal_time) __aligned(64); static bool has_steal_clock; static bool steal_acc __initdata = true; /* steal time accounting */ static __init int setup_vmw_sched_clock(char *s) { - vmw_sched_clock = 0; + vmw_sched_clock = false; return 0; } early_param("no-vmw-sched-clock", setup_vmw_sched_clock); -- 2.14.2
Apparently Analagous Threads
- [PATCH 0/3] x86/vmware guest improvements
- [PATCH 0/3] x86/vmware guest improvements
- [PATCH 3/3] x86/vmware: Add paravirt sched clock
- [RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()
- [RESEND PATCH 1/3] x86/vmware: Use tsc_khz value for calibrate_cpu()