Mark Langsdorf
2007-Aug-29 22:02 UTC
[Xen-devel] [PATCH] 2/2: cpufreq/PowerNow! in Xen: PowerNow! changes
Enable cpufreq support in Xen for AMD Operton processors by: 1) Allowing the PowerNow! driver in dom0 to write to the PowerNow! MSRs. 2) Adding the cpufreq notifier chain to time-xen.c in dom0. On a frequency change, a platform hypercall is performed to scale the frequency multiplier in the hypervisor. 3) Adding a platform hypercall to the hypervisor the scale the frequency multiplier and reset the time stamps so that next calibration remains reasonably correct. Patch 1 covers the frequency scaling platform call. Patch 2 covers the changes necessary to the PowerNow! driver to make it correctly associate shared cores under Xen and to write to MSRs. Most of this patch modifies the PowerNow! driver to correctly use the _PSD structure to determine pstate domains; a similar patch is upstream for the Linux kernel. This code can be readily expanded to cover Intel or other non-AMD processors by modifying xen/arch/x8/traps.c to allow the appropriate MSR accesses. Caveat: currently, this code does not support the in-kernel ondemand cpufreq governor. Dom0 must run a userspace daemon to monitor the utilization of the physical cpus with the getcpuinfo sysctl hypercall. Caveat 2: on SMP systems, dom0_vcpus_pin is strongly advised. Caveat 3: Even though the clock multipliers are being scaled and recorded correctly in both dom0 and the hypervisor, time errors appear immediately after a frequency change. They are not more likely when the frequency is constant. Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com> diff -r 05c22f282023 arch/i386/kernel/cpu/cpufreq/powernow-k8.c --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c Tue Aug 14 16:20:55 2007 +0100 +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c Tue Aug 28 14:55:24 2007 -0500 @@ -738,6 +738,7 @@ static int find_psb_table(struct powerno data->numps = psb->numps; dprintk("numpstates: 0x%x\n", data->numps); + data->starting_core_affinity = cpumask_of_cpu(0); return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid); } /* @@ -758,15 +759,43 @@ static int find_psb_table(struct powerno #ifdef CONFIG_X86_POWERNOW_K8_ACPI static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) + if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE)) return; - data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; + data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK; +} + +static struct acpi_processor_performance *acpi_perf_data[NR_CPUS]; +static int preregister_valid = 0; + +static int powernow_k8_cpu_preinit_acpi() +{ + int i; + struct acpi_processor_performance *data; + for_each_possible_cpu(i) { + data = kzalloc(sizeof(struct acpi_processor_performance), + GFP_KERNEL); + if (!data) { + int j; + for_each_possible_cpu(j) { + kfree(acpi_perf_data[j]); + acpi_perf_data[j] = NULL; + } + return -ENODEV; + } + acpi_perf_data[i] = data; + } + + if (acpi_processor_preregister_performance(acpi_perf_data)) + return -ENODEV; + else + preregister_valid = 1; + return 0; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) @@ -774,28 +803,29 @@ static int powernow_k8_cpu_init_acpi(str struct cpufreq_frequency_table *powernow_table; int ret_val; - if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { + data->acpi_data = acpi_perf_data[data->cpu]; + if (acpi_processor_register_performance(data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); return -EIO; } /* verify the data contained in the ACPI structures */ - if (data->acpi_data.state_count <= 1) { + if (data->acpi_data->state_count <= 1) { dprintk("No ACPI P-States\n"); goto err_out; } - if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { dprintk("Invalid control/status registers (%x - %x)\n", - data->acpi_data.control_register.space_id, - data->acpi_data.status_register.space_id); + data->acpi_data->control_register.space_id, + data->acpi_data->status_register.space_id); goto err_out; } /* fill in data->powernow_table */ powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data.state_count + 1)), GFP_KERNEL); + * (data->acpi_data->state_count + 1)), GFP_KERNEL); if (!powernow_table) { dprintk("powernow_table memory alloc failure\n"); goto err_out; @@ -808,28 +838,43 @@ static int powernow_k8_cpu_init_acpi(str if (ret_val) goto err_out_mem; - powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; - powernow_table[data->acpi_data.state_count].index = 0; + powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END; + powernow_table[data->acpi_data->state_count].index = 0; data->powernow_table = powernow_table; /* fill in data */ - data->numps = data->acpi_data.state_count; + data->numps = data->acpi_data->state_count; print_basics(data); powernow_k8_acpi_pst_values(data, 0); /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); + /* determine affinity, from ACPI if available */ + if (preregister_valid) { + if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) || + (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY)) + data->starting_core_affinity = data->acpi_data->shared_cpu_map; + else + data->starting_core_affinity = cpumask_of_cpu(data->cpu); + } else { + /* best guess from family if not */ + if (cpu_family == CPU_HW_PSTATE) + data->starting_core_affinity = cpumask_of_cpu(data->cpu); + else + data->starting_core_affinity = cpu_core_map[data->cpu]; + } + return 0; err_out_mem: kfree(powernow_table); err_out: - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); - - /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ - data->acpi_data.state_count = 0; + acpi_processor_unregister_performance(data->acpi_data, data->cpu); + + /* data->acpi_data->state_count informs us at ->exit() whether ACPI was used */ + data->acpi_data->state_count = 0; return -ENODEV; } @@ -838,13 +883,13 @@ static int fill_powernow_table_pstate(st { int i; - for (i = 0; i < data->acpi_data.state_count; i++) { + for (i = 0; i < data->acpi_data->state_count; i++) { u32 index; u32 hi = 0, lo = 0; u32 fid; u32 did; - index = data->acpi_data.states[i].control & HW_PSTATE_MASK; + index = data->acpi_data->states[i].control & HW_PSTATE_MASK; if (index > MAX_HW_PSTATE) { printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); @@ -865,10 +910,10 @@ static int fill_powernow_table_pstate(st powernow_table[i].frequency = find_khz_freq_from_fiddid(fid, did); - if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { + if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", powernow_table[i].frequency, - (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); + (unsigned int) (data->acpi_data->states[i].core_frequency * 1000)); powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; continue; } @@ -880,16 +925,16 @@ static int fill_powernow_table_fidvid(st { int i; int cntlofreq = 0; - for (i = 0; i < data->acpi_data.state_count; i++) { + for (i = 0; i < data->acpi_data->state_count; i++) { u32 fid; u32 vid; if (data->exttype) { - fid = data->acpi_data.states[i].status & EXT_FID_MASK; - vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; + fid = data->acpi_data->states[i].status & EXT_FID_MASK; + vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK; } else { - fid = data->acpi_data.states[i].control & FID_MASK; - vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; + fid = data->acpi_data->states[i].control & FID_MASK; + vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK; } dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); @@ -930,10 +975,10 @@ static int fill_powernow_table_fidvid(st cntlofreq = i; } - if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { + if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", powernow_table[i].frequency, - (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); + (unsigned int) (data->acpi_data->states[i].core_frequency * 1000)); powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; continue; } @@ -943,14 +988,15 @@ static int fill_powernow_table_fidvid(st static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { - if (data->acpi_data.state_count) - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + if (data->acpi_data->state_count) + acpi_processor_unregister_performance(data->acpi_data, data->cpu); } #else static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } +static int powernow_k8_cpu_preinit_acpi() { return -ENODEV; } #endif /* CONFIG_X86_POWERNOW_K8_ACPI */ /* Take a frequency, and issue the fid/vid transition command */ @@ -1164,7 +1210,7 @@ static int __cpuinit powernowk8_cpu_init * an UP version, and is deprecated by AMD. */ if (num_online_cpus() != 1) { - printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n"); + printk(KERN_ERR PFX "Your BIOS does not provide _PSS objects. PowerNow! does not work on SMP systems without _PSS objects. Complain to your BIOS vendor.\n"); kfree(data); return -ENODEV; } @@ -1204,10 +1250,7 @@ static int __cpuinit powernowk8_cpu_init set_cpus_allowed(current, oldmask); pol->governor = CPUFREQ_DEFAULT_GOVERNOR; - if (cpu_family == CPU_HW_PSTATE) - pol->cpus = cpumask_of_cpu(pol->cpu); - else - pol->cpus = cpu_core_map[pol->cpu]; + pol->cpus = data->starting_core_affinity; data->available_cores = &(pol->cpus); /* Take a crude guess here. @@ -1323,6 +1366,7 @@ static int __cpuinit powernowk8_init(voi } if (supported_cpus == num_online_cpus()) { + powernow_k8_cpu_preinit_acpi(); printk(KERN_INFO PFX "Found %d %s " "processors (" VERSION ")\n", supported_cpus, boot_cpu_data.x86_model_id); diff -r 05c22f282023 arch/i386/kernel/cpu/cpufreq/powernow-k8.h --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h Tue Aug 14 16:20:55 2007 +0100 +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h Tue Aug 28 14:55:24 2007 -0500 @@ -32,12 +32,13 @@ struct powernow_k8_data { #ifdef CONFIG_X86_POWERNOW_K8_ACPI /* the acpi table needs to be kept. it''s only available if ACPI was * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance acpi_data; + struct acpi_processor_performance *acpi_data; #endif /* we need to keep track of associated cores, but let cpufreq * handle hotplug events - so just point at cpufreq pol->cpus * structure */ cpumask_t *available_cores; + cpumask_t starting_core_affinity; }; diff -r 256160ff19b7 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Thu Aug 16 13:27:59 2007 +0100 +++ b/xen/arch/x86/traps.c Wed Aug 29 17:10:06 2007 -0500 @@ -1724,6 +1724,15 @@ static int emulate_privileged_op(struct v->arch.guest_context.gs_base_user = res; break; #endif + case MSR_K8_FIDVID_STATUS: + case MSR_K8_FIDVID_CTL: + if ( IS_COMPAT(v->domain) ) + goto fail; + if ( wrmsr_safe(regs->ecx, regs->eax, regs->edx) ) + goto fail; + v->arch.guest_context.gs_base_user + ((u64)regs->edx << 32) | regs->eax; + break; default: if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) ) break; @@ -1760,6 +1769,13 @@ static int emulate_privileged_op(struct regs->edx = v->arch.guest_context.gs_base_user >> 32; break; #endif + case MSR_K8_FIDVID_CTL: + case MSR_K8_FIDVID_STATUS: + if ( IS_COMPAT(v->domain) ) + goto fail; + if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) ) + goto fail; + break; case MSR_EFER: if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) ) goto fail; diff -r 256160ff19b7 xen/include/asm-x86/msr.h --- a/xen/include/asm-x86/msr.h Thu Aug 16 13:27:59 2007 +0100 +++ b/xen/include/asm-x86/msr.h Wed Aug 29 17:10:06 2007 -0500 @@ -357,6 +357,9 @@ static inline void write_efer(__u64 val) #define MSR_K8_VM_CR 0xC0010114 #define MSR_K8_VM_HSAVE_PA 0xC0010117 +#define MSR_K8_FIDVID_CTL 0xC0010041 +#define MSR_K8_FIDVID_STATUS 0xC0010042 + /* MSR_K8_VM_CR bits: */ #define _K8_VMCR_SVME_DISABLE 4 #define K8_VMCR_SVME_DISABLE (1 << _K8_VMCR_SVME_DISABLE) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel