Hi All! At long last, a respin of the cpuidle vs rcu cleanup patches. v1: https://lkml.kernel.org/r/20220608142723.103523089 at infradead.org These here patches clean up the mess that is cpuidle vs rcuidle. At the end of the ride there's only on RCU_NONIDLE user left: arch/arm64/kernel/suspend.c: RCU_NONIDLE(__cpu_suspend_exit()); and 'one' trace_*_rcuidle() user: kernel/trace/trace_preemptirq.c: trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); kernel/trace/trace_preemptirq.c: trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); kernel/trace/trace_preemptirq.c: trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); kernel/trace/trace_preemptirq.c: trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); kernel/trace/trace_preemptirq.c: trace_preempt_enable_rcuidle(a0, a1); kernel/trace/trace_preemptirq.c: trace_preempt_disable_rcuidle(a0, a1); However this last is all in deprecated code that should be unused for GENERIC_ENTRY. I've touched a lot of code that I can't test and I might've broken something by accident. In particular the whole ARM cpuidle stuff was quite involved. Please all; have a look where you haven't already. New since v1: - rebase on top of Frederic's rcu-context-tracking rename fest - more omap goodness as per the last discusion (thanks Tony!) - removed one more RCU_NONIDLE() from arm64/risc-v perf code - ubsan/kasan fixes - intel_idle module-param for testing - a bunch of extra __always_inline, because compilers are silly. --- arch/alpha/kernel/process.c | 1 - arch/alpha/kernel/vmlinux.lds.S | 1 - arch/arc/kernel/process.c | 3 ++ arch/arc/kernel/vmlinux.lds.S | 1 - arch/arm/include/asm/vmlinux.lds.h | 1 - arch/arm/kernel/process.c | 1 - arch/arm/kernel/smp.c | 6 +-- arch/arm/mach-gemini/board-dt.c | 3 +- arch/arm/mach-imx/cpuidle-imx6q.c | 4 +- arch/arm/mach-imx/cpuidle-imx6sx.c | 5 ++- arch/arm/mach-omap2/common.h | 6 ++- arch/arm/mach-omap2/cpuidle34xx.c | 16 +++++++- arch/arm/mach-omap2/cpuidle44xx.c | 29 +++++++------- arch/arm/mach-omap2/omap-mpuss-lowpower.c | 12 +++++- arch/arm/mach-omap2/pm.h | 2 +- arch/arm/mach-omap2/pm24xx.c | 51 +----------------------- arch/arm/mach-omap2/pm34xx.c | 14 +++++-- arch/arm/mach-omap2/pm44xx.c | 2 +- arch/arm/mach-omap2/powerdomain.c | 10 ++--- arch/arm64/kernel/idle.c | 1 - arch/arm64/kernel/smp.c | 4 +- arch/arm64/kernel/vmlinux.lds.S | 1 - arch/csky/kernel/process.c | 1 - arch/csky/kernel/smp.c | 2 +- arch/csky/kernel/vmlinux.lds.S | 1 - arch/hexagon/kernel/process.c | 1 - arch/hexagon/kernel/vmlinux.lds.S | 1 - arch/ia64/kernel/process.c | 1 + arch/ia64/kernel/vmlinux.lds.S | 1 - arch/loongarch/kernel/idle.c | 1 + arch/loongarch/kernel/vmlinux.lds.S | 1 - arch/m68k/kernel/vmlinux-nommu.lds | 1 - arch/m68k/kernel/vmlinux-std.lds | 1 - arch/m68k/kernel/vmlinux-sun3.lds | 1 - arch/microblaze/kernel/process.c | 1 - arch/microblaze/kernel/vmlinux.lds.S | 1 - arch/mips/kernel/idle.c | 8 ++-- arch/mips/kernel/vmlinux.lds.S | 1 - arch/nios2/kernel/process.c | 1 - arch/nios2/kernel/vmlinux.lds.S | 1 - arch/openrisc/kernel/process.c | 1 + arch/openrisc/kernel/vmlinux.lds.S | 1 - arch/parisc/kernel/process.c | 2 - arch/parisc/kernel/vmlinux.lds.S | 1 - arch/powerpc/kernel/idle.c | 5 +-- arch/powerpc/kernel/vmlinux.lds.S | 1 - arch/riscv/kernel/process.c | 1 - arch/riscv/kernel/vmlinux-xip.lds.S | 1 - arch/riscv/kernel/vmlinux.lds.S | 1 - arch/s390/kernel/idle.c | 1 - arch/s390/kernel/vmlinux.lds.S | 1 - arch/sh/kernel/idle.c | 1 + arch/sh/kernel/vmlinux.lds.S | 1 - arch/sparc/kernel/leon_pmc.c | 4 ++ arch/sparc/kernel/process_32.c | 1 - arch/sparc/kernel/process_64.c | 3 +- arch/sparc/kernel/vmlinux.lds.S | 1 - arch/um/kernel/dyn.lds.S | 1 - arch/um/kernel/process.c | 1 - arch/um/kernel/uml.lds.S | 1 - arch/x86/boot/compressed/vmlinux.lds.S | 1 + arch/x86/coco/tdx/tdcall.S | 15 +------ arch/x86/coco/tdx/tdx.c | 25 ++++-------- arch/x86/events/amd/brs.c | 13 +++---- arch/x86/include/asm/fpu/xcr.h | 4 +- arch/x86/include/asm/irqflags.h | 11 ++---- arch/x86/include/asm/mwait.h | 14 +++---- arch/x86/include/asm/nospec-branch.h | 2 +- arch/x86/include/asm/paravirt.h | 6 ++- arch/x86/include/asm/perf_event.h | 2 +- arch/x86/include/asm/shared/io.h | 4 +- arch/x86/include/asm/shared/tdx.h | 1 - arch/x86/include/asm/special_insns.h | 8 ++-- arch/x86/include/asm/xen/hypercall.h | 2 +- arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kernel/fpu/core.c | 4 +- arch/x86/kernel/paravirt.c | 14 ++++++- arch/x86/kernel/process.c | 65 +++++++++++++++---------------- arch/x86/kernel/vmlinux.lds.S | 1 - arch/x86/lib/memcpy_64.S | 5 +-- arch/x86/lib/memmove_64.S | 4 +- arch/x86/lib/memset_64.S | 4 +- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/irq.c | 2 +- arch/xtensa/kernel/process.c | 1 + arch/xtensa/kernel/vmlinux.lds.S | 1 - drivers/acpi/processor_idle.c | 36 ++++++++++------- drivers/base/power/runtime.c | 24 ++++++------ drivers/clk/clk.c | 8 ++-- drivers/cpuidle/cpuidle-arm.c | 1 + drivers/cpuidle/cpuidle-big_little.c | 8 +++- drivers/cpuidle/cpuidle-mvebu-v7.c | 7 ++++ drivers/cpuidle/cpuidle-psci.c | 10 +++-- drivers/cpuidle/cpuidle-qcom-spm.c | 1 + drivers/cpuidle/cpuidle-riscv-sbi.c | 10 +++-- drivers/cpuidle/cpuidle-tegra.c | 21 +++++++--- drivers/cpuidle/cpuidle.c | 21 +++++----- drivers/cpuidle/dt_idle_states.c | 2 +- drivers/cpuidle/poll_state.c | 10 ++++- drivers/idle/intel_idle.c | 19 +++++---- drivers/perf/arm_pmu.c | 11 +----- drivers/perf/riscv_pmu_sbi.c | 8 +--- include/asm-generic/vmlinux.lds.h | 9 ++--- include/linux/compiler_types.h | 8 +++- include/linux/cpu.h | 3 -- include/linux/cpuidle.h | 34 ++++++++++++++++ include/linux/cpumask.h | 4 +- include/linux/percpu-defs.h | 2 +- include/linux/sched/idle.h | 40 ++++++++++++++----- include/linux/thread_info.h | 18 ++++++++- include/linux/tracepoint.h | 13 ++++++- kernel/cpu_pm.c | 9 ----- kernel/printk/printk.c | 2 +- kernel/sched/idle.c | 47 +++++++--------------- kernel/time/tick-broadcast-hrtimer.c | 29 ++++++-------- kernel/time/tick-broadcast.c | 6 ++- kernel/trace/trace.c | 3 ++ lib/ubsan.c | 5 ++- mm/kasan/kasan.h | 4 ++ mm/kasan/shadow.c | 38 ++++++++++++++++++ tools/objtool/check.c | 17 ++++++++ 121 files changed, 511 insertions(+), 420 deletions(-)
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 01/44] x86/perf/amd: Remove tracing from perf_lopwr_cb()
The perf_lopwr_cb() is called from the idle routines; there is no RCU there, we must not enter tracing. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/events/amd/brs.c | 13 +++++-------- arch/x86/include/asm/perf_event.h | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -41,18 +41,15 @@ static inline unsigned int brs_to(int id return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; } -static inline void set_debug_extn_cfg(u64 val) +static __always_inline void set_debug_extn_cfg(u64 val) { /* bits[4:3] must always be set to 11b */ - wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); + __wrmsr(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3, val >> 32); } -static inline u64 get_debug_extn_cfg(void) +static __always_inline u64 get_debug_extn_cfg(void) { - u64 val; - - rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); - return val; + return __rdmsr(MSR_AMD_DBG_EXTN_CFG); } static bool __init amd_brs_detect(void) @@ -338,7 +335,7 @@ void amd_pmu_brs_sched_task(struct perf_ * called from ACPI processor_idle.c or acpi_pad.c * with interrupts disabled */ -void perf_amd_brs_lopwr_cb(bool lopwr_in) +void noinstr perf_amd_brs_lopwr_cb(bool lopwr_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); union amd_debug_extn_cfg cfg; --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -554,7 +554,7 @@ extern void perf_amd_brs_lopwr_cb(bool l DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); -static inline void perf_lopwr_cb(bool lopwr_in) +static __always_inline void perf_lopwr_cb(bool lopwr_in) { static_call_mod(perf_lopwr_cb)(lopwr_in); }
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 02/44] x86/idle: Replace x86_idle with a static_call
Typical boot time setup; no need to suffer an indirect call for that. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Frederic Weisbecker <frederic at kernel.org> Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki at intel.com> --- arch/x86/kernel/process.c | 50 +++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 22 deletions(-) --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -24,6 +24,7 @@ #include <linux/cpuidle.h> #include <linux/acpi.h> #include <linux/elf-randomize.h> +#include <linux/static_call.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <asm/cpu.h> @@ -692,7 +693,23 @@ void __switch_to_xtra(struct task_struct unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -static void (*x86_idle)(void); +/* + * We use this if we don't have any better idle routine.. + */ +void __cpuidle default_idle(void) +{ + raw_safe_halt(); +} +#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) +EXPORT_SYMBOL(default_idle); +#endif + +DEFINE_STATIC_CALL_NULL(x86_idle, default_idle); + +static bool x86_idle_set(void) +{ + return !!static_call_query(x86_idle); +} #ifndef CONFIG_SMP static inline void play_dead(void) @@ -715,28 +732,17 @@ void arch_cpu_idle_dead(void) /* * Called from the generic idle code. */ -void arch_cpu_idle(void) -{ - x86_idle(); -} - -/* - * We use this if we don't have any better idle routine.. - */ -void __cpuidle default_idle(void) +void __cpuidle arch_cpu_idle(void) { - raw_safe_halt(); + static_call(x86_idle)(); } -#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) -EXPORT_SYMBOL(default_idle); -#endif #ifdef CONFIG_XEN bool xen_set_default_idle(void) { - bool ret = !!x86_idle; + bool ret = x86_idle_set(); - x86_idle = default_idle; + static_call_update(x86_idle, default_idle); return ret; } @@ -859,20 +865,20 @@ void select_idle_routine(const struct cp if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); #endif - if (x86_idle || boot_option_idle_override == IDLE_POLL) + if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) return; if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { pr_info("using AMD E400 aware idle routine\n"); - x86_idle = amd_e400_idle; + static_call_update(x86_idle, amd_e400_idle); } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); - x86_idle = mwait_idle; + static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); - x86_idle = tdx_safe_halt; + static_call_update(x86_idle, tdx_safe_halt); } else - x86_idle = default_idle; + static_call_update(x86_idle, default_idle); } void amd_e400_c1e_apic_setup(void) @@ -925,7 +931,7 @@ static int __init idle_setup(char *str) * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ - x86_idle = default_idle; + static_call_update(x86_idle, default_idle); boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /*
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 03/44] cpuidle/poll: Ensure IRQ state is invariant
cpuidle_state::enter() methods should be IRQ invariant Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki at intel.com> --- drivers/cpuidle/poll_state.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -17,7 +17,7 @@ static int __cpuidle poll_idle(struct cp dev->poll_time_limit = false; - local_irq_enable(); + raw_local_irq_enable(); if (!current_set_polling_and_test()) { unsigned int loop_count = 0; u64 limit; @@ -36,6 +36,8 @@ static int __cpuidle poll_idle(struct cp } } } + raw_local_irq_disable(); + current_clr_polling(); return index;
Make cpuidle_enter_state() consistent with the s2idle variant and verify ->enter() always returns with interrupts disabled. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/cpuidle/cpuidle.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -236,7 +236,11 @@ int cpuidle_enter_state(struct cpuidle_d stop_critical_timings(); if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) ct_idle_enter(); + entered_state = target_state->enter(dev, drv, index); + if (WARN_ONCE(!irqs_disabled(), "%ps leaked IRQ state", target_state->enter)) + raw_local_irq_disable(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) ct_idle_exit(); start_critical_timings(); @@ -248,12 +252,8 @@ int cpuidle_enter_state(struct cpuidle_d /* The cpu is no longer idle or about to enter idle. */ sched_idle_set_state(NULL); - if (broadcast) { - if (WARN_ON_ONCE(!irqs_disabled())) - local_irq_disable(); - + if (broadcast) tick_broadcast_exit(); - } if (!cpuidle_state_is_coupled(drv, index)) local_irq_enable();
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 05/44] cpuidle,riscv: Push RCU-idle into driver
Doing RCU-idle outside the driver, only to then temporarily enable it again, at least twice, before going idle is daft. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/cpuidle/cpuidle-riscv-sbi.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -116,12 +116,12 @@ static int __sbi_enter_domain_idle_state return -1; /* Do runtime PM to manage a hierarchical CPU toplogy. */ - ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_suspend(pd_dev); else pm_runtime_put_sync_suspend(pd_dev); - ct_irq_exit_irqson(); + + ct_idle_enter(); if (sbi_is_domain_state_available()) state = sbi_get_domain_state(); @@ -130,12 +130,12 @@ static int __sbi_enter_domain_idle_state ret = sbi_suspend(state) ? -1 : idx; - ct_irq_enter_irqson(); + ct_idle_exit(); + if (s2idle) dev_pm_genpd_resume(pd_dev); else pm_runtime_get_sync(pd_dev); - ct_irq_exit_irqson(); cpu_pm_exit(); @@ -246,6 +246,7 @@ static int sbi_dt_cpu_init_topology(stru * of a shared state for the domain, assumes the domain states are all * deeper states. */ + drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE; drv->states[state_count - 1].enter = sbi_enter_domain_idle_state; drv->states[state_count - 1].enter_s2idle sbi_enter_s2idle_domain_idle_state;
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 06/44] cpuidle,tegra: Push RCU-idle into driver
Doing RCU-idle outside the driver, only to then temporarily enable it again, at least twice, before going idle is daft. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/cpuidle/cpuidle-tegra.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) --- a/drivers/cpuidle/cpuidle-tegra.c +++ b/drivers/cpuidle/cpuidle-tegra.c @@ -180,9 +180,11 @@ static int tegra_cpuidle_state_enter(str } local_fiq_disable(); - RCU_NONIDLE(tegra_pm_set_cpu_in_lp2()); + tegra_pm_set_cpu_in_lp2(); cpu_pm_enter(); + ct_idle_enter(); + switch (index) { case TEGRA_C7: err = tegra_cpuidle_c7_enter(); @@ -197,8 +199,10 @@ static int tegra_cpuidle_state_enter(str break; } + ct_idle_exit(); + cpu_pm_exit(); - RCU_NONIDLE(tegra_pm_clear_cpu_in_lp2()); + tegra_pm_clear_cpu_in_lp2(); local_fiq_enable(); return err ?: index; @@ -226,6 +230,7 @@ static int tegra_cpuidle_enter(struct cp struct cpuidle_driver *drv, int index) { + bool do_rcu = drv->states[index].flags & CPUIDLE_FLAG_RCU_IDLE; unsigned int cpu = cpu_logical_map(dev->cpu); int ret; @@ -233,9 +238,13 @@ static int tegra_cpuidle_enter(struct cp if (dev->states_usage[index].disable) return -1; - if (index == TEGRA_C1) + if (index == TEGRA_C1) { + if (do_rcu) + ct_idle_enter(); ret = arm_cpuidle_simple_enter(dev, drv, index); - else + if (do_rcu) + ct_idle_exit(); + } else ret = tegra_cpuidle_state_enter(dev, index, cpu); if (ret < 0) { @@ -285,7 +294,8 @@ static struct cpuidle_driver tegra_idle_ .exit_latency = 2000, .target_residency = 2200, .power_usage = 100, - .flags = CPUIDLE_FLAG_TIMER_STOP, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, .name = "C7", .desc = "CPU core powered off", }, @@ -295,6 +305,7 @@ static struct cpuidle_driver tegra_idle_ .target_residency = 10000, .power_usage = 0, .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE | CPUIDLE_FLAG_COUPLED, .name = "CC6", .desc = "CPU cluster powered off",
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 07/44] cpuidle,psci: Push RCU-idle into driver
Doing RCU-idle outside the driver, only to then temporarily enable it again, at least twice, before going idle is daft. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/cpuidle/cpuidle-psci.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -69,12 +69,12 @@ static int __psci_enter_domain_idle_stat return -1; /* Do runtime PM to manage a hierarchical CPU toplogy. */ - ct_irq_enter_irqson(); if (s2idle) dev_pm_genpd_suspend(pd_dev); else pm_runtime_put_sync_suspend(pd_dev); - ct_irq_exit_irqson(); + + ct_idle_enter(); state = psci_get_domain_state(); if (!state) @@ -82,12 +82,12 @@ static int __psci_enter_domain_idle_stat ret = psci_cpu_suspend_enter(state) ? -1 : idx; - ct_irq_enter_irqson(); + ct_idle_exit(); + if (s2idle) dev_pm_genpd_resume(pd_dev); else pm_runtime_get_sync(pd_dev); - ct_irq_exit_irqson(); cpu_pm_exit(); @@ -240,6 +240,7 @@ static int psci_dt_cpu_init_topology(str * of a shared state for the domain, assumes the domain states are all * deeper states. */ + drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE; drv->states[state_count - 1].enter = psci_enter_domain_idle_state; drv->states[state_count - 1].enter_s2idle = psci_enter_s2idle_domain_idle_state; psci_cpuidle_use_cpuhp = true;
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 08/44] cpuidle,imx6: Push RCU-idle into driver
Doing RCU-idle outside the driver, only to then temporarily enable it again, at least twice, before going idle is daft. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/arm/mach-imx/cpuidle-imx6sx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) --- a/arch/arm/mach-imx/cpuidle-imx6sx.c +++ b/arch/arm/mach-imx/cpuidle-imx6sx.c @@ -47,7 +47,9 @@ static int imx6sx_enter_wait(struct cpui cpu_pm_enter(); cpu_cluster_pm_enter(); + ct_idle_enter(); cpu_suspend(0, imx6sx_idle_finish); + ct_idle_exit(); cpu_cluster_pm_exit(); cpu_pm_exit(); @@ -87,7 +89,8 @@ static struct cpuidle_driver imx6sx_cpui */ .exit_latency = 300, .target_residency = 500, - .flags = CPUIDLE_FLAG_TIMER_STOP, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, .enter = imx6sx_enter_wait, .name = "LOW-POWER-IDLE", .desc = "ARM power off",
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 09/44] cpuidle,omap3: Push RCU-idle into driver
Doing RCU-idle outside the driver, only to then teporarily enable it again before going idle is daft. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Tony Lindgren <tony at atomide.com> Tested-by: Tony Lindgren <tony at atomide.com> --- arch/arm/mach-omap2/cpuidle34xx.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) --- a/arch/arm/mach-omap2/cpuidle34xx.c +++ b/arch/arm/mach-omap2/cpuidle34xx.c @@ -133,7 +133,9 @@ static int omap3_enter_idle(struct cpuid } /* Execute ARM wfi */ + ct_idle_enter(); omap_sram_idle(); + ct_idle_exit(); /* * Call idle CPU PM enter notifier chain to restore @@ -265,6 +267,7 @@ static struct cpuidle_driver omap3_idle_ .owner = THIS_MODULE, .states = { { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 2 + 2, .target_residency = 5, @@ -272,6 +275,7 @@ static struct cpuidle_driver omap3_idle_ .desc = "MPU ON + CORE ON", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 10 + 10, .target_residency = 30, @@ -279,6 +283,7 @@ static struct cpuidle_driver omap3_idle_ .desc = "MPU ON + CORE ON", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 50 + 50, .target_residency = 300, @@ -286,6 +291,7 @@ static struct cpuidle_driver omap3_idle_ .desc = "MPU RET + CORE ON", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 1500 + 1800, .target_residency = 4000, @@ -293,6 +299,7 @@ static struct cpuidle_driver omap3_idle_ .desc = "MPU OFF + CORE ON", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 2500 + 7500, .target_residency = 12000, @@ -300,6 +307,7 @@ static struct cpuidle_driver omap3_idle_ .desc = "MPU RET + CORE RET", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 3000 + 8500, .target_residency = 15000, @@ -307,6 +315,7 @@ static struct cpuidle_driver omap3_idle_ .desc = "MPU OFF + CORE RET", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 10000 + 30000, .target_residency = 30000, @@ -328,6 +337,7 @@ static struct cpuidle_driver omap3430_id .owner = THIS_MODULE, .states = { { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 110 + 162, .target_residency = 5, @@ -335,6 +345,7 @@ static struct cpuidle_driver omap3430_id .desc = "MPU ON + CORE ON", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 106 + 180, .target_residency = 309, @@ -342,6 +353,7 @@ static struct cpuidle_driver omap3430_id .desc = "MPU ON + CORE ON", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 107 + 410, .target_residency = 46057, @@ -349,6 +361,7 @@ static struct cpuidle_driver omap3430_id .desc = "MPU RET + CORE ON", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 121 + 3374, .target_residency = 46057, @@ -356,6 +369,7 @@ static struct cpuidle_driver omap3430_id .desc = "MPU OFF + CORE ON", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 855 + 1146, .target_residency = 46057, @@ -363,6 +377,7 @@ static struct cpuidle_driver omap3430_id .desc = "MPU RET + CORE RET", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 7580 + 4134, .target_residency = 484329, @@ -370,6 +385,7 @@ static struct cpuidle_driver omap3430_id .desc = "MPU OFF + CORE RET", }, { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = omap3_enter_idle_bm, .exit_latency = 7505 + 15274, .target_residency = 484329,
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 10/44] cpuidle,armada: Push RCU-idle into driver
Doing RCU-idle outside the driver, only to then temporarily enable it again before going idle is daft. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/cpuidle/cpuidle-mvebu-v7.c | 7 +++++++ 1 file changed, 7 insertions(+) --- a/drivers/cpuidle/cpuidle-mvebu-v7.c +++ b/drivers/cpuidle/cpuidle-mvebu-v7.c @@ -36,7 +36,10 @@ static int mvebu_v7_enter_idle(struct cp if (drv->states[index].flags & MVEBU_V7_FLAG_DEEP_IDLE) deepidle = true; + ct_idle_enter(); ret = mvebu_v7_cpu_suspend(deepidle); + ct_idle_exit(); + cpu_pm_exit(); if (ret) @@ -49,6 +52,7 @@ static struct cpuidle_driver armadaxp_id .name = "armada_xp_idle", .states[0] = ARM_CPUIDLE_WFI_STATE, .states[1] = { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = mvebu_v7_enter_idle, .exit_latency = 100, .power_usage = 50, @@ -57,6 +61,7 @@ static struct cpuidle_driver armadaxp_id .desc = "CPU power down", }, .states[2] = { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = mvebu_v7_enter_idle, .exit_latency = 1000, .power_usage = 5, @@ -72,6 +77,7 @@ static struct cpuidle_driver armada370_i .name = "armada_370_idle", .states[0] = ARM_CPUIDLE_WFI_STATE, .states[1] = { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = mvebu_v7_enter_idle, .exit_latency = 100, .power_usage = 5, @@ -87,6 +93,7 @@ static struct cpuidle_driver armada38x_i .name = "armada_38x_idle", .states[0] = ARM_CPUIDLE_WFI_STATE, .states[1] = { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = mvebu_v7_enter_idle, .exit_latency = 10, .power_usage = 5,
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 11/44] cpuidle,omap4: Push RCU-idle into driver
Doing RCU-idle outside the driver, only to then temporarily enable it again, some *four* times, before going idle is daft. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Tony Lindgren <tony at atomide.com> Tested-by: Tony Lindgren <tony at atomide.com> --- arch/arm/mach-omap2/cpuidle44xx.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) --- a/arch/arm/mach-omap2/cpuidle44xx.c +++ b/arch/arm/mach-omap2/cpuidle44xx.c @@ -105,7 +105,9 @@ static int omap_enter_idle_smp(struct cp } raw_spin_unlock_irqrestore(&mpu_lock, flag); + ct_idle_enter(); omap4_enter_lowpower(dev->cpu, cx->cpu_state); + ct_idle_exit(); raw_spin_lock_irqsave(&mpu_lock, flag); if (cx->mpu_state_vote == num_online_cpus()) @@ -151,10 +153,10 @@ static int omap_enter_idle_coupled(struc (cx->mpu_logic_state == PWRDM_POWER_OFF); /* Enter broadcast mode for periodic timers */ - RCU_NONIDLE(tick_broadcast_enable()); + tick_broadcast_enable(); /* Enter broadcast mode for one-shot timers */ - RCU_NONIDLE(tick_broadcast_enter()); + tick_broadcast_enter(); /* * Call idle CPU PM enter notifier chain so that @@ -166,7 +168,7 @@ static int omap_enter_idle_coupled(struc if (dev->cpu == 0) { pwrdm_set_logic_retst(mpu_pd, cx->mpu_logic_state); - RCU_NONIDLE(omap_set_pwrdm_state(mpu_pd, cx->mpu_state)); + omap_set_pwrdm_state(mpu_pd, cx->mpu_state); /* * Call idle CPU cluster PM enter notifier chain @@ -178,14 +180,16 @@ static int omap_enter_idle_coupled(struc index = 0; cx = state_ptr + index; pwrdm_set_logic_retst(mpu_pd, cx->mpu_logic_state); - RCU_NONIDLE(omap_set_pwrdm_state(mpu_pd, cx->mpu_state)); + omap_set_pwrdm_state(mpu_pd, cx->mpu_state); mpuss_can_lose_context = 0; } } } + ct_idle_enter(); omap4_enter_lowpower(dev->cpu, cx->cpu_state); cpu_done[dev->cpu] = true; + ct_idle_exit(); /* Wakeup CPU1 only if it is not offlined */ if (dev->cpu == 0 && cpumask_test_cpu(1, cpu_online_mask)) { @@ -194,9 +198,9 @@ static int omap_enter_idle_coupled(struc mpuss_can_lose_context) gic_dist_disable(); - RCU_NONIDLE(clkdm_deny_idle(cpu_clkdm[1])); - RCU_NONIDLE(omap_set_pwrdm_state(cpu_pd[1], PWRDM_POWER_ON)); - RCU_NONIDLE(clkdm_allow_idle(cpu_clkdm[1])); + clkdm_deny_idle(cpu_clkdm[1]); + omap_set_pwrdm_state(cpu_pd[1], PWRDM_POWER_ON); + clkdm_allow_idle(cpu_clkdm[1]); if (IS_PM44XX_ERRATUM(PM_OMAP4_ROM_SMP_BOOT_ERRATUM_GICD) && mpuss_can_lose_context) { @@ -222,7 +226,7 @@ static int omap_enter_idle_coupled(struc cpu_pm_exit(); cpu_pm_out: - RCU_NONIDLE(tick_broadcast_exit()); + tick_broadcast_exit(); fail: cpuidle_coupled_parallel_barrier(dev, &abort_barrier); @@ -247,7 +251,8 @@ static struct cpuidle_driver omap4_idle_ /* C2 - CPU0 OFF + CPU1 OFF + MPU CSWR */ .exit_latency = 328 + 440, .target_residency = 960, - .flags = CPUIDLE_FLAG_COUPLED, + .flags = CPUIDLE_FLAG_COUPLED | + CPUIDLE_FLAG_RCU_IDLE, .enter = omap_enter_idle_coupled, .name = "C2", .desc = "CPUx OFF, MPUSS CSWR", @@ -256,7 +261,8 @@ static struct cpuidle_driver omap4_idle_ /* C3 - CPU0 OFF + CPU1 OFF + MPU OSWR */ .exit_latency = 460 + 518, .target_residency = 1100, - .flags = CPUIDLE_FLAG_COUPLED, + .flags = CPUIDLE_FLAG_COUPLED | + CPUIDLE_FLAG_RCU_IDLE, .enter = omap_enter_idle_coupled, .name = "C3", .desc = "CPUx OFF, MPUSS OSWR", @@ -282,7 +288,8 @@ static struct cpuidle_driver omap5_idle_ /* C2 - CPU0 RET + CPU1 RET + MPU CSWR */ .exit_latency = 48 + 60, .target_residency = 100, - .flags = CPUIDLE_FLAG_TIMER_STOP, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, .enter = omap_enter_idle_smp, .name = "C2", .desc = "CPUx CSWR, MPUSS CSWR",
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 12/44] cpuidle,dt: Push RCU-idle into driver
Doing RCU-idle outside the driver, only to then temporarily enable it again before going idle is daft. Notably: this converts all dt_init_idle_driver() and __CPU_PM_CPU_IDLE_ENTER() users for they are inextrably intertwined. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/arm/mach-omap2/cpuidle34xx.c | 4 ++-- drivers/acpi/processor_idle.c | 2 ++ drivers/cpuidle/cpuidle-arm.c | 1 + drivers/cpuidle/cpuidle-big_little.c | 8 ++++++-- drivers/cpuidle/cpuidle-psci.c | 1 + drivers/cpuidle/cpuidle-qcom-spm.c | 1 + drivers/cpuidle/cpuidle-riscv-sbi.c | 1 + drivers/cpuidle/dt_idle_states.c | 2 +- include/linux/cpuidle.h | 4 ++++ 9 files changed, 19 insertions(+), 5 deletions(-) --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -1200,6 +1200,8 @@ static int acpi_processor_setup_lpi_stat state->target_residency = lpi->min_residency; if (lpi->arch_flags) state->flags |= CPUIDLE_FLAG_TIMER_STOP; + if (lpi->entry_method == ACPI_CSTATE_FFH) + state->flags |= CPUIDLE_FLAG_RCU_IDLE; state->enter = acpi_idle_lpi_enter; drv->safe_state_index = i; } --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -53,6 +53,7 @@ static struct cpuidle_driver arm_idle_dr * handler for idle state index 0. */ .states[0] = { + .flags = CPUIDLE_FLAG_RCU_IDLE, .enter = arm_enter_idle_state, .exit_latency = 1, .target_residency = 1, --- a/drivers/cpuidle/cpuidle-big_little.c +++ b/drivers/cpuidle/cpuidle-big_little.c @@ -64,7 +64,8 @@ static struct cpuidle_driver bl_idle_lit .enter = bl_enter_powerdown, .exit_latency = 700, .target_residency = 2500, - .flags = CPUIDLE_FLAG_TIMER_STOP, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, .name = "C1", .desc = "ARM little-cluster power down", }, @@ -85,7 +86,8 @@ static struct cpuidle_driver bl_idle_big .enter = bl_enter_powerdown, .exit_latency = 500, .target_residency = 2000, - .flags = CPUIDLE_FLAG_TIMER_STOP, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, .name = "C1", .desc = "ARM big-cluster power down", }, @@ -124,11 +126,13 @@ static int bl_enter_powerdown(struct cpu struct cpuidle_driver *drv, int idx) { cpu_pm_enter(); + ct_idle_enter(); cpu_suspend(0, bl_powerdown_finisher); /* signals the MCPM core that CPU is out of low power state */ mcpm_cpu_powered_up(); + ct_idle_exit(); cpu_pm_exit(); --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -357,6 +357,7 @@ static int psci_idle_init_cpu(struct dev * PSCI idle states relies on architectural WFI to be represented as * state index 0. */ + drv->states[0].flags = CPUIDLE_FLAG_RCU_IDLE; drv->states[0].enter = psci_enter_idle_state; drv->states[0].exit_latency = 1; drv->states[0].target_residency = 1; --- a/drivers/cpuidle/cpuidle-qcom-spm.c +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -72,6 +72,7 @@ static struct cpuidle_driver qcom_spm_id .owner = THIS_MODULE, .states[0] = { .enter = spm_enter_idle_state, + .flags = CPUIDLE_FLAG_RCU_IDLE, .exit_latency = 1, .target_residency = 1, .power_usage = UINT_MAX, --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -332,6 +332,7 @@ static int sbi_cpuidle_init_cpu(struct d drv->cpumask = (struct cpumask *)cpumask_of(cpu); /* RISC-V architectural WFI to be represented as state index 0. */ + drv->states[0].flags = CPUIDLE_FLAG_RCU_IDLE; drv->states[0].enter = sbi_cpuidle_enter_state; drv->states[0].exit_latency = 1; drv->states[0].target_residency = 1; --- a/drivers/cpuidle/dt_idle_states.c +++ b/drivers/cpuidle/dt_idle_states.c @@ -77,7 +77,7 @@ static int init_state_node(struct cpuidl if (err) desc = state_node->name; - idle_state->flags = 0; + idle_state->flags = CPUIDLE_FLAG_RCU_IDLE; if (of_property_read_bool(state_node, "local-timer-stop")) idle_state->flags |= CPUIDLE_FLAG_TIMER_STOP; /* --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -282,14 +282,18 @@ extern s64 cpuidle_governor_latency_req( int __ret = 0; \ \ if (!idx) { \ + ct_idle_enter(); \ cpu_do_idle(); \ + ct_idle_exit(); \ return idx; \ } \ \ if (!is_retention) \ __ret = cpu_pm_enter(); \ if (!__ret) { \ + ct_idle_enter(); \ __ret = low_level_idle_enter(state); \ + ct_idle_exit(); \ if (!is_retention) \ cpu_pm_exit(); \ } \
The whole disable-RCU, enable-IRQS dance is very intricate since changing IRQ state is traced, which depends on RCU. Add two helpers for the cpuidle case that mirror the entry code. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/arm/mach-imx/cpuidle-imx6q.c | 4 +-- arch/arm/mach-imx/cpuidle-imx6sx.c | 4 +-- arch/arm/mach-omap2/cpuidle34xx.c | 4 +-- arch/arm/mach-omap2/cpuidle44xx.c | 8 +++--- drivers/acpi/processor_idle.c | 8 ++++-- drivers/cpuidle/cpuidle-big_little.c | 4 +-- drivers/cpuidle/cpuidle-mvebu-v7.c | 4 +-- drivers/cpuidle/cpuidle-psci.c | 4 +-- drivers/cpuidle/cpuidle-riscv-sbi.c | 4 +-- drivers/cpuidle/cpuidle-tegra.c | 8 +++--- drivers/cpuidle/cpuidle.c | 11 ++++---- include/linux/cpuidle.h | 38 ++++++++++++++++++++++++++--- kernel/sched/idle.c | 45 ++++++++++------------------------- kernel/time/tick-broadcast.c | 6 +++- 14 files changed, 86 insertions(+), 66 deletions(-) --- a/arch/arm/mach-imx/cpuidle-imx6q.c +++ b/arch/arm/mach-imx/cpuidle-imx6q.c @@ -25,9 +25,9 @@ static int imx6q_enter_wait(struct cpuid imx6_set_lpm(WAIT_UNCLOCKED); raw_spin_unlock(&cpuidle_lock); - ct_idle_enter(); + ct_cpuidle_enter(); cpu_do_idle(); - ct_idle_exit(); + ct_cpuidle_exit(); raw_spin_lock(&cpuidle_lock); if (num_idle_cpus-- == num_online_cpus()) --- a/arch/arm/mach-imx/cpuidle-imx6sx.c +++ b/arch/arm/mach-imx/cpuidle-imx6sx.c @@ -47,9 +47,9 @@ static int imx6sx_enter_wait(struct cpui cpu_pm_enter(); cpu_cluster_pm_enter(); - ct_idle_enter(); + ct_cpuidle_enter(); cpu_suspend(0, imx6sx_idle_finish); - ct_idle_exit(); + ct_cpuidle_exit(); cpu_cluster_pm_exit(); cpu_pm_exit(); --- a/arch/arm/mach-omap2/cpuidle34xx.c +++ b/arch/arm/mach-omap2/cpuidle34xx.c @@ -133,9 +133,9 @@ static int omap3_enter_idle(struct cpuid } /* Execute ARM wfi */ - ct_idle_enter(); + ct_cpuidle_enter(); omap_sram_idle(); - ct_idle_exit(); + ct_cpuidle_exit(); /* * Call idle CPU PM enter notifier chain to restore --- a/arch/arm/mach-omap2/cpuidle44xx.c +++ b/arch/arm/mach-omap2/cpuidle44xx.c @@ -105,9 +105,9 @@ static int omap_enter_idle_smp(struct cp } raw_spin_unlock_irqrestore(&mpu_lock, flag); - ct_idle_enter(); + ct_cpuidle_enter(); omap4_enter_lowpower(dev->cpu, cx->cpu_state); - ct_idle_exit(); + ct_cpuidle_exit(); raw_spin_lock_irqsave(&mpu_lock, flag); if (cx->mpu_state_vote == num_online_cpus()) @@ -186,10 +186,10 @@ static int omap_enter_idle_coupled(struc } } - ct_idle_enter(); + ct_cpuidle_enter(); omap4_enter_lowpower(dev->cpu, cx->cpu_state); cpu_done[dev->cpu] = true; - ct_idle_exit(); + ct_cpuidle_exit(); /* Wakeup CPU1 only if it is not offlined */ if (dev->cpu == 0 && cpumask_test_cpu(1, cpu_online_mask)) { --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -627,6 +627,8 @@ static int __cpuidle acpi_idle_enter_bm( */ bool dis_bm = pr->flags.bm_control; + instrumentation_begin(); + /* If we can skip BM, demote to a safe state. */ if (!cx->bm_sts_skip && acpi_idle_bm_check()) { dis_bm = false; @@ -648,11 +650,11 @@ static int __cpuidle acpi_idle_enter_bm( raw_spin_unlock(&c3_lock); } - ct_idle_enter(); + ct_cpuidle_enter(); acpi_idle_do_entry(cx); - ct_idle_exit(); + ct_cpuidle_exit(); /* Re-enable bus master arbitration */ if (dis_bm) { @@ -662,6 +664,8 @@ static int __cpuidle acpi_idle_enter_bm( raw_spin_unlock(&c3_lock); } + instrumentation_end(); + return index; } --- a/drivers/cpuidle/cpuidle-big_little.c +++ b/drivers/cpuidle/cpuidle-big_little.c @@ -126,13 +126,13 @@ static int bl_enter_powerdown(struct cpu struct cpuidle_driver *drv, int idx) { cpu_pm_enter(); - ct_idle_enter(); + ct_cpuidle_enter(); cpu_suspend(0, bl_powerdown_finisher); /* signals the MCPM core that CPU is out of low power state */ mcpm_cpu_powered_up(); - ct_idle_exit(); + ct_cpuidle_exit(); cpu_pm_exit(); --- a/drivers/cpuidle/cpuidle-mvebu-v7.c +++ b/drivers/cpuidle/cpuidle-mvebu-v7.c @@ -36,9 +36,9 @@ static int mvebu_v7_enter_idle(struct cp if (drv->states[index].flags & MVEBU_V7_FLAG_DEEP_IDLE) deepidle = true; - ct_idle_enter(); + ct_cpuidle_enter(); ret = mvebu_v7_cpu_suspend(deepidle); - ct_idle_exit(); + ct_cpuidle_exit(); cpu_pm_exit(); --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -74,7 +74,7 @@ static int __psci_enter_domain_idle_stat else pm_runtime_put_sync_suspend(pd_dev); - ct_idle_enter(); + ct_cpuidle_enter(); state = psci_get_domain_state(); if (!state) @@ -82,7 +82,7 @@ static int __psci_enter_domain_idle_stat ret = psci_cpu_suspend_enter(state) ? -1 : idx; - ct_idle_exit(); + ct_cpuidle_exit(); if (s2idle) dev_pm_genpd_resume(pd_dev); --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -121,7 +121,7 @@ static int __sbi_enter_domain_idle_state else pm_runtime_put_sync_suspend(pd_dev); - ct_idle_enter(); + ct_cpuidle_enter(); if (sbi_is_domain_state_available()) state = sbi_get_domain_state(); @@ -130,7 +130,7 @@ static int __sbi_enter_domain_idle_state ret = sbi_suspend(state) ? -1 : idx; - ct_idle_exit(); + ct_cpuidle_exit(); if (s2idle) dev_pm_genpd_resume(pd_dev); --- a/drivers/cpuidle/cpuidle-tegra.c +++ b/drivers/cpuidle/cpuidle-tegra.c @@ -183,7 +183,7 @@ static int tegra_cpuidle_state_enter(str tegra_pm_set_cpu_in_lp2(); cpu_pm_enter(); - ct_idle_enter(); + ct_cpuidle_enter(); switch (index) { case TEGRA_C7: @@ -199,7 +199,7 @@ static int tegra_cpuidle_state_enter(str break; } - ct_idle_exit(); + ct_cpuidle_exit(); cpu_pm_exit(); tegra_pm_clear_cpu_in_lp2(); @@ -240,10 +240,10 @@ static int tegra_cpuidle_enter(struct cp if (index == TEGRA_C1) { if (do_rcu) - ct_idle_enter(); + ct_cpuidle_enter(); ret = arm_cpuidle_simple_enter(dev, drv, index); if (do_rcu) - ct_idle_exit(); + ct_cpuidle_exit(); } else ret = tegra_cpuidle_state_enter(dev, index, cpu); --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/sched/idle.h> #include <linux/notifier.h> #include <linux/pm_qos.h> #include <linux/cpu.h> @@ -152,12 +153,12 @@ static void enter_s2idle_proper(struct c */ stop_critical_timings(); if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) - ct_idle_enter(); + ct_cpuidle_enter(); target_state->enter_s2idle(dev, drv, index); if (WARN_ON_ONCE(!irqs_disabled())) - local_irq_disable(); + raw_local_irq_disable(); if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) - ct_idle_exit(); + ct_cpuidle_exit(); tick_unfreeze(); start_critical_timings(); @@ -235,14 +236,14 @@ int cpuidle_enter_state(struct cpuidle_d stop_critical_timings(); if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) - ct_idle_enter(); + ct_cpuidle_enter(); entered_state = target_state->enter(dev, drv, index); if (WARN_ONCE(!irqs_disabled(), "%ps leaked IRQ state", target_state->enter)) raw_local_irq_disable(); if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) - ct_idle_exit(); + ct_cpuidle_exit(); start_critical_timings(); sched_clock_idle_wakeup_event(); --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -14,6 +14,7 @@ #include <linux/percpu.h> #include <linux/list.h> #include <linux/hrtimer.h> +#include <linux/context_tracking.h> #define CPUIDLE_STATE_MAX 10 #define CPUIDLE_NAME_LEN 16 @@ -115,6 +116,35 @@ struct cpuidle_device { DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev); +static __always_inline void ct_cpuidle_enter(void) +{ + lockdep_assert_irqs_disabled(); + /* + * Idle is allowed to (temporary) enable IRQs. It + * will return with IRQs disabled. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * ct_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + ct_idle_enter(); + lockdep_hardirqs_on(_THIS_IP_); +} + +static __always_inline void ct_cpuidle_exit(void) +{ + /* + * Carefully undo the above. + */ + lockdep_hardirqs_off(_THIS_IP_); + ct_idle_exit(); + instrumentation_begin(); +} + /**************************** * CPUIDLE DRIVER INTERFACE * ****************************/ @@ -282,18 +312,18 @@ extern s64 cpuidle_governor_latency_req( int __ret = 0; \ \ if (!idx) { \ - ct_idle_enter(); \ + ct_cpuidle_enter(); \ cpu_do_idle(); \ - ct_idle_exit(); \ + ct_cpuidle_exit(); \ return idx; \ } \ \ if (!is_retention) \ __ret = cpu_pm_enter(); \ if (!__ret) { \ - ct_idle_enter(); \ + ct_cpuidle_enter(); \ __ret = low_level_idle_enter(state); \ - ct_idle_exit(); \ + ct_cpuidle_exit(); \ if (!is_retention) \ cpu_pm_exit(); \ } \ --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -51,18 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { + instrumentation_begin(); trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); - ct_idle_enter(); - local_irq_enable(); + ct_cpuidle_enter(); + raw_local_irq_enable(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + raw_local_irq_disable(); - ct_idle_exit(); + ct_cpuidle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + local_irq_enable(); + instrumentation_end(); return 1; } @@ -85,44 +89,21 @@ void __weak arch_cpu_idle(void) */ void __cpuidle default_idle_call(void) { - if (current_clr_polling_and_test()) { - local_irq_enable(); - } else { - + instrumentation_begin(); + if (!current_clr_polling_and_test()) { trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); - /* - * arch_cpu_idle() is supposed to enable IRQs, however - * we can't do that because of RCU and tracing. - * - * Trace IRQs enable here, then switch off RCU, and have - * arch_cpu_idle() use raw_local_irq_enable(). Note that - * ct_idle_enter() relies on lockdep IRQ state, so switch that - * last -- this is very similar to the entry code. - */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_idle_enter(); - lockdep_hardirqs_on(_THIS_IP_); - + ct_cpuidle_enter(); arch_cpu_idle(); - - /* - * OK, so IRQs are enabled here, but RCU needs them disabled to - * turn itself back on.. funny thing is that disabling IRQs - * will cause tracing, which needs RCU. Jump through hoops to - * make it 'work'. - */ raw_local_irq_disable(); - lockdep_hardirqs_off(_THIS_IP_); - ct_idle_exit(); - lockdep_hardirqs_on(_THIS_IP_); - raw_local_irq_enable(); + ct_cpuidle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } + local_irq_enable(); + instrumentation_end(); } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -622,9 +622,13 @@ struct cpumask *tick_get_broadcast_onesh * to avoid a deep idle transition as we are about to get the * broadcast IPI right away. */ -int tick_check_broadcast_expired(void) +noinstr int tick_check_broadcast_expired(void) { +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); +#else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +#endif } /*
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 14/44] cpuidle, cpu_pm: Remove RCU fiddling from cpu_pm_{enter, exit}()
All callers should still have RCU enabled. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Acked-by: Mark Rutland <mark.rutland at arm.com> --- kernel/cpu_pm.c | 9 --------- 1 file changed, 9 deletions(-) --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_eve { int ret; - /* - * This introduces a RCU read critical section, which could be - * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know - * this. - */ - ct_irq_enter_irqson(); rcu_read_lock(); ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); rcu_read_unlock(); - ct_irq_exit_irqson(); return notifier_to_errno(ret); } @@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu unsigned long flags; int ret; - ct_irq_enter_irqson(); raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); - ct_irq_exit_irqson(); return notifier_to_errno(ret); }
All the idle routines are called with RCU disabled, as such there must not be any tracing inside. While there; clean-up the io-port idle thing. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/acpi/processor_idle.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -108,8 +108,8 @@ static const struct dmi_system_id proces static void __cpuidle acpi_safe_halt(void) { if (!tif_need_resched()) { - safe_halt(); - local_irq_disable(); + raw_safe_halt(); + raw_local_irq_disable(); } } @@ -524,16 +524,21 @@ static int acpi_idle_bm_check(void) return bm_status; } -static void wait_for_freeze(void) +static __cpuidle void io_idle(unsigned long addr) { + /* IO port based C-state */ + inb(addr); + #ifdef CONFIG_X86 /* No delay is needed if we are in guest */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; #endif - /* Dummy wait op - must do something useless after P_LVL2 read - because chipsets cannot guarantee that STPCLK# signal - gets asserted in time to freeze execution properly. */ + /* + * Dummy wait op - must do something useless after P_LVL2 read + * because chipsets cannot guarantee that STPCLK# signal + * gets asserted in time to freeze execution properly. + */ inl(acpi_gbl_FADT.xpm_timer_block.address); } @@ -553,9 +558,7 @@ static void __cpuidle acpi_idle_do_entry } else if (cx->entry_method == ACPI_CSTATE_HALT) { acpi_safe_halt(); } else { - /* IO port based C-state */ - inb(cx->address); - wait_for_freeze(); + io_idle(cx->address); } perf_lopwr_cb(false); @@ -577,8 +580,7 @@ static int acpi_idle_play_dead(struct cp if (cx->entry_method == ACPI_CSTATE_HALT) safe_halt(); else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { - inb(cx->address); - wait_for_freeze(); + io_idle(cx->address); } else return -ENODEV;
The __cpuidle functions will become a noinstr class, as such they need explicit annotations. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki at intel.com> --- drivers/cpuidle/poll_state.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -13,7 +13,10 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - u64 time_start = local_clock(); + u64 time_start; + + instrumentation_begin(); + time_start = local_clock(); dev->poll_time_limit = false; @@ -39,6 +42,7 @@ static int __cpuidle poll_idle(struct cp raw_local_irq_disable(); current_clr_polling(); + instrumentation_end(); return index; }
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 17/44] objtool/idle: Validate __cpuidle code as noinstr
Idle code is very like entry code in that RCU isn't available. As such, add a little validation. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Acked-by: Geert Uytterhoeven <geert at linux-m68k.org> --- arch/alpha/kernel/vmlinux.lds.S | 1 - arch/arc/kernel/vmlinux.lds.S | 1 - arch/arm/include/asm/vmlinux.lds.h | 1 - arch/arm64/kernel/vmlinux.lds.S | 1 - arch/csky/kernel/vmlinux.lds.S | 1 - arch/hexagon/kernel/vmlinux.lds.S | 1 - arch/ia64/kernel/vmlinux.lds.S | 1 - arch/loongarch/kernel/vmlinux.lds.S | 1 - arch/m68k/kernel/vmlinux-nommu.lds | 1 - arch/m68k/kernel/vmlinux-std.lds | 1 - arch/m68k/kernel/vmlinux-sun3.lds | 1 - arch/microblaze/kernel/vmlinux.lds.S | 1 - arch/mips/kernel/vmlinux.lds.S | 1 - arch/nios2/kernel/vmlinux.lds.S | 1 - arch/openrisc/kernel/vmlinux.lds.S | 1 - arch/parisc/kernel/vmlinux.lds.S | 1 - arch/powerpc/kernel/vmlinux.lds.S | 1 - arch/riscv/kernel/vmlinux-xip.lds.S | 1 - arch/riscv/kernel/vmlinux.lds.S | 1 - arch/s390/kernel/vmlinux.lds.S | 1 - arch/sh/kernel/vmlinux.lds.S | 1 - arch/sparc/kernel/vmlinux.lds.S | 1 - arch/um/kernel/dyn.lds.S | 1 - arch/um/kernel/uml.lds.S | 1 - arch/x86/include/asm/irqflags.h | 11 ++++------- arch/x86/include/asm/mwait.h | 2 +- arch/x86/kernel/vmlinux.lds.S | 1 - arch/xtensa/kernel/vmlinux.lds.S | 1 - include/asm-generic/vmlinux.lds.h | 9 +++------ include/linux/compiler_types.h | 8 ++++++-- include/linux/cpu.h | 3 --- tools/objtool/check.c | 13 +++++++++++++ 32 files changed, 27 insertions(+), 45 deletions(-) --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -27,7 +27,6 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) --- a/arch/arc/kernel/vmlinux.lds.S +++ b/arch/arc/kernel/vmlinux.lds.S @@ -85,7 +85,6 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -96,7 +96,6 @@ SOFTIRQENTRY_TEXT \ TEXT_TEXT \ SCHED_TEXT \ - CPUIDLE_TEXT \ LOCK_TEXT \ KPROBES_TEXT \ ARM_STUBS_TEXT \ --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -163,7 +163,6 @@ SECTIONS ENTRY_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT HYPERVISOR_TEXT --- a/arch/csky/kernel/vmlinux.lds.S +++ b/arch/csky/kernel/vmlinux.lds.S @@ -38,7 +38,6 @@ SECTIONS SOFTIRQENTRY_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) --- a/arch/hexagon/kernel/vmlinux.lds.S +++ b/arch/hexagon/kernel/vmlinux.lds.S @@ -41,7 +41,6 @@ SECTIONS IRQENTRY_TEXT SOFTIRQENTRY_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -51,7 +51,6 @@ SECTIONS { __end_ivt_text = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/loongarch/kernel/vmlinux.lds.S +++ b/arch/loongarch/kernel/vmlinux.lds.S @@ -41,7 +41,6 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/m68k/kernel/vmlinux-nommu.lds +++ b/arch/m68k/kernel/vmlinux-nommu.lds @@ -48,7 +48,6 @@ SECTIONS { IRQENTRY_TEXT SOFTIRQENTRY_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT *(.fixup) . = ALIGN(16); --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -19,7 +19,6 @@ SECTIONS IRQENTRY_TEXT SOFTIRQENTRY_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -19,7 +19,6 @@ SECTIONS IRQENTRY_TEXT SOFTIRQENTRY_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -36,7 +36,6 @@ SECTIONS { EXIT_TEXT EXIT_CALL SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -61,7 +61,6 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/nios2/kernel/vmlinux.lds.S +++ b/arch/nios2/kernel/vmlinux.lds.S @@ -24,7 +24,6 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -52,7 +52,6 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -86,7 +86,6 @@ SECTIONS TEXT_TEXT LOCK_TEXT SCHED_TEXT - CPUIDLE_TEXT KPROBES_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -107,7 +107,6 @@ SECTIONS #endif NOINSTR_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/riscv/kernel/vmlinux-xip.lds.S +++ b/arch/riscv/kernel/vmlinux-xip.lds.S @@ -39,7 +39,6 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ENTRY_TEXT --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -42,7 +42,6 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ENTRY_TEXT --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -42,7 +42,6 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -29,7 +29,6 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -50,7 +50,6 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -74,7 +74,6 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -35,7 +35,6 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -8,9 +8,6 @@ #include <asm/nospec-branch.h> -/* Provide __cpuidle; we can't safely include <linux/cpu.h> */ -#define __cpuidle __section(".cpuidle.text") - /* * Interrupt control: */ @@ -45,13 +42,13 @@ static __always_inline void native_irq_e asm volatile("sti": : :"memory"); } -static inline __cpuidle void native_safe_halt(void) +static __always_inline void native_safe_halt(void) { mds_idle_clear_cpu_buffers(); asm volatile("sti; hlt": : :"memory"); } -static inline __cpuidle void native_halt(void) +static __always_inline void native_halt(void) { mds_idle_clear_cpu_buffers(); asm volatile("hlt": : :"memory"); @@ -84,7 +81,7 @@ static __always_inline void arch_local_i * Used in the idle loop; sti takes one instruction cycle * to complete: */ -static inline __cpuidle void arch_safe_halt(void) +static __always_inline void arch_safe_halt(void) { native_safe_halt(); } @@ -93,7 +90,7 @@ static inline __cpuidle void arch_safe_h * Used when interrupts are already enabled or to * shutdown the processor: */ -static inline __cpuidle void halt(void) +static __always_inline void halt(void) { native_halt(); } --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -104,7 +104,7 @@ static inline void __sti_mwait(unsigned * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) { if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) { --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -129,7 +129,6 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ALIGN_ENTRY_TEXT_BEGIN --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -125,7 +125,6 @@ SECTIONS ENTRY_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT *(.fixup) } --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -559,6 +559,9 @@ ALIGN_FUNCTION(); \ __noinstr_text_start = .; \ *(.noinstr.text) \ + __cpuidle_text_start = .; \ + *(.cpuidle.text) \ + __cpuidle_text_end = .; \ __noinstr_text_end = .; /* @@ -600,12 +603,6 @@ *(.spinlock.text) \ __lock_text_end = .; -#define CPUIDLE_TEXT \ - ALIGN_FUNCTION(); \ - __cpuidle_text_start = .; \ - *(.cpuidle.text) \ - __cpuidle_text_end = .; - #define KPROBES_TEXT \ ALIGN_FUNCTION(); \ __kprobes_text_start = .; \ --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -227,10 +227,14 @@ struct ftrace_likely_data { #endif /* Section for code which can't be instrumented at all */ -#define noinstr \ - noinline notrace __attribute((__section__(".noinstr.text"))) \ +#define __noinstr_section(section) \ + noinline notrace __attribute((__section__(section))) \ __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage +#define noinstr __noinstr_section(".noinstr.text") + +#define __cpuidle __noinstr_section(".cpuidle.text") + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -176,9 +176,6 @@ void __noreturn cpu_startup_entry(enum c void cpu_idle_poll_ctrl(bool enable); -/* Attach to any functions which should be considered cpuidle. */ -#define __cpuidle __section(".cpuidle.text") - bool cpu_in_idle(unsigned long pc); void arch_cpu_idle(void); --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -377,6 +377,7 @@ static int decode_instructions(struct ob if (!strcmp(sec->name, ".noinstr.text") || !strcmp(sec->name, ".entry.text") || + !strcmp(sec->name, ".cpuidle.text") || !strncmp(sec->name, ".text.__x86.", 12)) sec->noinstr = true; @@ -3187,6 +3188,12 @@ static inline bool noinstr_call_dest(str return true; /* + * If the symbol is a static_call trampoline, we can't tell. + */ + if (func->static_call_tramp) + return true; + + /* * The __ubsan_handle_*() calls are like WARN(), they only happen when * something 'BAD' happened. At the risk of taking the machine down, * let them proceed to get the message out. @@ -3932,6 +3939,12 @@ static int validate_noinstr_sections(str if (sec) { warnings += validate_section(file, sec); warnings += validate_unwind_hints(file, sec); + } + + sec = find_section_by_name(file->elf, ".cpuidle.text"); + if (sec) { + warnings += validate_section(file, sec); + warnings += validate_unwind_hints(file, sec); } return warnings;
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 18/44] cpuidle, intel_idle: Fix CPUIDLE_FLAG_IRQ_ENABLE *again*
vmlinux.o: warning: objtool: intel_idle_irq+0x10c: call to trace_hardirqs_off() leaves .noinstr.text section As per commit 32d4fd5751ea ("cpuidle,intel_idle: Fix CPUIDLE_FLAG_IRQ_ENABLE"): "must not have tracing in idle functions" Clearly people can't read and tinker along until splat dissapears. This straight up reverts commit d295ad34f236 ("intel_idle: Fix false positive RCU splats due to incorrect hardirqs state"). It doesn't re-introduce the problem because preceding patches fixed it properly. Fixes: d295ad34f236 ("intel_idle: Fix false positive RCU splats due to incorrect hardirqs state") Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/idle/intel_idle.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -168,13 +168,7 @@ static __cpuidle int intel_idle_irq(stru raw_local_irq_enable(); ret = __intel_idle(dev, drv, index); - - /* - * The lockdep hardirqs state may be changed to 'on' with timer - * tick interrupt followed by __do_softirq(). Use local_irq_disable() - * to keep the hardirqs state correct. - */ - local_irq_disable(); + raw_local_irq_disable(); return ret; }
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 19/44] cpuidle,intel_idle: Fix CPUIDLE_FLAG_INIT_XSTATE
vmlinux.o: warning: objtool: intel_idle_s2idle+0xd5: call to fpu_idle_fpregs() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_xstate+0x11: call to fpu_idle_fpregs() leaves .noinstr.text section vmlinux.o: warning: objtool: fpu_idle_fpregs+0x9: call to xfeatures_in_use() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/include/asm/fpu/xcr.h | 4 ++-- arch/x86/include/asm/special_insns.h | 2 +- arch/x86/kernel/fpu/core.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) --- a/arch/x86/include/asm/fpu/xcr.h +++ b/arch/x86/include/asm/fpu/xcr.h @@ -5,7 +5,7 @@ #define XCR_XFEATURE_ENABLED_MASK 0x00000000 #define XCR_XFEATURE_IN_USE_MASK 0x00000001 -static inline u64 xgetbv(u32 index) +static __always_inline u64 xgetbv(u32 index) { u32 eax, edx; @@ -27,7 +27,7 @@ static inline void xsetbv(u32 index, u64 * * Callers should check X86_FEATURE_XGETBV1. */ -static inline u64 xfeatures_in_use(void) +static __always_inline u64 xfeatures_in_use(void) { return xgetbv(XCR_XFEATURE_IN_USE_MASK); } --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -295,7 +295,7 @@ static inline int enqcmds(void __iomem * return 0; } -static inline void tile_release(void) +static __always_inline void tile_release(void) { /* * Instruction opcode for TILERELEASE; supported in binutils --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -856,12 +856,12 @@ int fpu__exception_code(struct fpu *fpu, * Initialize register state that may prevent from entering low-power idle. * This function will be invoked from the cpuidle driver only when needed. */ -void fpu_idle_fpregs(void) +noinstr void fpu_idle_fpregs(void) { /* Note: AMX_TILE being enabled implies XGETBV1 support */ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { tile_release(); - fpregs_deactivate(¤t->thread.fpu); + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } }
Peter Zijlstra
2022-Sep-19 09:59 UTC
[PATCH v2 20/44] cpuidle,intel_idle: Fix CPUIDLE_FLAG_IBRS
vmlinux.o: warning: objtool: intel_idle_ibrs+0x17: call to spec_ctrl_current() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_ibrs+0x27: call to wrmsrl.constprop.0() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/kernel/cpu/bugs.c | 2 +- drivers/idle/intel_idle.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -79,7 +79,7 @@ void write_spec_ctrl_current(u64 val, bo wrmsrl(MSR_IA32_SPEC_CTRL, val); } -u64 spec_ctrl_current(void) +noinstr u64 spec_ctrl_current(void) { return this_cpu_read(x86_spec_ctrl_current); } --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -181,12 +181,12 @@ static __cpuidle int intel_idle_ibrs(str int ret; if (smt_active) - wrmsrl(MSR_IA32_SPEC_CTRL, 0); + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); ret = __intel_idle(dev, drv, index); if (smt_active) - wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl); + native_wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl); return ret; }
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 21/44] arch/idle: Change arch_cpu_idle() IRQ behaviour
Current arch_cpu_idle() is called with IRQs disabled, but will return with IRQs enabled. However, the very first thing the generic code does after calling arch_cpu_idle() is raw_local_irq_disable(). This means that architectures that can idle with IRQs disabled end up doing a pointless 'enable-disable' dance. Therefore, push this IRQ disabling into the idle function, meaning that those architectures can avoid the pointless IRQ state flipping. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Gautham R. Shenoy <gautham.shenoy at amd.com> Acked-by: Mark Rutland <mark.rutland at arm.com> [arm64] Acked-by: Rafael J. Wysocki <rafael.j.wysocki at intel.com> --- arch/alpha/kernel/process.c | 1 - arch/arc/kernel/process.c | 3 +++ arch/arm/kernel/process.c | 1 - arch/arm/mach-gemini/board-dt.c | 3 ++- arch/arm64/kernel/idle.c | 1 - arch/csky/kernel/process.c | 1 - arch/csky/kernel/smp.c | 2 +- arch/hexagon/kernel/process.c | 1 - arch/ia64/kernel/process.c | 1 + arch/loongarch/kernel/idle.c | 1 + arch/microblaze/kernel/process.c | 1 - arch/mips/kernel/idle.c | 8 +++----- arch/nios2/kernel/process.c | 1 - arch/openrisc/kernel/process.c | 1 + arch/parisc/kernel/process.c | 2 -- arch/powerpc/kernel/idle.c | 5 ++--- arch/riscv/kernel/process.c | 1 - arch/s390/kernel/idle.c | 1 - arch/sh/kernel/idle.c | 1 + arch/sparc/kernel/leon_pmc.c | 4 ++++ arch/sparc/kernel/process_32.c | 1 - arch/sparc/kernel/process_64.c | 3 ++- arch/um/kernel/process.c | 1 - arch/x86/coco/tdx/tdx.c | 3 +++ arch/x86/kernel/process.c | 15 ++++----------- arch/xtensa/kernel/process.c | 1 + kernel/sched/idle.c | 2 -- 27 files changed, 29 insertions(+), 37 deletions(-) --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -57,7 +57,6 @@ EXPORT_SYMBOL(pm_power_off); void arch_cpu_idle(void) { wtint(0); - raw_local_irq_enable(); } void arch_cpu_idle_dead(void) --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -114,6 +114,8 @@ void arch_cpu_idle(void) "sleep %0 \n" : :"I"(arg)); /* can't be "r" has to be embedded const */ + + raw_local_irq_disable(); } #else /* ARC700 */ @@ -122,6 +124,7 @@ void arch_cpu_idle(void) { /* sleep, but enable both set E1/E2 (levels of interrupts) before committing */ __asm__ __volatile__("sleep 0x3 \n"); + raw_local_irq_disable(); } #endif --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -78,7 +78,6 @@ void arch_cpu_idle(void) arm_pm_idle(); else cpu_do_idle(); - raw_local_irq_enable(); } void arch_cpu_idle_prepare(void) --- a/arch/arm/mach-gemini/board-dt.c +++ b/arch/arm/mach-gemini/board-dt.c @@ -42,8 +42,9 @@ static void gemini_idle(void) */ /* FIXME: Enabling interrupts here is racy! */ - local_irq_enable(); + raw_local_irq_enable(); cpu_do_idle(); + raw_local_irq_disable(); } static void __init gemini_init_machine(void) --- a/arch/arm64/kernel/idle.c +++ b/arch/arm64/kernel/idle.c @@ -42,5 +42,4 @@ void noinstr arch_cpu_idle(void) * tricks */ cpu_do_idle(); - raw_local_irq_enable(); } --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -100,6 +100,5 @@ void arch_cpu_idle(void) #ifdef CONFIG_CPU_PM_STOP asm volatile("stop\n"); #endif - raw_local_irq_enable(); } #endif --- a/arch/csky/kernel/smp.c +++ b/arch/csky/kernel/smp.c @@ -309,7 +309,7 @@ void arch_cpu_idle_dead(void) while (!secondary_stack) arch_cpu_idle(); - local_irq_disable(); + raw_local_irq_disable(); asm volatile( "mov sp, %0\n" --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -44,7 +44,6 @@ void arch_cpu_idle(void) { __vmwait(); /* interrupts wake us up, but irqs are still disabled */ - raw_local_irq_enable(); } /* --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -242,6 +242,7 @@ void arch_cpu_idle(void) (*mark_idle)(1); raw_safe_halt(); + raw_local_irq_disable(); if (mark_idle) (*mark_idle)(0); --- a/arch/loongarch/kernel/idle.c +++ b/arch/loongarch/kernel/idle.c @@ -13,4 +13,5 @@ void __cpuidle arch_cpu_idle(void) { raw_local_irq_enable(); __arch_cpu_idle(); /* idle instruction needs irq enabled */ + raw_local_irq_disable(); } --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -140,5 +140,4 @@ int dump_fpu(struct pt_regs *regs, elf_f void arch_cpu_idle(void) { - raw_local_irq_enable(); } --- a/arch/mips/kernel/idle.c +++ b/arch/mips/kernel/idle.c @@ -33,13 +33,13 @@ static void __cpuidle r3081_wait(void) { unsigned long cfg = read_c0_conf(); write_c0_conf(cfg | R30XX_CONF_HALT); - raw_local_irq_enable(); } void __cpuidle r4k_wait(void) { raw_local_irq_enable(); __r4k_wait(); + raw_local_irq_disable(); } /* @@ -57,7 +57,6 @@ void __cpuidle r4k_wait_irqoff(void) " .set arch=r4000 \n" " wait \n" " .set pop \n"); - raw_local_irq_enable(); } /* @@ -77,7 +76,6 @@ static void __cpuidle rm7k_wait_irqoff(v " wait \n" " mtc0 $1, $12 # stalls until W stage \n" " .set pop \n"); - raw_local_irq_enable(); } /* @@ -103,6 +101,8 @@ static void __cpuidle au1k_wait(void) " nop \n" " .set pop \n" : : "r" (au1k_wait), "r" (c0status)); + + raw_local_irq_disable(); } static int __initdata nowait; @@ -245,8 +245,6 @@ void arch_cpu_idle(void) { if (cpu_wait) cpu_wait(); - else - raw_local_irq_enable(); } #ifdef CONFIG_CPU_IDLE --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -33,7 +33,6 @@ EXPORT_SYMBOL(pm_power_off); void arch_cpu_idle(void) { - raw_local_irq_enable(); } /* --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -102,6 +102,7 @@ void arch_cpu_idle(void) raw_local_irq_enable(); if (mfspr(SPR_UPR) & SPR_UPR_PMP) mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME); + raw_local_irq_disable(); } void (*pm_power_off)(void) = NULL; --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -187,8 +187,6 @@ void arch_cpu_idle_dead(void) void __cpuidle arch_cpu_idle(void) { - raw_local_irq_enable(); - /* nop on real hardware, qemu will idle sleep. */ asm volatile("or %%r10,%%r10,%%r10\n":::); } --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -51,10 +51,9 @@ void arch_cpu_idle(void) * Some power_save functions return with * interrupts enabled, some don't. */ - if (irqs_disabled()) - raw_local_irq_enable(); + if (!irqs_disabled()) + raw_local_irq_disable(); } else { - raw_local_irq_enable(); /* * Go into low thread priority and possibly * low power mode. --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -39,7 +39,6 @@ extern asmlinkage void ret_from_kernel_t void arch_cpu_idle(void) { cpu_do_idle(); - raw_local_irq_enable(); } void __show_regs(struct pt_regs *regs) --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -66,7 +66,6 @@ void arch_cpu_idle(void) idle->idle_count++; account_idle_time(cputime_to_nsecs(idle_time)); raw_write_seqcount_end(&idle->seqcount); - raw_local_irq_enable(); } static ssize_t show_idle_count(struct device *dev, --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -25,6 +25,7 @@ void default_idle(void) raw_local_irq_enable(); /* Isn't this racy ? */ cpu_sleep(); + raw_local_irq_disable(); clear_bl_bit(); } --- a/arch/sparc/kernel/leon_pmc.c +++ b/arch/sparc/kernel/leon_pmc.c @@ -57,6 +57,8 @@ static void pmc_leon_idle_fixup(void) "lda [%0] %1, %%g0\n" : : "r"(address), "i"(ASI_LEON_BYPASS)); + + raw_local_irq_disable(); } /* @@ -70,6 +72,8 @@ static void pmc_leon_idle(void) /* For systems without power-down, this will be no-op */ __asm__ __volatile__ ("wr %g0, %asr19\n\t"); + + raw_local_irq_disable(); } /* Install LEON Power Down function */ --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -71,7 +71,6 @@ void arch_cpu_idle(void) { if (sparc_idle) (*sparc_idle)(); - raw_local_irq_enable(); } /* XXX cli/sti -> local_irq_xxx here, check this works once SMP is fixed. */ --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -59,7 +59,6 @@ void arch_cpu_idle(void) { if (tlb_type != hypervisor) { touch_nmi_watchdog(); - raw_local_irq_enable(); } else { unsigned long pstate; @@ -90,6 +89,8 @@ void arch_cpu_idle(void) "wrpr %0, %%g0, %%pstate" : "=&r" (pstate) : "i" (PSTATE_IE)); + + raw_local_irq_disable(); } } --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -217,7 +217,6 @@ void arch_cpu_idle(void) { cpu_tasks[current_thread_info()->cpu].pid = os_getpid(); um_idle_sleep(); - raw_local_irq_enable(); } int __cant_sleep(void) { --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -223,6 +223,9 @@ void __cpuidle tdx_safe_halt(void) */ if (__halt(irq_disabled, do_sti)) WARN_ONCE(1, "HLT instruction emulation failed\n"); + + /* XXX I can't make sense of what @do_sti actually does */ + raw_local_irq_disable(); } static int read_msr(struct pt_regs *regs, struct ve_info *ve) --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -701,6 +701,7 @@ EXPORT_SYMBOL(boot_option_idle_override) void __cpuidle default_idle(void) { raw_safe_halt(); + raw_local_irq_disable(); } #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); @@ -806,13 +807,7 @@ static void amd_e400_idle(void) default_idle(); - /* - * The switch back from broadcast mode needs to be called with - * interrupts disabled. - */ - raw_local_irq_disable(); tick_broadcast_exit(); - raw_local_irq_enable(); } /* @@ -870,12 +865,10 @@ static __cpuidle void mwait_idle(void) } __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) + if (!need_resched()) { __sti_mwait(0, 0); - else - raw_local_irq_enable(); - } else { - raw_local_irq_enable(); + raw_local_irq_disable(); + } } __current_clr_polling(); } --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -183,6 +183,7 @@ void coprocessor_flush_release_all(struc void arch_cpu_idle(void) { platform_idle(); + raw_local_irq_disable(); } /* --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -79,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - raw_local_irq_enable(); } /** @@ -96,7 +95,6 @@ void __cpuidle default_idle_call(void) ct_cpuidle_enter(); arch_cpu_idle(); - raw_local_irq_disable(); ct_cpuidle_exit(); start_critical_timings();
Now that arch_cpu_idle() is expected to return with IRQs disabled, avoid the useless STI/CLI dance. Per the specs this is supposed to work, but nobody has yet relied up this behaviour so broken implementations are possible. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/coco/tdx/tdcall.S | 13 ------------- arch/x86/coco/tdx/tdx.c | 23 ++++------------------- arch/x86/include/asm/shared/tdx.h | 1 - 3 files changed, 4 insertions(+), 33 deletions(-) --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -139,19 +139,6 @@ SYM_FUNC_START(__tdx_hypercall) movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx - /* - * For the idle loop STI needs to be called directly before the TDCALL - * that enters idle (EXIT_REASON_HLT case). STI instruction enables - * interrupts only one instruction later. If there is a window between - * STI and the instruction that emulates the HALT state, there is a - * chance for interrupts to happen in this window, which can delay the - * HLT operation indefinitely. Since this is the not the desired - * result, conditionally call STI before TDCALL. - */ - testq $TDX_HCALL_ISSUE_STI, %rsi - jz .Lskip_sti - sti -.Lskip_sti: tdcall /* --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -169,7 +169,7 @@ static int ve_instr_len(struct ve_info * } } -static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) +static u64 __cpuidle __halt(const bool irq_disabled) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, @@ -189,20 +189,14 @@ static u64 __cpuidle __halt(const bool i * can keep the vCPU in virtual HLT, even if an IRQ is * pending, without hanging/breaking the guest. */ - return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0); + return __tdx_hypercall(&args, 0); } static int handle_halt(struct ve_info *ve) { - /* - * Since non safe halt is mainly used in CPU offlining - * and the guest will always stay in the halt state, don't - * call the STI instruction (set do_sti as false). - */ const bool irq_disabled = irqs_disabled(); - const bool do_sti = false; - if (__halt(irq_disabled, do_sti)) + if (__halt(irq_disabled)) return -EIO; return ve_instr_len(ve); @@ -210,22 +204,13 @@ static int handle_halt(struct ve_info *v void __cpuidle tdx_safe_halt(void) { - /* - * For do_sti=true case, __tdx_hypercall() function enables - * interrupts using the STI instruction before the TDCALL. So - * set irq_disabled as false. - */ const bool irq_disabled = false; - const bool do_sti = true; /* * Use WARN_ONCE() to report the failure. */ - if (__halt(irq_disabled, do_sti)) + if (__halt(irq_disabled)) WARN_ONCE(1, "HLT instruction emulation failed\n"); - - /* XXX I can't make sense of what @do_sti actually does */ - raw_local_irq_disable(); } static int read_msr(struct pt_regs *regs, struct ve_info *ve) --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -8,7 +8,6 @@ #define TDX_HYPERCALL_STANDARD 0 #define TDX_HCALL_HAS_OUTPUT BIT(0) -#define TDX_HCALL_ISSUE_STI BIT(1) #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX "
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 23/44] arm,smp: Remove trace_.*_rcuidle() usage
None of these functions should ever be ran with RCU disabled anymore. Specifically, do_handle_IPI() is only called from handle_IPI() which explicitly does irq_enter()/irq_exit() which ensures RCU is watching. The problem with smp_cross_call() was, per commit 7c64cc0531fa ("arm: Use _rcuidle for smp_cross_call() tracepoints"), that cpuidle_enter_state_coupled() already had RCU disabled, but that's long been fixed by commit 1098582a0f6c ("sched,idle,rcu: Push rcu_idle deeper into the idle path"). Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/arm/kernel/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -639,7 +639,7 @@ static void do_handle_IPI(int ipinr) unsigned int cpu = smp_processor_id(); if ((unsigned)ipinr < NR_IPI) - trace_ipi_entry_rcuidle(ipi_types[ipinr]); + trace_ipi_entry(ipi_types[ipinr]); switch (ipinr) { case IPI_WAKEUP: @@ -686,7 +686,7 @@ static void do_handle_IPI(int ipinr) } if ((unsigned)ipinr < NR_IPI) - trace_ipi_exit_rcuidle(ipi_types[ipinr]); + trace_ipi_exit(ipi_types[ipinr]); } /* Legacy version, should go away once all irqchips have been converted */ @@ -709,7 +709,7 @@ static irqreturn_t ipi_handler(int irq, static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) { - trace_ipi_raise_rcuidle(target, ipi_types[ipinr]); + trace_ipi_raise(target, ipi_types[ipinr]); __ipi_send_mask(ipi_desc[ipinr], target); }
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 24/44] arm64,smp: Remove trace_.*_rcuidle() usage
Ever since commit d3afc7f12987 ("arm64: Allow IPIs to be handled as normal interrupts") this function is called in regular IRQ context. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Acked-by: Mark Rutland <mark.rutland at arm.com> Acked-by: Marc Zyngier <maz at kernel.org> --- arch/arm64/kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -865,7 +865,7 @@ static void do_handle_IPI(int ipinr) unsigned int cpu = smp_processor_id(); if ((unsigned)ipinr < NR_IPI) - trace_ipi_entry_rcuidle(ipi_types[ipinr]); + trace_ipi_entry(ipi_types[ipinr]); switch (ipinr) { case IPI_RESCHEDULE: @@ -914,7 +914,7 @@ static void do_handle_IPI(int ipinr) } if ((unsigned)ipinr < NR_IPI) - trace_ipi_exit_rcuidle(ipi_types[ipinr]); + trace_ipi_exit(ipi_types[ipinr]); } static irqreturn_t ipi_handler(int irq, void *data)
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 25/44] printk: Remove trace_.*_rcuidle() usage
The problem, per commit fc98c3c8c9dc ("printk: use rcuidle console tracepoint"), was printk usage from the cpuidle path where RCU was already disabled. Per the patches earlier in this series, this is no longer the case. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Sergey Senozhatsky <senozhatsky at chromium.org> Acked-by: Petr Mladek <pmladek at suse.com> --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2238,7 +2238,7 @@ static u16 printk_sprint(char *text, u16 } } - trace_console_rcuidle(text, text_len); + trace_console(text, text_len); return text_len; }
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 26/44] time/tick-broadcast: Remove RCU_NONIDLE usage
No callers left that have already disabled RCU. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Acked-by: Mark Rutland <mark.rutland at arm.com> --- kernel/time/tick-broadcast-hrtimer.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, * hrtimer callback function is currently running, then * hrtimer_start() cannot move it and the timer stays on the CPU on * which it is assigned at the moment. + */ + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. * - * As this can be called from idle code, the hrtimer_start() - * invocation has to be wrapped with RCU_NONIDLE() as - * hrtimer_start() can call into tracing. + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. */ - RCU_NONIDLE( { - hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); - /* - * The core tick broadcast mode expects bc->bound_on to be set - * correctly to prevent a CPU which has the broadcast hrtimer - * armed from going deep idle. - * - * As tick_broadcast_lock is held, nothing can change the cpu - * base which was just established in hrtimer_start() above. So - * the below access is safe even without holding the hrtimer - * base lock. - */ - bc->bound_on = bctimer.base->cpu_base->cpu; - } ); + bc->bound_on = bctimer.base->cpu_base->cpu; + return 0; }
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 27/44] cpuidle, sched: Remove annotations from TIF_{POLLING_NRFLAG, NEED_RESCHED}
vmlinux.o: warning: objtool: mwait_idle+0x5: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_processor_ffh_cstate_enter+0xc5: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: cpu_idle_poll.isra.0+0x73: call to test_ti_thread_flag() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0xbc: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0xea: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0xb4: call to current_set_polling_and_test() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0xa6: call to current_clr_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0xbf: call to current_clr_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0xa1: call to current_clr_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: mwait_idle+0xe: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_processor_ffh_cstate_enter+0xc5: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: cpu_idle_poll.isra.0+0x73: call to test_ti_thread_flag() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0xbc: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0xea: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0xb4: call to __current_set_polling() leaves .noinstr.text section vmlinux.o: warning: objtool: cpu_idle_poll.isra.0+0x73: call to test_ti_thread_flag() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0x73: call to test_ti_thread_flag.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0x91: call to test_ti_thread_flag.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0x78: call to test_ti_thread_flag.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_safe_halt+0xf: call to test_ti_thread_flag.constprop.0() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- include/linux/sched/idle.h | 40 ++++++++++++++++++++++++++++++---------- include/linux/thread_info.h | 18 +++++++++++++++++- 2 files changed, 47 insertions(+), 11 deletions(-) --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -23,12 +23,37 @@ static inline void wake_up_if_idle(int c */ #ifdef TIF_POLLING_NRFLAG -static inline void __current_set_polling(void) +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H + +static __always_inline void __current_set_polling(void) { - set_thread_flag(TIF_POLLING_NRFLAG); + arch_set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); } -static inline bool __must_check current_set_polling_and_test(void) +static __always_inline void __current_clr_polling(void) +{ + arch_clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#else + +static __always_inline void __current_set_polling(void) +{ + set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +static __always_inline void __current_clr_polling(void) +{ + clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H */ + +static __always_inline bool __must_check current_set_polling_and_test(void) { __current_set_polling(); @@ -41,12 +66,7 @@ static inline bool __must_check current_ return unlikely(tif_need_resched()); } -static inline void __current_clr_polling(void) -{ - clear_thread_flag(TIF_POLLING_NRFLAG); -} - -static inline bool __must_check current_clr_polling_and_test(void) +static __always_inline bool __must_check current_clr_polling_and_test(void) { __current_clr_polling(); @@ -73,7 +93,7 @@ static inline bool __must_check current_ } #endif -static inline void current_clr_polling(void) +static __always_inline void current_clr_polling(void) { __current_clr_polling(); --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -177,7 +177,23 @@ static __always_inline unsigned long rea clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + +static __always_inline bool tif_need_resched(void) +{ + return arch_test_bit(TIF_NEED_RESCHED, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#else + +static __always_inline bool tif_need_resched(void) +{ + return test_bit(TIF_NEED_RESCHED, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack,
vmlinux.o: warning: objtool: intel_idle_s2idle+0x6e: call to __monitor.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0x8c: call to __monitor.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0x73: call to __monitor.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: mwait_idle+0x88: call to clflush() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/include/asm/mwait.h | 12 ++++++------ arch/x86/include/asm/special_insns.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -25,7 +25,7 @@ #define TPAUSE_C01_STATE 1 #define TPAUSE_C02_STATE 0 -static inline void __monitor(const void *eax, unsigned long ecx, +static __always_inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { /* "monitor %eax, %ecx, %edx;" */ @@ -33,7 +33,7 @@ static inline void __monitor(const void :: "a" (eax), "c" (ecx), "d"(edx)); } -static inline void __monitorx(const void *eax, unsigned long ecx, +static __always_inline void __monitorx(const void *eax, unsigned long ecx, unsigned long edx) { /* "monitorx %eax, %ecx, %edx;" */ @@ -41,7 +41,7 @@ static inline void __monitorx(const void :: "a" (eax), "c" (ecx), "d"(edx)); } -static inline void __mwait(unsigned long eax, unsigned long ecx) +static __always_inline void __mwait(unsigned long eax, unsigned long ecx) { mds_idle_clear_cpu_buffers(); @@ -76,8 +76,8 @@ static inline void __mwait(unsigned long * EAX (logical) address to monitor * ECX #GP if not zero */ -static inline void __mwaitx(unsigned long eax, unsigned long ebx, - unsigned long ecx) +static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx, + unsigned long ecx) { /* No MDS buffer clear as this is AMD/HYGON only */ @@ -86,7 +86,7 @@ static inline void __mwaitx(unsigned lon :: "a" (eax), "b" (ebx), "c" (ecx)); } -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx) { mds_idle_clear_cpu_buffers(); /* "mwait %eax, %ecx;" */ --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -196,7 +196,7 @@ static inline void load_gs_index(unsigne #endif /* CONFIG_PARAVIRT_XXL */ -static inline void clflush(volatile void *__p) +static __always_inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); }
vmlinux.o: warning: objtool: __halt+0x2c: call to hcall_func.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: __halt+0x3f: call to __tdx_hypercall() leaves .noinstr.text section vmlinux.o: warning: objtool: __tdx_hypercall+0x66: call to __tdx_hypercall_failed() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/boot/compressed/vmlinux.lds.S | 1 + arch/x86/coco/tdx/tdcall.S | 2 ++ arch/x86/coco/tdx/tdx.c | 5 +++-- 3 files changed, 6 insertions(+), 2 deletions(-) --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -34,6 +34,7 @@ SECTIONS _text = .; /* Text */ *(.text) *(.text.*) + *(.noinstr.text) _etext = . ; } .rodata : { --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -31,6 +31,8 @@ TDX_R12 | TDX_R13 | \ TDX_R14 | TDX_R15 ) +.section .noinstr.text, "ax" + /* * __tdx_module_call() - Used by TDX guests to request services from * the TDX module (does not include VMM services) using TDCALL instruction. --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -53,8 +53,9 @@ static inline u64 _tdx_hypercall(u64 fn, } /* Called from __tdx_hypercall() for unrecoverable failure */ -void __tdx_hypercall_failed(void) +noinstr void __tdx_hypercall_failed(void) { + instrumentation_begin(); panic("TDVMCALL failed. TDX module bug?"); } @@ -64,7 +65,7 @@ void __tdx_hypercall_failed(void) * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and * guest sides of these calls. */ -static u64 hcall_func(u64 exit_reason) +static __always_inline u64 hcall_func(u64 exit_reason) { return exit_reason; }
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 30/44] cpuidle,xenpv: Make more PARAVIRT_XXL noinstr clean
vmlinux.o: warning: objtool: acpi_idle_enter_s2idle+0xde: call to wbinvd() leaves .noinstr.text section vmlinux.o: warning: objtool: default_idle+0x4: call to arch_safe_halt() leaves .noinstr.text section vmlinux.o: warning: objtool: xen_safe_halt+0xa: call to HYPERVISOR_sched_op.constprop.0() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Srivatsa S. Bhat (VMware) <srivatsa at csail.mit.edu> --- arch/x86/include/asm/paravirt.h | 6 ++++-- arch/x86/include/asm/special_insns.h | 4 ++-- arch/x86/include/asm/xen/hypercall.h | 2 +- arch/x86/kernel/paravirt.c | 14 ++++++++++++-- arch/x86/xen/enlighten_pv.c | 2 +- arch/x86/xen/irq.c | 2 +- 6 files changed, 21 insertions(+), 9 deletions(-) --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -168,7 +168,7 @@ static inline void __write_cr4(unsigned PVOP_VCALL1(cpu.write_cr4, x); } -static inline void arch_safe_halt(void) +static __always_inline void arch_safe_halt(void) { PVOP_VCALL0(irq.safe_halt); } @@ -178,7 +178,9 @@ static inline void halt(void) PVOP_VCALL0(irq.halt); } -static inline void wbinvd(void) +extern noinstr void pv_native_wbinvd(void); + +static __always_inline void wbinvd(void) { PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); } --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -115,7 +115,7 @@ static inline void wrpkru(u32 pkru) } #endif -static inline void native_wbinvd(void) +static __always_inline void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } @@ -179,7 +179,7 @@ static inline void __write_cr4(unsigned native_write_cr4(x); } -static inline void wbinvd(void) +static __always_inline void wbinvd(void) { native_wbinvd(); } --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -382,7 +382,7 @@ MULTI_stack_switch(struct multicall_entr } #endif -static inline int +static __always_inline int HYPERVISOR_sched_op(int cmd, void *arg) { return _hypercall2(int, sched_op, cmd, arg); --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -233,6 +233,11 @@ static noinstr void pv_native_set_debugr native_set_debugreg(regno, val); } +noinstr void pv_native_wbinvd(void) +{ + native_wbinvd(); +} + static noinstr void pv_native_irq_enable(void) { native_irq_enable(); @@ -242,6 +247,11 @@ static noinstr void pv_native_irq_disabl { native_irq_disable(); } + +static noinstr void pv_native_safe_halt(void) +{ + native_safe_halt(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -273,7 +283,7 @@ struct paravirt_patch_template pv_ops .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, - .cpu.wbinvd = native_wbinvd, + .cpu.wbinvd = pv_native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, @@ -307,7 +317,7 @@ struct paravirt_patch_template pv_ops .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), - .irq.safe_halt = native_safe_halt, + .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1019,7 +1019,7 @@ static const typeof(pv_ops) xen_cpu_ops .write_cr4 = xen_write_cr4, - .wbinvd = native_wbinvd, + .wbinvd = pv_native_wbinvd, .read_msr = xen_read_msr, .write_msr = xen_write_msr, --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -24,7 +24,7 @@ noinstr void xen_force_evtchn_callback(v (void)HYPERVISOR_xen_version(0, NULL); } -static void xen_safe_halt(void) +static noinstr void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
vmlinux.o: warning: objtool: mwait_idle+0x47: call to mds_idle_clear_cpu_buffers() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_processor_ffh_cstate_enter+0xa2: call to mds_idle_clear_cpu_buffers() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle+0x91: call to mds_idle_clear_cpu_buffers() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_s2idle+0x8c: call to mds_idle_clear_cpu_buffers() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_irq+0xaa: call to mds_idle_clear_cpu_buffers() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/include/asm/nospec-branch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -310,7 +310,7 @@ static __always_inline void mds_user_cle * * Clear CPU buffers if the corresponding static key is enabled */ -static inline void mds_idle_clear_cpu_buffers(void) +static __always_inline void mds_idle_clear_cpu_buffers(void) { if (static_branch_likely(&mds_idle_clear)) mds_clear_cpu_buffers();
vmlinux.o: warning: objtool: io_idle+0xc: call to __inb.isra.0() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_idle_enter+0xfe: call to num_online_cpus() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_idle_enter+0x115: call to acpi_idle_fallback_to_c1.isra.0() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Acked-by: Rafael J. Wysocki <rafael.j.wysocki at intel.com> --- arch/x86/include/asm/shared/io.h | 4 ++-- drivers/acpi/processor_idle.c | 2 +- include/linux/cpumask.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) --- a/arch/x86/include/asm/shared/io.h +++ b/arch/x86/include/asm/shared/io.h @@ -5,13 +5,13 @@ #include <linux/types.h> #define BUILDIO(bwl, bw, type) \ -static inline void __out##bwl(type value, u16 port) \ +static __always_inline void __out##bwl(type value, u16 port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ -static inline type __in##bwl(u16 port) \ +static __always_inline type __in##bwl(u16 port) \ { \ type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -593,7 +593,7 @@ static int acpi_idle_play_dead(struct cp return 0; } -static bool acpi_idle_fallback_to_c1(struct acpi_processor *pr) +static __always_inline bool acpi_idle_fallback_to_c1(struct acpi_processor *pr) { return IS_ENABLED(CONFIG_HOTPLUG_CPU) && !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED); --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -908,9 +908,9 @@ static inline const struct cpumask *get_ * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ -static inline unsigned int num_online_cpus(void) +static __always_inline unsigned int num_online_cpus(void) { - return atomic_read(&__num_online_cpus); + return arch_atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask)
CONFIG_GENERIC_ENTRY disallows any and all tracing when RCU isn't enabled. XXX if s390 (the only other GENERIC_ENTRY user as of this writing) isn't comfortable with this, we could switch to HAVE_NOINSTR_VALIDATION which is x86_64 only atm. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- include/linux/tracepoint.h | 13 ++++++++++++- kernel/trace/trace.c | 3 +++ 2 files changed, 15 insertions(+), 1 deletion(-) --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -178,6 +178,16 @@ static inline struct tracepoint *tracepo #endif /* CONFIG_HAVE_STATIC_CALL */ /* + * CONFIG_GENERIC_ENTRY archs are expected to have sanitized entry and idle + * code that disallow any/all tracing/instrumentation when RCU isn't watching. + */ +#ifdef CONFIG_GENERIC_ENTRY +#define RCUIDLE_COND(rcuidle) (rcuidle) +#else +#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi()) +#endif + +/* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ @@ -189,7 +199,8 @@ static inline struct tracepoint *tracepo return; \ \ /* srcu can't be used from NMI */ \ - WARN_ON_ONCE(rcuidle && in_nmi()); \ + if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \ + return; \ \ /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3104,6 +3104,9 @@ void __trace_stack(struct trace_array *t return; } + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) + return; + /* * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 34/44] cpuidle,omap3: Use WFI for omap3_pm_idle()
arch_cpu_idle() is a very simple idle interface and exposes only a single idle state and is expected to not require RCU and not do any tracing/instrumentation. As such, omap_sram_idle() is not a valid implementation. Replace it with the simple (shallow) omap3_do_wfi() call. Leaving the more complicated idle states for the cpuidle driver. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Acked-by: Tony Lindgren <tony at atomide.com> --- arch/arm/mach-omap2/pm34xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/arch/arm/mach-omap2/pm34xx.c +++ b/arch/arm/mach-omap2/pm34xx.c @@ -294,7 +294,7 @@ static void omap3_pm_idle(void) if (omap_irq_pending()) return; - omap_sram_idle(); + omap3_do_wfi(); } #ifdef CONFIG_SUSPEND
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 35/44] cpuidle,omap3: Push RCU-idle into omap_sram_idle()
OMAP3 uses full SoC suspend modes as idle states, as such it needs the whole power-domain and clock-domain code from the idle path. All that code is not suitable to run with RCU disabled, as such push RCU-idle deeper still. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reviewed-by: Tony Lindgren <tony at atomide.com> Tested-by: Tony Lindgren <tony at atomide.com> --- arch/arm/mach-omap2/cpuidle34xx.c | 4 +--- arch/arm/mach-omap2/pm.h | 2 +- arch/arm/mach-omap2/pm34xx.c | 12 ++++++++++-- 3 files changed, 12 insertions(+), 6 deletions(-) --- a/arch/arm/mach-omap2/cpuidle34xx.c +++ b/arch/arm/mach-omap2/cpuidle34xx.c @@ -133,9 +133,7 @@ static int omap3_enter_idle(struct cpuid } /* Execute ARM wfi */ - ct_cpuidle_enter(); - omap_sram_idle(); - ct_cpuidle_exit(); + omap_sram_idle(true); /* * Call idle CPU PM enter notifier chain to restore --- a/arch/arm/mach-omap2/pm.h +++ b/arch/arm/mach-omap2/pm.h @@ -29,7 +29,7 @@ static inline int omap4_idle_init(void) extern void *omap3_secure_ram_storage; extern void omap3_pm_off_mode_enable(int); -extern void omap_sram_idle(void); +extern void omap_sram_idle(bool rcuidle); extern int omap_pm_clkdms_setup(struct clockdomain *clkdm, void *unused); #if defined(CONFIG_PM_OPP) --- a/arch/arm/mach-omap2/pm34xx.c +++ b/arch/arm/mach-omap2/pm34xx.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/slab.h> #include <linux/of.h> +#include <linux/cpuidle.h> #include <trace/events/power.h> @@ -174,7 +175,7 @@ static int omap34xx_do_sram_idle(unsigne return 0; } -void omap_sram_idle(void) +void omap_sram_idle(bool rcuidle) { /* Variable to tell what needs to be saved and restored * in omap_sram_idle*/ @@ -254,11 +255,18 @@ void omap_sram_idle(void) */ if (save_state) omap34xx_save_context(omap3_arm_context); + + if (rcuidle) + ct_cpuidle_enter(); + if (save_state == 1 || save_state == 3) cpu_suspend(save_state, omap34xx_do_sram_idle); else omap34xx_do_sram_idle(save_state); + if (rcuidle) + ct_cpuidle_exit(); + /* Restore normal SDRC POWER settings */ if (cpu_is_omap3430() && omap_rev() >= OMAP3430_REV_ES3_0 && (omap_type() == OMAP2_DEVICE_TYPE_EMU || @@ -316,7 +324,7 @@ static int omap3_pm_suspend(void) omap3_intc_suspend(); - omap_sram_idle(); + omap_sram_idle(false); restore: /* Restore next_pwrsts */
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 36/44] cpuidle, omap4: Push RCU-idle into omap4_enter_lowpower()
From: Tony Lindgren <tony at atomide.com> OMAP4 uses full SoC suspend modes as idle states, as such it needs the whole power-domain and clock-domain code from the idle path. All that code is not suitable to run with RCU disabled, as such push RCU-idle deeper still. Signed-off-by: Tony Lindgren <tony at atomide.com> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Link: https://lkml.kernel.org/r/Yqcv6crSNKuSWoTu at atomide.com --- arch/arm/mach-omap2/common.h | 6 ++++-- arch/arm/mach-omap2/cpuidle44xx.c | 8 ++------ arch/arm/mach-omap2/omap-mpuss-lowpower.c | 12 +++++++++++- arch/arm/mach-omap2/pm44xx.c | 2 +- 4 files changed, 18 insertions(+), 10 deletions(-) --- a/arch/arm/mach-omap2/common.h +++ b/arch/arm/mach-omap2/common.h @@ -284,11 +284,13 @@ extern u32 omap4_get_cpu1_ns_pa_addr(voi #if defined(CONFIG_SMP) && defined(CONFIG_PM) extern int omap4_mpuss_init(void); -extern int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state); +extern int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state, + bool rcuidle); extern int omap4_hotplug_cpu(unsigned int cpu, unsigned int power_state); #else static inline int omap4_enter_lowpower(unsigned int cpu, - unsigned int power_state) + unsigned int power_state, + bool rcuidle) { cpu_do_idle(); return 0; --- a/arch/arm/mach-omap2/cpuidle44xx.c +++ b/arch/arm/mach-omap2/cpuidle44xx.c @@ -105,9 +105,7 @@ static int omap_enter_idle_smp(struct cp } raw_spin_unlock_irqrestore(&mpu_lock, flag); - ct_cpuidle_enter(); - omap4_enter_lowpower(dev->cpu, cx->cpu_state); - ct_cpuidle_exit(); + omap4_enter_lowpower(dev->cpu, cx->cpu_state, true); raw_spin_lock_irqsave(&mpu_lock, flag); if (cx->mpu_state_vote == num_online_cpus()) @@ -186,10 +184,8 @@ static int omap_enter_idle_coupled(struc } } - ct_cpuidle_enter(); - omap4_enter_lowpower(dev->cpu, cx->cpu_state); + omap4_enter_lowpower(dev->cpu, cx->cpu_state, true); cpu_done[dev->cpu] = true; - ct_cpuidle_exit(); /* Wakeup CPU1 only if it is not offlined */ if (dev->cpu == 0 && cpumask_test_cpu(1, cpu_online_mask)) { --- a/arch/arm/mach-omap2/omap-mpuss-lowpower.c +++ b/arch/arm/mach-omap2/omap-mpuss-lowpower.c @@ -33,6 +33,7 @@ * and first to wake-up when MPUSS low power states are excercised */ +#include <linux/cpuidle.h> #include <linux/kernel.h> #include <linux/io.h> #include <linux/errno.h> @@ -214,6 +215,7 @@ static void __init save_l2x0_context(voi * of OMAP4 MPUSS subsystem * @cpu : CPU ID * @power_state: Low power state. + * @rcuidle: RCU needs to be idled * * MPUSS states for the context save: * save_state @@ -222,7 +224,8 @@ static void __init save_l2x0_context(voi * 2 - CPUx L1 and logic lost + GIC lost: MPUSS OSWR * 3 - CPUx L1 and logic lost + GIC + L2 lost: DEVICE OFF */ -int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state) +int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state, + bool rcuidle) { struct omap4_cpu_pm_info *pm_info = &per_cpu(omap4_pm_info, cpu); unsigned int save_state = 0, cpu_logic_state = PWRDM_POWER_RET; @@ -268,6 +271,10 @@ int omap4_enter_lowpower(unsigned int cp cpu_clear_prev_logic_pwrst(cpu); pwrdm_set_next_pwrst(pm_info->pwrdm, power_state); pwrdm_set_logic_retst(pm_info->pwrdm, cpu_logic_state); + + if (rcuidle) + ct_cpuidle_enter(); + set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.resume)); omap_pm_ops.scu_prepare(cpu, power_state); l2x0_pwrst_prepare(cpu, save_state); @@ -283,6 +290,9 @@ int omap4_enter_lowpower(unsigned int cp if (IS_PM44XX_ERRATUM(PM_OMAP4_ROM_SMP_BOOT_ERRATUM_GICD) && cpu) gic_dist_enable(); + if (rcuidle) + ct_cpuidle_exit(); + /* * Restore the CPUx power state to ON otherwise CPUx * power domain can transitions to programmed low power --- a/arch/arm/mach-omap2/pm44xx.c +++ b/arch/arm/mach-omap2/pm44xx.c @@ -76,7 +76,7 @@ static int omap4_pm_suspend(void) * domain CSWR is not supported by hardware. * More details can be found in OMAP4430 TRM section 4.3.4.2. */ - omap4_enter_lowpower(cpu_id, cpu_suspend_state); + omap4_enter_lowpower(cpu_id, cpu_suspend_state, false); /* Restore next powerdomain state */ list_for_each_entry(pwrst, &pwrst_list, node) {
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 37/44] arm,omap2: Use WFI for omap2_pm_idle()
arch_cpu_idle() is a very simple idle interface and exposes only a single idle state and is expected to not require RCU and not do any tracing/instrumentation. As such, omap2_pm_idle() is not a valid implementation. Replace it with a simple (shallow) omap2_do_wfi() call. Omap2 doesn't have a cpuidle driver; but adding one would be the recourse to (re)gain the other idle states. Suggested-by: Tony Lindgren <tony at atomide.com> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/arm/mach-omap2/pm24xx.c | 51 +------------------------------------------ 1 file changed, 2 insertions(+), 49 deletions(-) --- a/arch/arm/mach-omap2/pm24xx.c +++ b/arch/arm/mach-omap2/pm24xx.c @@ -116,50 +116,12 @@ static int omap2_enter_full_retention(vo static int sti_console_enabled; -static int omap2_allow_mpu_retention(void) -{ - if (!omap2xxx_cm_mpu_retention_allowed()) - return 0; - if (sti_console_enabled) - return 0; - - return 1; -} - -static void omap2_enter_mpu_retention(void) +static void omap2_do_wfi(void) { const int zero = 0; - /* The peripherals seem not to be able to wake up the MPU when - * it is in retention mode. */ - if (omap2_allow_mpu_retention()) { - /* REVISIT: These write to reserved bits? */ - omap_prm_clear_mod_irqs(CORE_MOD, PM_WKST1, ~0); - omap_prm_clear_mod_irqs(CORE_MOD, OMAP24XX_PM_WKST2, ~0); - omap_prm_clear_mod_irqs(WKUP_MOD, PM_WKST, ~0); - - /* Try to enter MPU retention */ - pwrdm_set_next_pwrst(mpu_pwrdm, PWRDM_POWER_RET); - - } else { - /* Block MPU retention */ - pwrdm_set_next_pwrst(mpu_pwrdm, PWRDM_POWER_ON); - } - /* WFI */ asm("mcr p15, 0, %0, c7, c0, 4" : : "r" (zero) : "memory", "cc"); - - pwrdm_set_next_pwrst(mpu_pwrdm, PWRDM_POWER_ON); -} - -static int omap2_can_sleep(void) -{ - if (omap2xxx_cm_fclks_active()) - return 0; - if (__clk_is_enabled(osc_ck)) - return 0; - - return 1; } static void omap2_pm_idle(void) @@ -169,16 +131,7 @@ static void omap2_pm_idle(void) if (omap_irq_pending()) return; - error = cpu_cluster_pm_enter(); - if (error || !omap2_can_sleep()) { - omap2_enter_mpu_retention(); - goto out_cpu_cluster_pm; - } - - omap2_enter_full_retention(); - -out_cpu_cluster_pm: - cpu_cluster_pm_exit(); + omap2_do_wfi(); } static void __init prcm_setup_regs(void)
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 38/44] cpuidle,powerdomain: Remove trace_.*_rcuidle()
OMAP was the one and only user. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/arm/mach-omap2/powerdomain.c | 10 +++++----- drivers/base/power/runtime.c | 24 ++++++++++++------------ 2 files changed, 17 insertions(+), 17 deletions(-) --- a/arch/arm/mach-omap2/powerdomain.c +++ b/arch/arm/mach-omap2/powerdomain.c @@ -187,9 +187,9 @@ static int _pwrdm_state_switch(struct po trace_state = (PWRDM_TRACE_STATES_FLAG | ((next & OMAP_POWERSTATE_MASK) << 8) | ((prev & OMAP_POWERSTATE_MASK) << 0)); - trace_power_domain_target_rcuidle(pwrdm->name, - trace_state, - raw_smp_processor_id()); + trace_power_domain_target(pwrdm->name, + trace_state, + raw_smp_processor_id()); } break; default: @@ -541,8 +541,8 @@ int pwrdm_set_next_pwrst(struct powerdom if (arch_pwrdm && arch_pwrdm->pwrdm_set_next_pwrst) { /* Trace the pwrdm desired target state */ - trace_power_domain_target_rcuidle(pwrdm->name, pwrst, - raw_smp_processor_id()); + trace_power_domain_target(pwrdm->name, pwrst, + raw_smp_processor_id()); /* Program the pwrdm desired target state */ ret = arch_pwrdm->pwrdm_set_next_pwrst(pwrdm, pwrst); } --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -442,7 +442,7 @@ static int rpm_idle(struct device *dev, int (*callback)(struct device *); int retval; - trace_rpm_idle_rcuidle(dev, rpmflags); + trace_rpm_idle(dev, rpmflags); retval = rpm_check_suspend_allowed(dev); if (retval < 0) ; /* Conditions are wrong. */ @@ -481,7 +481,7 @@ static int rpm_idle(struct device *dev, dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } - trace_rpm_return_int_rcuidle(dev, _THIS_IP_, 0); + trace_rpm_return_int(dev, _THIS_IP_, 0); return 0; } @@ -493,7 +493,7 @@ static int rpm_idle(struct device *dev, wake_up_all(&dev->power.wait_queue); out: - trace_rpm_return_int_rcuidle(dev, _THIS_IP_, retval); + trace_rpm_return_int(dev, _THIS_IP_, retval); return retval ? retval : rpm_suspend(dev, rpmflags | RPM_AUTO); } @@ -557,7 +557,7 @@ static int rpm_suspend(struct device *de struct device *parent = NULL; int retval; - trace_rpm_suspend_rcuidle(dev, rpmflags); + trace_rpm_suspend(dev, rpmflags); repeat: retval = rpm_check_suspend_allowed(dev); @@ -708,7 +708,7 @@ static int rpm_suspend(struct device *de } out: - trace_rpm_return_int_rcuidle(dev, _THIS_IP_, retval); + trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; @@ -760,7 +760,7 @@ static int rpm_resume(struct device *dev struct device *parent = NULL; int retval = 0; - trace_rpm_resume_rcuidle(dev, rpmflags); + trace_rpm_resume(dev, rpmflags); repeat: if (dev->power.runtime_error) { @@ -925,7 +925,7 @@ static int rpm_resume(struct device *dev spin_lock_irq(&dev->power.lock); } - trace_rpm_return_int_rcuidle(dev, _THIS_IP_, retval); + trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; } @@ -1081,7 +1081,7 @@ int __pm_runtime_idle(struct device *dev if (retval < 0) { return retval; } else if (retval > 0) { - trace_rpm_usage_rcuidle(dev, rpmflags); + trace_rpm_usage(dev, rpmflags); return 0; } } @@ -1119,7 +1119,7 @@ int __pm_runtime_suspend(struct device * if (retval < 0) { return retval; } else if (retval > 0) { - trace_rpm_usage_rcuidle(dev, rpmflags); + trace_rpm_usage(dev, rpmflags); return 0; } } @@ -1202,7 +1202,7 @@ int pm_runtime_get_if_active(struct devi } else { retval = atomic_inc_not_zero(&dev->power.usage_count); } - trace_rpm_usage_rcuidle(dev, 0); + trace_rpm_usage(dev, 0); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; @@ -1566,7 +1566,7 @@ void pm_runtime_allow(struct device *dev if (ret == 0) rpm_idle(dev, RPM_AUTO | RPM_ASYNC); else if (ret > 0) - trace_rpm_usage_rcuidle(dev, RPM_AUTO | RPM_ASYNC); + trace_rpm_usage(dev, RPM_AUTO | RPM_ASYNC); out: spin_unlock_irq(&dev->power.lock); @@ -1635,7 +1635,7 @@ static void update_autosuspend(struct de atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); } else { - trace_rpm_usage_rcuidle(dev, 0); + trace_rpm_usage(dev, 0); } }
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 39/44] cpuidle,clk: Remove trace_.*_rcuidle()
OMAP was the one and only user. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/clk/clk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -978,12 +978,12 @@ static void clk_core_disable(struct clk_ if (--core->enable_count > 0) return; - trace_clk_disable_rcuidle(core); + trace_clk_disable(core); if (core->ops->disable) core->ops->disable(core->hw); - trace_clk_disable_complete_rcuidle(core); + trace_clk_disable_complete(core); clk_core_disable(core->parent); } @@ -1037,12 +1037,12 @@ static int clk_core_enable(struct clk_co if (ret) return ret; - trace_clk_enable_rcuidle(core); + trace_clk_enable(core); if (core->ops->enable) ret = core->ops->enable(core->hw); - trace_clk_enable_complete_rcuidle(core); + trace_clk_enable_complete(core); if (ret) { clk_core_disable(core->parent);
clang-14 allyesconfig gives: vmlinux.o: warning: objtool: emulator_cmpxchg_emulated+0x705: call to __ubsan_handle_load_invalid_value() with UACCESS enabled vmlinux.o: warning: objtool: paging64_update_accessed_dirty_bits+0x39e: call to __ubsan_handle_load_invalid_value() with UACCESS enabled vmlinux.o: warning: objtool: paging32_update_accessed_dirty_bits+0x390: call to __ubsan_handle_load_invalid_value() with UACCESS enabled vmlinux.o: warning: objtool: ept_update_accessed_dirty_bits+0x43f: call to __ubsan_handle_load_invalid_value() with UACCESS enabled Add the required eflags save/restore and whitelist the thing. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- lib/ubsan.c | 5 ++++- tools/objtool/check.c | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -340,9 +340,10 @@ void __ubsan_handle_load_invalid_value(v { struct invalid_value_data *data = _data; char val_str[VALUE_LENGTH]; + unsigned long ua_flags = user_access_save(); if (suppress_report(&data->location)) - return; + goto out; ubsan_prologue(&data->location, "invalid-load"); @@ -352,6 +353,8 @@ void __ubsan_handle_load_invalid_value(v val_str, data->type->type_name); ubsan_epilogue(); +out: + user_access_restore(ua_flags); } EXPORT_SYMBOL(__ubsan_handle_load_invalid_value); --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1068,6 +1068,7 @@ static const char *uaccess_safe_builtin[ "__ubsan_handle_type_mismatch", "__ubsan_handle_type_mismatch_v1", "__ubsan_handle_shift_out_of_bounds", + "__ubsan_handle_load_invalid_value", /* misc */ "csum_partial_copy_generic", "copy_mc_fragile",
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 41/44] intel_idle: Add force_irq_on module param
For testing purposes. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/idle/intel_idle.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1787,6 +1787,9 @@ static bool __init intel_idle_verify_cst return true; } +static bool force_irq_on __read_mostly; +module_param(force_irq_on, bool, 0444); + static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) { int cstate; @@ -1838,8 +1841,10 @@ static void __init intel_idle_init_cstat /* Structure copy. */ drv->states[drv->state_count] = cpuidle_state_table[cstate]; - if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE) + if ((cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE) || force_irq_on) { + printk("intel_idle: forced intel_idle_irq for state %d\n", cstate); drv->states[drv->state_count].enter = intel_idle_irq; + } if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) {
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 42/44] entry, kasan, x86: Disallow overriding mem*() functions
KASAN cannot just hijack the mem*() functions, it needs to emit __asan_mem*() variants if it wants instrumentation (other sanitizers already do this). vmlinux.o: warning: objtool: sync_regs+0x24: call to memcpy() leaves .noinstr.text section vmlinux.o: warning: objtool: vc_switch_off_ist+0xbe: call to memcpy() leaves .noinstr.text section vmlinux.o: warning: objtool: fixup_bad_iret+0x36: call to memset() leaves .noinstr.text section vmlinux.o: warning: objtool: __sev_get_ghcb+0xa0: call to memcpy() leaves .noinstr.text section vmlinux.o: warning: objtool: __sev_put_ghcb+0x35: call to memcpy() leaves .noinstr.text section Remove the weak aliases to ensure nobody hijacks these functions and add them to the noinstr section. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/lib/memcpy_64.S | 5 ++--- arch/x86/lib/memmove_64.S | 4 +++- arch/x86/lib/memset_64.S | 4 +++- mm/kasan/kasan.h | 4 ++++ mm/kasan/shadow.c | 38 ++++++++++++++++++++++++++++++++++++++ tools/objtool/check.c | 3 +++ 6 files changed, 53 insertions(+), 5 deletions(-) --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -7,7 +7,7 @@ #include <asm/alternative.h> #include <asm/export.h> -.pushsection .noinstr.text, "ax" +.section .noinstr.text, "ax" /* * We build a jump to memcpy_orig by default which gets NOPped out on @@ -42,7 +42,7 @@ SYM_FUNC_START(__memcpy) SYM_FUNC_END(__memcpy) EXPORT_SYMBOL(__memcpy) -SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy) +SYM_FUNC_ALIAS(memcpy, __memcpy) EXPORT_SYMBOL(memcpy) /* @@ -183,4 +183,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig) RET SYM_FUNC_END(memcpy_orig) -.popsection --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -13,6 +13,8 @@ #undef memmove +.section .noinstr.text, "ax" + /* * Implement memmove(). This can handle overlap between src and dst. * @@ -213,5 +215,5 @@ SYM_FUNC_START(__memmove) SYM_FUNC_END(__memmove) EXPORT_SYMBOL(__memmove) -SYM_FUNC_ALIAS_WEAK(memmove, __memmove) +SYM_FUNC_ALIAS(memmove, __memmove) EXPORT_SYMBOL(memmove) --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -6,6 +6,8 @@ #include <asm/alternative.h> #include <asm/export.h> +.section .noinstr.text, "ax" + /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is @@ -43,7 +45,7 @@ SYM_FUNC_START(__memset) SYM_FUNC_END(__memset) EXPORT_SYMBOL(__memset) -SYM_FUNC_ALIAS_WEAK(memset, __memset) +SYM_FUNC_ALIAS(memset, __memset) EXPORT_SYMBOL(memset) /* --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -551,6 +551,10 @@ void __asan_set_shadow_f3(const void *ad void __asan_set_shadow_f5(const void *addr, size_t size); void __asan_set_shadow_f8(const void *addr, size_t size); +void *__asan_memset(void *addr, int c, size_t len); +void *__asan_memmove(void *dest, const void *src, size_t len); +void *__asan_memcpy(void *dest, const void *src, size_t len); + void __hwasan_load1_noabort(unsigned long addr); void __hwasan_store1_noabort(unsigned long addr); void __hwasan_load2_noabort(unsigned long addr); --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -38,6 +38,12 @@ bool __kasan_check_write(const volatile } EXPORT_SYMBOL(__kasan_check_write); +#ifndef CONFIG_GENERIC_ENTRY +/* + * CONFIG_GENERIC_ENTRY relies on compiler emitted mem*() calls to not be + * instrumented. KASAN enabled toolchains should emit __asan_mem*() functions + * for the sites they want to instrument. + */ #undef memset void *memset(void *addr, int c, size_t len) { @@ -68,6 +74,38 @@ void *memcpy(void *dest, const void *src return __memcpy(dest, src, len); } +#endif + +void *__asan_memset(void *addr, int c, size_t len) +{ + if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + return NULL; + + return __memset(addr, c, len); +} +EXPORT_SYMBOL(__asan_memset); + +#ifdef __HAVE_ARCH_MEMMOVE +void *__asan_memmove(void *dest, const void *src, size_t len) +{ + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + return NULL; + + return __memmove(dest, src, len); +} +EXPORT_SYMBOL(__asan_memmove); +#endif + +void *__asan_memcpy(void *dest, const void *src, size_t len) +{ + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + return NULL; + + return __memcpy(dest, src, len); +} +EXPORT_SYMBOL(__asan_memcpy); void kasan_poison(const void *addr, size_t size, u8 value, bool init) { --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -956,6 +956,9 @@ static const char *uaccess_safe_builtin[ "__asan_store16_noabort", "__kasan_check_read", "__kasan_check_write", + "__asan_memset", + "__asan_memmove", + "__asan_memcpy", /* KASAN in-line */ "__asan_report_load_n_noabort", "__asan_report_load1_noabort",
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 43/44] sched: Always inline __this_cpu_preempt_check()
vmlinux.o: warning: objtool: in_entry_stack+0x9: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: default_do_nmi+0x10: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: fpu_idle_fpregs+0x41: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: kvm_read_and_reset_apf_flags+0x1: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: lockdep_hardirqs_on+0xb0: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: lockdep_hardirqs_off+0xae: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_nmi_enter+0x69: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_nmi_exit+0x32: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_processor_ffh_cstate_enter+0x9: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_idle_enter+0x43: call to __this_cpu_preempt_check() leaves .noinstr.text section vmlinux.o: warning: objtool: acpi_idle_enter_s2idle+0x45: call to __this_cpu_preempt_check() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -310,7 +310,7 @@ extern void __bad_size_call_parameter(vo #ifdef CONFIG_DEBUG_PREEMPT extern void __this_cpu_preempt_check(const char *op); #else -static inline void __this_cpu_preempt_check(const char *op) { } +static __always_inline void __this_cpu_preempt_check(const char *op) { } #endif #define __pcpu_size_call_return(stem, variable) \
Peter Zijlstra
2022-Sep-19 10:00 UTC
[PATCH v2 44/44] arm64,riscv,perf: Remove RCU_NONIDLE() usage
The PM notifiers should no longer be ran with RCU disabled (per the previous patches), as such this hack is no longer required either. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/perf/arm_pmu.c | 11 +---------- drivers/perf/riscv_pmu_sbi.c | 8 +------- 2 files changed, 2 insertions(+), 17 deletions(-) --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -762,17 +762,8 @@ static void cpu_pm_pmu_setup(struct arm_ case CPU_PM_ENTER_FAILED: /* * Restore and enable the counter. - * armpmu_start() indirectly calls - * - * perf_event_update_userpage() - * - * that requires RCU read locking to be functional, - * wrap the call within RCU_NONIDLE to make the - * RCU subsystem aware this cpu is not idle from - * an RCU perspective for the armpmu_start() call - * duration. */ - RCU_NONIDLE(armpmu_start(event, PERF_EF_RELOAD)); + armpmu_start(event, PERF_EF_RELOAD); break; default: break; --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -747,14 +747,8 @@ static int riscv_pm_pmu_notify(struct no case CPU_PM_ENTER_FAILED: /* * Restore and enable the counter. - * - * Requires RCU read locking to be functional, - * wrap the call within RCU_NONIDLE to make the - * RCU subsystem aware this cpu is not idle from - * an RCU perspective for the riscv_pmu_start() call - * duration. */ - RCU_NONIDLE(riscv_pmu_start(event, PERF_EF_RELOAD)); + riscv_pmu_start(event, PERF_EF_RELOAD); break; default: break;
On Mon, Sep 19, 2022 at 12:17 PM Peter Zijlstra <peterz at infradead.org> wrote:> > Hi All! > > At long last, a respin of the cpuidle vs rcu cleanup patches. > > v1: https://lkml.kernel.org/r/20220608142723.103523089 at infradead.org > > These here patches clean up the mess that is cpuidle vs rcuidle. > > At the end of the ride there's only on RCU_NONIDLE user left: > > arch/arm64/kernel/suspend.c: RCU_NONIDLE(__cpu_suspend_exit()); > > and 'one' trace_*_rcuidle() user: > > kernel/trace/trace_preemptirq.c: trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); > kernel/trace/trace_preemptirq.c: trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); > kernel/trace/trace_preemptirq.c: trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); > kernel/trace/trace_preemptirq.c: trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); > kernel/trace/trace_preemptirq.c: trace_preempt_enable_rcuidle(a0, a1); > kernel/trace/trace_preemptirq.c: trace_preempt_disable_rcuidle(a0, a1); > > However this last is all in deprecated code that should be unused for GENERIC_ENTRY. > > I've touched a lot of code that I can't test and I might've broken something by > accident. In particular the whole ARM cpuidle stuff was quite involved. > > Please all; have a look where you haven't already. > > > New since v1: > > - rebase on top of Frederic's rcu-context-tracking rename fest > - more omap goodness as per the last discusion (thanks Tony!) > - removed one more RCU_NONIDLE() from arm64/risc-v perf code > - ubsan/kasan fixes > - intel_idle module-param for testing > - a bunch of extra __always_inline, because compilers are silly.Acked-by: Rafael J. Wysocki <rafael.j.wysocki at intel.com> for the whole set and let me know if you want me to merge any of these through cpuidle. Thanks!> > --- > arch/alpha/kernel/process.c | 1 - > arch/alpha/kernel/vmlinux.lds.S | 1 - > arch/arc/kernel/process.c | 3 ++ > arch/arc/kernel/vmlinux.lds.S | 1 - > arch/arm/include/asm/vmlinux.lds.h | 1 - > arch/arm/kernel/process.c | 1 - > arch/arm/kernel/smp.c | 6 +-- > arch/arm/mach-gemini/board-dt.c | 3 +- > arch/arm/mach-imx/cpuidle-imx6q.c | 4 +- > arch/arm/mach-imx/cpuidle-imx6sx.c | 5 ++- > arch/arm/mach-omap2/common.h | 6 ++- > arch/arm/mach-omap2/cpuidle34xx.c | 16 +++++++- > arch/arm/mach-omap2/cpuidle44xx.c | 29 +++++++------- > arch/arm/mach-omap2/omap-mpuss-lowpower.c | 12 +++++- > arch/arm/mach-omap2/pm.h | 2 +- > arch/arm/mach-omap2/pm24xx.c | 51 +----------------------- > arch/arm/mach-omap2/pm34xx.c | 14 +++++-- > arch/arm/mach-omap2/pm44xx.c | 2 +- > arch/arm/mach-omap2/powerdomain.c | 10 ++--- > arch/arm64/kernel/idle.c | 1 - > arch/arm64/kernel/smp.c | 4 +- > arch/arm64/kernel/vmlinux.lds.S | 1 - > arch/csky/kernel/process.c | 1 - > arch/csky/kernel/smp.c | 2 +- > arch/csky/kernel/vmlinux.lds.S | 1 - > arch/hexagon/kernel/process.c | 1 - > arch/hexagon/kernel/vmlinux.lds.S | 1 - > arch/ia64/kernel/process.c | 1 + > arch/ia64/kernel/vmlinux.lds.S | 1 - > arch/loongarch/kernel/idle.c | 1 + > arch/loongarch/kernel/vmlinux.lds.S | 1 - > arch/m68k/kernel/vmlinux-nommu.lds | 1 - > arch/m68k/kernel/vmlinux-std.lds | 1 - > arch/m68k/kernel/vmlinux-sun3.lds | 1 - > arch/microblaze/kernel/process.c | 1 - > arch/microblaze/kernel/vmlinux.lds.S | 1 - > arch/mips/kernel/idle.c | 8 ++-- > arch/mips/kernel/vmlinux.lds.S | 1 - > arch/nios2/kernel/process.c | 1 - > arch/nios2/kernel/vmlinux.lds.S | 1 - > arch/openrisc/kernel/process.c | 1 + > arch/openrisc/kernel/vmlinux.lds.S | 1 - > arch/parisc/kernel/process.c | 2 - > arch/parisc/kernel/vmlinux.lds.S | 1 - > arch/powerpc/kernel/idle.c | 5 +-- > arch/powerpc/kernel/vmlinux.lds.S | 1 - > arch/riscv/kernel/process.c | 1 - > arch/riscv/kernel/vmlinux-xip.lds.S | 1 - > arch/riscv/kernel/vmlinux.lds.S | 1 - > arch/s390/kernel/idle.c | 1 - > arch/s390/kernel/vmlinux.lds.S | 1 - > arch/sh/kernel/idle.c | 1 + > arch/sh/kernel/vmlinux.lds.S | 1 - > arch/sparc/kernel/leon_pmc.c | 4 ++ > arch/sparc/kernel/process_32.c | 1 - > arch/sparc/kernel/process_64.c | 3 +- > arch/sparc/kernel/vmlinux.lds.S | 1 - > arch/um/kernel/dyn.lds.S | 1 - > arch/um/kernel/process.c | 1 - > arch/um/kernel/uml.lds.S | 1 - > arch/x86/boot/compressed/vmlinux.lds.S | 1 + > arch/x86/coco/tdx/tdcall.S | 15 +------ > arch/x86/coco/tdx/tdx.c | 25 ++++-------- > arch/x86/events/amd/brs.c | 13 +++---- > arch/x86/include/asm/fpu/xcr.h | 4 +- > arch/x86/include/asm/irqflags.h | 11 ++---- > arch/x86/include/asm/mwait.h | 14 +++---- > arch/x86/include/asm/nospec-branch.h | 2 +- > arch/x86/include/asm/paravirt.h | 6 ++- > arch/x86/include/asm/perf_event.h | 2 +- > arch/x86/include/asm/shared/io.h | 4 +- > arch/x86/include/asm/shared/tdx.h | 1 - > arch/x86/include/asm/special_insns.h | 8 ++-- > arch/x86/include/asm/xen/hypercall.h | 2 +- > arch/x86/kernel/cpu/bugs.c | 2 +- > arch/x86/kernel/fpu/core.c | 4 +- > arch/x86/kernel/paravirt.c | 14 ++++++- > arch/x86/kernel/process.c | 65 +++++++++++++++---------------- > arch/x86/kernel/vmlinux.lds.S | 1 - > arch/x86/lib/memcpy_64.S | 5 +-- > arch/x86/lib/memmove_64.S | 4 +- > arch/x86/lib/memset_64.S | 4 +- > arch/x86/xen/enlighten_pv.c | 2 +- > arch/x86/xen/irq.c | 2 +- > arch/xtensa/kernel/process.c | 1 + > arch/xtensa/kernel/vmlinux.lds.S | 1 - > drivers/acpi/processor_idle.c | 36 ++++++++++------- > drivers/base/power/runtime.c | 24 ++++++------ > drivers/clk/clk.c | 8 ++-- > drivers/cpuidle/cpuidle-arm.c | 1 + > drivers/cpuidle/cpuidle-big_little.c | 8 +++- > drivers/cpuidle/cpuidle-mvebu-v7.c | 7 ++++ > drivers/cpuidle/cpuidle-psci.c | 10 +++-- > drivers/cpuidle/cpuidle-qcom-spm.c | 1 + > drivers/cpuidle/cpuidle-riscv-sbi.c | 10 +++-- > drivers/cpuidle/cpuidle-tegra.c | 21 +++++++--- > drivers/cpuidle/cpuidle.c | 21 +++++----- > drivers/cpuidle/dt_idle_states.c | 2 +- > drivers/cpuidle/poll_state.c | 10 ++++- > drivers/idle/intel_idle.c | 19 +++++---- > drivers/perf/arm_pmu.c | 11 +----- > drivers/perf/riscv_pmu_sbi.c | 8 +--- > include/asm-generic/vmlinux.lds.h | 9 ++--- > include/linux/compiler_types.h | 8 +++- > include/linux/cpu.h | 3 -- > include/linux/cpuidle.h | 34 ++++++++++++++++ > include/linux/cpumask.h | 4 +- > include/linux/percpu-defs.h | 2 +- > include/linux/sched/idle.h | 40 ++++++++++++++----- > include/linux/thread_info.h | 18 ++++++++- > include/linux/tracepoint.h | 13 ++++++- > kernel/cpu_pm.c | 9 ----- > kernel/printk/printk.c | 2 +- > kernel/sched/idle.c | 47 +++++++--------------- > kernel/time/tick-broadcast-hrtimer.c | 29 ++++++-------- > kernel/time/tick-broadcast.c | 6 ++- > kernel/trace/trace.c | 3 ++ > lib/ubsan.c | 5 ++- > mm/kasan/kasan.h | 4 ++ > mm/kasan/shadow.c | 38 ++++++++++++++++++ > tools/objtool/check.c | 17 ++++++++ > 121 files changed, 511 insertions(+), 420 deletions(-) >
Because Nadav asked about tracing/kprobing idle, I had another go around and noticed not all functions calling ct_cpuidle_enter are __cpuidle. Basically all cpuidle_driver::enter functions should be __cpuidle; i'll do that audit shortly. For now this is ct_cpuidle_enter / CPU_IDLE_ENTER users. --- --- a/arch/arm/mach-imx/cpuidle-imx6q.c +++ b/arch/arm/mach-imx/cpuidle-imx6q.c @@ -17,8 +17,8 @@ static int num_idle_cpus = 0; static DEFINE_RAW_SPINLOCK(cpuidle_lock); -static int imx6q_enter_wait(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) +static __cpuidle int imx6q_enter_wait(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { raw_spin_lock(&cpuidle_lock); if (++num_idle_cpus == num_online_cpus()) --- a/arch/arm/mach-imx/cpuidle-imx6sx.c +++ b/arch/arm/mach-imx/cpuidle-imx6sx.c @@ -30,8 +30,8 @@ static int imx6sx_idle_finish(unsigned l return 0; } -static int imx6sx_enter_wait(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) +static __cpuidle int imx6sx_enter_wait(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { imx6_set_lpm(WAIT_UNCLOCKED); --- a/arch/arm/mach-omap2/omap-mpuss-lowpower.c +++ b/arch/arm/mach-omap2/omap-mpuss-lowpower.c @@ -224,8 +224,8 @@ static void __init save_l2x0_context(voi * 2 - CPUx L1 and logic lost + GIC lost: MPUSS OSWR * 3 - CPUx L1 and logic lost + GIC + L2 lost: DEVICE OFF */ -int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state, - bool rcuidle) +__cpuidle int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state, + bool rcuidle) { struct omap4_cpu_pm_info *pm_info = &per_cpu(omap4_pm_info, cpu); unsigned int save_state = 0, cpu_logic_state = PWRDM_POWER_RET; --- a/arch/arm/mach-omap2/pm34xx.c +++ b/arch/arm/mach-omap2/pm34xx.c @@ -175,7 +175,7 @@ static int omap34xx_do_sram_idle(unsigne return 0; } -void omap_sram_idle(bool rcuidle) +__cpuidle void omap_sram_idle(bool rcuidle) { /* Variable to tell what needs to be saved and restored * in omap_sram_idle*/ --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -62,7 +62,7 @@ int acpi_processor_ffh_lpi_probe(unsigne return psci_acpi_cpu_init_idle(cpu); } -int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) +__cpuidle int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) { u32 state = lpi->address; --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -31,8 +31,8 @@ * Called from the CPUidle framework to program the device to the * specified target state selected by the governor. */ -static int arm_enter_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int arm_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { /* * Pass idle state index to arm_cpuidle_suspend which in turn --- a/drivers/cpuidle/cpuidle-big_little.c +++ b/drivers/cpuidle/cpuidle-big_little.c @@ -122,8 +122,8 @@ static int notrace bl_powerdown_finisher * Called from the CPUidle framework to program the device to the * specified target state selected by the governor. */ -static int bl_enter_powerdown(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int bl_enter_powerdown(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { cpu_pm_enter(); ct_cpuidle_enter(); --- a/drivers/cpuidle/cpuidle-mvebu-v7.c +++ b/drivers/cpuidle/cpuidle-mvebu-v7.c @@ -25,9 +25,9 @@ static int (*mvebu_v7_cpu_suspend)(int); -static int mvebu_v7_enter_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static __cpuidle int mvebu_v7_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) { int ret; bool deepidle = false; --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -49,14 +49,9 @@ static inline u32 psci_get_domain_state( return __this_cpu_read(domain_state); } -static inline int psci_enter_state(int idx, u32 state) -{ - return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter, idx, state); -} - -static int __psci_enter_domain_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx, - bool s2idle) +static __cpuidle int __psci_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx, + bool s2idle) { struct psci_cpuidle_data *data = this_cpu_ptr(&psci_cpuidle_data); u32 *states = data->psci_states; @@ -192,12 +187,12 @@ static void psci_idle_init_cpuhp(void) pr_warn("Failed %d while setup cpuhp state\n", err); } -static int psci_enter_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int psci_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { u32 *state = __this_cpu_read(psci_cpuidle_data.psci_states); - return psci_enter_state(idx, state[idx]); + return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter, idx, state[idx]); } static const struct of_device_id psci_idle_state_match[] = { --- a/drivers/cpuidle/cpuidle-qcom-spm.c +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -58,8 +58,8 @@ static int qcom_cpu_spc(struct spm_drive return ret; } -static int spm_enter_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int spm_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { struct cpuidle_qcom_spm_data *data = container_of(drv, struct cpuidle_qcom_spm_data, cpuidle_driver); --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -93,17 +93,17 @@ static int sbi_suspend(u32 state) return sbi_suspend_finisher(state, 0, 0); } -static int sbi_cpuidle_enter_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx) +static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) { u32 *states = __this_cpu_read(sbi_cpuidle_data.states); return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, states[idx]); } -static int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int idx, - bool s2idle) +static __cpuidle int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx, + bool s2idle) { struct sbi_cpuidle_data *data = this_cpu_ptr(&sbi_cpuidle_data); u32 *states = data->states; --- a/drivers/cpuidle/cpuidle-tegra.c +++ b/drivers/cpuidle/cpuidle-tegra.c @@ -160,8 +160,8 @@ static int tegra_cpuidle_coupled_barrier return 0; } -static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, - int index, unsigned int cpu) +static __cpuidle int tegra_cpuidle_state_enter(struct cpuidle_device *dev, + int index, unsigned int cpu) { int err; @@ -226,9 +226,9 @@ static int tegra_cpuidle_adjust_state_in return index; } -static int tegra_cpuidle_enter(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static __cpuidle int tegra_cpuidle_enter(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) { bool do_rcu = drv->states[index].flags & CPUIDLE_FLAG_RCU_IDLE; unsigned int cpu = cpu_logical_map(dev->cpu); --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -137,11 +137,13 @@ int cpuidle_find_deepest_state(struct cp } #ifdef CONFIG_SUSPEND -static void enter_s2idle_proper(struct cpuidle_driver *drv, - struct cpuidle_device *dev, int index) +static __cpuidle void enter_s2idle_proper(struct cpuidle_driver *drv, + struct cpuidle_device *dev, int index) { - ktime_t time_start, time_end; struct cpuidle_state *target_state = &drv->states[index]; + ktime_t time_start, time_end; + + instrumentation_begin(); time_start = ns_to_ktime(local_clock()); @@ -152,13 +154,18 @@ static void enter_s2idle_proper(struct c * suspended is generally unsafe. */ stop_critical_timings(); - if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { ct_cpuidle_enter(); + /* Annotate away the indirect call */ + instrumentation_begin(); + } target_state->enter_s2idle(dev, drv, index); if (WARN_ON_ONCE(!irqs_disabled())) raw_local_irq_disable(); - if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + instrumentation_end(); ct_cpuidle_exit(); + } tick_unfreeze(); start_critical_timings(); @@ -166,6 +173,7 @@ static void enter_s2idle_proper(struct c dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start); dev->states_usage[index].s2idle_usage++; + instrumentation_end(); } /** @@ -200,8 +208,9 @@ int cpuidle_enter_s2idle(struct cpuidle_ * @drv: cpuidle driver for this cpu * @index: index into the states table in @drv of the state to enter */ -int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, - int index) +__cpuidle int cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) { int entered_state; @@ -209,6 +218,8 @@ int cpuidle_enter_state(struct cpuidle_d bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP); ktime_t time_start, time_end; + instrumentation_begin(); + /* * Tell the time framework to switch to a broadcast timer because our * local timer will be shut down. If a local timer is used from another @@ -235,15 +246,21 @@ int cpuidle_enter_state(struct cpuidle_d time_start = ns_to_ktime(local_clock()); stop_critical_timings(); - if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { ct_cpuidle_enter(); + /* Annotate away the indirect call */ + instrumentation_begin(); + } entered_state = target_state->enter(dev, drv, index); + if (WARN_ONCE(!irqs_disabled(), "%ps leaked IRQ state", target_state->enter)) raw_local_irq_disable(); - if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + instrumentation_end(); ct_cpuidle_exit(); + } start_critical_timings(); sched_clock_idle_wakeup_event(); @@ -306,6 +323,8 @@ int cpuidle_enter_state(struct cpuidle_d dev->states_usage[index].rejected++; } + instrumentation_end(); + return entered_state; }
On Mon, 19 Sept 2022 at 12:18, Peter Zijlstra <peterz at infradead.org> wrote:> > Hi All! > > At long last, a respin of the cpuidle vs rcu cleanup patches. > > v1: https://lkml.kernel.org/r/20220608142723.103523089 at infradead.org > > These here patches clean up the mess that is cpuidle vs rcuidle. > > At the end of the ride there's only on RCU_NONIDLE user left: > > arch/arm64/kernel/suspend.c: RCU_NONIDLE(__cpu_suspend_exit()); > > and 'one' trace_*_rcuidle() user: > > kernel/trace/trace_preemptirq.c: trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); > kernel/trace/trace_preemptirq.c: trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); > kernel/trace/trace_preemptirq.c: trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); > kernel/trace/trace_preemptirq.c: trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); > kernel/trace/trace_preemptirq.c: trace_preempt_enable_rcuidle(a0, a1); > kernel/trace/trace_preemptirq.c: trace_preempt_disable_rcuidle(a0, a1); > > However this last is all in deprecated code that should be unused for GENERIC_ENTRY. > > I've touched a lot of code that I can't test and I might've broken something by > accident. In particular the whole ARM cpuidle stuff was quite involved. > > Please all; have a look where you haven't already. > > > New since v1: > > - rebase on top of Frederic's rcu-context-tracking rename fest > - more omap goodness as per the last discusion (thanks Tony!) > - removed one more RCU_NONIDLE() from arm64/risc-v perf code > - ubsan/kasan fixes > - intel_idle module-param for testing > - a bunch of extra __always_inline, because compilers are silly. > > --- > arch/alpha/kernel/process.c | 1 - > arch/alpha/kernel/vmlinux.lds.S | 1 - > arch/arc/kernel/process.c | 3 ++ > arch/arc/kernel/vmlinux.lds.S | 1 - > arch/arm/include/asm/vmlinux.lds.h | 1 - > arch/arm/kernel/process.c | 1 - > arch/arm/kernel/smp.c | 6 +-- > arch/arm/mach-gemini/board-dt.c | 3 +- > arch/arm/mach-imx/cpuidle-imx6q.c | 4 +- > arch/arm/mach-imx/cpuidle-imx6sx.c | 5 ++- > arch/arm/mach-omap2/common.h | 6 ++- > arch/arm/mach-omap2/cpuidle34xx.c | 16 +++++++- > arch/arm/mach-omap2/cpuidle44xx.c | 29 +++++++------- > arch/arm/mach-omap2/omap-mpuss-lowpower.c | 12 +++++- > arch/arm/mach-omap2/pm.h | 2 +- > arch/arm/mach-omap2/pm24xx.c | 51 +----------------------- > arch/arm/mach-omap2/pm34xx.c | 14 +++++-- > arch/arm/mach-omap2/pm44xx.c | 2 +- > arch/arm/mach-omap2/powerdomain.c | 10 ++--- > arch/arm64/kernel/idle.c | 1 - > arch/arm64/kernel/smp.c | 4 +- > arch/arm64/kernel/vmlinux.lds.S | 1 - > arch/csky/kernel/process.c | 1 - > arch/csky/kernel/smp.c | 2 +- > arch/csky/kernel/vmlinux.lds.S | 1 - > arch/hexagon/kernel/process.c | 1 - > arch/hexagon/kernel/vmlinux.lds.S | 1 - > arch/ia64/kernel/process.c | 1 + > arch/ia64/kernel/vmlinux.lds.S | 1 - > arch/loongarch/kernel/idle.c | 1 + > arch/loongarch/kernel/vmlinux.lds.S | 1 - > arch/m68k/kernel/vmlinux-nommu.lds | 1 - > arch/m68k/kernel/vmlinux-std.lds | 1 - > arch/m68k/kernel/vmlinux-sun3.lds | 1 - > arch/microblaze/kernel/process.c | 1 - > arch/microblaze/kernel/vmlinux.lds.S | 1 - > arch/mips/kernel/idle.c | 8 ++-- > arch/mips/kernel/vmlinux.lds.S | 1 - > arch/nios2/kernel/process.c | 1 - > arch/nios2/kernel/vmlinux.lds.S | 1 - > arch/openrisc/kernel/process.c | 1 + > arch/openrisc/kernel/vmlinux.lds.S | 1 - > arch/parisc/kernel/process.c | 2 - > arch/parisc/kernel/vmlinux.lds.S | 1 - > arch/powerpc/kernel/idle.c | 5 +-- > arch/powerpc/kernel/vmlinux.lds.S | 1 - > arch/riscv/kernel/process.c | 1 - > arch/riscv/kernel/vmlinux-xip.lds.S | 1 - > arch/riscv/kernel/vmlinux.lds.S | 1 - > arch/s390/kernel/idle.c | 1 - > arch/s390/kernel/vmlinux.lds.S | 1 - > arch/sh/kernel/idle.c | 1 + > arch/sh/kernel/vmlinux.lds.S | 1 - > arch/sparc/kernel/leon_pmc.c | 4 ++ > arch/sparc/kernel/process_32.c | 1 - > arch/sparc/kernel/process_64.c | 3 +- > arch/sparc/kernel/vmlinux.lds.S | 1 - > arch/um/kernel/dyn.lds.S | 1 - > arch/um/kernel/process.c | 1 - > arch/um/kernel/uml.lds.S | 1 - > arch/x86/boot/compressed/vmlinux.lds.S | 1 + > arch/x86/coco/tdx/tdcall.S | 15 +------ > arch/x86/coco/tdx/tdx.c | 25 ++++-------- > arch/x86/events/amd/brs.c | 13 +++---- > arch/x86/include/asm/fpu/xcr.h | 4 +- > arch/x86/include/asm/irqflags.h | 11 ++---- > arch/x86/include/asm/mwait.h | 14 +++---- > arch/x86/include/asm/nospec-branch.h | 2 +- > arch/x86/include/asm/paravirt.h | 6 ++- > arch/x86/include/asm/perf_event.h | 2 +- > arch/x86/include/asm/shared/io.h | 4 +- > arch/x86/include/asm/shared/tdx.h | 1 - > arch/x86/include/asm/special_insns.h | 8 ++-- > arch/x86/include/asm/xen/hypercall.h | 2 +- > arch/x86/kernel/cpu/bugs.c | 2 +- > arch/x86/kernel/fpu/core.c | 4 +- > arch/x86/kernel/paravirt.c | 14 ++++++- > arch/x86/kernel/process.c | 65 +++++++++++++++---------------- > arch/x86/kernel/vmlinux.lds.S | 1 - > arch/x86/lib/memcpy_64.S | 5 +-- > arch/x86/lib/memmove_64.S | 4 +- > arch/x86/lib/memset_64.S | 4 +- > arch/x86/xen/enlighten_pv.c | 2 +- > arch/x86/xen/irq.c | 2 +- > arch/xtensa/kernel/process.c | 1 + > arch/xtensa/kernel/vmlinux.lds.S | 1 - > drivers/acpi/processor_idle.c | 36 ++++++++++------- > drivers/base/power/runtime.c | 24 ++++++------ > drivers/clk/clk.c | 8 ++-- > drivers/cpuidle/cpuidle-arm.c | 1 + > drivers/cpuidle/cpuidle-big_little.c | 8 +++- > drivers/cpuidle/cpuidle-mvebu-v7.c | 7 ++++ > drivers/cpuidle/cpuidle-psci.c | 10 +++-- > drivers/cpuidle/cpuidle-qcom-spm.c | 1 + > drivers/cpuidle/cpuidle-riscv-sbi.c | 10 +++-- > drivers/cpuidle/cpuidle-tegra.c | 21 +++++++--- > drivers/cpuidle/cpuidle.c | 21 +++++----- > drivers/cpuidle/dt_idle_states.c | 2 +- > drivers/cpuidle/poll_state.c | 10 ++++- > drivers/idle/intel_idle.c | 19 +++++---- > drivers/perf/arm_pmu.c | 11 +----- > drivers/perf/riscv_pmu_sbi.c | 8 +--- > include/asm-generic/vmlinux.lds.h | 9 ++--- > include/linux/compiler_types.h | 8 +++- > include/linux/cpu.h | 3 -- > include/linux/cpuidle.h | 34 ++++++++++++++++ > include/linux/cpumask.h | 4 +- > include/linux/percpu-defs.h | 2 +- > include/linux/sched/idle.h | 40 ++++++++++++++----- > include/linux/thread_info.h | 18 ++++++++- > include/linux/tracepoint.h | 13 ++++++- > kernel/cpu_pm.c | 9 ----- > kernel/printk/printk.c | 2 +- > kernel/sched/idle.c | 47 +++++++--------------- > kernel/time/tick-broadcast-hrtimer.c | 29 ++++++-------- > kernel/time/tick-broadcast.c | 6 ++- > kernel/trace/trace.c | 3 ++ > lib/ubsan.c | 5 ++- > mm/kasan/kasan.h | 4 ++ > mm/kasan/shadow.c | 38 ++++++++++++++++++ > tools/objtool/check.c | 17 ++++++++ > 121 files changed, 511 insertions(+), 420 deletions(-)Thanks for cleaning up the situation! I have applied this on a plain v6.0 (only one patch had a minor conflict) and tested this on an ARM64 Dragonboard 410c, which uses cpuidle-psci and the cpuidle-psci-domain. I didn't observe any problems, so feel free to add: Tested-by: Ulf Hansson <ulf.hansson at linaro.org> Kind regards Uffe