0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat. These patches appear to cure this, the ftrace selftest now runs to completion without spamming scary messages to dmesg. Since v1: - fixed recursive RCU splats - fixed psci thingies for arm (null) - improved the tracing WARN (rostedt) - fixed TRACE_PREEMPT_TOGGLE (null) --- arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++------------------- arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++---------------- arch/x86/include/asm/kvmclock.h | 2 +- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/include/asm/pvclock.h | 3 ++- arch/x86/kernel/cpu/vmware.c | 2 +- arch/x86/kernel/kvmclock.c | 6 +++--- arch/x86/kernel/pvclock.c | 22 +++++++++++++------ arch/x86/kernel/tsc.c | 7 +++--- arch/x86/xen/time.c | 12 +++++++++-- drivers/cpuidle/cpuidle.c | 2 +- drivers/cpuidle/poll_state.c | 2 -- drivers/firmware/psci/psci.c | 31 ++++++++++++++++----------- include/linux/context_tracking.h | 27 +++++++++++++++++++++++ include/linux/math64.h | 4 ++-- include/linux/sched/clock.h | 8 +++---- include/linux/trace_recursion.h | 18 ++++++++++++++++ kernel/locking/lockdep.c | 3 +++ kernel/panic.c | 5 +++++ kernel/sched/clock.c | 27 +++++++++++++++++------ kernel/trace/trace_preemptirq.c | 6 ++---- lib/bug.c | 15 ++++++++++++- 22 files changed, 192 insertions(+), 92 deletions(-)
Peter Zijlstra
2023-Jan-26 15:08 UTC
[PATCH v2 1/9] drivers: firmware: psci: Dont instrument suspend code
From: Mark Rutland <mark.rutland at arm.com> The PSCI suspend code is currently instrumentable, which is not safe as instrumentation (e.g. ftrace) may try to make use of RCU during idle periods when RCU is not watching. To fix this we need to ensure that psci_suspend_finisher() and anything it calls are not instrumented. We can do this fairly simply by marking psci_suspend_finisher() and the psci*_cpu_suspend() functions as noinstr, and the underlying helper functions as __always_inline. When CONFIG_DEBUG_VIRTUAL=y, __pa_symbol() can expand to an out-of-line instrumented function, so we must use __pa_symbol_nodebug() within psci_suspend_finisher(). The raw SMCCC invocation functions are written in assembly, and are not subject to compiler instrumentation. Signed-off-by: Mark Rutland <mark.rutland at arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- drivers/firmware/psci/psci.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -108,9 +108,10 @@ bool psci_power_state_is_valid(u32 state return !(state & ~valid_mask); } -static unsigned long __invoke_psci_fn_hvc(unsigned long function_id, - unsigned long arg0, unsigned long arg1, - unsigned long arg2) +static __always_inline unsigned long +__invoke_psci_fn_hvc(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2) { struct arm_smccc_res res; @@ -118,9 +119,10 @@ static unsigned long __invoke_psci_fn_hv return res.a0; } -static unsigned long __invoke_psci_fn_smc(unsigned long function_id, - unsigned long arg0, unsigned long arg1, - unsigned long arg2) +static __always_inline unsigned long +__invoke_psci_fn_smc(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2) { struct arm_smccc_res res; @@ -128,7 +130,7 @@ static unsigned long __invoke_psci_fn_sm return res.a0; } -static int psci_to_linux_errno(int errno) +static __always_inline int psci_to_linux_errno(int errno) { switch (errno) { case PSCI_RET_SUCCESS: @@ -169,7 +171,8 @@ int psci_set_osi_mode(bool enable) return psci_to_linux_errno(err); } -static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) +static __always_inline int +__psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) { int err; @@ -177,13 +180,15 @@ static int __psci_cpu_suspend(u32 fn, u3 return psci_to_linux_errno(err); } -static int psci_0_1_cpu_suspend(u32 state, unsigned long entry_point) +static __always_inline int +psci_0_1_cpu_suspend(u32 state, unsigned long entry_point) { return __psci_cpu_suspend(psci_0_1_function_ids.cpu_suspend, state, entry_point); } -static int psci_0_2_cpu_suspend(u32 state, unsigned long entry_point) +static __always_inline int +psci_0_2_cpu_suspend(u32 state, unsigned long entry_point) { return __psci_cpu_suspend(PSCI_FN_NATIVE(0_2, CPU_SUSPEND), state, entry_point); @@ -447,10 +452,12 @@ late_initcall(psci_debugfs_init) #endif #ifdef CONFIG_CPU_IDLE -static int psci_suspend_finisher(unsigned long state) +static noinstr int psci_suspend_finisher(unsigned long state) { u32 power_state = state; - phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume); + phys_addr_t pa_cpu_resume; + + pa_cpu_resume = __pa_symbol_nodebug((unsigned long)cpu_resume); return psci_ops.cpu_suspend(power_state, pa_cpu_resume); }
Peter Zijlstra
2023-Jan-26 15:08 UTC
[PATCH v2 2/9] bug: Disable rcu_is_watching() during WARN/BUG
In order to avoid WARN/BUG from generating nested or even recursive warnings, force rcu_is_watching() true during WARN/lockdep_rcu_suspicious(). Notably things like unwinding the stack can trigger rcu_dereference() warnings, which then triggers more unwinding which then triggers more warnings etc.. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- include/linux/context_tracking.h | 27 +++++++++++++++++++++++++++ kernel/locking/lockdep.c | 3 +++ kernel/panic.c | 5 +++++ lib/bug.c | 15 ++++++++++++++- 4 files changed, 49 insertions(+), 1 deletion(-) --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -130,9 +130,36 @@ static __always_inline unsigned long ct_ return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state)); } +static __always_inline bool warn_rcu_enter(void) +{ + bool ret = false; + + /* + * Horrible hack to shut up recursive RCU isn't watching fail since + * lots of the actual reporting also relies on RCU. + */ + preempt_disable_notrace(); + if (rcu_dynticks_curr_cpu_in_eqs()) { + ret = true; + ct_state_inc(RCU_DYNTICKS_IDX); + } + + return ret; +} + +static __always_inline void warn_rcu_exit(bool rcu) +{ + if (rcu) + ct_state_inc(RCU_DYNTICKS_IDX); + preempt_enable_notrace(); +} + #else static inline void ct_idle_enter(void) { } static inline void ct_idle_exit(void) { } + +static __always_inline bool warn_rcu_enter(void) { return false; } +static __always_inline void warn_rcu_exit(bool rcu) { } #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */ #endif --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@ #include <linux/rcupdate.h> #include <linux/kprobes.h> #include <linux/lockdep.h> +#include <linux/context_tracking.h> #include <asm/sections.h> @@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char * { struct task_struct *curr = current; int dl = READ_ONCE(debug_locks); + bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); @@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char * lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ #include <linux/ratelimit.h> #include <linux/debugfs.h> #include <linux/sysfs.h> +#include <linux/context_tracking.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -679,6 +680,7 @@ void __warn(const char *file, int line, void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { + bool rcu = warn_rcu_enter(); struct warn_args args; pr_warn(CUT_HERE); @@ -693,11 +695,13 @@ void warn_slowpath_fmt(const char *file, va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(warn_slowpath_fmt); #else void __warn_printk(const char *fmt, ...) { + bool rcu = warn_rcu_enter(); va_list args; pr_warn(CUT_HERE); @@ -705,6 +709,7 @@ void __warn_printk(const char *fmt, ...) va_start(args, fmt); vprintk(fmt, args); va_end(args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(__warn_printk); #endif --- a/lib/bug.c +++ b/lib/bug.c @@ -47,6 +47,7 @@ #include <linux/sched.h> #include <linux/rculist.h> #include <linux/ftrace.h> +#include <linux/context_tracking.h> extern struct bug_entry __start___bug_table[], __stop___bug_table[]; @@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long return module_find_bug(bugaddr); } -enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) +static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs) { struct bug_entry *bug; const char *file; @@ -209,6 +210,18 @@ enum bug_trap_type report_bug(unsigned l return BUG_TRAP_TYPE_BUG; } +enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) +{ + enum bug_trap_type ret; + bool rcu = false; + + rcu = warn_rcu_enter(); + ret = __report_bug(bugaddr, regs); + warn_rcu_exit(rcu); + + return ret; +} + static void clear_once_table(struct bug_entry *start, struct bug_entry *end) { struct bug_entry *bug;
Peter Zijlstra
2023-Jan-26 15:08 UTC
[PATCH v2 3/9] tracing: Warn about !rcu_is_watching()
When using noinstr, WARN when tracing hits when RCU is disabled. Suggested-by: Steven Rostedt (Google) <rostedt at goodmis.org> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- include/linux/trace_recursion.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsi # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +# define trace_warn_on_no_rcu(ip) \ + ({ \ + bool __ret = !rcu_is_watching(); \ + if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \ + trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \ + WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \ + trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \ + } \ + __ret; \ + }) +#else +# define trace_warn_on_no_rcu(ip) false +#endif + /* * Preemption is promised to be disabled when return bit >= 0. */ @@ -144,6 +159,9 @@ static __always_inline int trace_test_an unsigned int val = READ_ONCE(current->trace_recursion); int bit; + if (trace_warn_on_no_rcu(ip)) + return -1; + bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /*
Peter Zijlstra
2023-Jan-26 15:08 UTC
[PATCH v2 4/9] tracing, preempt: Squash _rcuidle tracing
Extend commit 9aedeaed6fc6 ("tracing, hardirq: No moar _rcuidle() tracing") to also cover trace_preempt_{on,off}() which were mysteriously untouched. Fixes: 9aedeaed6fc6 ("tracing, hardirq: No moar _rcuidle() tracing") Reported-by: Mark Rutland <mark.rutland at arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Tested-by: Mark Rutland <mark.rutland at arm.com> Link: https://lkml.kernel.org/r/Y9D5AfnOukWNOZ5q at hirez.programming.kicks-ass.net --- diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index f992444a0b1f..ea96b41c8838 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -100,15 +100,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (!in_nmi()) - trace_preempt_enable_rcuidle(a0, a1); + trace(preempt_enable)(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (!in_nmi()) - trace_preempt_disable_rcuidle(a0, a1); + trace(preempt_disable)(a0, a1); tracer_preempt_off(a0, a1); } #endif
As already done for regular arch_atomic, always inline arch_atomic64. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/include/asm/atomic64_32.h | 44 ++++++++++++++++++------------------- arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++--------------- 2 files changed, 40 insertions(+), 40 deletions(-) --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -71,7 +71,7 @@ ATOMIC64_DECL(add_unless); * the old value. */ -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { return arch_cmpxchg64(&v->counter, o, n); } @@ -85,7 +85,7 @@ static inline s64 arch_atomic64_cmpxchg( * Atomically xchgs the value of @v to @n and returns * the old value. */ -static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) +static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { s64 o; unsigned high = (unsigned)(n >> 32); @@ -104,7 +104,7 @@ static inline s64 arch_atomic64_xchg(ato * * Atomically sets the value of @v to @n. */ -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; @@ -119,7 +119,7 @@ static inline void arch_atomic64_set(ato * * Atomically reads the value of @v and returns it. */ -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { s64 r; alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); @@ -133,7 +133,7 @@ static inline s64 arch_atomic64_read(con * * Atomically adds @i to @v and returns @i + *@v */ -static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -145,7 +145,7 @@ static inline s64 arch_atomic64_add_retu /* * Other variants with different arithmetic operators: */ -static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -154,7 +154,7 @@ static inline s64 arch_atomic64_sub_retu } #define arch_atomic64_sub_return arch_atomic64_sub_return -static inline s64 arch_atomic64_inc_return(atomic64_t *v) +static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) { s64 a; alternative_atomic64(inc_return, "=&A" (a), @@ -163,7 +163,7 @@ static inline s64 arch_atomic64_inc_retu } #define arch_atomic64_inc_return arch_atomic64_inc_return -static inline s64 arch_atomic64_dec_return(atomic64_t *v) +static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) { s64 a; alternative_atomic64(dec_return, "=&A" (a), @@ -179,7 +179,7 @@ static inline s64 arch_atomic64_dec_retu * * Atomically adds @i to @v. */ -static inline s64 arch_atomic64_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -194,7 +194,7 @@ static inline s64 arch_atomic64_add(s64 * * Atomically subtracts @i from @v. */ -static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -208,7 +208,7 @@ static inline s64 arch_atomic64_sub(s64 * * Atomically increments @v by 1. */ -static inline void arch_atomic64_inc(atomic64_t *v) +static __always_inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); @@ -221,7 +221,7 @@ static inline void arch_atomic64_inc(ato * * Atomically decrements @v by 1. */ -static inline void arch_atomic64_dec(atomic64_t *v) +static __always_inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); @@ -237,7 +237,7 @@ static inline void arch_atomic64_dec(ato * Atomically adds @a to @v, so long as it was not @u. * Returns non-zero if the add was done, zero otherwise. */ -static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) +static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); @@ -248,7 +248,7 @@ static inline int arch_atomic64_add_unle } #define arch_atomic64_add_unless arch_atomic64_add_unless -static inline int arch_atomic64_inc_not_zero(atomic64_t *v) +static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; alternative_atomic64(inc_not_zero, "=&a" (r), @@ -257,7 +257,7 @@ static inline int arch_atomic64_inc_not_ } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero -static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) +static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { s64 r; alternative_atomic64(dec_if_positive, "=&A" (r), @@ -269,7 +269,7 @@ static inline s64 arch_atomic64_dec_if_p #undef alternative_atomic64 #undef __alternative_atomic64 -static inline void arch_atomic64_and(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -277,7 +277,7 @@ static inline void arch_atomic64_and(s64 c = old; } -static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -288,7 +288,7 @@ static inline s64 arch_atomic64_fetch_an } #define arch_atomic64_fetch_and arch_atomic64_fetch_and -static inline void arch_atomic64_or(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -296,7 +296,7 @@ static inline void arch_atomic64_or(s64 c = old; } -static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -307,7 +307,7 @@ static inline s64 arch_atomic64_fetch_or } #define arch_atomic64_fetch_or arch_atomic64_fetch_or -static inline void arch_atomic64_xor(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -315,7 +315,7 @@ static inline void arch_atomic64_xor(s64 c = old; } -static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -326,7 +326,7 @@ static inline s64 arch_atomic64_fetch_xo } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { s64 old, c = 0; --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -17,7 +17,7 @@ * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return __READ_ONCE((v)->counter); } @@ -29,7 +29,7 @@ static inline s64 arch_atomic64_read(con * * Atomically sets the value of @v to @i. */ -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { __WRITE_ONCE(v->counter, i); } @@ -55,7 +55,7 @@ static __always_inline void arch_atomic6 * * Atomically subtracts @i from @v. */ -static inline void arch_atomic64_sub(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) @@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(s64 * true if the result is zero, or false for all * other cases. */ -static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) +static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } @@ -113,7 +113,7 @@ static __always_inline void arch_atomic6 * returns true if the result is 0, or false for all other * cases. */ -static inline bool arch_atomic64_dec_and_test(atomic64_t *v) +static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } @@ -127,7 +127,7 @@ static inline bool arch_atomic64_dec_and * and returns true if the result is zero, or false for all * other cases. */ -static inline bool arch_atomic64_inc_and_test(atomic64_t *v) +static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } @@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) +static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } @@ -161,25 +161,25 @@ static __always_inline s64 arch_atomic64 } #define arch_atomic64_add_return arch_atomic64_add_return -static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { return arch_atomic64_add_return(-i, v); } #define arch_atomic64_sub_return arch_atomic64_sub_return -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) { return xadd(&v->counter, -i); } #define arch_atomic64_fetch_sub arch_atomic64_fetch_sub -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } @@ -191,13 +191,13 @@ static __always_inline bool arch_atomic6 } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg -static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) +static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } #define arch_atomic64_xchg arch_atomic64_xchg -static inline void arch_atomic64_and(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "andq %1,%0" : "+m" (v->counter) @@ -205,7 +205,7 @@ static inline void arch_atomic64_and(s64 : "memory"); } -static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -215,7 +215,7 @@ static inline s64 arch_atomic64_fetch_an } #define arch_atomic64_fetch_and arch_atomic64_fetch_and -static inline void arch_atomic64_or(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "orq %1,%0" : "+m" (v->counter) @@ -223,7 +223,7 @@ static inline void arch_atomic64_or(s64 : "memory"); } -static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -233,7 +233,7 @@ static inline s64 arch_atomic64_fetch_or } #define arch_atomic64_fetch_or arch_atomic64_fetch_or -static inline void arch_atomic64_xor(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "xorq %1,%0" : "+m" (v->counter) @@ -241,7 +241,7 @@ static inline void arch_atomic64_xor(s64 : "memory"); } -static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v);
Peter Zijlstra
2023-Jan-26 15:08 UTC
[PATCH v2 6/9] x86/pvclock: improve atomic update of last_value in pvclock_clocksource_read
From: Uros Bizjak <ubizjak at gmail.com> Improve atomic update of last_value in pvclock_clocksource_read: - Atomic update can be skipped if the "last_value" is already equal to "ret". - The detection of atomic update failure is not correct. The value, returned by atomic64_cmpxchg should be compared to the old value from the location to be updated. If these two are the same, then atomic update succeeded and "last_value" location is updated to "ret" in an atomic way. Otherwise, the atomic update failed and it should be retried with the value from "last_value" - exactly what atomic64_try_cmpxchg does in a correct and more optimal way. Signed-off-by: Uros Bizjak <ubizjak at gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Link: https://lkml.kernel.org/r/20230118202330.3740-1-ubizjak at gmail.com --- arch/x86/kernel/pvclock.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index eda37df016f0..5a2a517dd61b 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -102,10 +102,9 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) */ last = atomic64_read(&last_value); do { - if (ret < last) + if (ret <= last) return last; - last = atomic64_cmpxchg(&last_value, last, ret); - } while (unlikely(last != ret)); + } while (!atomic64_try_cmpxchg(&last_value, &last, ret)); return ret; } -- 2.39.0
In order to use sched_clock() from noinstr code, mark it and all it's implenentations noinstr. The whole pvclock thing (used by KVM/Xen) is a bit of a pain, since it calls out to watchdogs, create a pvclock_clocksource_read_nowd() variant doesn't do that and can be noinstr. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/include/asm/kvmclock.h | 2 +- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/include/asm/pvclock.h | 3 ++- arch/x86/kernel/cpu/vmware.c | 2 +- arch/x86/kernel/kvmclock.c | 6 +++--- arch/x86/kernel/pvclock.c | 19 +++++++++++++++---- arch/x86/kernel/tsc.c | 7 +++---- arch/x86/xen/time.c | 12 ++++++++++-- include/linux/math64.h | 4 ++-- 9 files changed, 38 insertions(+), 19 deletions(-) --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -8,7 +8,7 @@ extern struct clocksource kvm_clock; DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); -static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { return &this_cpu_read(hv_clock_per_cpu)->pvti; } --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -26,7 +26,7 @@ DECLARE_STATIC_CALL(pv_sched_clock, dumm void paravirt_set_sched_clock(u64 (*func)(void)); -static inline u64 paravirt_sched_clock(void) +static __always_inline u64 paravirt_sched_clock(void) { return static_call(pv_sched_clock)(); } --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -7,6 +7,7 @@ /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); @@ -39,7 +40,7 @@ bool pvclock_read_retry(const struct pvc * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ -static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) { u64 product; #ifdef __i386__ --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char } early_param("no-steal-acc", parse_no_stealacc); -static unsigned long long notrace vmware_sched_clock(void) +static noinstr u64 vmware_sched_clock(void) { unsigned long long ns; --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -71,12 +71,12 @@ static int kvm_set_wallclock(const struc return -ENODEV; } -static u64 kvm_clock_read(void) +static noinstr u64 kvm_clock_read(void) { u64 ret; preempt_disable_notrace(); - ret = pvclock_clocksource_read(this_cpu_pvti()); + ret = pvclock_clocksource_read_nowd(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -86,7 +86,7 @@ static u64 kvm_clock_get_cycles(struct c return kvm_clock_read(); } -static u64 kvm_sched_clock_read(void) +static noinstr u64 kvm_sched_clock_read(void) { return kvm_clock_read() - kvm_sched_clock_offset; } --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -64,7 +64,8 @@ u8 pvclock_read_flags(struct pvclock_vcp return flags & valid_flags; } -u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +static __always_inline +u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd) { unsigned version; u64 ret; @@ -77,7 +78,7 @@ u64 pvclock_clocksource_read(struct pvcl flags = src->flags; } while (pvclock_read_retry(src, version)); - if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { + if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { src->flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); } @@ -100,15 +101,25 @@ u64 pvclock_clocksource_read(struct pvcl * updating at the same time, and one of them could be slightly behind, * making the assumption that last_value always go forward fail to hold. */ - last = atomic64_read(&last_value); + last = arch_atomic64_read(&last_value); do { if (ret <= last) return last; - } while (!atomic64_try_cmpxchg(&last_value, &last, ret)); + } while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret)); return ret; } +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + return __pvclock_clocksource_read(src, true); +} + +noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src) +{ + return __pvclock_clocksource_read(src, false); +} + void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, struct pvclock_vcpu_time_info *vcpu_time, struct timespec64 *ts) --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -215,7 +215,7 @@ static void __init cyc2ns_init_secondary /* * Scheduler clock - returns current time in nanosec units. */ -u64 native_sched_clock(void) +noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); @@ -248,7 +248,7 @@ u64 native_sched_clock_from_tsc(u64 tsc) /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) +noinstr u64 sched_clock(void) { return paravirt_sched_clock(); } @@ -258,8 +258,7 @@ bool using_native_sched_clock(void) return static_call_query(pv_sched_clock) == native_sched_clock; } #else -unsigned long long -sched_clock(void) __attribute__((alias("native_sched_clock"))); +u64 sched_clock(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -60,9 +60,17 @@ static u64 xen_clocksource_get_cycles(st return xen_clocksource_read(); } -static u64 xen_sched_clock(void) +static noinstr u64 xen_sched_clock(void) { - return xen_clocksource_read() - xen_sched_clock_offset; + struct pvclock_vcpu_time_info *src; + u64 ret; + + preempt_disable_notrace(); + src = &__this_cpu_read(xen_vcpu)->time; + ret = pvclock_clocksource_read_nowd(src); + ret -= xen_sched_clock_offset; + preempt_enable_notrace(); + return ret; } static void xen_read_wallclock(struct timespec64 *ts) --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -161,7 +161,7 @@ static inline u64 mul_u32_u32(u32 a, u32 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr -static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } @@ -177,7 +177,7 @@ static inline u64 mul_u64_u64_shr(u64 a, #else #ifndef mul_u64_u32_shr -static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { u32 ah, al; u64 ret;
Peter Zijlstra
2023-Jan-26 15:08 UTC
[PATCH v2 8/9] sched/clock: Make local_clock() noinstr
With sched_clock() noinstr, provide a noinstr implementation of local_clock(). Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- include/linux/sched/clock.h | 8 +++----- kernel/sched/clock.c | 27 +++++++++++++++++++++------ 2 files changed, 24 insertions(+), 11 deletions(-) --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -45,7 +45,7 @@ static inline u64 cpu_clock(int cpu) return sched_clock(); } -static inline u64 local_clock(void) +static __always_inline u64 local_clock(void) { return sched_clock(); } @@ -79,10 +79,8 @@ static inline u64 cpu_clock(int cpu) return sched_clock_cpu(cpu); } -static inline u64 local_clock(void) -{ - return sched_clock_cpu(raw_smp_processor_id()); -} +extern u64 local_clock(void); + #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -93,7 +93,7 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -notrace static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } @@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -notrace static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -notrace static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -notrace static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -287,13 +287,28 @@ notrace static u64 sched_clock_local(str clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) + if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -notrace static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock(void) +{ + u64 clock; + + if (static_branch_likely(&__sched_clock_stable)) + return sched_clock() + __sched_clock_offset; + + preempt_disable_notrace(); + clock = sched_clock_local(this_scd()); + preempt_enable_notrace(); + + return clock; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock;
Peter Zijlstra
2023-Jan-26 15:08 UTC
[PATCH v2 9/9] cpuidle: Fix poll_idle() noinstr annotation
The instrumentation_begin()/end() annotations in poll_idle() were complete nonsense. Specifically they caused tracing to happen in the middle of noinstr code, resulting in RCU splats. Now that local_clock() is noinstr, mark up the rest and let it rip. Fixes: 00717eb8c955 ("cpuidle: Annotate poll_idle()") Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reported-by: kernel test robot <oliver.sang at intel.com> Acked-by: Rafael J. Wysocki <rafael.j.wysocki at intel.com> Link: https://lore.kernel.org/oe-lkp/202301192148.58ece903-oliver.sang at intel.com --- drivers/cpuidle/cpuidle.c | 2 +- drivers/cpuidle/poll_state.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -426,7 +426,7 @@ void cpuidle_reflect(struct cpuidle_devi * @dev: the cpuidle device * */ -u64 cpuidle_poll_time(struct cpuidle_driver *drv, +__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv, struct cpuidle_device *dev) { int i; --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -15,7 +15,6 @@ static int __cpuidle poll_idle(struct cp { u64 time_start; - instrumentation_begin(); time_start = local_clock(); dev->poll_time_limit = false; @@ -42,7 +41,6 @@ static int __cpuidle poll_idle(struct cp raw_local_irq_disable(); current_clr_polling(); - instrumentation_end(); return index; }