0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat. These patches appear to cure this, the ftrace selftest now runs to completion without spamming scary messages to dmesg. --- arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++------------------- arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++---------------- arch/x86/include/asm/kvmclock.h | 2 +- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/include/asm/pvclock.h | 3 ++- arch/x86/kernel/cpu/vmware.c | 2 +- arch/x86/kernel/ftrace.c | 3 +++ arch/x86/kernel/kvmclock.c | 6 +++--- arch/x86/kernel/pvclock.c | 22 +++++++++++++------ arch/x86/kernel/tsc.c | 7 +++--- arch/x86/xen/time.c | 12 +++++++++-- drivers/cpuidle/cpuidle.c | 2 +- drivers/cpuidle/poll_state.c | 2 -- include/linux/math64.h | 4 ++-- include/linux/sched/clock.h | 8 +++---- kernel/sched/clock.c | 27 +++++++++++++++++------ 16 files changed, 107 insertions(+), 75 deletions(-)
As already done for regular arch_atomic, always inline arch_atomic64. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/include/asm/atomic64_32.h | 44 ++++++++++++++++++------------------- arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++--------------- 2 files changed, 40 insertions(+), 40 deletions(-) --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -71,7 +71,7 @@ ATOMIC64_DECL(add_unless); * the old value. */ -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { return arch_cmpxchg64(&v->counter, o, n); } @@ -85,7 +85,7 @@ static inline s64 arch_atomic64_cmpxchg( * Atomically xchgs the value of @v to @n and returns * the old value. */ -static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) +static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { s64 o; unsigned high = (unsigned)(n >> 32); @@ -104,7 +104,7 @@ static inline s64 arch_atomic64_xchg(ato * * Atomically sets the value of @v to @n. */ -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; @@ -119,7 +119,7 @@ static inline void arch_atomic64_set(ato * * Atomically reads the value of @v and returns it. */ -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { s64 r; alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); @@ -133,7 +133,7 @@ static inline s64 arch_atomic64_read(con * * Atomically adds @i to @v and returns @i + *@v */ -static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -145,7 +145,7 @@ static inline s64 arch_atomic64_add_retu /* * Other variants with different arithmetic operators: */ -static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -154,7 +154,7 @@ static inline s64 arch_atomic64_sub_retu } #define arch_atomic64_sub_return arch_atomic64_sub_return -static inline s64 arch_atomic64_inc_return(atomic64_t *v) +static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) { s64 a; alternative_atomic64(inc_return, "=&A" (a), @@ -163,7 +163,7 @@ static inline s64 arch_atomic64_inc_retu } #define arch_atomic64_inc_return arch_atomic64_inc_return -static inline s64 arch_atomic64_dec_return(atomic64_t *v) +static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) { s64 a; alternative_atomic64(dec_return, "=&A" (a), @@ -179,7 +179,7 @@ static inline s64 arch_atomic64_dec_retu * * Atomically adds @i to @v. */ -static inline s64 arch_atomic64_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -194,7 +194,7 @@ static inline s64 arch_atomic64_add(s64 * * Atomically subtracts @i from @v. */ -static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -208,7 +208,7 @@ static inline s64 arch_atomic64_sub(s64 * * Atomically increments @v by 1. */ -static inline void arch_atomic64_inc(atomic64_t *v) +static __always_inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); @@ -221,7 +221,7 @@ static inline void arch_atomic64_inc(ato * * Atomically decrements @v by 1. */ -static inline void arch_atomic64_dec(atomic64_t *v) +static __always_inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); @@ -237,7 +237,7 @@ static inline void arch_atomic64_dec(ato * Atomically adds @a to @v, so long as it was not @u. * Returns non-zero if the add was done, zero otherwise. */ -static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) +static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); @@ -248,7 +248,7 @@ static inline int arch_atomic64_add_unle } #define arch_atomic64_add_unless arch_atomic64_add_unless -static inline int arch_atomic64_inc_not_zero(atomic64_t *v) +static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; alternative_atomic64(inc_not_zero, "=&a" (r), @@ -257,7 +257,7 @@ static inline int arch_atomic64_inc_not_ } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero -static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) +static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { s64 r; alternative_atomic64(dec_if_positive, "=&A" (r), @@ -269,7 +269,7 @@ static inline s64 arch_atomic64_dec_if_p #undef alternative_atomic64 #undef __alternative_atomic64 -static inline void arch_atomic64_and(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -277,7 +277,7 @@ static inline void arch_atomic64_and(s64 c = old; } -static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -288,7 +288,7 @@ static inline s64 arch_atomic64_fetch_an } #define arch_atomic64_fetch_and arch_atomic64_fetch_and -static inline void arch_atomic64_or(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -296,7 +296,7 @@ static inline void arch_atomic64_or(s64 c = old; } -static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -307,7 +307,7 @@ static inline s64 arch_atomic64_fetch_or } #define arch_atomic64_fetch_or arch_atomic64_fetch_or -static inline void arch_atomic64_xor(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -315,7 +315,7 @@ static inline void arch_atomic64_xor(s64 c = old; } -static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 old, c = 0; @@ -326,7 +326,7 @@ static inline s64 arch_atomic64_fetch_xo } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { s64 old, c = 0; --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -17,7 +17,7 @@ * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return __READ_ONCE((v)->counter); } @@ -29,7 +29,7 @@ static inline s64 arch_atomic64_read(con * * Atomically sets the value of @v to @i. */ -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { __WRITE_ONCE(v->counter, i); } @@ -55,7 +55,7 @@ static __always_inline void arch_atomic6 * * Atomically subtracts @i from @v. */ -static inline void arch_atomic64_sub(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) @@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(s64 * true if the result is zero, or false for all * other cases. */ -static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) +static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } @@ -113,7 +113,7 @@ static __always_inline void arch_atomic6 * returns true if the result is 0, or false for all other * cases. */ -static inline bool arch_atomic64_dec_and_test(atomic64_t *v) +static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } @@ -127,7 +127,7 @@ static inline bool arch_atomic64_dec_and * and returns true if the result is zero, or false for all * other cases. */ -static inline bool arch_atomic64_inc_and_test(atomic64_t *v) +static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } @@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) +static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } @@ -161,25 +161,25 @@ static __always_inline s64 arch_atomic64 } #define arch_atomic64_add_return arch_atomic64_add_return -static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { return arch_atomic64_add_return(-i, v); } #define arch_atomic64_sub_return arch_atomic64_sub_return -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) { return xadd(&v->counter, -i); } #define arch_atomic64_fetch_sub arch_atomic64_fetch_sub -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } @@ -191,13 +191,13 @@ static __always_inline bool arch_atomic6 } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg -static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) +static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } #define arch_atomic64_xchg arch_atomic64_xchg -static inline void arch_atomic64_and(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "andq %1,%0" : "+m" (v->counter) @@ -205,7 +205,7 @@ static inline void arch_atomic64_and(s64 : "memory"); } -static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -215,7 +215,7 @@ static inline s64 arch_atomic64_fetch_an } #define arch_atomic64_fetch_and arch_atomic64_fetch_and -static inline void arch_atomic64_or(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "orq %1,%0" : "+m" (v->counter) @@ -223,7 +223,7 @@ static inline void arch_atomic64_or(s64 : "memory"); } -static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -233,7 +233,7 @@ static inline s64 arch_atomic64_fetch_or } #define arch_atomic64_fetch_or arch_atomic64_fetch_or -static inline void arch_atomic64_xor(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "xorq %1,%0" : "+m" (v->counter) @@ -241,7 +241,7 @@ static inline void arch_atomic64_xor(s64 : "memory"); } -static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v);
Peter Zijlstra
2023-Jan-23 20:50 UTC
[PATCH 2/6] x86/pvclock: improve atomic update of last_value in pvclock_clocksource_read
From: Uros Bizjak <ubizjak at gmail.com> Improve atomic update of last_value in pvclock_clocksource_read: - Atomic update can be skipped if the "last_value" is already equal to "ret". - The detection of atomic update failure is not correct. The value, returned by atomic64_cmpxchg should be compared to the old value from the location to be updated. If these two are the same, then atomic update succeeded and "last_value" location is updated to "ret" in an atomic way. Otherwise, the atomic update failed and it should be retried with the value from "last_value" - exactly what atomic64_try_cmpxchg does in a correct and more optimal way. Signed-off-by: Uros Bizjak <ubizjak at gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Link: https://lkml.kernel.org/r/20230118202330.3740-1-ubizjak at gmail.com --- arch/x86/kernel/pvclock.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index eda37df016f0..5a2a517dd61b 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -102,10 +102,9 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) */ last = atomic64_read(&last_value); do { - if (ret < last) + if (ret <= last) return last; - last = atomic64_cmpxchg(&last_value, last, ret); - } while (unlikely(last != ret)); + } while (!atomic64_try_cmpxchg(&last_value, &last, ret)); return ret; } -- 2.39.0
Peter Zijlstra
2023-Jan-23 20:50 UTC
[PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
All RCU disabled code should be noinstr and hence we should never get here -- when we do, WARN about it and make sure to not actually do tracing. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/kernel/ftrace.c | 3 +++ 1 file changed, 3 insertions(+) --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -646,6 +646,9 @@ void prepare_ftrace_return(unsigned long if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; + if (WARN_ONCE(!rcu_is_watching(), "RCU not on for: %pS\n", (void *)ip)) + return; + bit = ftrace_test_recursion_trylock(ip, *parent); if (bit < 0) return;
In order to use sched_clock() from noinstr code, mark it and all it's implenentations noinstr. The whole pvclock thing (used by KVM/Xen) is a bit of a pain, since it calls out to watchdogs, create a pvclock_clocksource_read_nowd() variant doesn't do that and can be noinstr. Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- arch/x86/include/asm/kvmclock.h | 2 +- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/include/asm/pvclock.h | 3 ++- arch/x86/kernel/cpu/vmware.c | 2 +- arch/x86/kernel/kvmclock.c | 6 +++--- arch/x86/kernel/pvclock.c | 19 +++++++++++++++---- arch/x86/kernel/tsc.c | 7 +++---- arch/x86/xen/time.c | 12 ++++++++++-- include/linux/math64.h | 4 ++-- 9 files changed, 38 insertions(+), 19 deletions(-) --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -8,7 +8,7 @@ extern struct clocksource kvm_clock; DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); -static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { return &this_cpu_read(hv_clock_per_cpu)->pvti; } --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -26,7 +26,7 @@ DECLARE_STATIC_CALL(pv_sched_clock, dumm void paravirt_set_sched_clock(u64 (*func)(void)); -static inline u64 paravirt_sched_clock(void) +static __always_inline u64 paravirt_sched_clock(void) { return static_call(pv_sched_clock)(); } --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -7,6 +7,7 @@ /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); @@ -39,7 +40,7 @@ bool pvclock_read_retry(const struct pvc * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ -static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) { u64 product; #ifdef __i386__ --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char } early_param("no-steal-acc", parse_no_stealacc); -static unsigned long long notrace vmware_sched_clock(void) +static noinstr u64 vmware_sched_clock(void) { unsigned long long ns; --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -71,12 +71,12 @@ static int kvm_set_wallclock(const struc return -ENODEV; } -static u64 kvm_clock_read(void) +static noinstr u64 kvm_clock_read(void) { u64 ret; preempt_disable_notrace(); - ret = pvclock_clocksource_read(this_cpu_pvti()); + ret = pvclock_clocksource_read_nowd(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -86,7 +86,7 @@ static u64 kvm_clock_get_cycles(struct c return kvm_clock_read(); } -static u64 kvm_sched_clock_read(void) +static noinstr u64 kvm_sched_clock_read(void) { return kvm_clock_read() - kvm_sched_clock_offset; } --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -64,7 +64,8 @@ u8 pvclock_read_flags(struct pvclock_vcp return flags & valid_flags; } -u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +static __always_inline +u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd) { unsigned version; u64 ret; @@ -77,7 +78,7 @@ u64 pvclock_clocksource_read(struct pvcl flags = src->flags; } while (pvclock_read_retry(src, version)); - if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { + if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { src->flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); } @@ -100,15 +101,25 @@ u64 pvclock_clocksource_read(struct pvcl * updating at the same time, and one of them could be slightly behind, * making the assumption that last_value always go forward fail to hold. */ - last = atomic64_read(&last_value); + last = arch_atomic64_read(&last_value); do { if (ret <= last) return last; - } while (!atomic64_try_cmpxchg(&last_value, &last, ret)); + } while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret)); return ret; } +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + return __pvclock_clocksource_read(src, true); +} + +noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src) +{ + return __pvclock_clocksource_read(src, false); +} + void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, struct pvclock_vcpu_time_info *vcpu_time, struct timespec64 *ts) --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -215,7 +215,7 @@ static void __init cyc2ns_init_secondary /* * Scheduler clock - returns current time in nanosec units. */ -u64 native_sched_clock(void) +noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); @@ -248,7 +248,7 @@ u64 native_sched_clock_from_tsc(u64 tsc) /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) +noinstr u64 sched_clock(void) { return paravirt_sched_clock(); } @@ -258,8 +258,7 @@ bool using_native_sched_clock(void) return static_call_query(pv_sched_clock) == native_sched_clock; } #else -unsigned long long -sched_clock(void) __attribute__((alias("native_sched_clock"))); +u64 sched_clock(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -60,9 +60,17 @@ static u64 xen_clocksource_get_cycles(st return xen_clocksource_read(); } -static u64 xen_sched_clock(void) +static noinstr u64 xen_sched_clock(void) { - return xen_clocksource_read() - xen_sched_clock_offset; + struct pvclock_vcpu_time_info *src; + u64 ret; + + preempt_disable_notrace(); + src = &__this_cpu_read(xen_vcpu)->time; + ret = pvclock_clocksource_read_nowd(src); + ret -= xen_sched_clock_offset; + preempt_enable_notrace(); + return ret; } static void xen_read_wallclock(struct timespec64 *ts) --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -161,7 +161,7 @@ static inline u64 mul_u32_u32(u32 a, u32 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr -static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } @@ -177,7 +177,7 @@ static inline u64 mul_u64_u64_shr(u64 a, #else #ifndef mul_u64_u32_shr -static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { u32 ah, al; u64 ret;
With sched_clock() noinstr, provide a noinstr implementation of local_clock(). Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> --- include/linux/sched/clock.h | 8 +++----- kernel/sched/clock.c | 27 +++++++++++++++++++++------ 2 files changed, 24 insertions(+), 11 deletions(-) --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -45,7 +45,7 @@ static inline u64 cpu_clock(int cpu) return sched_clock(); } -static inline u64 local_clock(void) +static __always_inline u64 local_clock(void) { return sched_clock(); } @@ -79,10 +79,8 @@ static inline u64 cpu_clock(int cpu) return sched_clock_cpu(cpu); } -static inline u64 local_clock(void) -{ - return sched_clock_cpu(raw_smp_processor_id()); -} +extern u64 local_clock(void); + #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -93,7 +93,7 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -notrace static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } @@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -notrace static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -notrace static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -notrace static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -287,13 +287,28 @@ notrace static u64 sched_clock_local(str clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) + if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -notrace static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock(void) +{ + u64 clock; + + if (static_branch_likely(&__sched_clock_stable)) + return sched_clock() + __sched_clock_offset; + + preempt_disable_notrace(); + clock = sched_clock_local(this_scd()); + preempt_enable_notrace(); + + return clock; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock;
Peter Zijlstra
2023-Jan-23 20:50 UTC
[PATCH 6/6] cpuidle: Fix poll_idle() noinstr annotation
The instrumentation_begin()/end() annotations in poll_idle() were complete nonsense. Specifically they caused tracing to happen in the middle of noinstr code, resulting in RCU splats. Now that local_clock() is noinstr, mark up the rest and let it rip. Fixes: 00717eb8c955 ("cpuidle: Annotate poll_idle()") Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org> Reported-by: kernel test robot <oliver.sang at intel.com> Link: https://lore.kernel.org/oe-lkp/202301192148.58ece903-oliver.sang at intel.com --- drivers/cpuidle/cpuidle.c | 2 +- drivers/cpuidle/poll_state.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -426,7 +426,7 @@ void cpuidle_reflect(struct cpuidle_devi * @dev: the cpuidle device * */ -u64 cpuidle_poll_time(struct cpuidle_driver *drv, +__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv, struct cpuidle_device *dev) { int i; --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -15,7 +15,6 @@ static int __cpuidle poll_idle(struct cp { u64 time_start; - instrumentation_begin(); time_start = local_clock(); dev->poll_time_limit = false; @@ -42,7 +41,6 @@ static int __cpuidle poll_idle(struct cp raw_local_irq_disable(); current_clr_polling(); - instrumentation_end(); return index; }
Hi Peter, On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:> 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.Do you have a link toe the splat somewhere? I'm assuming that this is partially generic, and I'd like to make sure I test the right thing on arm64. I'll throw my usual lockdep options at the ftrace selftests... Thanks, Mark.> > These patches appear to cure this, the ftrace selftest now runs to completion > without spamming scary messages to dmesg. > > --- > arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++------------------- > arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++---------------- > arch/x86/include/asm/kvmclock.h | 2 +- > arch/x86/include/asm/paravirt.h | 2 +- > arch/x86/include/asm/pvclock.h | 3 ++- > arch/x86/kernel/cpu/vmware.c | 2 +- > arch/x86/kernel/ftrace.c | 3 +++ > arch/x86/kernel/kvmclock.c | 6 +++--- > arch/x86/kernel/pvclock.c | 22 +++++++++++++------ > arch/x86/kernel/tsc.c | 7 +++--- > arch/x86/xen/time.c | 12 +++++++++-- > drivers/cpuidle/cpuidle.c | 2 +- > drivers/cpuidle/poll_state.c | 2 -- > include/linux/math64.h | 4 ++-- > include/linux/sched/clock.h | 8 +++---- > kernel/sched/clock.c | 27 +++++++++++++++++------ > 16 files changed, 107 insertions(+), 75 deletions(-) > >
Hi Peter, On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:> 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat. > > These patches appear to cure this, the ftrace selftest now runs to completion > without spamming scary messages to dmesg.In addition to the other bits for arm64, we'll need the following patch. Are you happy to add that to the start of this series? I've tested this on an arm64 Juno board with a full-fat ftrace config, CONFIG_PROVE_LOCKING + CONFIG_DEBUG_LOCKDEP, and CONFIG_DEBUG_VIRTUAL=y, and build tested for 32-bit arm. Thanks, Mark. ---->8---->From 30ab9eba19e952cb51c9f599d2ac9b8a302cb63d Mon Sep 17 00:00:00 2001From: Mark Rutland <mark.rutland at arm.com> Date: Wed, 25 Jan 2023 14:20:49 +0000 Subject: [PATCH] drivers: firmware: psci: don't instrument suspend code The PSCI suspend code is currently instrumentable, which is not safe as instrumentation (e.g. ftrace) may try to make use of RCU during idle periods when RCU is not watching. To fix this we need to ensure that psci_suspend_finisher() and anything it calls are not instrumented. We can do this fairly simply by marking psci_suspend_finisher() and the psci*_cpu_suspend() functions as noinstr, and the underlying helper functions as __always_inline. When CONFIG_DEBUG_VIRTUAL=y, __pa_symbol() can expand to an out-of-line instrumented function, so we must use __pa_symbol_nodebug() within psci_suspend_finisher(). The raw SMCCC invocation functions are written in assembly, and are not subject to compiler instrumentation. Signed-off-by: Mark Rutland <mark.rutland at arm.com> Cc: Lorenzo Pieralisi <lpieralisi at kernel.org> Cc: Peter Zijlstra <peterz at infradead.org> --- drivers/firmware/psci/psci.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index f3a044fa4652a..c12847b4736de 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -108,9 +108,10 @@ bool psci_power_state_is_valid(u32 state) return !(state & ~valid_mask); } -static unsigned long __invoke_psci_fn_hvc(unsigned long function_id, - unsigned long arg0, unsigned long arg1, - unsigned long arg2) +static __always_inline unsigned long +__invoke_psci_fn_hvc(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2) { struct arm_smccc_res res; @@ -118,9 +119,10 @@ static unsigned long __invoke_psci_fn_hvc(unsigned long function_id, return res.a0; } -static unsigned long __invoke_psci_fn_smc(unsigned long function_id, - unsigned long arg0, unsigned long arg1, - unsigned long arg2) +static __always_inline unsigned long +__invoke_psci_fn_smc(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2) { struct arm_smccc_res res; @@ -128,7 +130,7 @@ static unsigned long __invoke_psci_fn_smc(unsigned long function_id, return res.a0; } -static int psci_to_linux_errno(int errno) +static __always_inline int psci_to_linux_errno(int errno) { switch (errno) { case PSCI_RET_SUCCESS: @@ -169,7 +171,8 @@ int psci_set_osi_mode(bool enable) return psci_to_linux_errno(err); } -static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) +static __always_inline int +__psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) { int err; @@ -177,13 +180,15 @@ static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) return psci_to_linux_errno(err); } -static int psci_0_1_cpu_suspend(u32 state, unsigned long entry_point) +static __always_inline int +psci_0_1_cpu_suspend(u32 state, unsigned long entry_point) { return __psci_cpu_suspend(psci_0_1_function_ids.cpu_suspend, state, entry_point); } -static int psci_0_2_cpu_suspend(u32 state, unsigned long entry_point) +static __always_inline int +psci_0_2_cpu_suspend(u32 state, unsigned long entry_point) { return __psci_cpu_suspend(PSCI_FN_NATIVE(0_2, CPU_SUSPEND), state, entry_point); @@ -447,10 +452,12 @@ late_initcall(psci_debugfs_init) #endif #ifdef CONFIG_CPU_IDLE -static int psci_suspend_finisher(unsigned long state) +static noinstr int psci_suspend_finisher(unsigned long state) { u32 power_state = state; - phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume); + phys_addr_t pa_cpu_resume; + + pa_cpu_resume = __pa_symbol_nodebug((unsigned long)cpu_resume); return psci_ops.cpu_suspend(power_state, pa_cpu_resume); } -- 2.30.2