Raghavendra K T
2015-Feb-06 14:49 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
Paravirt spinlock clears slowpath flag after doing unlock. As explained by Linus currently it does: prev = *lock; add_smp(&lock->tickets.head, TICKET_LOCK_INC); /* add_smp() is a full mb() */ if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) __ticket_unlock_slowpath(lock, prev); which is *exactly* the kind of things you cannot do with spinlocks, because after you've done the "add_smp()" and released the spinlock for the fast-path, you can't access the spinlock any more. Exactly because a fast-path lock might come in, and release the whole data structure. Linus suggested that we should not do any writes to lock after unlock(), and we can move slowpath clearing to fastpath lock. However it brings additional case to be handled, viz., slowpath still could be set when somebody does arch_trylock. Handle that too by ignoring slowpath flag during lock availability check. Reported-by: Sasha Levin <sasha.levin at oracle.com> Suggested-by: Linus Torvalds <torvalds at linux-foundation.org> Signed-off-by: Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com> --- arch/x86/include/asm/spinlock.h | 70 ++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 625660f..0829f86 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -49,6 +49,23 @@ static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) set_bit(0, (volatile unsigned long *)&lock->tickets.tail); } +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock) +{ + arch_spinlock_t old, new; + __ticket_t diff; + + old.tickets = READ_ONCE(lock->tickets); + diff = (old.tickets.tail & ~TICKET_SLOWPATH_FLAG) - old.tickets.head; + + /* try to clear slowpath flag when there are no contenders */ + if ((old.tickets.tail & TICKET_SLOWPATH_FLAG) && + (diff == TICKET_LOCK_INC)) { + new = old; + new.tickets.tail &= ~TICKET_SLOWPATH_FLAG; + cmpxchg(&lock->head_tail, old.head_tail, new.head_tail); + } +} + #else /* !CONFIG_PARAVIRT_SPINLOCKS */ static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, __ticket_t ticket) @@ -59,6 +76,10 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock, { } +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock) +{ +} + #endif /* CONFIG_PARAVIRT_SPINLOCKS */ static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) @@ -84,7 +105,7 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; inc = xadd(&lock->tickets, inc); - if (likely(inc.head == inc.tail)) + if (likely(inc.head == (inc.tail & ~TICKET_SLOWPATH_FLAG))) goto out; inc.tail &= ~TICKET_SLOWPATH_FLAG; @@ -98,7 +119,10 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) } while (--count); __ticket_lock_spinning(lock, inc.tail); } -out: barrier(); /* make sure nothing creeps before the lock is taken */ +out: + __ticket_check_and_clear_slowpath(lock); + + barrier(); /* make sure nothing creeps before the lock is taken */ } static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) @@ -115,47 +139,21 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, - arch_spinlock_t old) -{ - arch_spinlock_t new; - - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - - /* Perform the unlock on the "before" copy */ - old.tickets.head += TICKET_LOCK_INC; - - /* Clear the slowpath flag */ - new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); - - /* - * If the lock is uncontended, clear the flag - use cmpxchg in - * case it changes behind our back though. - */ - if (new.tickets.head != new.tickets.tail || - cmpxchg(&lock->head_tail, old.head_tail, - new.head_tail) != old.head_tail) { - /* - * Lock still has someone queued for it, so wake up an - * appropriate waiter. - */ - __ticket_unlock_kick(lock, old.tickets.head); - } -} - static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { if (TICKET_SLOWPATH_FLAG && - static_key_false(¶virt_ticketlocks_enabled)) { - arch_spinlock_t prev; + static_key_false(¶virt_ticketlocks_enabled)) { + __ticket_t prev_head; - prev = *lock; + prev_head = lock->tickets.head; add_smp(&lock->tickets.head, TICKET_LOCK_INC); /* add_smp() is a full mb() */ - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) - __ticket_unlock_slowpath(lock, prev); + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) { + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); + __ticket_unlock_kick(lock, prev_head); + } } else __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } @@ -164,7 +162,7 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock) { struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return tmp.tail != tmp.head; + return (tmp.tail & ~TICKET_SLOWPATH_FLAG) != tmp.head; } static inline int arch_spin_is_contended(arch_spinlock_t *lock) -- 1.7.11.7
Sasha Levin
2015-Feb-06 15:20 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On 02/06/2015 09:49 AM, Raghavendra K T wrote:> static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) > { > if (TICKET_SLOWPATH_FLAG && > - static_key_false(¶virt_ticketlocks_enabled)) { > - arch_spinlock_t prev; > + static_key_false(¶virt_ticketlocks_enabled)) { > + __ticket_t prev_head; > > - prev = *lock; > + prev_head = lock->tickets.head; > add_smp(&lock->tickets.head, TICKET_LOCK_INC); > > /* add_smp() is a full mb() */ > > - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) > - __ticket_unlock_slowpath(lock, prev); > + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) { > + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); > + __ticket_unlock_kick(lock, prev_head);Can we modify it slightly to avoid potentially accessing invalid memory: diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 5315887..cd22d73 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -144,13 +144,13 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock if (TICKET_SLOWPATH_FLAG && static_key_false(¶virt_ticketlocks_enabled)) { __ticket_t prev_head; - + bool needs_kick = lock->tickets.tail & TICKET_SLOWPATH_FLAG; prev_head = lock->tickets.head; add_smp(&lock->tickets.head, TICKET_LOCK_INC); /* add_smp() is a full mb() */ - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) { + if (unlikely(needs_kick)) { BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); __ticket_unlock_kick(lock, prev_head); } Thanks, Sasha
Linus Torvalds
2015-Feb-06 16:15 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On Fri, Feb 6, 2015 at 7:20 AM, Sasha Levin <sasha.levin at oracle.com> wrote:> > Can we modify it slightly to avoid potentially accessing invalid memory:So I think there's a race with that. And I'll warn you: the kernel does do speculative reads of memory that might be invalid, not just in places like this. See the comment in get_user_huge_page() for example, where we knowingly do speculative reads, but hide it if DEBUG_PAGEALLOC is set. More commonly, CONFIG_DCACHE_WORD_ACCESS is very much about doing speculative reads. Now, that access is hidden inside an asm, so KASan won't see it, but there might well be others. You probably don't see them very much just because they are so rarely a problem, and most of the time it's not to other processes stack but to allocated structures where freeing takes long enough to basically hide any small race.. In other words: I suspect it would be good to instead just teach KASan about "this is a speculative read" and just suppress the warning for those instead. Linus
Linus Torvalds
2015-Feb-06 16:25 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On Fri, Feb 6, 2015 at 6:49 AM, Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com> wrote:> Paravirt spinlock clears slowpath flag after doing unlock.[ fix edited out ] So I'm not going to be applying this for 3.19, because it's much too late and the patch is too scary. Plus the bug probably effectively never shows up in real life (it is probably easy to trigger the speculative *read* but probably never the actual speculative write after dropping the lock last). This will need a lot of testing by the paravirt people - both performance and correctness. So *maybe* for 3.20, but maybe for even later, and then marked for stable, of course. Are there any good paravirt stress-tests that people could run for extended times? Linus
Sasha Levin
2015-Feb-06 18:57 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On 02/06/2015 09:49 AM, Raghavendra K T wrote:> Paravirt spinlock clears slowpath flag after doing unlock. > As explained by Linus currently it does: > prev = *lock; > add_smp(&lock->tickets.head, TICKET_LOCK_INC); > > /* add_smp() is a full mb() */ > > if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) > __ticket_unlock_slowpath(lock, prev); > > > which is *exactly* the kind of things you cannot do with spinlocks, > because after you've done the "add_smp()" and released the spinlock > for the fast-path, you can't access the spinlock any more. Exactly > because a fast-path lock might come in, and release the whole data > structure. > > Linus suggested that we should not do any writes to lock after unlock(), > and we can move slowpath clearing to fastpath lock. > > However it brings additional case to be handled, viz., slowpath still > could be set when somebody does arch_trylock. Handle that too by ignoring > slowpath flag during lock availability check. > > Reported-by: Sasha Levin <sasha.levin at oracle.com> > Suggested-by: Linus Torvalds <torvalds at linux-foundation.org> > Signed-off-by: Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com>With this patch, my VMs lock up quickly after boot with: [ 161.613469] BUG: spinlock lockup suspected on CPU#31, kworker/31:1/5213 [ 161.613469] lock: purge_lock.28981+0x0/0x40, .magic: dead4ead, .owner: kworker/7:1/6400, .owner_cpu: 7 [ 161.613469] CPU: 31 PID: 5213 Comm: kworker/31:1 Not tainted 3.19.0-rc7-next-20150204-sasha-00048-gee3a350 #1869 [ 161.613469] Workqueue: events bpf_prog_free_deferred [ 161.613469] 0000000000000000 00000000f03dd27f ffff88056b227a88 ffffffffa1702276 [ 161.613469] 0000000000000000 ffff88017cf70000 ffff88056b227aa8 ffffffff9e1d009c [ 161.613469] ffffffffa3edae60 0000000086c3f830 ffff88056b227ad8 ffffffff9e1d01d7 [ 161.613469] Call Trace: [ 161.613469] dump_stack (lib/dump_stack.c:52) [ 161.613469] spin_dump (kernel/locking/spinlock_debug.c:68 (discriminator 8)) [ 161.613469] do_raw_spin_lock (include/linux/nmi.h:48 kernel/locking/spinlock_debug.c:119 kernel/locking/spinlock_debug.c:137) [ 161.613469] _raw_spin_lock (kernel/locking/spinlock.c:152) [ 161.613469] ? __purge_vmap_area_lazy (mm/vmalloc.c:615) [ 161.613469] __purge_vmap_area_lazy (mm/vmalloc.c:615) [ 161.613469] ? vm_unmap_aliases (mm/vmalloc.c:1021) [ 161.613469] vm_unmap_aliases (mm/vmalloc.c:1052) [ 161.613469] ? vm_unmap_aliases (mm/vmalloc.c:1021) [ 161.613469] ? __lock_acquire (kernel/locking/lockdep.c:2019 kernel/locking/lockdep.c:3184) [ 161.613469] change_page_attr_set_clr (arch/x86/mm/pageattr.c:1394) [ 161.613469] ? debug_object_deactivate (lib/debugobjects.c:463) [ 161.613469] set_memory_rw (arch/x86/mm/pageattr.c:1662) [ 161.613469] ? __lock_is_held (kernel/locking/lockdep.c:3518) [ 161.613469] bpf_jit_free (include/linux/filter.h:377 arch/x86/net/bpf_jit_comp.c:991) [ 161.613469] bpf_prog_free_deferred (kernel/bpf/core.c:646) [ 161.613469] process_one_work (kernel/workqueue.c:2014 include/linux/jump_label.h:114 include/trace/events/workqueue.h:111 kernel/workqueue.c:2019) [ 161.613469] ? process_one_work (./arch/x86/include/asm/atomic64_64.h:33 include/asm-generic/atomic-long.h:38 kernel/workqueue.c:598 kernel/workqueue.c:625 kernel/workqueue.c:2007) [ 161.613469] worker_thread (include/linux/list.h:189 kernel/workqueue.c:2147) [ 161.613469] ? process_one_work (kernel/workqueue.c:2091) [ 161.613469] kthread (kernel/kthread.c:207) [ 161.613469] ? finish_task_switch (./arch/x86/include/asm/current.h:14 kernel/sched/sched.h:1058 kernel/sched/core.c:2258) [ 161.613469] ? flush_kthread_work (kernel/kthread.c:176) [ 161.613469] ret_from_fork (arch/x86/kernel/entry_64.S:283) [ 161.613469] ? flush_kthread_work (kernel/kthread.c:176) And a few soft lockup messages inside the scheduler after that. Thanks, Sasha
Davidlohr Bueso
2015-Feb-06 19:42 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On Fri, 2015-02-06 at 08:25 -0800, Linus Torvalds wrote:> On Fri, Feb 6, 2015 at 6:49 AM, Raghavendra K T > <raghavendra.kt at linux.vnet.ibm.com> wrote: > > Paravirt spinlock clears slowpath flag after doing unlock. > [ fix edited out ] > > So I'm not going to be applying this for 3.19, because it's much too > late and the patch is too scary. Plus the bug probably effectively > never shows up in real life (it is probably easy to trigger the > speculative *read* but probably never the actual speculative write > after dropping the lock last). > > This will need a lot of testing by the paravirt people - both > performance and correctness. So *maybe* for 3.20, but maybe for even > later, and then marked for stable, of course. > > Are there any good paravirt stress-tests that people could run for > extended times?locktorture inside a VM should give a proper pounding.
Oleg Nesterov
2015-Feb-08 17:14 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On 02/06, Sasha Levin wrote:> > Can we modify it slightly to avoid potentially accessing invalid memory: > > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h > index 5315887..cd22d73 100644 > --- a/arch/x86/include/asm/spinlock.h > +++ b/arch/x86/include/asm/spinlock.h > @@ -144,13 +144,13 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock > if (TICKET_SLOWPATH_FLAG && > static_key_false(¶virt_ticketlocks_enabled)) { > __ticket_t prev_head; > - > + bool needs_kick = lock->tickets.tail & TICKET_SLOWPATH_FLAG; > prev_head = lock->tickets.head; > add_smp(&lock->tickets.head, TICKET_LOCK_INC); > > /* add_smp() is a full mb() */ > > - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) { > + if (unlikely(needs_kick)) {This doesn't look right too... We need to guarantee that either unlock() sees TICKET_SLOWPATH_FLAG, or the calller of __ticket_enter_slowpath() sees the result of add_smp(). Suppose that kvm_lock_spinning() is called right before add_smp() and it sets SLOWPATH. It will block then because .head != want, and it needs __ticket_unlock_kick(). Oleg.
Raghavendra K T
2015-Feb-08 17:49 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On 02/06/2015 09:55 PM, Linus Torvalds wrote:> On Fri, Feb 6, 2015 at 6:49 AM, Raghavendra K T > <raghavendra.kt at linux.vnet.ibm.com> wrote: >> Paravirt spinlock clears slowpath flag after doing unlock. > [ fix edited out ] > > So I'm not going to be applying this for 3.19, because it's much too > late and the patch is too scary. Plus the bug probably effectively > never shows up in real life (it is probably easy to trigger the > speculative *read* but probably never the actual speculative write > after dropping the lock last). >Understood and agreed.> This will need a lot of testing by the paravirt people - both > performance and correctness. So *maybe* for 3.20, but maybe for even > later, and then marked for stable, of course. > > Are there any good paravirt stress-tests that people could run for > extended times? >I have been running several benchmarks (kern, sys, hack, ebizzy etc in in 1x,2x scenarios. I run them for performance test as well. (In the current patch I did not get kvm hang in normal run, But overcommit reproduced it).
Raghavendra K T
2015-Feb-08 17:57 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On 02/07/2015 12:27 AM, Sasha Levin wrote:> On 02/06/2015 09:49 AM, Raghavendra K T wrote: >> Paravirt spinlock clears slowpath flag after doing unlock. >> As explained by Linus currently it does: >> prev = *lock; >> add_smp(&lock->tickets.head, TICKET_LOCK_INC); >> >> /* add_smp() is a full mb() */ >> >> if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) >> __ticket_unlock_slowpath(lock, prev); >> >> >> which is *exactly* the kind of things you cannot do with spinlocks, >> because after you've done the "add_smp()" and released the spinlock >> for the fast-path, you can't access the spinlock any more. Exactly >> because a fast-path lock might come in, and release the whole data >> structure. >> >> Linus suggested that we should not do any writes to lock after unlock(), >> and we can move slowpath clearing to fastpath lock. >> >> However it brings additional case to be handled, viz., slowpath still >> could be set when somebody does arch_trylock. Handle that too by ignoring >> slowpath flag during lock availability check. >> >> Reported-by: Sasha Levin <sasha.levin at oracle.com> >> Suggested-by: Linus Torvalds <torvalds at linux-foundation.org> >> Signed-off-by: Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com> > > With this patch, my VMs lock up quickly after boot with:Tried to reproduce the hang myself, and there seems to be still a barrier (or logic I miss). Looking closely below, unlock_kick got missed though we see that SLOWPATH_FLAG is still set: /me goes back to look closely (gdb) bt #0 native_halt () at ./arch/x86/include/asm/irqflags.h:55 #1 0xffffffff81037c27 in halt () at ./arch/x86/include/asm/paravirt.h:116 #2 kvm_lock_spinning (lock=0xffff88023ffe8240, want=52504) at arch/x86/kernel/kvm.c:786 #3 0xffffffff81037251 in __raw_callee_save_kvm_lock_spinning () #4 0xffff88023fc0edb0 in ?? () #5 0x0000000000000000 in ?? () (gdb) p *(arch_spinlock_t *)0xffff88023ffe8240 $1 = {{head_tail = 3441806612, tickets = {head = 52500, tail = 52517}}} (gdb) t 2 [Switching to thread 2 (Thread 2)] #0 native_halt () at ./arch/x86/include/asm/irqflags.h:55 55 } (gdb) bt #0 native_halt () at ./arch/x86/include/asm/irqflags.h:55 #1 0xffffffff81037c27 in halt () at ./arch/x86/include/asm/paravirt.h:116 #2 kvm_lock_spinning (lock=0xffff88023ffe8240, want=52502) at arch/x86/kernel/kvm.c:786 #3 0xffffffff81037251 in __raw_callee_save_kvm_lock_spinning () #4 0x0000000000000246 in irq_stack_union () #5 0x0000000000080750 in ?? () #6 0x0000000000020000 in ?? () #7 0x0000000000000004 in irq_stack_union () #8 0x000000000000cd16 in nmi_print_seq () Cannot access memory at address 0xbfc0 (gdb) t 3 [Switching to thread 3 (Thread 3)] #0 native_halt () at ./arch/x86/include/asm/irqflags.h:55 55 } (gdb) bt #0 native_halt () at ./arch/x86/include/asm/irqflags.h:55 #1 0xffffffff81037c27 in halt () at ./arch/x86/include/asm/paravirt.h:116 #2 kvm_lock_spinning (lock=0xffff88023ffe8240, want=52512) at arch/x86/kernel/kvm.c:786 #3 0xffffffff81037251 in __raw_callee_save_kvm_lock_spinning () #4 0xffff88023fc8edb0 in ?? () #5 0x0000000000000000 in ?? () [...] //other threads with similar output (gdb) t 8 [Switching to thread 8 (Thread 8)] #0 native_halt () at ./arch/x86/include/asm/irqflags.h:55 55 } (gdb) bt #0 native_halt () at ./arch/x86/include/asm/irqflags.h:55 #1 0xffffffff81037c27 in halt () at ./arch/x86/include/asm/paravirt.h:116 #2 kvm_lock_spinning (lock=0xffff88023ffe8240, want=52500) at arch/x86/kernel/kvm.c:786 #3 0xffffffff81037251 in __raw_callee_save_kvm_lock_spinning () #4 0xffff88023fdcedb0 in ?? () #5 0x0000000000000000 in ?? ()
Jeremy Fitzhardinge
2015-Feb-08 21:14 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On 02/06/2015 06:49 AM, Raghavendra K T wrote:> Paravirt spinlock clears slowpath flag after doing unlock. > As explained by Linus currently it does: > prev = *lock; > add_smp(&lock->tickets.head, TICKET_LOCK_INC); > > /* add_smp() is a full mb() */ > > if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) > __ticket_unlock_slowpath(lock, prev); > > > which is *exactly* the kind of things you cannot do with spinlocks, > because after you've done the "add_smp()" and released the spinlock > for the fast-path, you can't access the spinlock any more. Exactly > because a fast-path lock might come in, and release the whole data > structure.Yeah, that's an embarrasingly obvious bug in retrospect.> Linus suggested that we should not do any writes to lock after unlock(), > and we can move slowpath clearing to fastpath lock.Yep, that seems like a sound approach.> However it brings additional case to be handled, viz., slowpath still > could be set when somebody does arch_trylock. Handle that too by ignoring > slowpath flag during lock availability check. > > Reported-by: Sasha Levin <sasha.levin at oracle.com> > Suggested-by: Linus Torvalds <torvalds at linux-foundation.org> > Signed-off-by: Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com> > --- > arch/x86/include/asm/spinlock.h | 70 ++++++++++++++++++++--------------------- > 1 file changed, 34 insertions(+), 36 deletions(-) > > diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h > index 625660f..0829f86 100644 > --- a/arch/x86/include/asm/spinlock.h > +++ b/arch/x86/include/asm/spinlock.h > @@ -49,6 +49,23 @@ static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) > set_bit(0, (volatile unsigned long *)&lock->tickets.tail); > } > > +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock) > +{ > + arch_spinlock_t old, new; > + __ticket_t diff; > + > + old.tickets = READ_ONCE(lock->tickets);Couldn't the caller pass in the lock state that it read rather than re-reading it?> + diff = (old.tickets.tail & ~TICKET_SLOWPATH_FLAG) - old.tickets.head; > + > + /* try to clear slowpath flag when there are no contenders */ > + if ((old.tickets.tail & TICKET_SLOWPATH_FLAG) && > + (diff == TICKET_LOCK_INC)) { > + new = old; > + new.tickets.tail &= ~TICKET_SLOWPATH_FLAG; > + cmpxchg(&lock->head_tail, old.head_tail, new.head_tail); > + } > +} > + > #else /* !CONFIG_PARAVIRT_SPINLOCKS */ > static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, > __ticket_t ticket) > @@ -59,6 +76,10 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock, > { > } > > +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock) > +{ > +} > + > #endif /* CONFIG_PARAVIRT_SPINLOCKS */ > > static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) > @@ -84,7 +105,7 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) > register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; > > inc = xadd(&lock->tickets, inc); > - if (likely(inc.head == inc.tail)) > + if (likely(inc.head == (inc.tail & ~TICKET_SLOWPATH_FLAG)))The intent of this conditional was to be the quickest possible path when taking a fastpath lock, with the code below being used for all slowpath locks (free or taken). So I don't think masking out SLOWPATH_FLAG is necessary here.> goto out; > > inc.tail &= ~TICKET_SLOWPATH_FLAG; > @@ -98,7 +119,10 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) > } while (--count); > __ticket_lock_spinning(lock, inc.tail); > } > -out: barrier(); /* make sure nothing creeps before the lock is taken */ > +out: > + __ticket_check_and_clear_slowpath(lock); > + > + barrier(); /* make sure nothing creeps before the lock is taken */Which means that if "goto out" path is only ever used for fastpath locks, you can limit calling __ticket_check_and_clear_slowpath() to the slowpath case.> } > > static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) > @@ -115,47 +139,21 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) > return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; > } > > -static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, > - arch_spinlock_t old) > -{ > - arch_spinlock_t new; > - > - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); > - > - /* Perform the unlock on the "before" copy */ > - old.tickets.head += TICKET_LOCK_INC;NB (see below)> - > - /* Clear the slowpath flag */ > - new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); > - > - /* > - * If the lock is uncontended, clear the flag - use cmpxchg in > - * case it changes behind our back though. > - */ > - if (new.tickets.head != new.tickets.tail || > - cmpxchg(&lock->head_tail, old.head_tail, > - new.head_tail) != old.head_tail) { > - /* > - * Lock still has someone queued for it, so wake up an > - * appropriate waiter. > - */ > - __ticket_unlock_kick(lock, old.tickets.head); > - } > -} > - > static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) > { > if (TICKET_SLOWPATH_FLAG && > - static_key_false(¶virt_ticketlocks_enabled)) { > - arch_spinlock_t prev; > + static_key_false(¶virt_ticketlocks_enabled)) { > + __ticket_t prev_head; > > - prev = *lock; > + prev_head = lock->tickets.head; > add_smp(&lock->tickets.head, TICKET_LOCK_INC); > > /* add_smp() is a full mb() */ > > - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) > - __ticket_unlock_slowpath(lock, prev); > + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) {So we're OK with still having a ("speculative"?) read-after-unlock here? I guess the only way to avoid it is to make the add_smp an xadd, but that's pretty expensive even compared to a locked add (at least last time I checked, which was at least a couple of microarchitectures ago). An unlocked add followed by lfence should also do the trick, but that was also much worse in practice.> + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); > + __ticket_unlock_kick(lock, prev_head);Should be "prev_head + TICKET_LOCK_INC" to match the previous code, otherwise it won't find the CPU waiting for the new head. J> + } > } else > __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); > } > @@ -164,7 +162,7 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock) > { > struct __raw_tickets tmp = READ_ONCE(lock->tickets); > > - return tmp.tail != tmp.head; > + return (tmp.tail & ~TICKET_SLOWPATH_FLAG) != tmp.head; > } > > static inline int arch_spin_is_contended(arch_spinlock_t *lock)
Raghavendra K T
2015-Feb-09 09:34 UTC
[PATCH] x86 spinlock: Fix memory corruption on completing completions
On 02/09/2015 02:44 AM, Jeremy Fitzhardinge wrote:> On 02/06/2015 06:49 AM, Raghavendra K T wrote:[...]> >> Linus suggested that we should not do any writes to lock after unlock(), >> and we can move slowpath clearing to fastpath lock. > > Yep, that seems like a sound approach.Current approach seem to be working now. (though we could not avoid read). Related question: Do you think we could avoid SLOWPATH_FLAG itself by checking head and tail difference. or is it costly because it may result in unnecessary unlock_kicks?>> However it brings additional case to be handled, viz., slowpath still >> could be set when somebody does arch_trylock. Handle that too by ignoring >> slowpath flag during lock availability check. >> >> Reported-by: Sasha Levin <sasha.levin at oracle.com> >> Suggested-by: Linus Torvalds <torvalds at linux-foundation.org> >> Signed-off-by: Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com> >> --- >> arch/x86/include/asm/spinlock.h | 70 ++++++++++++++++++++--------------------- >> 1 file changed, 34 insertions(+), 36 deletions(-) >> >> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h >> index 625660f..0829f86 100644 >> --- a/arch/x86/include/asm/spinlock.h >> +++ b/arch/x86/include/asm/spinlock.h >> @@ -49,6 +49,23 @@ static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) >> set_bit(0, (volatile unsigned long *)&lock->tickets.tail); >> } >> >> +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock) >> +{ >> + arch_spinlock_t old, new; >> + __ticket_t diff; >> + >> + old.tickets = READ_ONCE(lock->tickets); > > Couldn't the caller pass in the lock state that it read rather than > re-reading it? >Yes we could. do you mean we could pass additional read value apart from lock, (because lock will be anyway needed for cmpxchg).>> >> +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock) >> +{ >> +} >> + >> #endif /* CONFIG_PARAVIRT_SPINLOCKS */ >> >> static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) >> @@ -84,7 +105,7 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) >> register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; >> >> inc = xadd(&lock->tickets, inc); >> - if (likely(inc.head == inc.tail)) >> + if (likely(inc.head == (inc.tail & ~TICKET_SLOWPATH_FLAG))) >good point, we can get rid of this as well.> The intent of this conditional was to be the quickest possible path when > taking a fastpath lock, with the code below being used for all slowpath > locks (free or taken). So I don't think masking out SLOWPATH_FLAG is > necessary here. > >> goto out; >> >> inc.tail &= ~TICKET_SLOWPATH_FLAG; >> @@ -98,7 +119,10 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) >> } while (--count); >> __ticket_lock_spinning(lock, inc.tail); >> } >> -out: barrier(); /* make sure nothing creeps before the lock is taken */ >> +out: >> + __ticket_check_and_clear_slowpath(lock); >> + >> + barrier(); /* make sure nothing creeps before the lock is taken */ > > Which means that if "goto out" path is only ever used for fastpath > locks, you can limit calling __ticket_check_and_clear_slowpath() to the > slowpath case. >Yes, I ll move that call up.>> } >> >> static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) >> @@ -115,47 +139,21 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) >> return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; >> } >> >> -static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, >> - arch_spinlock_t old) >> -{ >> - arch_spinlock_t new; >> - >> - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); >> - >> - /* Perform the unlock on the "before" copy */ >> - old.tickets.head += TICKET_LOCK_INC; > > NB (see below)Thanks for pointing, this solved the hang issue. I missed this exact addition.> >> - >> - /* Clear the slowpath flag */ >> - new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); >> - >> - /* >> - * If the lock is uncontended, clear the flag - use cmpxchg in >> - * case it changes behind our back though. >> - */ >> - if (new.tickets.head != new.tickets.tail || >> - cmpxchg(&lock->head_tail, old.head_tail, >> - new.head_tail) != old.head_tail) { >> - /* >> - * Lock still has someone queued for it, so wake up an >> - * appropriate waiter. >> - */ >> - __ticket_unlock_kick(lock, old.tickets.head); >> - } >> -} >> - >> static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) >> { >> if (TICKET_SLOWPATH_FLAG && >> - static_key_false(¶virt_ticketlocks_enabled)) { >> - arch_spinlock_t prev; >> + static_key_false(¶virt_ticketlocks_enabled)) { >> + __ticket_t prev_head; >> >> - prev = *lock; >> + prev_head = lock->tickets.head; >> add_smp(&lock->tickets.head, TICKET_LOCK_INC); >> >> /* add_smp() is a full mb() */ >> >> - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) >> - __ticket_unlock_slowpath(lock, prev); >> + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) { > > So we're OK with still having a ("speculative"?) read-after-unlock here? > I guess the only way to avoid it is to make the add_smp an xadd, but > that's pretty expensive even compared to a locked add (at least last > time I checked, which was at least a couple of microarchitectures ago). > An unlocked add followed by lfence should also do the trick, but that > was also much worse in practice.So we have 3 choices, 1. xadd 2. continue with current approach. 3. a read before unlock and also after that.> >> + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); >> + __ticket_unlock_kick(lock, prev_head); > > Should be "prev_head + TICKET_LOCK_INC" to match the previous code, > otherwise it won't find the CPU waiting for the new head.Yes it is :)
Maybe Matching Threads
- [PATCH] x86 spinlock: Fix memory corruption on completing completions
- [PATCH V5] x86 spinlock: Fix memory corruption on completing completions
- [PATCH V5] x86 spinlock: Fix memory corruption on completing completions
- [PATCH V3] x86 spinlock: Fix memory corruption on completing completions
- [PATCH V3] x86 spinlock: Fix memory corruption on completing completions