Nicholas Piggin
2020-Jul-06 04:35 UTC
[PATCH v3 0/6] powerpc: queued spinlocks and rwlocks
v3 is updated to use __pv_queued_spin_unlock, noticed by Waiman (thank you). Thanks, Nick Nicholas Piggin (6): powerpc/powernv: must include hvcall.h to get PAPR defines powerpc/pseries: move some PAPR paravirt functions to their own file powerpc: move spinlock implementation to simple_spinlock powerpc/64s: implement queued spinlocks and rwlocks powerpc/pseries: implement paravirt qspinlocks for SPLPAR powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the lock hint arch/powerpc/Kconfig | 13 + arch/powerpc/include/asm/Kbuild | 2 + arch/powerpc/include/asm/atomic.h | 28 ++ arch/powerpc/include/asm/paravirt.h | 89 +++++ arch/powerpc/include/asm/qspinlock.h | 91 ++++++ arch/powerpc/include/asm/qspinlock_paravirt.h | 7 + arch/powerpc/include/asm/simple_spinlock.h | 292 +++++++++++++++++ .../include/asm/simple_spinlock_types.h | 21 ++ arch/powerpc/include/asm/spinlock.h | 308 +----------------- arch/powerpc/include/asm/spinlock_types.h | 17 +- arch/powerpc/lib/Makefile | 3 + arch/powerpc/lib/locks.c | 12 +- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 + arch/powerpc/platforms/pseries/Kconfig | 5 + arch/powerpc/platforms/pseries/setup.c | 6 +- include/asm-generic/qspinlock.h | 4 + 16 files changed, 577 insertions(+), 322 deletions(-) create mode 100644 arch/powerpc/include/asm/paravirt.h create mode 100644 arch/powerpc/include/asm/qspinlock.h create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h create mode 100644 arch/powerpc/include/asm/simple_spinlock.h create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h -- 2.23.0
Nicholas Piggin
2020-Jul-06 04:35 UTC
[PATCH v3 1/6] powerpc/powernv: must include hvcall.h to get PAPR defines
An include goes away in future patches which breaks compilation without this. Signed-off-by: Nicholas Piggin <npiggin at gmail.com> --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index f923359d8afc..8eba6ece7808 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -15,6 +15,7 @@ #include <asm/iommu.h> #include <asm/tce.h> +#include <asm/hvcall.h> /* share error returns with PAPR */ #include "pci.h" unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) -- 2.23.0
Nicholas Piggin
2020-Jul-06 04:35 UTC
[PATCH v3 2/6] powerpc/pseries: move some PAPR paravirt functions to their own file
Signed-off-by: Nicholas Piggin <npiggin at gmail.com> --- arch/powerpc/include/asm/paravirt.h | 61 +++++++++++++++++++++++++++++ arch/powerpc/include/asm/spinlock.h | 24 +----------- arch/powerpc/lib/locks.c | 12 +++--- 3 files changed, 68 insertions(+), 29 deletions(-) create mode 100644 arch/powerpc/include/asm/paravirt.h diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h new file mode 100644 index 000000000000..7a8546660a63 --- /dev/null +++ b/arch/powerpc/include/asm/paravirt.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_PARAVIRT_H +#define __ASM_PARAVIRT_H +#ifdef __KERNEL__ + +#include <linux/jump_label.h> +#include <asm/smp.h> +#ifdef CONFIG_PPC64 +#include <asm/paca.h> +#include <asm/hvcall.h> +#endif + +#ifdef CONFIG_PPC_SPLPAR +DECLARE_STATIC_KEY_FALSE(shared_processor); + +static inline bool is_shared_processor(void) +{ + return static_branch_unlikely(&shared_processor); +} + +/* If bit 0 is set, the cpu has been preempted */ +static inline u32 yield_count_of(int cpu) +{ + __be32 yield_count = READ_ONCE(lppaca_of(cpu).yield_count); + return be32_to_cpu(yield_count); +} + +static inline void yield_to_preempted(int cpu, u32 yield_count) +{ + plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count); +} +#else +static inline bool is_shared_processor(void) +{ + return false; +} + +static inline u32 yield_count_of(int cpu) +{ + return 0; +} + +extern void ___bad_yield_to_preempted(void); +static inline void yield_to_preempted(int cpu, u32 yield_count) +{ + ___bad_yield_to_preempted(); /* This would be a bug */ +} +#endif + +#define vcpu_is_preempted vcpu_is_preempted +static inline bool vcpu_is_preempted(int cpu) +{ + if (!is_shared_processor()) + return false; + if (yield_count_of(cpu) & 1) + return true; + return false; +} + +#endif /* __KERNEL__ */ +#endif /* __ASM_PARAVIRT_H */ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 2d620896cdae..79be9bb10bbb 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -15,11 +15,10 @@ * * (the type definitions are in asm/spinlock_types.h) */ -#include <linux/jump_label.h> #include <linux/irqflags.h> +#include <asm/paravirt.h> #ifdef CONFIG_PPC64 #include <asm/paca.h> -#include <asm/hvcall.h> #endif #include <asm/synch.h> #include <asm/ppc-opcode.h> @@ -35,18 +34,6 @@ #define LOCK_TOKEN 1 #endif -#ifdef CONFIG_PPC_PSERIES -DECLARE_STATIC_KEY_FALSE(shared_processor); - -#define vcpu_is_preempted vcpu_is_preempted -static inline bool vcpu_is_preempted(int cpu) -{ - if (!static_branch_unlikely(&shared_processor)) - return false; - return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1); -} -#endif - static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) { return lock.slock == 0; @@ -110,15 +97,6 @@ static inline void splpar_spin_yield(arch_spinlock_t *lock) {}; static inline void splpar_rw_yield(arch_rwlock_t *lock) {}; #endif -static inline bool is_shared_processor(void) -{ -#ifdef CONFIG_PPC_SPLPAR - return static_branch_unlikely(&shared_processor); -#else - return false; -#endif -} - static inline void spin_yield(arch_spinlock_t *lock) { if (is_shared_processor()) diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 6440d5943c00..04165b7a163f 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -27,14 +27,14 @@ void splpar_spin_yield(arch_spinlock_t *lock) return; holder_cpu = lock_value & 0xffff; BUG_ON(holder_cpu >= NR_CPUS); - yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count); + + yield_count = yield_count_of(holder_cpu); if ((yield_count & 1) == 0) return; /* virtual cpu is currently running */ rmb(); if (lock->slock != lock_value) return; /* something has changed */ - plpar_hcall_norets(H_CONFER, - get_hard_smp_processor_id(holder_cpu), yield_count); + yield_to_preempted(holder_cpu, yield_count); } EXPORT_SYMBOL_GPL(splpar_spin_yield); @@ -53,13 +53,13 @@ void splpar_rw_yield(arch_rwlock_t *rw) return; /* no write lock at present */ holder_cpu = lock_value & 0xffff; BUG_ON(holder_cpu >= NR_CPUS); - yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count); + + yield_count = yield_count_of(holder_cpu); if ((yield_count & 1) == 0) return; /* virtual cpu is currently running */ rmb(); if (rw->lock != lock_value) return; /* something has changed */ - plpar_hcall_norets(H_CONFER, - get_hard_smp_processor_id(holder_cpu), yield_count); + yield_to_preempted(holder_cpu, yield_count); } #endif -- 2.23.0
Nicholas Piggin
2020-Jul-06 04:35 UTC
[PATCH v3 3/6] powerpc: move spinlock implementation to simple_spinlock
To prepare for queued spinlocks. This is a simple rename except to update preprocessor guard name and a file reference. Signed-off-by: Nicholas Piggin <npiggin at gmail.com> --- arch/powerpc/include/asm/simple_spinlock.h | 292 ++++++++++++++++++ .../include/asm/simple_spinlock_types.h | 21 ++ arch/powerpc/include/asm/spinlock.h | 285 +---------------- arch/powerpc/include/asm/spinlock_types.h | 12 +- 4 files changed, 315 insertions(+), 295 deletions(-) create mode 100644 arch/powerpc/include/asm/simple_spinlock.h create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h new file mode 100644 index 000000000000..e048c041c4a9 --- /dev/null +++ b/arch/powerpc/include/asm/simple_spinlock.h @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_SIMPLE_SPINLOCK_H +#define __ASM_SIMPLE_SPINLOCK_H +#ifdef __KERNEL__ + +/* + * Simple spin lock operations. + * + * Copyright (C) 2001-2004 Paul Mackerras <paulus at au.ibm.com>, IBM + * Copyright (C) 2001 Anton Blanchard <anton at au.ibm.com>, IBM + * Copyright (C) 2002 Dave Engebretsen <engebret at us.ibm.com>, IBM + * Rework to support virtual processors + * + * Type of int is used as a full 64b word is not necessary. + * + * (the type definitions are in asm/simple_spinlock_types.h) + */ +#include <linux/irqflags.h> +#include <asm/paravirt.h> +#ifdef CONFIG_PPC64 +#include <asm/paca.h> +#endif +#include <asm/synch.h> +#include <asm/ppc-opcode.h> + +#ifdef CONFIG_PPC64 +/* use 0x800000yy when locked, where yy == CPU number */ +#ifdef __BIG_ENDIAN__ +#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) +#else +#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index)) +#endif +#else +#define LOCK_TOKEN 1 +#endif + +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.slock == 0; +} + +static inline int arch_spin_is_locked(arch_spinlock_t *lock) +{ + smp_mb(); + return !arch_spin_value_unlocked(*lock); +} + +/* + * This returns the old value in the lock, so we succeeded + * in getting the lock if the return value is 0. + */ +static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) +{ + unsigned long tmp, token; + + token = LOCK_TOKEN; + __asm__ __volatile__( +"1: " PPC_LWARX(%0,0,%2,1) "\n\ + cmpwi 0,%0,0\n\ + bne- 2f\n\ + stwcx. %1,0,%2\n\ + bne- 1b\n" + PPC_ACQUIRE_BARRIER +"2:" + : "=&r" (tmp) + : "r" (token), "r" (&lock->slock) + : "cr0", "memory"); + + return tmp; +} + +static inline int arch_spin_trylock(arch_spinlock_t *lock) +{ + return __arch_spin_trylock(lock) == 0; +} + +/* + * On a system with shared processors (that is, where a physical + * processor is multiplexed between several virtual processors), + * there is no point spinning on a lock if the holder of the lock + * isn't currently scheduled on a physical processor. Instead + * we detect this situation and ask the hypervisor to give the + * rest of our timeslice to the lock holder. + * + * So that we can tell which virtual processor is holding a lock, + * we put 0x80000000 | smp_processor_id() in the lock when it is + * held. Conveniently, we have a word in the paca that holds this + * value. + */ + +#if defined(CONFIG_PPC_SPLPAR) +/* We only yield to the hypervisor if we are in shared processor mode */ +void splpar_spin_yield(arch_spinlock_t *lock); +void splpar_rw_yield(arch_rwlock_t *lock); +#else /* SPLPAR */ +static inline void splpar_spin_yield(arch_spinlock_t *lock) {}; +static inline void splpar_rw_yield(arch_rwlock_t *lock) {}; +#endif + +static inline void spin_yield(arch_spinlock_t *lock) +{ + if (is_shared_processor()) + splpar_spin_yield(lock); + else + barrier(); +} + +static inline void rw_yield(arch_rwlock_t *lock) +{ + if (is_shared_processor()) + splpar_rw_yield(lock); + else + barrier(); +} + +static inline void arch_spin_lock(arch_spinlock_t *lock) +{ + while (1) { + if (likely(__arch_spin_trylock(lock) == 0)) + break; + do { + HMT_low(); + if (is_shared_processor()) + splpar_spin_yield(lock); + } while (unlikely(lock->slock != 0)); + HMT_medium(); + } +} + +static inline +void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) +{ + unsigned long flags_dis; + + while (1) { + if (likely(__arch_spin_trylock(lock) == 0)) + break; + local_save_flags(flags_dis); + local_irq_restore(flags); + do { + HMT_low(); + if (is_shared_processor()) + splpar_spin_yield(lock); + } while (unlikely(lock->slock != 0)); + HMT_medium(); + local_irq_restore(flags_dis); + } +} +#define arch_spin_lock_flags arch_spin_lock_flags + +static inline void arch_spin_unlock(arch_spinlock_t *lock) +{ + __asm__ __volatile__("# arch_spin_unlock\n\t" + PPC_RELEASE_BARRIER: : :"memory"); + lock->slock = 0; +} + +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + */ + +#ifdef CONFIG_PPC64 +#define __DO_SIGN_EXTEND "extsw %0,%0\n" +#define WRLOCK_TOKEN LOCK_TOKEN /* it's negative */ +#else +#define __DO_SIGN_EXTEND +#define WRLOCK_TOKEN (-1) +#endif + +/* + * This returns the old value in the lock + 1, + * so we got a read lock if the return value is > 0. + */ +static inline long __arch_read_trylock(arch_rwlock_t *rw) +{ + long tmp; + + __asm__ __volatile__( +"1: " PPC_LWARX(%0,0,%1,1) "\n" + __DO_SIGN_EXTEND +" addic. %0,%0,1\n\ + ble- 2f\n" +" stwcx. %0,0,%1\n\ + bne- 1b\n" + PPC_ACQUIRE_BARRIER +"2:" : "=&r" (tmp) + : "r" (&rw->lock) + : "cr0", "xer", "memory"); + + return tmp; +} + +/* + * This returns the old value in the lock, + * so we got the write lock if the return value is 0. + */ +static inline long __arch_write_trylock(arch_rwlock_t *rw) +{ + long tmp, token; + + token = WRLOCK_TOKEN; + __asm__ __volatile__( +"1: " PPC_LWARX(%0,0,%2,1) "\n\ + cmpwi 0,%0,0\n\ + bne- 2f\n" +" stwcx. %1,0,%2\n\ + bne- 1b\n" + PPC_ACQUIRE_BARRIER +"2:" : "=&r" (tmp) + : "r" (token), "r" (&rw->lock) + : "cr0", "memory"); + + return tmp; +} + +static inline void arch_read_lock(arch_rwlock_t *rw) +{ + while (1) { + if (likely(__arch_read_trylock(rw) > 0)) + break; + do { + HMT_low(); + if (is_shared_processor()) + splpar_rw_yield(rw); + } while (unlikely(rw->lock < 0)); + HMT_medium(); + } +} + +static inline void arch_write_lock(arch_rwlock_t *rw) +{ + while (1) { + if (likely(__arch_write_trylock(rw) == 0)) + break; + do { + HMT_low(); + if (is_shared_processor()) + splpar_rw_yield(rw); + } while (unlikely(rw->lock != 0)); + HMT_medium(); + } +} + +static inline int arch_read_trylock(arch_rwlock_t *rw) +{ + return __arch_read_trylock(rw) > 0; +} + +static inline int arch_write_trylock(arch_rwlock_t *rw) +{ + return __arch_write_trylock(rw) == 0; +} + +static inline void arch_read_unlock(arch_rwlock_t *rw) +{ + long tmp; + + __asm__ __volatile__( + "# read_unlock\n\t" + PPC_RELEASE_BARRIER +"1: lwarx %0,0,%1\n\ + addic %0,%0,-1\n" +" stwcx. %0,0,%1\n\ + bne- 1b" + : "=&r"(tmp) + : "r"(&rw->lock) + : "cr0", "xer", "memory"); +} + +static inline void arch_write_unlock(arch_rwlock_t *rw) +{ + __asm__ __volatile__("# write_unlock\n\t" + PPC_RELEASE_BARRIER: : :"memory"); + rw->lock = 0; +} + +#define arch_spin_relax(lock) spin_yield(lock) +#define arch_read_relax(lock) rw_yield(lock) +#define arch_write_relax(lock) rw_yield(lock) + +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() + +#endif /* __KERNEL__ */ +#endif /* __ASM_SIMPLE_SPINLOCK_H */ diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h new file mode 100644 index 000000000000..7c2b48ce62dc --- /dev/null +++ b/arch/powerpc/include/asm/simple_spinlock_types.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H +#define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + +#ifndef __LINUX_SPINLOCK_TYPES_H +# error "please don't include this file directly" +#endif + +typedef struct { + volatile unsigned int slock; +} arch_spinlock_t; + +#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } + +typedef struct { + volatile signed int lock; +} arch_rwlock_t; + +#define __ARCH_RW_LOCK_UNLOCKED { 0 } + +#endif diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 79be9bb10bbb..21357fe05fe0 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -3,290 +3,7 @@ #define __ASM_SPINLOCK_H #ifdef __KERNEL__ -/* - * Simple spin lock operations. - * - * Copyright (C) 2001-2004 Paul Mackerras <paulus at au.ibm.com>, IBM - * Copyright (C) 2001 Anton Blanchard <anton at au.ibm.com>, IBM - * Copyright (C) 2002 Dave Engebretsen <engebret at us.ibm.com>, IBM - * Rework to support virtual processors - * - * Type of int is used as a full 64b word is not necessary. - * - * (the type definitions are in asm/spinlock_types.h) - */ -#include <linux/irqflags.h> -#include <asm/paravirt.h> -#ifdef CONFIG_PPC64 -#include <asm/paca.h> -#endif -#include <asm/synch.h> -#include <asm/ppc-opcode.h> - -#ifdef CONFIG_PPC64 -/* use 0x800000yy when locked, where yy == CPU number */ -#ifdef __BIG_ENDIAN__ -#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) -#else -#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index)) -#endif -#else -#define LOCK_TOKEN 1 -#endif - -static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) -{ - return lock.slock == 0; -} - -static inline int arch_spin_is_locked(arch_spinlock_t *lock) -{ - smp_mb(); - return !arch_spin_value_unlocked(*lock); -} - -/* - * This returns the old value in the lock, so we succeeded - * in getting the lock if the return value is 0. - */ -static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) -{ - unsigned long tmp, token; - - token = LOCK_TOKEN; - __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%2,1) "\n\ - cmpwi 0,%0,0\n\ - bne- 2f\n\ - stwcx. %1,0,%2\n\ - bne- 1b\n" - PPC_ACQUIRE_BARRIER -"2:" - : "=&r" (tmp) - : "r" (token), "r" (&lock->slock) - : "cr0", "memory"); - - return tmp; -} - -static inline int arch_spin_trylock(arch_spinlock_t *lock) -{ - return __arch_spin_trylock(lock) == 0; -} - -/* - * On a system with shared processors (that is, where a physical - * processor is multiplexed between several virtual processors), - * there is no point spinning on a lock if the holder of the lock - * isn't currently scheduled on a physical processor. Instead - * we detect this situation and ask the hypervisor to give the - * rest of our timeslice to the lock holder. - * - * So that we can tell which virtual processor is holding a lock, - * we put 0x80000000 | smp_processor_id() in the lock when it is - * held. Conveniently, we have a word in the paca that holds this - * value. - */ - -#if defined(CONFIG_PPC_SPLPAR) -/* We only yield to the hypervisor if we are in shared processor mode */ -void splpar_spin_yield(arch_spinlock_t *lock); -void splpar_rw_yield(arch_rwlock_t *lock); -#else /* SPLPAR */ -static inline void splpar_spin_yield(arch_spinlock_t *lock) {}; -static inline void splpar_rw_yield(arch_rwlock_t *lock) {}; -#endif - -static inline void spin_yield(arch_spinlock_t *lock) -{ - if (is_shared_processor()) - splpar_spin_yield(lock); - else - barrier(); -} - -static inline void rw_yield(arch_rwlock_t *lock) -{ - if (is_shared_processor()) - splpar_rw_yield(lock); - else - barrier(); -} - -static inline void arch_spin_lock(arch_spinlock_t *lock) -{ - while (1) { - if (likely(__arch_spin_trylock(lock) == 0)) - break; - do { - HMT_low(); - if (is_shared_processor()) - splpar_spin_yield(lock); - } while (unlikely(lock->slock != 0)); - HMT_medium(); - } -} - -static inline -void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - unsigned long flags_dis; - - while (1) { - if (likely(__arch_spin_trylock(lock) == 0)) - break; - local_save_flags(flags_dis); - local_irq_restore(flags); - do { - HMT_low(); - if (is_shared_processor()) - splpar_spin_yield(lock); - } while (unlikely(lock->slock != 0)); - HMT_medium(); - local_irq_restore(flags_dis); - } -} -#define arch_spin_lock_flags arch_spin_lock_flags - -static inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - __asm__ __volatile__("# arch_spin_unlock\n\t" - PPC_RELEASE_BARRIER: : :"memory"); - lock->slock = 0; -} - -/* - * Read-write spinlocks, allowing multiple readers - * but only one writer. - * - * NOTE! it is quite common to have readers in interrupts - * but no interrupt writers. For those circumstances we - * can "mix" irq-safe locks - any writer needs to get a - * irq-safe write-lock, but readers can get non-irqsafe - * read-locks. - */ - -#ifdef CONFIG_PPC64 -#define __DO_SIGN_EXTEND "extsw %0,%0\n" -#define WRLOCK_TOKEN LOCK_TOKEN /* it's negative */ -#else -#define __DO_SIGN_EXTEND -#define WRLOCK_TOKEN (-1) -#endif - -/* - * This returns the old value in the lock + 1, - * so we got a read lock if the return value is > 0. - */ -static inline long __arch_read_trylock(arch_rwlock_t *rw) -{ - long tmp; - - __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%1,1) "\n" - __DO_SIGN_EXTEND -" addic. %0,%0,1\n\ - ble- 2f\n" -" stwcx. %0,0,%1\n\ - bne- 1b\n" - PPC_ACQUIRE_BARRIER -"2:" : "=&r" (tmp) - : "r" (&rw->lock) - : "cr0", "xer", "memory"); - - return tmp; -} - -/* - * This returns the old value in the lock, - * so we got the write lock if the return value is 0. - */ -static inline long __arch_write_trylock(arch_rwlock_t *rw) -{ - long tmp, token; - - token = WRLOCK_TOKEN; - __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%2,1) "\n\ - cmpwi 0,%0,0\n\ - bne- 2f\n" -" stwcx. %1,0,%2\n\ - bne- 1b\n" - PPC_ACQUIRE_BARRIER -"2:" : "=&r" (tmp) - : "r" (token), "r" (&rw->lock) - : "cr0", "memory"); - - return tmp; -} - -static inline void arch_read_lock(arch_rwlock_t *rw) -{ - while (1) { - if (likely(__arch_read_trylock(rw) > 0)) - break; - do { - HMT_low(); - if (is_shared_processor()) - splpar_rw_yield(rw); - } while (unlikely(rw->lock < 0)); - HMT_medium(); - } -} - -static inline void arch_write_lock(arch_rwlock_t *rw) -{ - while (1) { - if (likely(__arch_write_trylock(rw) == 0)) - break; - do { - HMT_low(); - if (is_shared_processor()) - splpar_rw_yield(rw); - } while (unlikely(rw->lock != 0)); - HMT_medium(); - } -} - -static inline int arch_read_trylock(arch_rwlock_t *rw) -{ - return __arch_read_trylock(rw) > 0; -} - -static inline int arch_write_trylock(arch_rwlock_t *rw) -{ - return __arch_write_trylock(rw) == 0; -} - -static inline void arch_read_unlock(arch_rwlock_t *rw) -{ - long tmp; - - __asm__ __volatile__( - "# read_unlock\n\t" - PPC_RELEASE_BARRIER -"1: lwarx %0,0,%1\n\ - addic %0,%0,-1\n" -" stwcx. %0,0,%1\n\ - bne- 1b" - : "=&r"(tmp) - : "r"(&rw->lock) - : "cr0", "xer", "memory"); -} - -static inline void arch_write_unlock(arch_rwlock_t *rw) -{ - __asm__ __volatile__("# write_unlock\n\t" - PPC_RELEASE_BARRIER: : :"memory"); - rw->lock = 0; -} - -#define arch_spin_relax(lock) spin_yield(lock) -#define arch_read_relax(lock) rw_yield(lock) -#define arch_write_relax(lock) rw_yield(lock) - -/* See include/linux/spinlock.h */ -#define smp_mb__after_spinlock() smp_mb() +#include <asm/simple_spinlock.h> #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index 87adaf13b7e8..3906f52dae65 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -6,16 +6,6 @@ # error "please don't include this file directly" #endif -typedef struct { - volatile unsigned int slock; -} arch_spinlock_t; - -#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } - -typedef struct { - volatile signed int lock; -} arch_rwlock_t; - -#define __ARCH_RW_LOCK_UNLOCKED { 0 } +#include <asm/simple_spinlock_types.h> #endif -- 2.23.0
Nicholas Piggin
2020-Jul-06 04:35 UTC
[PATCH v3 4/6] powerpc/64s: implement queued spinlocks and rwlocks
These have shown significantly improved performance and fairness when spinlock contention is moderate to high on very large systems. [ Numbers hopefully forthcoming after more testing, but initial results look good ] Thanks to the fast path, single threaded performance is not noticably hurt. Signed-off-by: Nicholas Piggin <npiggin at gmail.com> --- arch/powerpc/Kconfig | 13 ++++++++++++ arch/powerpc/include/asm/Kbuild | 2 ++ arch/powerpc/include/asm/qspinlock.h | 25 +++++++++++++++++++++++ arch/powerpc/include/asm/spinlock.h | 5 +++++ arch/powerpc/include/asm/spinlock_types.h | 5 +++++ arch/powerpc/lib/Makefile | 3 +++ include/asm-generic/qspinlock.h | 2 ++ 7 files changed, 55 insertions(+) create mode 100644 arch/powerpc/include/asm/qspinlock.h diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 24ac85c868db..17663ea57697 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -146,6 +146,8 @@ config PPC select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 + select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS + select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF @@ -492,6 +494,17 @@ config HOTPLUG_CPU Say N if you are unsure. +config PPC_QUEUED_SPINLOCKS + bool "Queued spinlocks" + depends on SMP + default "y" if PPC_BOOK3S_64 + help + Say Y here to use to use queued spinlocks which are more complex + but give better salability and fairness on large SMP and NUMA + systems. + + If unsure, say "Y" if you have lots of cores, otherwise "N". + config ARCH_CPU_PROBE_RELEASE def_bool y depends on HOTPLUG_CPU diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index dadbcf3a0b1e..1dd8b6adff5e 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h generic-y += export.h generic-y += local64.h generic-y += mcs_spinlock.h +generic-y += qrwlock.h +generic-y += qspinlock.h generic-y += vtime.h generic-y += early_ioremap.h diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h new file mode 100644 index 000000000000..c49e33e24edd --- /dev/null +++ b/arch/powerpc/include/asm/qspinlock.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_QSPINLOCK_H +#define _ASM_POWERPC_QSPINLOCK_H + +#include <asm-generic/qspinlock_types.h> + +#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */ + +#define smp_mb__after_spinlock() smp_mb() + +static __always_inline int queued_spin_is_locked(struct qspinlock *lock) +{ + /* + * This barrier was added to simple spinlocks by commit 51d7d5205d338, + * but it should now be possible to remove it, asm arm64 has done with + * commit c6f5d02b6a0f. + */ + smp_mb(); + return atomic_read(&lock->val); +} +#define queued_spin_is_locked queued_spin_is_locked + +#include <asm-generic/qspinlock.h> + +#endif /* _ASM_POWERPC_QSPINLOCK_H */ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 21357fe05fe0..434615f1d761 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -3,7 +3,12 @@ #define __ASM_SPINLOCK_H #ifdef __KERNEL__ +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#include <asm/qrwlock.h> +#else #include <asm/simple_spinlock.h> +#endif #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index 3906f52dae65..c5d742f18021 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -6,6 +6,11 @@ # error "please don't include this file directly" #endif +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS +#include <asm-generic/qspinlock_types.h> +#include <asm-generic/qrwlock_types.h> +#else #include <asm/simple_spinlock_types.h> +#endif #endif diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 5e994cda8e40..d66a645503eb 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ memcpy_64.o memcpy_mcsafe_64.o +ifndef CONFIG_PPC_QUEUED_SPINLOCKS obj64-$(CONFIG_SMP) += locks.o +endif + obj64-$(CONFIG_ALTIVEC) += vmx-helper.o obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \ test_emulate_step_exec_instr.o diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index fde943d180e0..fb0a814d4395 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -12,6 +12,7 @@ #include <asm-generic/qspinlock_types.h> +#ifndef queued_spin_is_locked /** * queued_spin_is_locked - is the spinlock locked? * @lock: Pointer to queued spinlock structure @@ -25,6 +26,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock) */ return atomic_read(&lock->val); } +#endif /** * queued_spin_value_unlocked - is the spinlock structure unlocked? -- 2.23.0
Nicholas Piggin
2020-Jul-06 04:35 UTC
[PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
Signed-off-by: Nicholas Piggin <npiggin at gmail.com> --- arch/powerpc/include/asm/paravirt.h | 28 ++++++++ arch/powerpc/include/asm/qspinlock.h | 66 +++++++++++++++++++ arch/powerpc/include/asm/qspinlock_paravirt.h | 7 ++ arch/powerpc/platforms/pseries/Kconfig | 5 ++ arch/powerpc/platforms/pseries/setup.c | 6 +- include/asm-generic/qspinlock.h | 2 + 6 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h index 7a8546660a63..f2d51f929cf5 100644 --- a/arch/powerpc/include/asm/paravirt.h +++ b/arch/powerpc/include/asm/paravirt.h @@ -29,6 +29,16 @@ static inline void yield_to_preempted(int cpu, u32 yield_count) { plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count); } + +static inline void prod_cpu(int cpu) +{ + plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); +} + +static inline void yield_to_any(void) +{ + plpar_hcall_norets(H_CONFER, -1, 0); +} #else static inline bool is_shared_processor(void) { @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count) { ___bad_yield_to_preempted(); /* This would be a bug */ } + +extern void ___bad_yield_to_any(void); +static inline void yield_to_any(void) +{ + ___bad_yield_to_any(); /* This would be a bug */ +} + +extern void ___bad_prod_cpu(void); +static inline void prod_cpu(int cpu) +{ + ___bad_prod_cpu(); /* This would be a bug */ +} + #endif #define vcpu_is_preempted vcpu_is_preempted @@ -57,5 +80,10 @@ static inline bool vcpu_is_preempted(int cpu) return false; } +static inline bool pv_is_native_spin_unlock(void) +{ + return !is_shared_processor(); +} + #endif /* __KERNEL__ */ #endif /* __ASM_PARAVIRT_H */ diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index c49e33e24edd..f5066f00a08c 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -3,9 +3,47 @@ #define _ASM_POWERPC_QSPINLOCK_H #include <asm-generic/qspinlock_types.h> +#include <asm/paravirt.h> #define _Q_PENDING_LOOPS (1 << 9) /* not tuned */ +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_queued_spin_unlock(struct qspinlock *lock); + +static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + if (!is_shared_processor()) + native_queued_spin_lock_slowpath(lock, val); + else + __pv_queued_spin_lock_slowpath(lock, val); +} + +#define queued_spin_unlock queued_spin_unlock +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + if (!is_shared_processor()) + smp_store_release(&lock->locked, 0); + else + __pv_queued_spin_unlock(lock); +} + +#else +extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +#endif + +static __always_inline void queued_spin_lock(struct qspinlock *lock) +{ + u32 val = 0; + + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) + return; + + queued_spin_lock_slowpath(lock, val); +} +#define queued_spin_lock queued_spin_lock + #define smp_mb__after_spinlock() smp_mb() static __always_inline int queued_spin_is_locked(struct qspinlock *lock) @@ -20,6 +58,34 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock) } #define queued_spin_is_locked queued_spin_is_locked +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define SPIN_THRESHOLD (1<<15) /* not tuned */ + +static __always_inline void pv_wait(u8 *ptr, u8 val) +{ + if (*ptr != val) + return; + yield_to_any(); + /* + * We could pass in a CPU here if waiting in the queue and yield to + * the previous CPU in the queue. + */ +} + +static __always_inline void pv_kick(int cpu) +{ + prod_cpu(cpu); +} + +extern void __pv_init_lock_hash(void); + +static inline void pv_spinlocks_init(void) +{ + __pv_init_lock_hash(); +} + +#endif + #include <asm-generic/qspinlock.h> #endif /* _ASM_POWERPC_QSPINLOCK_H */ diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h new file mode 100644 index 000000000000..750d1b5e0202 --- /dev/null +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_QSPINLOCK_PARAVIRT_H +#define __ASM_QSPINLOCK_PARAVIRT_H + +EXPORT_SYMBOL(__pv_queued_spin_unlock); + +#endif /* __ASM_QSPINLOCK_PARAVIRT_H */ diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 24c18362e5ea..756e727b383f 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -25,9 +25,14 @@ config PPC_PSERIES select SWIOTLB default y +config PARAVIRT_SPINLOCKS + bool + default n + config PPC_SPLPAR depends on PPC_PSERIES bool "Support for shared-processor logical partitions" + select PARAVIRT_SPINLOCKS if PPC_QUEUED_SPINLOCKS help Enabling this option will make the kernel run more efficiently on logically-partitioned pSeries systems which use shared diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 2db8469e475f..747a203d9453 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void) if (firmware_has_feature(FW_FEATURE_LPAR)) { vpa_init(boot_cpuid); - if (lppaca_shared_proc(get_lppaca())) + if (lppaca_shared_proc(get_lppaca())) { static_branch_enable(&shared_processor); +#ifdef CONFIG_PARAVIRT_SPINLOCKS + pv_spinlocks_init(); +#endif + } ppc_md.power_save = pseries_lpar_idle; ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index fb0a814d4395..38ca14e79a86 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -69,6 +69,7 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock) extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +#ifndef queued_spin_lock /** * queued_spin_lock - acquire a queued spinlock * @lock: Pointer to queued spinlock structure @@ -82,6 +83,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock) queued_spin_lock_slowpath(lock, val); } +#endif #ifndef queued_spin_unlock /** -- 2.23.0
Nicholas Piggin
2020-Jul-06 04:35 UTC
[PATCH v3 6/6] powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the lock hint
This brings the behaviour of the uncontended fast path back to roughly equivalent to simple spinlocks -- a single atomic op with lock hint. Signed-off-by: Nicholas Piggin <npiggin at gmail.com> --- arch/powerpc/include/asm/atomic.h | 28 ++++++++++++++++++++++++++++ arch/powerpc/include/asm/qspinlock.h | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 498785ffc25f..f6a3d145ffb7 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -193,6 +193,34 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) #define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new)) +/* + * Don't want to override the generic atomic_try_cmpxchg_acquire, because + * we add a lock hint to the lwarx, which may not be wanted for the + * _acquire case (and is not used by the other _acquire variants so it + * would be a surprise). + */ +static __always_inline bool +atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) +{ + int r, o = *old; + + __asm__ __volatile__ ( +"1:\t" PPC_LWARX(%0,0,%2,1) " # atomic_try_cmpxchg_acquire \n" +" cmpw 0,%0,%3 \n" +" bne- 2f \n" +" stwcx. %4,0,%2 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (r), "+m" (v->counter) + : "r" (&v->counter), "r" (o), "r" (new) + : "cr0", "memory"); + + if (unlikely(r != o)) + *old = r; + return likely(r == o); +} + /** * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index f5066f00a08c..b752d34517b3 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -37,7 +37,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock) { u32 val = 0; - if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) + if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL))) return; queued_spin_lock_slowpath(lock, val); -- 2.23.0
On 7/6/20 12:35 AM, Nicholas Piggin wrote:> v3 is updated to use __pv_queued_spin_unlock, noticed by Waiman (thank you). > > Thanks, > Nick > > Nicholas Piggin (6): > powerpc/powernv: must include hvcall.h to get PAPR defines > powerpc/pseries: move some PAPR paravirt functions to their own file > powerpc: move spinlock implementation to simple_spinlock > powerpc/64s: implement queued spinlocks and rwlocks > powerpc/pseries: implement paravirt qspinlocks for SPLPAR > powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the > lock hint > > arch/powerpc/Kconfig | 13 + > arch/powerpc/include/asm/Kbuild | 2 + > arch/powerpc/include/asm/atomic.h | 28 ++ > arch/powerpc/include/asm/paravirt.h | 89 +++++ > arch/powerpc/include/asm/qspinlock.h | 91 ++++++ > arch/powerpc/include/asm/qspinlock_paravirt.h | 7 + > arch/powerpc/include/asm/simple_spinlock.h | 292 +++++++++++++++++ > .../include/asm/simple_spinlock_types.h | 21 ++ > arch/powerpc/include/asm/spinlock.h | 308 +----------------- > arch/powerpc/include/asm/spinlock_types.h | 17 +- > arch/powerpc/lib/Makefile | 3 + > arch/powerpc/lib/locks.c | 12 +- > arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 + > arch/powerpc/platforms/pseries/Kconfig | 5 + > arch/powerpc/platforms/pseries/setup.c | 6 +- > include/asm-generic/qspinlock.h | 4 + > 16 files changed, 577 insertions(+), 322 deletions(-) > create mode 100644 arch/powerpc/include/asm/paravirt.h > create mode 100644 arch/powerpc/include/asm/qspinlock.h > create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h > create mode 100644 arch/powerpc/include/asm/simple_spinlock.h > create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h >This patch looks OK to me. I had run some microbenchmark on powerpc system with or w/o the patch. On a 2-socket 160-thread SMT4 POWER9 system (not virtualized): 5.8.0-rc4 ======== Running locktest with spinlock [runtime = 10s, load = 1] Threads = 160, Min/Mean/Max = 77,665/90,153/106,895 Threads = 160, Total Rate = 1,441,759 op/s; Percpu Rate = 9,011 op/s Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] Threads = 160, Min/Mean/Max = 47,879/53,807/63,689 Threads = 160, Total Rate = 860,192 op/s; Percpu Rate = 5,376 op/s Running locktest with spinlock [runtime = 10s, load = 1] Threads = 80, Min/Mean/Max = 242,907/319,514/463,161 Threads = 80, Total Rate = 2,555 kop/s; Percpu Rate = 32 kop/s Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] Threads = 80, Min/Mean/Max = 146,161/187,474/259,270 Threads = 80, Total Rate = 1,498 kop/s; Percpu Rate = 19 kop/s Running locktest with spinlock [runtime = 10s, load = 1] Threads = 40, Min/Mean/Max = 646,639/1,000,817/1,455,205 Threads = 40, Total Rate = 4,001 kop/s; Percpu Rate = 100 kop/s Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] Threads = 40, Min/Mean/Max = 402,165/597,132/814,555 Threads = 40, Total Rate = 2,388 kop/s; Percpu Rate = 60 kop/s 5.8.0-rc4-qlock+ =============== Running locktest with spinlock [runtime = 10s, load = 1] Threads = 160, Min/Mean/Max = 123,835/124,580/124,587 Threads = 160, Total Rate = 1,992 kop/s; Percpu Rate = 12 kop/s Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] Threads = 160, Min/Mean/Max = 254,210/264,714/276,784 Threads = 160, Total Rate = 4,231 kop/s; Percpu Rate = 26 kop/s Running locktest with spinlock [runtime = 10s, load = 1] Threads = 80, Min/Mean/Max = 599,715/603,397/603,450 Threads = 80, Total Rate = 4,825 kop/s; Percpu Rate = 60 kop/s Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] Threads = 80, Min/Mean/Max = 492,687/525,224/567,456 Threads = 80, Total Rate = 4,199 kop/s; Percpu Rate = 52 kop/s Running locktest with spinlock [runtime = 10s, load = 1] Threads = 40, Min/Mean/Max = 1,325,623/1,325,628/1,325,636 Threads = 40, Total Rate = 5,299 kop/s; Percpu Rate = 132 kop/s Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] Threads = 40, Min/Mean/Max = 1,249,731/1,292,977/1,342,815 Threads = 40, Total Rate = 5,168 kop/s; Percpu Rate = 129 kop/s On systems on large number of cpus, qspinlock lock is faster and more fair. With some tuning, we may be able to squeeze out more performance. Cheers, Longman
Nicholas Piggin
2020-Jul-07 05:57 UTC
[PATCH v3 0/6] powerpc: queued spinlocks and rwlocks
Excerpts from Waiman Long's message of July 7, 2020 4:39 am:> On 7/6/20 12:35 AM, Nicholas Piggin wrote: >> v3 is updated to use __pv_queued_spin_unlock, noticed by Waiman (thank you). >> >> Thanks, >> Nick >> >> Nicholas Piggin (6): >> powerpc/powernv: must include hvcall.h to get PAPR defines >> powerpc/pseries: move some PAPR paravirt functions to their own file >> powerpc: move spinlock implementation to simple_spinlock >> powerpc/64s: implement queued spinlocks and rwlocks >> powerpc/pseries: implement paravirt qspinlocks for SPLPAR >> powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the >> lock hint >> >> arch/powerpc/Kconfig | 13 + >> arch/powerpc/include/asm/Kbuild | 2 + >> arch/powerpc/include/asm/atomic.h | 28 ++ >> arch/powerpc/include/asm/paravirt.h | 89 +++++ >> arch/powerpc/include/asm/qspinlock.h | 91 ++++++ >> arch/powerpc/include/asm/qspinlock_paravirt.h | 7 + >> arch/powerpc/include/asm/simple_spinlock.h | 292 +++++++++++++++++ >> .../include/asm/simple_spinlock_types.h | 21 ++ >> arch/powerpc/include/asm/spinlock.h | 308 +----------------- >> arch/powerpc/include/asm/spinlock_types.h | 17 +- >> arch/powerpc/lib/Makefile | 3 + >> arch/powerpc/lib/locks.c | 12 +- >> arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 + >> arch/powerpc/platforms/pseries/Kconfig | 5 + >> arch/powerpc/platforms/pseries/setup.c | 6 +- >> include/asm-generic/qspinlock.h | 4 + >> 16 files changed, 577 insertions(+), 322 deletions(-) >> create mode 100644 arch/powerpc/include/asm/paravirt.h >> create mode 100644 arch/powerpc/include/asm/qspinlock.h >> create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h >> create mode 100644 arch/powerpc/include/asm/simple_spinlock.h >> create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h >> > This patch looks OK to me.Thanks for reviewing and testing.> I had run some microbenchmark on powerpc system with or w/o the patch. > > On a 2-socket 160-thread SMT4 POWER9 system (not virtualized): > > 5.8.0-rc4 > ========> > Running locktest with spinlock [runtime = 10s, load = 1] > Threads = 160, Min/Mean/Max = 77,665/90,153/106,895 > Threads = 160, Total Rate = 1,441,759 op/s; Percpu Rate = 9,011 op/s > > Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] > Threads = 160, Min/Mean/Max = 47,879/53,807/63,689 > Threads = 160, Total Rate = 860,192 op/s; Percpu Rate = 5,376 op/s > > Running locktest with spinlock [runtime = 10s, load = 1] > Threads = 80, Min/Mean/Max = 242,907/319,514/463,161 > Threads = 80, Total Rate = 2,555 kop/s; Percpu Rate = 32 kop/s > > Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] > Threads = 80, Min/Mean/Max = 146,161/187,474/259,270 > Threads = 80, Total Rate = 1,498 kop/s; Percpu Rate = 19 kop/s > > Running locktest with spinlock [runtime = 10s, load = 1] > Threads = 40, Min/Mean/Max = 646,639/1,000,817/1,455,205 > Threads = 40, Total Rate = 4,001 kop/s; Percpu Rate = 100 kop/s > > Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] > Threads = 40, Min/Mean/Max = 402,165/597,132/814,555 > Threads = 40, Total Rate = 2,388 kop/s; Percpu Rate = 60 kop/s > > 5.8.0-rc4-qlock+ > ===============> > Running locktest with spinlock [runtime = 10s, load = 1] > Threads = 160, Min/Mean/Max = 123,835/124,580/124,587 > Threads = 160, Total Rate = 1,992 kop/s; Percpu Rate = 12 kop/s > > Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] > Threads = 160, Min/Mean/Max = 254,210/264,714/276,784 > Threads = 160, Total Rate = 4,231 kop/s; Percpu Rate = 26 kop/s > > Running locktest with spinlock [runtime = 10s, load = 1] > Threads = 80, Min/Mean/Max = 599,715/603,397/603,450 > Threads = 80, Total Rate = 4,825 kop/s; Percpu Rate = 60 kop/s > > Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] > Threads = 80, Min/Mean/Max = 492,687/525,224/567,456 > Threads = 80, Total Rate = 4,199 kop/s; Percpu Rate = 52 kop/s > > Running locktest with spinlock [runtime = 10s, load = 1] > Threads = 40, Min/Mean/Max = 1,325,623/1,325,628/1,325,636 > Threads = 40, Total Rate = 5,299 kop/s; Percpu Rate = 132 kop/s > > Running locktest with rwlock [runtime = 10s, r% = 50%, load = 1] > Threads = 40, Min/Mean/Max = 1,249,731/1,292,977/1,342,815 > Threads = 40, Total Rate = 5,168 kop/s; Percpu Rate = 129 kop/s > > On systems on large number of cpus, qspinlock lock is faster and more fair. > > With some tuning, we may be able to squeeze out more performance.Yes, powerpc could certainly get more performance out of the slow paths, and then there are a few parameters to tune. We don't have a good alternate patching for function calls yet, but that would be something to do for native vs pv. And then there seem to be one or two tunable parameters we could experiment with. The paravirt locks may need a bit more tuning. Some simple testing under KVM shows we might be a bit slower in some cases. Whether this is fairness or something else I'm not sure. The current simple pv spinlock code can do a directed yield to the lock holder CPU, whereas the pv qspl here just does a general yield. I think we might actually be able to change that to also support directed yield. Though I'm not sure if this is actually the cause of the slowdown yet. Thanks, Nick
Michael Ellerman
2020-Jul-09 10:05 UTC
[PATCH v3 1/6] powerpc/powernv: must include hvcall.h to get PAPR defines
Nicholas Piggin <npiggin at gmail.com> writes:> An include goes away in future patches which breaks compilation > without this. > > Signed-off-by: Nicholas Piggin <npiggin at gmail.com> > --- > arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c > index f923359d8afc..8eba6ece7808 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c > @@ -15,6 +15,7 @@ > > #include <asm/iommu.h> > #include <asm/tce.h> > +#include <asm/hvcall.h> /* share error returns with PAPR */ > #include "pci.h" > > unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) > -- > 2.23.0This isn't needed anymore AFAICS, since: 5f202c1a1d42 ("powerpc/powernv/ioda: Return correct error if TCE level allocation failed") cheers
Michael Ellerman
2020-Jul-09 10:11 UTC
[PATCH v3 2/6] powerpc/pseries: move some PAPR paravirt functions to their own file
Nicholas Piggin <npiggin at gmail.com> writes:>Little bit of changelog would be nice :D> Signed-off-by: Nicholas Piggin <npiggin at gmail.com> > --- > arch/powerpc/include/asm/paravirt.h | 61 +++++++++++++++++++++++++++++ > arch/powerpc/include/asm/spinlock.h | 24 +----------- > arch/powerpc/lib/locks.c | 12 +++--- > 3 files changed, 68 insertions(+), 29 deletions(-) > create mode 100644 arch/powerpc/include/asm/paravirt.h > > diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h > new file mode 100644 > index 000000000000..7a8546660a63 > --- /dev/null > +++ b/arch/powerpc/include/asm/paravirt.h > @@ -0,0 +1,61 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > +#ifndef __ASM_PARAVIRT_H > +#define __ASM_PARAVIRT_HShould be _ASM_POWERPC_PARAVIRT_H> +#ifdef __KERNEL__We shouldn't need __KERNEL__ in here, it's not a uapi header. cheers
Michael Ellerman
2020-Jul-09 10:15 UTC
[PATCH v3 3/6] powerpc: move spinlock implementation to simple_spinlock
Nicholas Piggin <npiggin at gmail.com> writes:> To prepare for queued spinlocks. This is a simple rename except to update > preprocessor guard name and a file reference. > > Signed-off-by: Nicholas Piggin <npiggin at gmail.com> > --- > arch/powerpc/include/asm/simple_spinlock.h | 292 ++++++++++++++++++ > .../include/asm/simple_spinlock_types.h | 21 ++ > arch/powerpc/include/asm/spinlock.h | 285 +---------------- > arch/powerpc/include/asm/spinlock_types.h | 12 +- > 4 files changed, 315 insertions(+), 295 deletions(-) > create mode 100644 arch/powerpc/include/asm/simple_spinlock.h > create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h > > diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h > new file mode 100644 > index 000000000000..e048c041c4a9 > --- /dev/null > +++ b/arch/powerpc/include/asm/simple_spinlock.h > @@ -0,0 +1,292 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > +#ifndef __ASM_SIMPLE_SPINLOCK_H > +#define __ASM_SIMPLE_SPINLOCK_H_ASM_POWERPC_SIMPLE_SPINLOCK_H> +#ifdef __KERNEL__Shouldn't be necessary.> +/* > + * Simple spin lock operations. > + * > + * Copyright (C) 2001-2004 Paul Mackerras <paulus at au.ibm.com>, IBM > + * Copyright (C) 2001 Anton Blanchard <anton at au.ibm.com>, IBM > + * Copyright (C) 2002 Dave Engebretsen <engebret at us.ibm.com>, IBM > + * Rework to support virtual processors > + * > + * Type of int is used as a full 64b word is not necessary. > + * > + * (the type definitions are in asm/simple_spinlock_types.h) > + */ > +#include <linux/irqflags.h> > +#include <asm/paravirt.h> > +#ifdef CONFIG_PPC64 > +#include <asm/paca.h> > +#endifI don't think paca.h needs a CONFIG_PPC64 guard, it contains one. I know you're just moving the code, but still nice to cleanup slightly along the way. cheers
Michael Ellerman
2020-Jul-09 10:20 UTC
[PATCH v3 4/6] powerpc/64s: implement queued spinlocks and rwlocks
Nicholas Piggin <npiggin at gmail.com> writes:> These have shown significantly improved performance and fairness when > spinlock contention is moderate to high on very large systems. > > [ Numbers hopefully forthcoming after more testing, but initial > results look good ]Would be good to have something here, even if it's preliminary.> Thanks to the fast path, single threaded performance is not noticably > hurt. > > Signed-off-by: Nicholas Piggin <npiggin at gmail.com> > --- > arch/powerpc/Kconfig | 13 ++++++++++++ > arch/powerpc/include/asm/Kbuild | 2 ++ > arch/powerpc/include/asm/qspinlock.h | 25 +++++++++++++++++++++++ > arch/powerpc/include/asm/spinlock.h | 5 +++++ > arch/powerpc/include/asm/spinlock_types.h | 5 +++++ > arch/powerpc/lib/Makefile | 3 +++> include/asm-generic/qspinlock.h | 2 ++Who's ack do we need for that part?> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig > index 24ac85c868db..17663ea57697 100644 > --- a/arch/powerpc/Kconfig > +++ b/arch/powerpc/Kconfig > @@ -492,6 +494,17 @@ config HOTPLUG_CPU > > Say N if you are unsure. > > +config PPC_QUEUED_SPINLOCKS > + bool "Queued spinlocks" > + depends on SMP > + default "y" if PPC_BOOK3S_64Not sure about default y? At least until we've got a better idea of the perf impact on a range of small/big new/old systems.> + help > + Say Y here to use to use queued spinlocks which are more complex > + but give better salability and fairness on large SMP and NUMA > + systems. > + > + If unsure, say "Y" if you have lots of cores, otherwise "N".Would be nice if we could give a range for "lots".> diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild > index dadbcf3a0b1e..1dd8b6adff5e 100644 > --- a/arch/powerpc/include/asm/Kbuild > +++ b/arch/powerpc/include/asm/Kbuild > @@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h > generic-y += export.h > generic-y += local64.h > generic-y += mcs_spinlock.h > +generic-y += qrwlock.h > +generic-y += qspinlock.hThe 2nd line spits a warning about a redundant entry. I think you want to just drop it. cheers
Michael Ellerman
2020-Jul-09 10:53 UTC
[PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
Nicholas Piggin <npiggin at gmail.com> writes:> Signed-off-by: Nicholas Piggin <npiggin at gmail.com> > --- > arch/powerpc/include/asm/paravirt.h | 28 ++++++++ > arch/powerpc/include/asm/qspinlock.h | 66 +++++++++++++++++++ > arch/powerpc/include/asm/qspinlock_paravirt.h | 7 ++ > arch/powerpc/platforms/pseries/Kconfig | 5 ++ > arch/powerpc/platforms/pseries/setup.c | 6 +- > include/asm-generic/qspinlock.h | 2 +Another ack?> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h > index 7a8546660a63..f2d51f929cf5 100644 > --- a/arch/powerpc/include/asm/paravirt.h > +++ b/arch/powerpc/include/asm/paravirt.h > @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count) > { > ___bad_yield_to_preempted(); /* This would be a bug */ > } > + > +extern void ___bad_yield_to_any(void); > +static inline void yield_to_any(void) > +{ > + ___bad_yield_to_any(); /* This would be a bug */ > +}Why do we do that rather than just not defining yield_to_any() at all and letting the build fail on that? There's a condition somewhere that we know will false at compile time and drop the call before linking?> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h > new file mode 100644 > index 000000000000..750d1b5e0202 > --- /dev/null > +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h > @@ -0,0 +1,7 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > +#ifndef __ASM_QSPINLOCK_PARAVIRT_H > +#define __ASM_QSPINLOCK_PARAVIRT_H_ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.> + > +EXPORT_SYMBOL(__pv_queued_spin_unlock);Why's that in a header? Should that (eventually) go with the generic implementation?> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig > index 24c18362e5ea..756e727b383f 100644 > --- a/arch/powerpc/platforms/pseries/Kconfig > +++ b/arch/powerpc/platforms/pseries/Kconfig > @@ -25,9 +25,14 @@ config PPC_PSERIES > select SWIOTLB > default y > > +config PARAVIRT_SPINLOCKS > + bool > + default ndefault n is the default.> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c > index 2db8469e475f..747a203d9453 100644 > --- a/arch/powerpc/platforms/pseries/setup.c > +++ b/arch/powerpc/platforms/pseries/setup.c > @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void) > if (firmware_has_feature(FW_FEATURE_LPAR)) { > vpa_init(boot_cpuid); > > - if (lppaca_shared_proc(get_lppaca())) > + if (lppaca_shared_proc(get_lppaca())) { > static_branch_enable(&shared_processor); > +#ifdef CONFIG_PARAVIRT_SPINLOCKS > + pv_spinlocks_init(); > +#endif > + }We could avoid the ifdef with this I think? diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 434615f1d761..6ec72282888d 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -10,5 +10,9 @@ #include <asm/simple_spinlock.h> #endif +#ifndef CONFIG_PARAVIRT_SPINLOCKS +static inline void pv_spinlocks_init(void) { } +#endif + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ cheers
Possibly Parallel Threads
- [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks
- [PATCH v2 0/6] powerpc: queued spinlocks and rwlocks
- [PATCH 0/8] powerpc: queued spinlocks and rwlocks
- [PATCH v4 0/6] powerpc: queued spinlocks and rwlocks
- [PATCH v2 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR