On 2016?06?03? 09:32, Benjamin Herrenschmidt wrote:> On Fri, 2016-06-03 at 11:32 +1000, Benjamin Herrenschmidt wrote: >> On Thu, 2016-06-02 at 17:22 +0800, Pan Xinhui wrote: >>> >>> Base code to enable qspinlock on powerpc. this patch add some >>> #ifdef >>> here and there. Although there is no paravirt related code, we can >>> successfully build a qspinlock kernel after apply this patch. >> This is missing the IO_SYNC stuff ... It means we'll fail to do a >> full >> sync to order vs MMIOs. >> >> You need to add that back in the unlock path. > > Well, and in the lock path as well... >Oh, yes. I missed IO_SYNC stuff. thank you, Ben :)> Cheers, > Ben. > >>> >>> Signed-off-by: Pan Xinhui <xinhui.pan at linux.vnet.ibm.com> >>> --- >>> arch/powerpc/include/asm/qspinlock.h | 26 >>> ++++++++++++++++++++++++++ >>> arch/powerpc/include/asm/spinlock.h | 27 +++++++++++++++---- >>> -------- >>> arch/powerpc/include/asm/spinlock_types.h | 4 ++++ >>> arch/powerpc/lib/locks.c | 4 ++++ >>> 4 files changed, 49 insertions(+), 12 deletions(-) >>> create mode 100644 arch/powerpc/include/asm/qspinlock.h >>> >>> diff --git a/arch/powerpc/include/asm/qspinlock.h >>> b/arch/powerpc/include/asm/qspinlock.h >>> new file mode 100644 >>> index 0000000..fc83cd2 >>> --- /dev/null >>> +++ b/arch/powerpc/include/asm/qspinlock.h >>> @@ -0,0 +1,26 @@ >>> +#ifndef _ASM_POWERPC_QSPINLOCK_H >>> +#define _ASM_POWERPC_QSPINLOCK_H >>> + >>> +#include <asm-generic/qspinlock_types.h> >>> + >>> +#define SPIN_THRESHOLD (1 << 15) >>> +#define queued_spin_unlock queued_spin_unlock >>> + >>> +static inline void native_queued_spin_unlock(struct qspinlock >>> *lock) >>> +{ >>> + u8 *locked = (u8 *)lock; >>> +#ifdef __BIG_ENDIAN >>> + locked += 3; >>> +#endif >>> + /* no load/store can be across the unlock()*/ >>> + smp_store_release(locked, 0); >>> +} >>> + >>> +static inline void queued_spin_unlock(struct qspinlock *lock) >>> +{ >>> + native_queued_spin_unlock(lock); >>> +} >>> + >>> +#include <asm-generic/qspinlock.h> >>> + >>> +#endif /* _ASM_POWERPC_QSPINLOCK_H */ >>> diff --git a/arch/powerpc/include/asm/spinlock.h >>> b/arch/powerpc/include/asm/spinlock.h >>> index 523673d..4359ee6 100644 >>> --- a/arch/powerpc/include/asm/spinlock.h >>> +++ b/arch/powerpc/include/asm/spinlock.h >>> @@ -52,6 +52,20 @@ >>> #define SYNC_IO >>> #endif >>> >>> +#if defined(CONFIG_PPC_SPLPAR) >>> +/* We only yield to the hypervisor if we are in shared processor >>> mode */ >>> +#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca- >>>> >>>> lppaca_ptr)) >>> +extern void __spin_yield(arch_spinlock_t *lock); >>> +extern void __rw_yield(arch_rwlock_t *lock); >>> +#else /* SPLPAR */ >>> +#define __spin_yield(x) barrier() >>> +#define __rw_yield(x) barrier() >>> +#define SHARED_PROCESSOR 0 >>> +#endif >>> + >>> +#ifdef CONFIG_QUEUED_SPINLOCKS >>> +#include <asm/qspinlock.h> >>> +#else >>> static __always_inline int >>> arch_spin_value_unlocked(arch_spinlock_t >>> lock) >>> { >>> return lock.slock == 0; >>> @@ -106,18 +120,6 @@ static inline int >>> arch_spin_trylock(arch_spinlock_t *lock) >>> * held. Conveniently, we have a word in the paca that holds this >>> * value. >>> */ >>> - >>> -#if defined(CONFIG_PPC_SPLPAR) >>> -/* We only yield to the hypervisor if we are in shared processor >>> mode */ >>> -#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca- >>>> >>>> lppaca_ptr)) >>> -extern void __spin_yield(arch_spinlock_t *lock); >>> -extern void __rw_yield(arch_rwlock_t *lock); >>> -#else /* SPLPAR */ >>> -#define __spin_yield(x) barrier() >>> -#define __rw_yield(x) barrier() >>> -#define SHARED_PROCESSOR 0 >>> -#endif >>> - >>> static inline void arch_spin_lock(arch_spinlock_t *lock) >>> { >>> CLEAR_IO_SYNC; >>> @@ -169,6 +171,7 @@ extern void >>> arch_spin_unlock_wait(arch_spinlock_t >>> *lock); >>> do { while (arch_spin_is_locked(lock)) cpu_relax(); } >>> while >>> (0) >>> #endif >>> >>> +#endif /* !CONFIG_QUEUED_SPINLOCKS */ >>> /* >>> * Read-write spinlocks, allowing multiple readers >>> * but only one writer. >>> diff --git a/arch/powerpc/include/asm/spinlock_types.h >>> b/arch/powerpc/include/asm/spinlock_types.h >>> index 2351adc..bd7144e 100644 >>> --- a/arch/powerpc/include/asm/spinlock_types.h >>> +++ b/arch/powerpc/include/asm/spinlock_types.h >>> @@ -5,11 +5,15 @@ >>> # error "please don't include this file directly" >>> #endif >>> >>> +#ifdef CONFIG_QUEUED_SPINLOCKS >>> +#include <asm-generic/qspinlock_types.h> >>> +#else >>> typedef struct { >>> volatile unsigned int slock; >>> } arch_spinlock_t; >>> >>> #define __ARCH_SPIN_LOCK_UNLOCKED { 0 } >>> +#endif >>> >>> typedef struct { >>> volatile signed int lock; >>> diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c >>> index f7deebd..a9ebd71 100644 >>> --- a/arch/powerpc/lib/locks.c >>> +++ b/arch/powerpc/lib/locks.c >>> @@ -23,6 +23,7 @@ >>> #include <asm/hvcall.h> >>> #include <asm/smp.h> >>> >>> +#ifndef CONFIG_QUEUED_SPINLOCKS >>> void __spin_yield(arch_spinlock_t *lock) >>> { >>> unsigned int lock_value, holder_cpu, yield_count; >>> @@ -42,6 +43,7 @@ void __spin_yield(arch_spinlock_t *lock) >>> get_hard_smp_processor_id(holder_cpu), >>> yield_count); >>> } >>> EXPORT_SYMBOL_GPL(__spin_yield); >>> +#endif >>> >>> /* >>> * Waiting for a read lock or a write lock on a rwlock... >>> @@ -69,6 +71,7 @@ void __rw_yield(arch_rwlock_t *rw) >>> } >>> #endif >>> >>> +#ifndef CONFIG_QUEUED_SPINLOCKS >>> void arch_spin_unlock_wait(arch_spinlock_t *lock) >>> { >>> smp_mb(); >>> @@ -84,3 +87,4 @@ void arch_spin_unlock_wait(arch_spinlock_t *lock) >>> } >>> >>> EXPORT_SYMBOL(arch_spin_unlock_wait); >>> +#endif
Benjamin Herrenschmidt
2016-Jun-03 04:33 UTC
[PATCH v5 1/6] qspinlock: powerpc support qspinlock
On Fri, 2016-06-03 at 12:10 +0800, xinhui wrote:> On 2016?06?03? 09:32, Benjamin Herrenschmidt wrote: > > On Fri, 2016-06-03 at 11:32 +1000, Benjamin Herrenschmidt wrote: > >> On Thu, 2016-06-02 at 17:22 +0800, Pan Xinhui wrote: > >>> > >>> Base code to enable qspinlock on powerpc. this patch add some > >>> #ifdef > >>> here and there. Although there is no paravirt related code, we > can > >>> successfully build a qspinlock kernel after apply this patch. > >> This is missing the IO_SYNC stuff ... It means we'll fail to do a > >> full > >> sync to order vs MMIOs. > >> > >> You need to add that back in the unlock path. > > > > Well, and in the lock path as well... > > > Oh, yes. I missed IO_SYNC stuff. > > thank you, Ben :)Ok couple of other things that would be nice from my perspective (and Michael's) if you can produce them: ?- Some benchmarks of the qspinlock alone, without the PV stuff, ? ?so we understand how much of the overhead is inherent to the ? ?qspinlock and how much is introduced by the PV bits. ?- For the above, can you show (or describe) where the qspinlock ? ?improves things compared to our current locks. While there's ? ?theory and to some extent practice on x86, it would be nice to ? ?validate the effects on POWER. ?- Comparative benchmark with the PV stuff in on a bare metal system ? ?to understand the overhead there. ?- Comparative benchmark with the PV stuff under pHyp and KVM Spinlocks are fiddly and a critical piece of infrastructure, it's important we fully understand the performance implications before we decide to switch to a new model. Cheers, Ben.
On 2016?06?03? 12:33, Benjamin Herrenschmidt wrote:> On Fri, 2016-06-03 at 12:10 +0800, xinhui wrote: >> On 2016?06?03? 09:32, Benjamin Herrenschmidt wrote: >>> On Fri, 2016-06-03 at 11:32 +1000, Benjamin Herrenschmidt wrote: >>>> On Thu, 2016-06-02 at 17:22 +0800, Pan Xinhui wrote: >>>>> >>>>> Base code to enable qspinlock on powerpc. this patch add some >>>>> #ifdef >>>>> here and there. Although there is no paravirt related code, we >> can >>>>> successfully build a qspinlock kernel after apply this patch. >>>> This is missing the IO_SYNC stuff ... It means we'll fail to do a >>>> full >>>> sync to order vs MMIOs. >>>> >>>> You need to add that back in the unlock path. >>> >>> Well, and in the lock path as well... >>> >> Oh, yes. I missed IO_SYNC stuff. >> >> thank you, Ben :) > > Ok couple of other things that would be nice from my perspective (and > Michael's) if you can produce them: > > - Some benchmarks of the qspinlock alone, without the PV stuff, > so we understand how much of the overhead is inherent to the > qspinlock and how much is introduced by the PV bits. > > - For the above, can you show (or describe) where the qspinlock > improves things compared to our current locks. While there's > theory and to some extent practice on x86, it would be nice to > validate the effects on POWER. > > - Comparative benchmark with the PV stuff in on a bare metal system > to understand the overhead there. > > - Comparative benchmark with the PV stuff under pHyp and KVM >Will do such benchmark tests in next days. thanks for your kind suggestions. :)> Spinlocks are fiddly and a critical piece of infrastructure, it's > important we fully understand the performance implications before we > decide to switch to a new model. >yes, We really need understand how {pv}qspinlock works in more complex cases. thanks xinhui> Cheers, > Ben. >
On Fri, Jun 03, 2016 at 02:33:47PM +1000, Benjamin Herrenschmidt wrote:> ?- For the above, can you show (or describe) where the qspinlock > ? ?improves things compared to our current locks.So currently PPC has a fairly straight forward test-and-set spinlock IIRC. You have this because LPAR/virt muck and lock holder preemption issues etc.. qspinlock is 1) a fair lock (like ticket locks) and 2) provides out-of-word spinning, reducing cacheline pressure. Esp. on multi-socket x86 we saw the out-of-word spinning being a big win over our ticket locks. And fairness, brought to us by the ticket locks a long time ago, eliminated starvation issues we had, where a spinner local to the holder would 'always' win from a spinner further away. So under heavy enough local contention, the spinners on 'remote' CPUs would 'never' get to own the lock. pv-qspinlock tries to preserve the fairness while allowing limited lock stealing and explicitly managing which vcpus to wake.> While there's > ? ?theory and to some extent practice on x86, it would be nice to > ? ?validate the effects on POWER.Right; so that will have to be from benchmarks which I cannot help you with ;-)
Possibly Parallel Threads
- [PATCH v5 1/6] qspinlock: powerpc support qspinlock
- [PATCH v5 1/6] qspinlock: powerpc support qspinlock
- [PATCH v5 1/6] qspinlock: powerpc support qspinlock
- [PATCH v9 0/6] Implement qspinlock/pv-qspinlock on ppc
- [PATCH v9 0/6] Implement qspinlock/pv-qspinlock on ppc