thr3ads.net - Linux Virtualization - [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks [Jul 2020]

If this information is useful, please help other people find it:
Share via:

Nicholas Piggin

2020-Jul-21 11:08 UTC

[PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

Excerpts from Peter Zijlstra's message of July 8, 2020 6:41
pm:> On Tue, Jul 07, 2020 at 03:57:06PM +1000, Nicholas Piggin wrote:
>> Yes, powerpc could certainly get more performance out of the slow
>> paths, and then there are a few parameters to tune.
> 
Sorry for the delay, got bogged down and distracted by other things :(
> Can you clarify? The slow path is already in use on ARM64 which is weak,
> so I doubt there's superfluous serialization present. And Will spend a
> fair amount of time on making that thing guarantee forward progressm, so
> there just isn't too much room to play.
Sure, the way the pending not-queued slowpath (which I guess is the
medium-path) is implemented is just poorly structured for LL/SC. It
has one more atomic than necessary (queued_fetch_set_pending_acquire),
and a lot of branches in suboptimal order.

Attached patch (completely untested just compiled and looked at asm
so far) is a way we can fix this on powerpc I think. It's actually
very little generic code change which is good, duplicated medium-path
logic unfortunately but that's no worse than something like x86
really.
>> We don't have a good alternate patching for function calls yet, but
>> that would be something to do for native vs pv.
> 
> Going by your jump_label implementation, support for static_call should
> be fairly straight forward too, no?
> 
>   https://lkml.kernel.org/r/20200624153024.794671356 at infradead.org
Nice, yeah it should be. I've wanted this for ages!

powerpc is kind of annoying to implement that with limited call range,
Hmm, not sure if we'd need a new linker feature to support it. We'd
provide call site patch space for indirect branches for those out of
range of direct call, so that should work fine. The trick would be 
patching in the TOC lookup for the function... should be doable somehow.

Thanks,
Nick

---

diff --git a/arch/powerpc/include/asm/qspinlock.h
b/arch/powerpc/include/asm/qspinlock.h
index b752d34517b3..26d8766a1106 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock
*lock)
 
 #else
 extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
 #endif
 
 static __always_inline void queued_spin_lock(struct qspinlock *lock)
 {
-	u32 val = 0;
-
-	if (likely(atomic_try_cmpxchg_lock(&lock->val, &val,
_Q_LOCKED_VAL)))
+	atomic_t *a = &lock->val;
+	u32 val;
+
+again:
+	asm volatile(
+"1:\t"	PPC_LWARX(%0,0,%1,1) "	# queued_spin_lock			\n"
+	: "=&r" (val)
+	: "r" (&a->counter)
+	: "memory");
+
+	if (likely(val == 0)) {
+		asm_volatile_goto(
+	"	stwcx.	%0,0,%1							\n"
+	"	bne-	%l[again]						\n"
+	"\t"	PPC_ACQUIRE_BARRIER "						\n"
+		:
+		: "r"(_Q_LOCKED_VAL), "r" (&a->counter)
+		: "cr0", "memory"
+		: again );
 		return;
-
-	queued_spin_lock_slowpath(lock, val);
+	}
+
+	if (likely(val == _Q_LOCKED_VAL)) {
+		asm_volatile_goto(
+	"	stwcx.	%0,0,%1							\n"
+	"	bne-	%l[again]						\n"
+		:
+		: "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r"
(&a->counter)
+		: "cr0", "memory"
+		: again );
+
+		atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
+//		clear_pending_set_locked(lock);
+		WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+//		lockevent_inc(lock_pending);
+		return;
+	}
+
+	if (val == _Q_PENDING_VAL) {
+		int cnt = _Q_PENDING_LOOPS;
+		val = atomic_cond_read_relaxed(a,
+					       (VAL != _Q_PENDING_VAL) || !cnt--);
+		if (!(val & ~_Q_LOCKED_MASK))
+			goto again;
+        }
+	queued_spin_lock_slowpath_queue(lock);
 }
 #define queued_spin_lock queued_spin_lock
 
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9515fcc9b29..ebcc6f5d99d5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -287,10 +287,14 @@ static __always_inline u32  __pv_wait_head_or_lock(struct
qspinlock *lock,
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath_queue	native_queued_spin_lock_slowpath_queue
 #endif
 
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 
+void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
+static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
+
 /**
  * queued_spin_lock_slowpath - acquire the queued spinlock
  * @lock: Pointer to queued spinlock structure
@@ -314,12 +318,6 @@ static __always_inline u32  __pv_wait_head_or_lock(struct
qspinlock *lock,
  */
 void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
-	struct mcs_spinlock *prev, *next, *node;
-	u32 old, tail;
-	int idx;
-
-	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
-
 	if (pv_enabled())
 		goto pv_queue;
 
@@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32
val)
 queue:
 	lockevent_inc(lock_slowpath);
 pv_queue:
+	__queued_spin_lock_slowpath_queue(lock);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
+{
+	lockevent_inc(lock_slowpath);
+	__queued_spin_lock_slowpath_queue(lock);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
+
+static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
+{
+	struct mcs_spinlock *prev, *next, *node;
+	u32 old, tail;
+	u32 val;
+	int idx;
+
+	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
 	node = this_cpu_ptr(&qnodes[0].mcs);
 	idx = node->count++;
 	tail = encode_tail(smp_processor_id(), idx);
@@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32
val)
 	 */
 	__this_cpu_dec(qnodes[0].mcs.count);
 }
-EXPORT_SYMBOL(queued_spin_lock_slowpath);
 
 /*
  * Generate the paravirt code for queued_spin_unlock_slowpath().

Waiman Long

2020-Jul-21 14:36 UTC

head link

[PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

On 7/21/20 7:08 AM, Nicholas Piggin wrote:> diff --git a/arch/powerpc/include/asm/qspinlock.h
b/arch/powerpc/include/asm/qspinlock.h
> index b752d34517b3..26d8766a1106 100644
> --- a/arch/powerpc/include/asm/qspinlock.h
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock
*lock)
>   
>   #else
>   extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
> +extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>   #endif
>   
>   static __always_inline void queued_spin_lock(struct qspinlock *lock)
>   {
> -	u32 val = 0;
> -
> -	if (likely(atomic_try_cmpxchg_lock(&lock->val, &val,
_Q_LOCKED_VAL)))
> +	atomic_t *a = &lock->val;
> +	u32 val;
> +
> +again:
> +	asm volatile(
> +"1:\t"	PPC_LWARX(%0,0,%1,1) "	# queued_spin_lock			\n"
> +	: "=&r" (val)
> +	: "r" (&a->counter)
> +	: "memory");
> +
> +	if (likely(val == 0)) {
> +		asm_volatile_goto(
> +	"	stwcx.	%0,0,%1							\n"
> +	"	bne-	%l[again]						\n"
> +	"\t"	PPC_ACQUIRE_BARRIER "						\n"
> +		:
> +		: "r"(_Q_LOCKED_VAL), "r" (&a->counter)
> +		: "cr0", "memory"
> +		: again );
>   		return;
> -
> -	queued_spin_lock_slowpath(lock, val);
> +	}
> +
> +	if (likely(val == _Q_LOCKED_VAL)) {
> +		asm_volatile_goto(
> +	"	stwcx.	%0,0,%1							\n"
> +	"	bne-	%l[again]						\n"
> +		:
> +		: "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r"
(&a->counter)
> +		: "cr0", "memory"
> +		: again );
> +
> +		atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
> +//		clear_pending_set_locked(lock);
> +		WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
> +//		lockevent_inc(lock_pending);
> +		return;
> +	}
> +
> +	if (val == _Q_PENDING_VAL) {
> +		int cnt = _Q_PENDING_LOOPS;
> +		val = atomic_cond_read_relaxed(a,
> +					       (VAL != _Q_PENDING_VAL) || !cnt--);
> +		if (!(val & ~_Q_LOCKED_MASK))
> +			goto again;
> +        }
> +	queued_spin_lock_slowpath_queue(lock);
>   }
>   #define queued_spin_lock queued_spin_lock
>   
I am fine with the arch code override some part of the generic code.

> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index b9515fcc9b29..ebcc6f5d99d5 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -287,10 +287,14 @@ static __always_inline u32 
__pv_wait_head_or_lock(struct qspinlock *lock,
>   
>   #ifdef CONFIG_PARAVIRT_SPINLOCKS
>   #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
> +#define queued_spin_lock_slowpath_queue
native_queued_spin_lock_slowpath_queue
>   #endif
>   
>   #endif /* _GEN_PV_LOCK_SLOWPATH */
>   
> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
> +
>   /**
>    * queued_spin_lock_slowpath - acquire the queued spinlock
>    * @lock: Pointer to queued spinlock structure
> @@ -314,12 +318,6 @@ static __always_inline u32 
__pv_wait_head_or_lock(struct qspinlock *lock,
>    */
>   void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>   {
> -	struct mcs_spinlock *prev, *next, *node;
> -	u32 old, tail;
> -	int idx;
> -
> -	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
> -
>   	if (pv_enabled())
>   		goto pv_queue;
>   
> @@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock,
u32 val)
>   queue:
>   	lockevent_inc(lock_slowpath);
>   pv_queue:
> +	__queued_spin_lock_slowpath_queue(lock);
> +}
> +EXPORT_SYMBOL(queued_spin_lock_slowpath);
> +
> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
> +{
> +	lockevent_inc(lock_slowpath);
> +	__queued_spin_lock_slowpath_queue(lock);
> +}
> +EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
> +
> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
> +{
> +	struct mcs_spinlock *prev, *next, *node;
> +	u32 old, tail;
> +	u32 val;
> +	int idx;
> +
> +	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
> +
>   	node = this_cpu_ptr(&qnodes[0].mcs);
>   	idx = node->count++;
>   	tail = encode_tail(smp_processor_id(), idx);
> @@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock,
u32 val)
>   	 */
>   	__this_cpu_dec(qnodes[0].mcs.count);
>   }
> -EXPORT_SYMBOL(queued_spin_lock_slowpath);
>   
>   /*
>    * Generate the paravirt code for queued_spin_unlock_slowpath().
>I would prefer to extract out the pending bit handling code out into a 
separate helper function which can be overridden by the arch code 
instead of breaking the slowpath into 2 pieces.

Cheers,
Longman

Nicholas Piggin

2020-Jul-23 13:30 UTC

head link

[PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

Excerpts from Waiman Long's message of July 22, 2020 12:36
am:> On 7/21/20 7:08 AM, Nicholas Piggin wrote:
>> diff --git a/arch/powerpc/include/asm/qspinlock.h
b/arch/powerpc/include/asm/qspinlock.h
>> index b752d34517b3..26d8766a1106 100644
>> --- a/arch/powerpc/include/asm/qspinlock.h
>> +++ b/arch/powerpc/include/asm/qspinlock.h
>> @@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct
qspinlock *lock)
>>   
>>   #else
>>   extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32
val);
>> +extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>>   #endif
>>   
>>   static __always_inline void queued_spin_lock(struct qspinlock *lock)
>>   {
>> -	u32 val = 0;
>> -
>> -	if (likely(atomic_try_cmpxchg_lock(&lock->val, &val,
_Q_LOCKED_VAL)))
>> +	atomic_t *a = &lock->val;
>> +	u32 val;
>> +
>> +again:
>> +	asm volatile(
>> +"1:\t"	PPC_LWARX(%0,0,%1,1) "	# queued_spin_lock		
\n"
>> +	: "=&r" (val)
>> +	: "r" (&a->counter)
>> +	: "memory");
>> +
>> +	if (likely(val == 0)) {
>> +		asm_volatile_goto(
>> +	"	stwcx.	%0,0,%1							\n"
>> +	"	bne-	%l[again]						\n"
>> +	"\t"	PPC_ACQUIRE_BARRIER "						\n"
>> +		:
>> +		: "r"(_Q_LOCKED_VAL), "r" (&a->counter)
>> +		: "cr0", "memory"
>> +		: again );
>>   		return;
>> -
>> -	queued_spin_lock_slowpath(lock, val);
>> +	}
>> +
>> +	if (likely(val == _Q_LOCKED_VAL)) {
>> +		asm_volatile_goto(
>> +	"	stwcx.	%0,0,%1							\n"
>> +	"	bne-	%l[again]						\n"
>> +		:
>> +		: "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r"
(&a->counter)
>> +		: "cr0", "memory"
>> +		: again );
>> +
>> +		atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
>> +//		clear_pending_set_locked(lock);
>> +		WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
>> +//		lockevent_inc(lock_pending);
>> +		return;
>> +	}
>> +
>> +	if (val == _Q_PENDING_VAL) {
>> +		int cnt = _Q_PENDING_LOOPS;
>> +		val = atomic_cond_read_relaxed(a,
>> +					       (VAL != _Q_PENDING_VAL) || !cnt--);
>> +		if (!(val & ~_Q_LOCKED_MASK))
>> +			goto again;
>> +        }
>> +	queued_spin_lock_slowpath_queue(lock);
>>   }
>>   #define queued_spin_lock queued_spin_lock
>>   
> 
> I am fine with the arch code override some part of the generic code.
Cool.
>> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
>> index b9515fcc9b29..ebcc6f5d99d5 100644
>> --- a/kernel/locking/qspinlock.c
>> +++ b/kernel/locking/qspinlock.c
>> @@ -287,10 +287,14 @@ static __always_inline u32 
__pv_wait_head_or_lock(struct qspinlock *lock,
>>   
>>   #ifdef CONFIG_PARAVIRT_SPINLOCKS
>>   #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
>> +#define queued_spin_lock_slowpath_queue
native_queued_spin_lock_slowpath_queue
>>   #endif
>>   
>>   #endif /* _GEN_PV_LOCK_SLOWPATH */
>>   
>> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> +
>>   /**
>>    * queued_spin_lock_slowpath - acquire the queued spinlock
>>    * @lock: Pointer to queued spinlock structure
>> @@ -314,12 +318,6 @@ static __always_inline u32 
__pv_wait_head_or_lock(struct qspinlock *lock,
>>    */
>>   void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>>   {
>> -	struct mcs_spinlock *prev, *next, *node;
>> -	u32 old, tail;
>> -	int idx;
>> -
>> -	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
>> -
>>   	if (pv_enabled())
>>   		goto pv_queue;
>>   
>> @@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock
*lock, u32 val)
>>   queue:
>>   	lockevent_inc(lock_slowpath);
>>   pv_queue:
>> +	__queued_spin_lock_slowpath_queue(lock);
>> +}
>> +EXPORT_SYMBOL(queued_spin_lock_slowpath);
>> +
>> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
>> +{
>> +	lockevent_inc(lock_slowpath);
>> +	__queued_spin_lock_slowpath_queue(lock);
>> +}
>> +EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
>> +
>> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
>> +{
>> +	struct mcs_spinlock *prev, *next, *node;
>> +	u32 old, tail;
>> +	u32 val;
>> +	int idx;
>> +
>> +	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
>> +
>>   	node = this_cpu_ptr(&qnodes[0].mcs);
>>   	idx = node->count++;
>>   	tail = encode_tail(smp_processor_id(), idx);
>> @@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock
*lock, u32 val)
>>   	 */
>>   	__this_cpu_dec(qnodes[0].mcs.count);
>>   }
>> -EXPORT_SYMBOL(queued_spin_lock_slowpath);
>>   
>>   /*
>>    * Generate the paravirt code for queued_spin_unlock_slowpath().
>>
> I would prefer to extract out the pending bit handling code out into a 
> separate helper function which can be overridden by the arch code 
> instead of breaking the slowpath into 2 pieces.
You mean have the arch provide a queued_spin_lock_slowpath_pending 
function that the slow path calls?

I would actually prefer the pending handling can be made inline in
the queued_spin_lock function, especially with out-of-line locks it 
makes sense to put it there.

We could ifdef out queued_spin_lock_slowpath_queue if it's not used,
then __queued_spin_lock_slowpath_queue would be inlined into the
caller so there would be no split?

Thanks,
Nick

Reasonably Related Threads

Search for more reasonably related threads

Linux Virtualization - Jul 2020 - [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

[PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

[PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

[PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

Reasonably Related Threads