thr3ads.net - Linux Virtualization - [PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock [Apr 2015]

If this information is useful, please help other people find it:
Share via:

Peter Zijlstra

2015-Apr-09 18:13 UTC

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

On Mon, Apr 06, 2015 at 10:55:44PM -0400, Waiman Long
wrote:> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -0,0 +1,321 @@
> +#ifndef _GEN_PV_LOCK_SLOWPATH
> +#error "do not include this file"
> +#endif
> +
> +/*
> + * Implement paravirt qspinlocks; the general idea is to halt the vcpus
instead
> + * of spinning them.
> + *
> + * This relies on the architecture to provide two paravirt hypercalls:
> + *
> + *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
> + *   pv_kick(cpu)             -- wakes a suspended vcpu
> + *
> + * Using these we implement __pv_queue_spin_lock_slowpath() and
> + * __pv_queue_spin_unlock() to replace native_queue_spin_lock_slowpath()
and
> + * native_queue_spin_unlock().
> + */
> +
> +#define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
> +
> +enum vcpu_state {
> +	vcpu_running = 0,
> +	vcpu_halted,
> +};
> +
> +struct pv_node {
> +	struct mcs_spinlock	mcs;
> +	struct mcs_spinlock	__res[3];
> +
> +	int			cpu;
> +	u8			state;
> +};
> +
> +/*
> + * Hash table using open addressing with an LFSR probe sequence.
> + *
> + * Since we should not be holding locks from NMI context (very rare
indeed) the
> + * max load factor is 0.75, which is around the point where open
addressing
> + * breaks down.
> + *
> + * Instead of probing just the immediate bucket we probe all buckets in
the
> + * same cacheline.
> + *
> + * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
> + *
> + * Dynamically allocate a hash table big enough to hold at least 4X the
> + * number of possible cpus in the system. Allocation is done on page
> + * granularity. So the minimum number of hash buckets should be at least
> + * 256 to fully utilize a 4k page.
> + */
> +#define LFSR_MIN_BITS	8
> +#define	LFSR_MAX_BITS	(2 + NR_CPUS_BITS)
> +#if LFSR_MAX_BITS < LFSR_MIN_BITS
> +#undef  LFSR_MAX_BITS
> +#define LFSR_MAX_BITS	LFSR_MIN_BITS
> +#endif
> +
> +struct pv_hash_bucket {
> +	struct qspinlock *lock;
> +	struct pv_node   *node;
> +};
> +#define PV_HB_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct pv_hash_bucket))
> +#define HB_RESERVED	((struct qspinlock *)1)
This is unused.
> +
> +static struct pv_hash_bucket *pv_lock_hash;
> +static unsigned int pv_lock_hash_bits __read_mostly;
static unsigned int pv_taps __read_mostly;
> +
> +#include <linux/hash.h>
> +#include <linux/lfsr.h>
> +#include <linux/bootmem.h>
> +
> +/*
> + * Allocate memory for the PV qspinlock hash buckets
> + *
> + * This function should be called from the paravirt spinlock
initialization
> + * routine.
> + */
> +void __init __pv_init_lock_hash(void)
> +{
> +	int pv_hash_size = 4 * num_possible_cpus();
> +
> +	if (pv_hash_size < (1U << LFSR_MIN_BITS))
> +		pv_hash_size = (1U << LFSR_MIN_BITS);
> +	/*
> +	 * Allocate space from bootmem which should be page-size aligned
> +	 * and hence cacheline aligned.
> +	 */
> +	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
> +					       sizeof(struct pv_hash_bucket),
> +					       pv_hash_size, 0, HASH_EARLY,
> +					       &pv_lock_hash_bits, NULL,
> +					       pv_hash_size, pv_hash_size);
	pv_taps = lfsr_taps(pv_lock_hash_bits);
> +}
> +
> +static inline u32 hash_align(u32 hash)
> +{
> +	return hash & ~(PV_HB_PER_LINE - 1);
> +}
> +
> +static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node
*node)
> +{
> +	unsigned long init_hash, hash = hash_ptr(lock, pv_lock_hash_bits);
> +	struct pv_hash_bucket *hb, *end;
> +
> +	if (!hash)
> +		hash = 1;
> +
> +	init_hash = hash;
> +	hb = &pv_lock_hash[hash_align(hash)];
> +	for (;;) {
> +		for (end = hb + PV_HB_PER_LINE; hb < end; hb++) {
> +			if (!cmpxchg(&hb->lock, NULL, lock)) {
> +				WRITE_ONCE(hb->node, node);
> +				/*
> +				 * We haven't set the _Q_SLOW_VAL yet. So
> +				 * the order of writing doesn't matter.
> +				 */
> +				smp_wmb(); /* matches rmb from pv_hash_find */
This doesn't make sense. Both sites do ->lock first and ->node second.
No amount of ordering can 'fix' that.

I think we can safely remove this wmb and the rmb below, because the
required ordering is already provided by setting/observing l->locked =SLOW.
> +				goto done;
> +			}
> +		}
> +
> +		hash = lfsr(hash, pv_lock_hash_bits, 0);
Since pv_lock_hash_bits is a variable, you end up running through that
massive if() forest to find the corresponding tap every single time. It
cannot compile-time optimize it.

Hence:
		hash = lfsr(hash, pv_taps);

(I don't get the bits argument to the lfsr).

In any case, like I said before, I think we should try a linear probe
sequence first, the lfsr was over engineering from my side.
> +		hb = &pv_lock_hash[hash_align(hash)];
> +		BUG_ON(hash == init_hash);
> +	}
> +
> +done:
> +	return &hb->lock;
> +}
> +
> +static struct pv_node *pv_hash_find(struct qspinlock *lock)
> +{
> +	unsigned long init_hash, hash = hash_ptr(lock, pv_lock_hash_bits);
> +	struct pv_hash_bucket *hb, *end;
> +	struct pv_node *node = NULL;
> +
> +	if (!hash)
> +		hash = 1;
> +
> +	init_hash = hash;
> +	hb = &pv_lock_hash[hash_align(hash)];
> +	for (;;) {
> +		for (end = hb + PV_HB_PER_LINE; hb < end; hb++) {
> +			struct qspinlock *l = READ_ONCE(hb->lock);
> +
> +			if (l == lock) {
> +				smp_rmb(); /* matches wmb from pv_hash() */
per the above this can go, IF we observe SLOW we must also observe a
consistent bucket.
> +				node = READ_ONCE(hb->node);
> +				goto done;
> +			}
> +		}
> +
> +		hash = lfsr(hash, pv_lock_hash_bits, 0);
idem the previous lfsr comment.
> +		hb = &pv_lock_hash[hash_align(hash)];
> +		BUG_ON(hash == init_hash);
> +	}
> +done:
> +	/*
> +	 * Clear the hash bucket
> +	 */
> +	WRITE_ONCE(hb->lock, NULL);
> +	return node;
> +}
> +/*
> + * Wait for l->locked to become clear; halt the vcpu after a short
spin.
> + * __pv_queue_spin_unlock() will wake us.
> + */
> +static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock
*node)
> +{
> +	struct __qspinlock *l = (void *)lock;
> +	struct qspinlock **lp = NULL;
> +	struct pv_node *pn = (struct pv_node *)node;
> +	int slow_set = false;
> +	int loop;
> +
> +	for (;;) {
> +		for (loop = SPIN_THRESHOLD; loop; loop--) {
> +			if (!READ_ONCE(l->locked))
> +				return;
> +
> +			cpu_relax();
> +		}
> +
> +		WRITE_ONCE(pn->state, vcpu_halted);
> +		if (!lp)
> +			lp = pv_hash(lock, pn);
> +		/*
> +		 * lp must be set before setting _Q_SLOW_VAL
> +		 *
> +		 * [S] lp = lock                [RmW] l = l->locked = 0
> +		 *     MB                             MB
> +		 * [S] l->locked = _Q_SLOW_VAL  [L]   lp
> +		 *
> +		 * Matches the cmpxchg() in pv_queue_spin_unlock().
> +		 */
> +		if (!slow_set &&
> +		    !cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
> +			/*
> +			 * The lock is free and _Q_SLOW_VAL has never been
> +			 * set. Need to clear the hash bucket before getting
> +			 * the lock.
> +			 */
> +			WRITE_ONCE(*lp, NULL);
> +			return;
> +		} else if (slow_set && !READ_ONCE(l->locked))
> +			return;
> +		slow_set = true;
I'm somewhat puzzled by the slow_set thing; what is wrong with the thing
I had, namely:

		if (!lp) {
			lp = pv_hash(lock, pn);

			/*
			 * comment
			 */
			lv = cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
			if (lv != _Q_LOCKED_VAL) {
				/* we're woken, unhash and return */
				WRITE_ONCE(*lp, NULL);
				return;
			}
		}> +
> +		pv_wait(&l->locked, _Q_SLOW_VAL);

If we get a spurious wakeup (due to device interrupts or random kick)
we'll loop around but ->locked will remain _Q_SLOW_VAL.
> +	}
> +	/*
> +	 * Lock is unlocked now; the caller will acquire it without waiting.
> +	 * As with pv_wait_node() we rely on the caller to do a load-acquire
> +	 * for us.
> +	 */
> +}
> +
> +/*
> + * To be used in stead of queue_spin_unlock() for paravirt locks. Wakes
> + * pv_wait_head() if appropriate.
> + */
> +__visible void __pv_queue_spin_unlock(struct qspinlock *lock)
> +{
> +	struct __qspinlock *l = (void *)lock;
> +	struct pv_node *node;
> +
> +	if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) ==
_Q_LOCKED_VAL))
> +		return;
> +
> +	/*
> +	 * The queue head has been halted. Need to locate it and wake it up.
> +	 */
> +	node = pv_hash_find(lock);
> +	smp_store_release(&l->locked, 0);
Ah yes, clever that.
> +	/*
> +	 * At this point the memory pointed at by lock can be freed/reused,
> +	 * however we can still use the PV node to kick the CPU.
> +	 */
> +	if (READ_ONCE(node->state) == vcpu_halted)
> +		pv_kick(node->cpu);
> +}
> +PV_CALLEE_SAVE_REGS_THUNK(__pv_queue_spin_unlock);
However I feel the PV_CALLEE_SAVE_REGS_THUNK thing belongs in the x86
code.

Peter Zijlstra

2015-Apr-09 18:23 UTC

head link

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

On Thu, Apr 09, 2015 at 08:13:27PM +0200, Peter Zijlstra
wrote:> On Mon, Apr 06, 2015 at 10:55:44PM -0400, Waiman Long wrote:
> > +#define PV_HB_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct
pv_hash_bucket))
> > +static struct qspinlock **pv_hash(struct qspinlock *lock, struct
pv_node *node)
> > +{
> > +	unsigned long init_hash, hash = hash_ptr(lock, pv_lock_hash_bits);
> > +	struct pv_hash_bucket *hb, *end;
> > +
> > +	if (!hash)
> > +		hash = 1;
> > +
> > +	init_hash = hash;
> > +	hb = &pv_lock_hash[hash_align(hash)];
> > +	for (;;) {
> > +		for (end = hb + PV_HB_PER_LINE; hb < end; hb++) {
> > +			if (!cmpxchg(&hb->lock, NULL, lock)) {
> > +				WRITE_ONCE(hb->node, node);
> > +				/*
> > +				 * We haven't set the _Q_SLOW_VAL yet. So
> > +				 * the order of writing doesn't matter.
> > +				 */
> > +				smp_wmb(); /* matches rmb from pv_hash_find */
> > +				goto done;
> > +			}
> > +		}
> > +
> > +		hash = lfsr(hash, pv_lock_hash_bits, 0);
> 
> Since pv_lock_hash_bits is a variable, you end up running through that
> massive if() forest to find the corresponding tap every single time. It
> cannot compile-time optimize it.
> 
> Hence:
> 		hash = lfsr(hash, pv_taps);
> 
> (I don't get the bits argument to the lfsr).
> 
> In any case, like I said before, I think we should try a linear probe
> sequence first, the lfsr was over engineering from my side.
> 
> > +		hb = &pv_lock_hash[hash_align(hash)];
So one thing this does -- and one of the reasons I figured I should
ditch the LFSR instead of fixing it -- is that you end up scanning each
bucket HB_PER_LINE times.

The 'fix' would be to LFSR on cachelines instead of HBs but then
you're
stuck with the 0-th cacheline.
> > +		BUG_ON(hash == init_hash);
> > +	}
> > +
> > +done:
> > +	return &hb->lock;
> > +}

Waiman Long

2015-Apr-09 20:36 UTC

head link

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

On 04/09/2015 02:23 PM, Peter Zijlstra wrote:> On Thu, Apr 09, 2015 at 08:13:27PM +0200, Peter Zijlstra wrote:
>> On Mon, Apr 06, 2015 at 10:55:44PM -0400, Waiman Long wrote:
>>> +#define PV_HB_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct
pv_hash_bucket))
>>> +static struct qspinlock **pv_hash(struct qspinlock *lock, struct
pv_node *node)
>>> +{
>>> +	unsigned long init_hash, hash = hash_ptr(lock,
pv_lock_hash_bits);
>>> +	struct pv_hash_bucket *hb, *end;
>>> +
>>> +	if (!hash)
>>> +		hash = 1;
>>> +
>>> +	init_hash = hash;
>>> +	hb =&pv_lock_hash[hash_align(hash)];
>>> +	for (;;) {
>>> +		for (end = hb + PV_HB_PER_LINE; hb<  end; hb++) {
>>> +			if (!cmpxchg(&hb->lock, NULL, lock)) {
>>> +				WRITE_ONCE(hb->node, node);
>>> +				/*
>>> +				 * We haven't set the _Q_SLOW_VAL yet. So
>>> +				 * the order of writing doesn't matter.
>>> +				 */
>>> +				smp_wmb(); /* matches rmb from pv_hash_find */
>>> +				goto done;
>>> +			}
>>> +		}
>>> +
>>> +		hash = lfsr(hash, pv_lock_hash_bits, 0);
>> Since pv_lock_hash_bits is a variable, you end up running through that
>> massive if() forest to find the corresponding tap every single time. It
>> cannot compile-time optimize it.
>>
>> Hence:
>> 		hash = lfsr(hash, pv_taps);
>>
>> (I don't get the bits argument to the lfsr).
>>
>> In any case, like I said before, I think we should try a linear probe
>> sequence first, the lfsr was over engineering from my side.
>>
>>> +		hb =&pv_lock_hash[hash_align(hash)];
>>>   
> So one thing this does -- and one of the reasons I figured I should
> ditch the LFSR instead of fixing it -- is that you end up scanning each
> bucket HB_PER_LINE times.
I am aware of that when I was trying to add the hash table debug code, 
but I want to get the code out for review and so hasn't made any change 
yet. I have just done testing by adding some debug code to check the 
hashing efficiency. With the kernel build workload, with over 1M calls 
to pv_hash(), all of them get an empty entry on the first try. Maybe the 
minimum hash table size of 256 helps.
>
> The 'fix' would be to LFSR on cachelines instead of HBs but then
you're
> stuck with the 0-th cacheline.
This should not be a big problem. I just need to add a check at the end 
of the for loop that if hash is 0, change it to a certain non-0 value 
instead of calling lfsr().

As for ditching the lfsr idea, I am fine with that. So there will be 4 
entries (1 cacheline) for each hash value. If all the entries are full, 
we proceed to the next cacheline.  Right?

Cheers,
Longman

Waiman Long

2015-Apr-09 21:41 UTC

head link

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

On 04/09/2015 02:13 PM, Peter Zijlstra wrote:> On Mon, Apr 06, 2015 at 10:55:44PM -0400, Waiman Long wrote:
>> +++ b/kernel/locking/qspinlock_paravirt.h
>> @@ -0,0 +1,321 @@
>> +#ifndef _GEN_PV_LOCK_SLOWPATH
>> +#error "do not include this file"
>> +#endif
>> +
>> +/*
>> + * Implement paravirt qspinlocks; the general idea is to halt the
vcpus instead
>> + * of spinning them.
>> + *
>> + * This relies on the architecture to provide two paravirt hypercalls:
>> + *
>> + *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
>> + *   pv_kick(cpu)             -- wakes a suspended vcpu
>> + *
>> + * Using these we implement __pv_queue_spin_lock_slowpath() and
>> + * __pv_queue_spin_unlock() to replace
native_queue_spin_lock_slowpath() and
>> + * native_queue_spin_unlock().
>> + */
>> +
>> +#define _Q_SLOW_VAL	(3U<<  _Q_LOCKED_OFFSET)
>> +
>> +enum vcpu_state {
>> +	vcpu_running = 0,
>> +	vcpu_halted,
>> +};
>> +
>> +struct pv_node {
>> +	struct mcs_spinlock	mcs;
>> +	struct mcs_spinlock	__res[3];
>> +
>> +	int			cpu;
>> +	u8			state;
>> +};
>> +
>> +/*
>> + * Hash table using open addressing with an LFSR probe sequence.
>> + *
>> + * Since we should not be holding locks from NMI context (very rare
indeed) the
>> + * max load factor is 0.75, which is around the point where open
addressing
>> + * breaks down.
>> + *
>> + * Instead of probing just the immediate bucket we probe all buckets
in the
>> + * same cacheline.
>> + *
>> + * http://en.wikipedia.org/wiki/Hash_table#Open_addressing
>> + *
>> + * Dynamically allocate a hash table big enough to hold at least 4X
the
>> + * number of possible cpus in the system. Allocation is done on page
>> + * granularity. So the minimum number of hash buckets should be at
least
>> + * 256 to fully utilize a 4k page.
>> + */
>> +#define LFSR_MIN_BITS	8
>> +#define	LFSR_MAX_BITS	(2 + NR_CPUS_BITS)
>> +#if LFSR_MAX_BITS<  LFSR_MIN_BITS
>> +#undef  LFSR_MAX_BITS
>> +#define LFSR_MAX_BITS	LFSR_MIN_BITS
>> +#endif
>> +
>> +struct pv_hash_bucket {
>> +	struct qspinlock *lock;
>> +	struct pv_node   *node;
>> +};
>> +#define PV_HB_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct
pv_hash_bucket))
>> +#define HB_RESERVED	((struct qspinlock *)1)
> This is unused.
You are right, I will remove that.
>> +
>> +static struct pv_hash_bucket *pv_lock_hash;
>> +static unsigned int pv_lock_hash_bits __read_mostly;
> static unsigned int pv_taps __read_mostly;
It will depend on whether we keep the lfsr code or not.
>> +
>> +#include<linux/hash.h>
>> +#include<linux/lfsr.h>
>> +#include<linux/bootmem.h>
>> +
>> +/*
>> + * Allocate memory for the PV qspinlock hash buckets
>> + *
>> + * This function should be called from the paravirt spinlock
initialization
>> + * routine.
>> + */
>> +void __init __pv_init_lock_hash(void)
>> +{
>> +	int pv_hash_size = 4 * num_possible_cpus();
>> +
>> +	if (pv_hash_size<  (1U<<  LFSR_MIN_BITS))
>> +		pv_hash_size = (1U<<  LFSR_MIN_BITS);
>> +	/*
>> +	 * Allocate space from bootmem which should be page-size aligned
>> +	 * and hence cacheline aligned.
>> +	 */
>> +	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
>> +					       sizeof(struct pv_hash_bucket),
>> +					       pv_hash_size, 0, HASH_EARLY,
>> +					&pv_lock_hash_bits, NULL,
>> +					       pv_hash_size, pv_hash_size);
> 	pv_taps = lfsr_taps(pv_lock_hash_bits);
>
I don't understand what you meant here.
>> +}
>> +
>> +static inline u32 hash_align(u32 hash)
>> +{
>> +	return hash&  ~(PV_HB_PER_LINE - 1);
>> +}
>> +
>> +static struct qspinlock **pv_hash(struct qspinlock *lock, struct
pv_node *node)
>> +{
>> +	unsigned long init_hash, hash = hash_ptr(lock, pv_lock_hash_bits);
>> +	struct pv_hash_bucket *hb, *end;
>> +
>> +	if (!hash)
>> +		hash = 1;
>> +
>> +	init_hash = hash;
>> +	hb =&pv_lock_hash[hash_align(hash)];
>> +	for (;;) {
>> +		for (end = hb + PV_HB_PER_LINE; hb<  end; hb++) {
>> +			if (!cmpxchg(&hb->lock, NULL, lock)) {
>> +				WRITE_ONCE(hb->node, node);
>> +				/*
>> +				 * We haven't set the _Q_SLOW_VAL yet. So
>> +				 * the order of writing doesn't matter.
>> +				 */
>> +				smp_wmb(); /* matches rmb from pv_hash_find */
> This doesn't make sense. Both sites do ->lock first and ->node
second.
> No amount of ordering can 'fix' that.
>
> I think we can safely remove this wmb and the rmb below, because the
> required ordering is already provided by setting/observing l->locked
=> SLOW.
>
Right, I was over-cautious here. Will remove that.
>> +				goto done;
>> +			}
>> +		}
>> +
>> +		hash = lfsr(hash, pv_lock_hash_bits, 0);
> Since pv_lock_hash_bits is a variable, you end up running through that
> massive if() forest to find the corresponding tap every single time. It
> cannot compile-time optimize it.
The minimum bits size is now 8. So unless the system has more than 64 
vCPUs, it will get the right value in the first if statement.
> Hence:
> 		hash = lfsr(hash, pv_taps);
>
> (I don't get the bits argument to the lfsr).
>
> In any case, like I said before, I think we should try a linear probe
> sequence first, the lfsr was over engineering from my side.
As I said before, I am fine with removing the lfsr() code.
>> +		hb =&pv_lock_hash[hash_align(hash)];
>> +		BUG_ON(hash == init_hash);
>> +	}
>> +
>> +done:
>> +	return&hb->lock;
>> +}
>> +
>> +static struct pv_node *pv_hash_find(struct qspinlock *lock)
>> +{
>> +	unsigned long init_hash, hash = hash_ptr(lock, pv_lock_hash_bits);
>> +	struct pv_hash_bucket *hb, *end;
>> +	struct pv_node *node = NULL;
>> +
>> +	if (!hash)
>> +		hash = 1;
>> +
>> +	init_hash = hash;
>> +	hb =&pv_lock_hash[hash_align(hash)];
>> +	for (;;) {
>> +		for (end = hb + PV_HB_PER_LINE; hb<  end; hb++) {
>> +			struct qspinlock *l = READ_ONCE(hb->lock);
>> +
>> +			if (l == lock) {
>> +				smp_rmb(); /* matches wmb from pv_hash() */
> per the above this can go, IF we observe SLOW we must also observe a
> consistent bucket.
Will remove that.
>> +				node = READ_ONCE(hb->node);
>> +				goto done;
>> +			}
>> +		}
>> +
>> +		hash = lfsr(hash, pv_lock_hash_bits, 0);
> idem the previous lfsr comment.
>
>> +		hb =&pv_lock_hash[hash_align(hash)];
>> +		BUG_ON(hash == init_hash);
>> +	}
>> +done:
>> +	/*
>> +	 * Clear the hash bucket
>> +	 */
>> +	WRITE_ONCE(hb->lock, NULL);
>> +	return node;
>> +}
>> +/*
>> + * Wait for l->locked to become clear; halt the vcpu after a short
spin.
>> + * __pv_queue_spin_unlock() will wake us.
>> + */
>> +static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock
*node)
>> +{
>> +	struct __qspinlock *l = (void *)lock;
>> +	struct qspinlock **lp = NULL;
>> +	struct pv_node *pn = (struct pv_node *)node;
>> +	int slow_set = false;
>> +	int loop;
>> +
>> +	for (;;) {
>> +		for (loop = SPIN_THRESHOLD; loop; loop--) {
>> +			if (!READ_ONCE(l->locked))
>> +				return;
>> +
>> +			cpu_relax();
>> +		}
>> +
>> +		WRITE_ONCE(pn->state, vcpu_halted);
>> +		if (!lp)
>> +			lp = pv_hash(lock, pn);
>> +		/*
>> +		 * lp must be set before setting _Q_SLOW_VAL
>> +		 *
>> +		 * [S] lp = lock                [RmW] l = l->locked = 0
>> +		 *     MB                             MB
>> +		 * [S] l->locked = _Q_SLOW_VAL  [L]   lp
>> +		 *
>> +		 * Matches the cmpxchg() in pv_queue_spin_unlock().
>> +		 */
>> +		if (!slow_set&&
>> +		    !cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
>> +			/*
>> +			 * The lock is free and _Q_SLOW_VAL has never been
>> +			 * set. Need to clear the hash bucket before getting
>> +			 * the lock.
>> +			 */
>> +			WRITE_ONCE(*lp, NULL);
>> +			return;
>> +		} else if (slow_set&&  !READ_ONCE(l->locked))
>> +			return;
>> +		slow_set = true;
> I'm somewhat puzzled by the slow_set thing; what is wrong with the
thing
> I had, namely:
>
> 		if (!lp) {
> 			lp = pv_hash(lock, pn);
>
> 			/*
> 			 * comment
> 			 */
> 			lv = cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
> 			if (lv != _Q_LOCKED_VAL) {
> 				/* we're woken, unhash and return */
> 				WRITE_ONCE(*lp, NULL);
> 				return;
> 			}
> 		}
>> +
>> +		pv_wait(&l->locked, _Q_SLOW_VAL);
>
> If we get a spurious wakeup (due to device interrupts or random kick)
> we'll loop around but ->locked will remain _Q_SLOW_VAL.
The purpose of the slow_set flag is not about the lock value. It is to 
make sure that pv_hash_find() will always find a match. Consider the 
following scenario:

cpu1            cpu2                    cpu3
----            ----                    ----
pv_wait
spurious wakeup
loop l->locked

                 read _Q_SLOW_VAL
                 pv_hash_find()
                 unlock

                                         pv_hash() <- same entry

cmpxchg(&l->locked)
clear hash (?)

Here, the entry for cpu3 will be removed leading to panic when
pv_hash_find() can find the entry. So the hash entry can only be
removed if the other cpu has no chance to see _Q_SLOW_VAL.
>> +	}
>> +	/*
>> +	 * Lock is unlocked now; the caller will acquire it without waiting.
>> +	 * As with pv_wait_node() we rely on the caller to do a load-acquire
>> +	 * for us.
>> +	 */
>> +}
>> +
>> +/*
>> + * To be used in stead of queue_spin_unlock() for paravirt locks.
Wakes
>> + * pv_wait_head() if appropriate.
>> + */
>> +__visible void __pv_queue_spin_unlock(struct qspinlock *lock)
>> +{
>> +	struct __qspinlock *l = (void *)lock;
>> +	struct pv_node *node;
>> +
>> +	if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) ==
_Q_LOCKED_VAL))
>> +		return;
>> +
>> +	/*
>> +	 * The queue head has been halted. Need to locate it and wake it up.
>> +	 */
>> +	node = pv_hash_find(lock);
>> +	smp_store_release(&l->locked, 0);
> Ah yes, clever that.
>
>> +	/*
>> +	 * At this point the memory pointed at by lock can be freed/reused,
>> +	 * however we can still use the PV node to kick the CPU.
>> +	 */
>> +	if (READ_ONCE(node->state) == vcpu_halted)
>> +		pv_kick(node->cpu);
>> +}
>> +PV_CALLEE_SAVE_REGS_THUNK(__pv_queue_spin_unlock);
> However I feel the PV_CALLEE_SAVE_REGS_THUNK thing belongs in the x86
> code.
That is why I originally put my version of the qspinlock_paravirt.h 
header file under arch/x86/include/asm. Maybe we should move it back 
there. Putting the thunk in arch/x86/kernel/kvm.c didn't work when you 
consider that the Xen code also need that.

Cheers,
Longman

Peter Zijlstra

2015-Apr-13 14:47 UTC

head link

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

On Thu, Apr 09, 2015 at 05:41:44PM -0400, Waiman Long
wrote:> >>+void __init __pv_init_lock_hash(void)
> >>+{
> >>+	int pv_hash_size = 4 * num_possible_cpus();
> >>+
> >>+	if (pv_hash_size<  (1U<<  LFSR_MIN_BITS))
> >>+		pv_hash_size = (1U<<  LFSR_MIN_BITS);
> >>+	/*
> >>+	 * Allocate space from bootmem which should be page-size aligned
> >>+	 * and hence cacheline aligned.
> >>+	 */
> >>+	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
> >>+					       sizeof(struct pv_hash_bucket),
> >>+					       pv_hash_size, 0, HASH_EARLY,
> >>+					&pv_lock_hash_bits, NULL,
> >>+					       pv_hash_size, pv_hash_size);
> >	pv_taps = lfsr_taps(pv_lock_hash_bits);
> >
> 
> I don't understand what you meant here.
Let me explain (even though I propose taking all the LFSR stuff out).

pv_lock_hash_bit is a runtime variable, therefore it cannot compile time
evaluate the forest of if statements required to compute the taps value.

Therefore its best to compute the taps _once_, and this seems like a
good place to do so.
> >>+				goto done;
> >>+			}
> >>+		}
> >>+
> >>+		hash = lfsr(hash, pv_lock_hash_bits, 0);
> >Since pv_lock_hash_bits is a variable, you end up running through that
> >massive if() forest to find the corresponding tap every single time. It
> >cannot compile-time optimize it.
> 
> The minimum bits size is now 8. So unless the system has more than 64
vCPUs,
> it will get the right value in the first if statement.
Still, no reason to not pre-compute the taps value, its simple enough.

Peter Zijlstra

2015-Apr-13 15:08 UTC

head link

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

On Thu, Apr 09, 2015 at 05:41:44PM -0400, Waiman Long wrote:
> >>+static void pv_wait_head(struct qspinlock *lock, struct
mcs_spinlock *node)
> >>+{
> >>+	struct __qspinlock *l = (void *)lock;
> >>+	struct qspinlock **lp = NULL;
> >>+	struct pv_node *pn = (struct pv_node *)node;
> >>+	int slow_set = false;
> >>+	int loop;
> >>+
> >>+	for (;;) {
> >>+		for (loop = SPIN_THRESHOLD; loop; loop--) {
> >>+			if (!READ_ONCE(l->locked))
> >>+				return;
> >>+
> >>+			cpu_relax();
> >>+		}
> >>+
> >>+		WRITE_ONCE(pn->state, vcpu_halted);
> >>+		if (!lp)
> >>+			lp = pv_hash(lock, pn);
> >>+		/*
> >>+		 * lp must be set before setting _Q_SLOW_VAL
> >>+		 *
> >>+		 * [S] lp = lock                [RmW] l = l->locked = 0
> >>+		 *     MB                             MB
> >>+		 * [S] l->locked = _Q_SLOW_VAL  [L]   lp
> >>+		 *
> >>+		 * Matches the cmpxchg() in pv_queue_spin_unlock().
> >>+		 */
> >>+		if (!slow_set&&
> >>+		    !cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
> >>+			/*
> >>+			 * The lock is free and _Q_SLOW_VAL has never been
> >>+			 * set. Need to clear the hash bucket before getting
> >>+			 * the lock.
> >>+			 */
> >>+			WRITE_ONCE(*lp, NULL);
> >>+			return;
> >>+		} else if (slow_set&&  !READ_ONCE(l->locked))
> >>+			return;
> >>+		slow_set = true;
> >I'm somewhat puzzled by the slow_set thing; what is wrong with the
thing
> >I had, namely:
> >
> >		if (!lp) {
> >			lp = pv_hash(lock, pn);
> >
> >			/*
> >			 * comment
> >			 */
> >			lv = cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL);
> >			if (lv != _Q_LOCKED_VAL) {
> >				/* we're woken, unhash and return */
> >				WRITE_ONCE(*lp, NULL);
> >				return;
> >			}
> >		}
> >>+
> >>+		pv_wait(&l->locked, _Q_SLOW_VAL);
> >
> >If we get a spurious wakeup (due to device interrupts or random kick)
> >we'll loop around but ->locked will remain _Q_SLOW_VAL.
> 
> The purpose of the slow_set flag is not about the lock value. It is to make
> sure that pv_hash_find() will always find a match. Consider the following
> scenario:
> 
> cpu1            cpu2                    cpu3
> ----            ----                    ----
> pv_wait
> spurious wakeup
> loop l->locked
> 
>                 read _Q_SLOW_VAL
>                 pv_hash_find()
>                 unlock
> 
>                                         pv_hash() <- same entry
> 
> cmpxchg(&l->locked)
> clear hash (?)
> 
> Here, the entry for cpu3 will be removed leading to panic when
> pv_hash_find() can find the entry. So the hash entry can only be
> removed if the other cpu has no chance to see _Q_SLOW_VAL.
Still confused. Afaict that cannot happen with my code. We only do the
cmpxchg(, _Q_SLOW_VAL) _once_.

Only on the first time around that loop will we hash the lock and set
the slow flag. And cpu3 cannot come in on the same entry because we've
not yet released the lock when we find and unhash.

Peter Zijlstra

2015-Apr-13 15:09 UTC

head link

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

On Thu, Apr 09, 2015 at 05:41:44PM -0400, Waiman Long
wrote:> >>+__visible void __pv_queue_spin_unlock(struct qspinlock *lock)
> >>+{
> >>+	struct __qspinlock *l = (void *)lock;
> >>+	struct pv_node *node;
> >>+
> >>+	if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) ==
_Q_LOCKED_VAL))
> >>+		return;
> >>+
> >>+	/*
> >>+	 * The queue head has been halted. Need to locate it and wake it
up.
> >>+	 */
> >>+	node = pv_hash_find(lock);
> >>+	smp_store_release(&l->locked, 0);
> >Ah yes, clever that.
> >
> >>+	/*
> >>+	 * At this point the memory pointed at by lock can be
freed/reused,
> >>+	 * however we can still use the PV node to kick the CPU.
> >>+	 */
> >>+	if (READ_ONCE(node->state) == vcpu_halted)
> >>+		pv_kick(node->cpu);
> >>+}
> >>+PV_CALLEE_SAVE_REGS_THUNK(__pv_queue_spin_unlock);
> >However I feel the PV_CALLEE_SAVE_REGS_THUNK thing belongs in the x86
> >code.
> 
> That is why I originally put my version of the qspinlock_paravirt.h header
> file under arch/x86/include/asm. Maybe we should move it back there.
Putting
> the thunk in arch/x86/kernel/kvm.c didn't work when you consider that
the
> Xen code also need that.
Well the function is 'generic' and belong here I think. Its just the
PV_CALLEE_SAVE_REGS_THUNK thing that arch specific. Should have live in
arch/x86/kernel/paravirt-spinlocks.c instead?

Possibly Parallel Threads

Search for more reasonably related threads

Linux Virtualization - Apr 2015 - [PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

[PATCH v15 09/15] pvqspinlock: Implement simple paravirt support for the qspinlock

Possibly Parallel Threads