Waiman Long
2014-Oct-27 17:15 UTC
[PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/24/2014 06:04 PM, Peter Zijlstra wrote:> On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote: >> The additional register pressure may just cause a few more register moves >> which should be negligible in the overall performance . The additional >> icache pressure, however, may have some impact on performance. I was trying >> to balance the performance of the pv and non-pv versions so that we won't >> penalize the pv code too much for a bit more performance in the non-pv code. >> Doing it your way will add a lot of function call and register >> saving/restoring to the pv code. > If people care about performance they should not be using virt crap :-) > > I only really care about bare metal.Yes, I am aware of that. However, the whole point of doing PV spinlock is to improve performance in a virtual guest. Anyway, I had done some measurements. In my test system, the queue_spin_lock_slowpath() function has a text size of about 400 bytes without PV, but 1120 bytes with PV. I made some changes to create separate versions of PV and non-PV slowpath functions as shown by the diff below. The text size is now about 430 bytes for the non-PV version and 925 bytes for the PV version. The overall object size increases by a bit more than 200 bytes, but the icache footprint should be reduced no matter which version is used. -Longman ---------------------------------------- diff --git a/arch/x86/include/asm/pvqspinlock.h b/arch/x86/include/asm/pvqspinlo index d424252..241bf30 100644 --- a/arch/x86/include/asm/pvqspinlock.h +++ b/arch/x86/include/asm/pvqspinlock.h @@ -79,9 +79,6 @@ static inline void pv_init_node(struct mcs_spinlock *node) BUILD_BUG_ON(sizeof(struct pv_qnode) > 5*sizeof(struct mcs_spinlock)); - if (!pv_enabled()) - return; - pn->cpustate = PV_CPU_ACTIVE; pn->mayhalt = false; pn->mycpu = smp_processor_id(); @@ -132,9 +129,6 @@ static inline bool pv_link_and_wait_node(u32 old, struct mcs struct pv_qnode *ppn, *pn = (struct pv_qnode *)node; unsigned int count; - if (!pv_enabled()) - return false; - if (!(old & _Q_TAIL_MASK)) { node->locked = true; /* At queue head now */ goto ret; @@ -236,9 +230,6 @@ pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *no { struct pv_qnode *pn = (struct pv_qnode *)node; - if (!pv_enabled()) - return smp_load_acquire(&lock->val.counter); - for (;;) { unsigned int count; s8 oldstate; @@ -328,8 +319,6 @@ static inline void pv_wait_check(struct qspinlock *lock, struct pv_qnode *pnxt = (struct pv_qnode *)next; struct pv_qnode *pcur = (struct pv_qnode *)node; - if (!pv_enabled()) - return; /* * Clear the locked and head values of lock holder */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 1662dbd..05aea57 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -16,6 +16,7 @@ * Authors: Waiman Long <waiman.long at hp.com> * Peter Zijlstra <pzijlstr at redhat.com> */ +#ifndef _GEN_PV_LOCK_SLOWPATH #include <linux/smp.h> #include <linux/bug.h> #include <linux/cpumask.h> @@ -271,19 +272,37 @@ void queue_spin_unlock_slowpath(struct qspinlock *lock) } EXPORT_SYMBOL(queue_spin_unlock_slowpath); -#else +static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val); + +#else /* CONFIG_PARAVIRT_SPINLOCKS */ + +static inline void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) + { } -static inline void pv_init_node(struct mcs_spinlock *node) { } -static inline void pv_wait_check(struct qspinlock *lock, - struct mcs_spinlock *node, - struct mcs_spinlock *next) { } -static inline bool pv_link_and_wait_node(u32 old, struct mcs_spinlock *node) +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Dummy PV functions for bare-metal slowpath code + */ +static inline void nopv_init_node(struct mcs_spinlock *node) { } +static inline void nopv_wait_check(struct qspinlock *lock, + struct mcs_spinlock *node, + struct mcs_spinlock *next) { } +static inline bool nopv_link_and_wait_node(u32 old, struct mcs_spinlock *node) { return false; } -static inline int pv_wait_head(struct qspinlock *lock, +static inline int nopv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { return smp_load_acquire(&lock->val.counter); } +static inline bool return_true(void) { return true; } +static inline bool return_false(void) { return false; } -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ +#define pv_init_node nopv_init_node +#define pv_wait_check nopv_wait_check +#define pv_link_and_wait_node nopv_link_and_wait_node +#define pv_wait_head nopv_wait_head +#define in_pv_code return_false + +#endif /* _GEN_PV_LOCK_SLOWPATH */ /** * queue_spin_lock_slowpath - acquire the queue spinlock @@ -306,7 +325,11 @@ static inline int pv_wait_head(struct qspinlock *lock, * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */ +#ifdef _GEN_PV_LOCK_SLOWPATH +static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) +#else void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) +#endif { struct mcs_spinlock *prev, *next, *node; u32 new, old, tail; @@ -314,7 +337,12 @@ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 v BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); - if (pv_enabled()) + if (pv_enabled()) { + pv_queue_spin_lock_slowpath(lock, val); + return; + } + + if (in_pv_code()) goto queue; if (virt_queue_spin_lock(lock)) @@ -474,3 +502,23 @@ release: this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queue_spin_lock_slowpath); + +#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS) +/* + * Generate the PV version of the queue_spin_lock_slowpath function + */ +#undef pv_init_node +#undef pv_wait_check +#undef pv_link_and_wait_node +#undef pv_wait_head +#undef EXPORT_SYMBOL +#undef in_pv_code + +#define _GEN_PV_LOCK_SLOWPATH +#define EXPORT_SYMBOL(x) +#define in_pv_code return_true +#define pv_enabled return_false + +#include "qspinlock.c" + +#endif
Peter Zijlstra
2014-Oct-27 17:27 UTC
[PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Mon, Oct 27, 2014 at 01:15:53PM -0400, Waiman Long wrote:> On 10/24/2014 06:04 PM, Peter Zijlstra wrote: > >On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote: > >>The additional register pressure may just cause a few more register moves > >>which should be negligible in the overall performance . The additional > >>icache pressure, however, may have some impact on performance. I was trying > >>to balance the performance of the pv and non-pv versions so that we won't > >>penalize the pv code too much for a bit more performance in the non-pv code. > >>Doing it your way will add a lot of function call and register > >>saving/restoring to the pv code. > >If people care about performance they should not be using virt crap :-) > > > >I only really care about bare metal. > > Yes, I am aware of that. However, the whole point of doing PV spinlock is to > improve performance in a virtual guest.Anything that avoids the lock holder preemption nonsense is a _massive_ win for them, a few function calls should not even register on that scale.> +#ifdef _GEN_PV_LOCK_SLOWPATH > +static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) > +#else > void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) > +#endifIf you have two functions you might as well use the PV stuff to patch in the right function call at the usage sites and avoid:> + if (pv_enabled()) { > + pv_queue_spin_lock_slowpath(lock, val); > + return; > + }this alltogether.> this_cpu_dec(mcs_nodes[0].count); > } > EXPORT_SYMBOL(queue_spin_lock_slowpath); > + > +#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS) > +/* > + * Generate the PV version of the queue_spin_lock_slowpath function > + */ > +#undef pv_init_node > +#undef pv_wait_check > +#undef pv_link_and_wait_node > +#undef pv_wait_head > +#undef EXPORT_SYMBOL > +#undef in_pv_code > + > +#define _GEN_PV_LOCK_SLOWPATH > +#define EXPORT_SYMBOL(x) > +#define in_pv_code return_true > +#define pv_enabled return_false > + > +#include "qspinlock.c" > + > +#endifThat's properly disgusting :-) But a lot better than actually duplicating everything I suppose.
Waiman Long
2014-Oct-27 20:50 UTC
[PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/27/2014 01:27 PM, Peter Zijlstra wrote:> On Mon, Oct 27, 2014 at 01:15:53PM -0400, Waiman Long wrote: >> On 10/24/2014 06:04 PM, Peter Zijlstra wrote: >>> On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote: >>>> The additional register pressure may just cause a few more register moves >>>> which should be negligible in the overall performance . The additional >>>> icache pressure, however, may have some impact on performance. I was trying >>>> to balance the performance of the pv and non-pv versions so that we won't >>>> penalize the pv code too much for a bit more performance in the non-pv code. >>>> Doing it your way will add a lot of function call and register >>>> saving/restoring to the pv code. >>> If people care about performance they should not be using virt crap :-) >>> >>> I only really care about bare metal. >> Yes, I am aware of that. However, the whole point of doing PV spinlock is to >> improve performance in a virtual guest. > Anything that avoids the lock holder preemption nonsense is a _massive_ > win for them, a few function calls should not even register on that > scale.I would say all the PV stuffs are mostly useful for a over-committed guest where a single CPU is shared in more than one guest. When the guests are not overcommitted, the PV code seldom get triggered. In those cases, the overhead of the extra function call and register saving/restoring will show up.>> +#ifdef _GEN_PV_LOCK_SLOWPATH >> +static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) >> +#else >> void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) >> +#endif > If you have two functions you might as well use the PV stuff to patch in > the right function call at the usage sites and avoid: > >> + if (pv_enabled()) { >> + pv_queue_spin_lock_slowpath(lock, val); >> + return; >> + } > this alltogether.Good point! I will do some investigation on how to do this kind of function address patching and eliminate the extra function call overhead.>> this_cpu_dec(mcs_nodes[0].count); >> } >> EXPORT_SYMBOL(queue_spin_lock_slowpath); >> + >> +#if !defined(_GEN_PV_LOCK_SLOWPATH)&& defined(CONFIG_PARAVIRT_SPINLOCKS) >> +/* >> + * Generate the PV version of the queue_spin_lock_slowpath function >> + */ >> +#undef pv_init_node >> +#undef pv_wait_check >> +#undef pv_link_and_wait_node >> +#undef pv_wait_head >> +#undef EXPORT_SYMBOL >> +#undef in_pv_code >> + >> +#define _GEN_PV_LOCK_SLOWPATH >> +#define EXPORT_SYMBOL(x) >> +#define in_pv_code return_true >> +#define pv_enabled return_false >> + >> +#include "qspinlock.c" >> + >> +#endif > That's properly disgusting :-) But a lot better than actually > duplicating everything I suppose.I know you don't like this kind of preprocessor trick, but this is the easiest way that I can think of to generate two separate functions from the same source code. -Longman
Maybe Matching Threads
- [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
- [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
- [PATCH v13 09/11] pvqspinlock, x86: Add para-virtualization support
- [PATCH v13 09/11] pvqspinlock, x86: Add para-virtualization support
- [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support