thr3ads.net - Btrfs devel - [PATCH -v8][RFC] mutex: implement adaptive spinning [Jan 2009]

If this information is useful, please help other people find it:
Share via:

Peter Zijlstra

2009-Jan-12 15:37 UTC

[PATCH -v8][RFC] mutex: implement adaptive spinning

Subject: mutex: implement adaptive spinning
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon Jan 12 14:01:47 CET 2009

Change mutex contention behaviour such that it will sometimes busy wait on
acquisition - moving its behaviour closer to that of spinlocks.

This concept got ported to mainline from the -rt tree, where it was originally
implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins.

Testing with Ingo''s test-mutex application
(http://lkml.org/lkml/2006/1/8/50)
gave a 345% boost for VFS scalability on my testbox:

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               296604

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               85870

The key criteria for the busy wait is that the lock owner has to be running on
a (different) cpu. The idea is that as long as the owner is running, there is a
fair chance it''ll release the lock soon, and thus we''ll be
better off spinning
instead of blocking/scheduling.

Since regular mutexes (as opposed to rtmutexes) do not atomically track the
owner, we add the owner in a non-atomic fashion and deal with the races in
the slowpath.

Furthermore, to ease the testing of the performance impact of this new code,
there is means to disable this behaviour runtime (without having to reboot
the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y),
by issuing the following command:

 # echo NO_OWNER_SPIN > /debug/sched_features

This command re-enables spinning again (this is also the default):

 # echo OWNER_SPIN > /debug/sched_features

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Changes since -v7:
 - made all the mutex ops fully non-preemptible
   - this fixes the getting preempted with owner=null issues
 - implemented FIFO fairness using the wait_list
 - fixed owner tracking issues

Dmitry noted that since we now have FIFO order, we need not put all
spinners to rest when one falls into schedule, but only those behind
it in the queue -- I haven''t yet implemented this, but it should be
doable without a list-walk.

PREEMPT_NONE, PREEMPT_VOLUNTARY work as advertised
PREEMPT=y still seems to madly over-preempt or something.

 include/linux/mutex.h   |   10 ++-
 include/linux/sched.h   |    3 
 kernel/mutex-debug.c    |    9 --
 kernel/mutex-debug.h    |   18 +++--
 kernel/mutex.c          |  154 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/mutex.h          |   22 ++++++
 kernel/sched.c          |   82 ++++++++++++++++++++++++-
 kernel/sched_features.h |    1 
 8 files changed, 257 insertions(+), 42 deletions(-)

Index: linux-2.6/include/linux/mutex.h
==================================================================---
linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -50,8 +50,11 @@ struct mutex {
 	atomic_t		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct thread_info	*owner;
+	int			waiters;
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
 	void			*magic;
 #endif
@@ -67,8 +70,11 @@ struct mutex {
 struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
-#ifdef CONFIG_DEBUG_MUTEXES
 	struct mutex		*lock;
+#ifdef CONFIG_SMP
+	int			spin;
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
 #endif
 };
Index: linux-2.6/include/linux/sched.h
==================================================================---
linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -328,7 +328,10 @@ extern signed long schedule_timeout(sign
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
+extern int mutex_spin_on_owner(struct mutex_waiter *waiter,
+			       struct thread_info *owner);
 
 struct nsproxy;
 struct user_namespace;
Index: linux-2.6/kernel/mutex-debug.c
==================================================================---
linux-2.6.orig/kernel/mutex-debug.c
+++ linux-2.6/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
-	lock->owner = new_owner;
-}
-
 void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex
 
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
-	waiter->lock = lock;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lo
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev &&
!lock->wait_list.next);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	mutex_clear_owner(lock);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->owner = NULL;
 	lock->magic = lock;
 }
 
Index: linux-2.6/kernel/mutex-debug.h
==================================================================---
linux-2.6.orig/kernel/mutex-debug.h
+++ linux-2.6/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
-	lock->owner = NULL;
-}
-
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mu
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
Index: linux-2.6/kernel/mutex.c
==================================================================---
linux-2.6.orig/kernel/mutex.c
+++ linux-2.6/kernel/mutex.c
@@ -10,6 +10,11 @@
  * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
  * David Howells for suggestions and improvements.
  *
+ *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ *    from the -rt tree, where it was originally implemented for rtmutexes
+ *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ *    and Sven Dietrich.
+ *
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const c
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
+	mutex_clear_owner(lock);
 
 	debug_mutex_init(lock, name, key);
 }
@@ -90,7 +96,10 @@ void inline __sched mutex_lock(struct mu
 	 * The locking fastpath is the 1->0 transition from
 	 * ''unlocked'' into ''locked'' state.
 	 */
+	preempt_disable();
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	mutex_set_owner(lock);
+	preempt_enable();
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -115,11 +124,86 @@ void __sched mutex_unlock(struct mutex *
 	 * The unlocking fastpath is the 0->1 transition from
''locked''
 	 * into ''unlocked'' state:
 	 */
+	preempt_disable();
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(lock);
+#endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+	preempt_enable();
 }
 
 EXPORT_SYMBOL(mutex_unlock);
 
+#ifdef CONFIG_SMP
+static void mutex_spin_or_schedule(struct mutex_waiter *waiter,
+				   unsigned long *flags)
+{
+	struct mutex *lock = waiter->lock;
+
+	waiter->spin = !lock->waiters;
+	if (!waiter->spin)
+		goto sleep;
+
+	spin_unlock_mutex(&lock->wait_lock, *flags);
+	while (waiter->spin && !lock->waiters) {
+		struct thread_info *owner;
+
+		/* Busy wait on the owner, if any. */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(waiter, owner))
+			break;
+
+		cpu_relax();
+	}
+	spin_lock_mutex(&lock->wait_lock, *flags);
+
+	if (!waiter->spin) {
+		__set_task_state(waiter->task, TASK_RUNNING);
+		return;
+	}
+	waiter->spin = 0;
+sleep:
+	/*
+	 * Stop all other spinners.
+	 */
+	lock->waiters++;
+	spin_unlock_mutex(&lock->wait_lock, *flags);
+
+	__schedule();
+
+	spin_lock_mutex(&lock->wait_lock, *flags);
+	lock->waiters--;
+}
+
+static void mutex_wake_up(struct mutex_waiter *waiter)
+{
+	if (waiter->spin)
+		waiter->spin = 0;
+	else
+		wake_up_process(waiter->task);
+}
+#else
+static void mutex_spin_or_schedule(struct mutex_waiter *waiter,
+				   unsigned long *flags)
+{
+	struct mutex *lock = waiter->lock;
+
+	spin_unlock_mutex(&lock->wait_lock, *flags);
+	__schedule();
+	spin_lock_mutex(&lock->wait_lock, *flags);
+}
+
+static void mutex_wake_up(struct mutex_waiter *waiter)
+{
+	wake_up_process(waiter->task);
+}
+#endif
+
 /*
  * Lock a mutex (possibly interruptible), slowpath:
  */
@@ -129,7 +213,6 @@ __mutex_lock_common(struct mutex *lock, 
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
-	unsigned int old_val;
 	unsigned long flags;
 
 	spin_lock_mutex(&lock->wait_lock, flags);
@@ -141,9 +224,9 @@ __mutex_lock_common(struct mutex *lock, 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
+	waiter.lock = lock;
 
-	old_val = atomic_xchg(&lock->count, -1);
-	if (old_val == 1)
+	if (atomic_xchg(&lock->count, -1) == 1)
 		goto done;
 
 	lock_contended(&lock->dep_map, ip);
@@ -158,8 +241,7 @@ __mutex_lock_common(struct mutex *lock, 
 		 * that when we release the lock, we properly wake up the
 		 * other waiters:
 		 */
-		old_val = atomic_xchg(&lock->count, -1);
-		if (old_val == 1)
+		if (atomic_xchg(&lock->count, -1) == 1)
 			break;
 
 		/*
@@ -178,16 +260,14 @@ __mutex_lock_common(struct mutex *lock, 
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
-		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule();
-		spin_lock_mutex(&lock->wait_lock, flags);
+		mutex_spin_or_schedule(&waiter, &flags);
 	}
 
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-	debug_mutex_set_owner(lock, task_thread_info(task));
+	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -205,7 +285,9 @@ void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
+	preempt_disable();
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
+	preempt_enable();
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -213,16 +295,28 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested);
 int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+	preempt_disable();
+	ret = __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+	preempt_enable();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
 
 int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	preempt_disable();
+	ret = __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	preempt_enable();
+
+	return ret;
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -257,11 +351,9 @@ __mutex_unlock_common_slowpath(atomic_t 
 
 		debug_mutex_wake_waiter(lock, waiter);
 
-		wake_up_process(waiter->task);
+		mutex_wake_up(waiter);
 	}
 
-	debug_mutex_clear_owner(lock);
-
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
@@ -298,18 +390,34 @@ __mutex_lock_interruptible_slowpath(atom
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	preempt_disable();
+	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+	preempt_enable();
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	preempt_disable();
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+	preempt_enable();
+
+	return ret;
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
@@ -352,9 +460,10 @@ static inline int __mutex_trylock_slowpa
 
 	prev = atomic_xchg(&lock->count, -1);
 	if (likely(prev == 1)) {
-		debug_mutex_set_owner(lock, current_thread_info());
+		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -380,8 +489,15 @@ static inline int __mutex_trylock_slowpa
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
-					__mutex_trylock_slowpath);
+	int ret;
+
+	preempt_disable();
+	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+	if (ret)
+		mutex_set_owner(lock);
+	preempt_enable();
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
Index: linux-2.6/kernel/mutex.h
==================================================================---
linux-2.6.orig/kernel/mutex.h
+++ linux-2.6/kernel/mutex.h
@@ -16,8 +16,26 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
-#define debug_mutex_clear_owner(lock)			do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
Index: linux-2.6/kernel/sched.c
==================================================================---
linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4535,15 +4535,13 @@ pick_next_task(struct rq *rq, struct tas
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -4600,13 +4598,91 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
+}
 
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	preempt_disable();
+	__schedule();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex_waiter *waiter, struct thread_info *owner)
+{
+	struct mutex *lock = waiter->lock;
+	unsigned int cpu;
+	struct rq *rq;
+	int ret = 1;
+
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
+
+	preempt_disable();
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Need to access the cpu field knowing that
+	 * DEBUG_PAGEALLOC could have unmapped it if
+	 * the mutex owner just released it and exited.
+	 */
+	if (probe_kernel_address(&owner->cpu, cpu))
+		goto out;
+#else
+	cpu = owner->cpu;
+#endif
+
+	/*
+	 * Even if the access succeeded (likely case),
+	 * the cpu field may no longer be valid.
+	 */
+	if (cpu >= nr_cpumask_bits)
+		goto out;
+
+	/*
+	 * We need to validate that we can do a
+	 * get_cpu() and that we have the percpu area.
+	 */
+	if (!cpu_online(cpu))
+		goto out;
+
+	rq = cpu_rq(cpu);
+
+	while (waiter->spin && !lock->waiters) {
+		/*
+		 * Owner changed, break to re-assess state.
+		 */
+		if (lock->owner != owner)
+			break;
+
+		/*
+		 * Is that owner really running on that cpu?
+		 */
+		if (task_thread_info(rq->curr) != owner) {
+			ret = 0;
+			break;
+		}
+
+		if (need_resched()) {
+			ret = 0;
+			break;
+		}
+
+		cpu_relax();
+	}
+out:
+	preempt_enable_no_resched();
+	return ret;
+}
+#endif
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
Index: linux-2.6/kernel/sched_features.h
==================================================================---
linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-12 16:04 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 12 Jan 2009, Peter Zijlstra wrote:> 
> Change mutex contention behaviour such that it will sometimes busy wait on
> acquisition - moving its behaviour closer to that of spinlocks.
This version seems worse in so many ways.

You made the mutex bigger, for no obvious reason.

You made it back into the locked version.

You have unnecessary preempt_enable/preempt_disable calls in functions 
that need to be called (and are called) with preemption already disabled.

Things seem to be going backwards.

		Linus
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity

2009-Jan-12 16:13 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Peter Zijlstra wrote:> Subject: mutex: implement adaptive spinning
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Mon Jan 12 14:01:47 CET 2009
>
> Change mutex contention behaviour such that it will sometimes busy wait on
> acquisition - moving its behaviour closer to that of spinlocks.
>
> This concept got ported to mainline from the -rt tree, where it was
originally
> implemented for rtmutexes by Steven Rostedt, based on work by Gregory
Haskins.
>
> Testing with Ingo''s test-mutex application
(http://lkml.org/lkml/2006/1/8/50)
> gave a 345% boost for VFS scalability on my testbox:
>
>  # ./test-mutex-shm V 16 10 | grep "^avg ops"
>  avg ops/sec:               296604
>
>  # ./test-mutex-shm V 16 10 | grep "^avg ops"
>  avg ops/sec:               85870
>
> The key criteria for the busy wait is that the lock owner has to be running
on
> a (different) cpu. The idea is that as long as the owner is running, there
is a
> fair chance it''ll release the lock soon, and thus we''ll
be better off spinning
> instead of blocking/scheduling.
>
> Since regular mutexes (as opposed to rtmutexes) do not atomically track the
> owner, we add the owner in a non-atomic fashion and deal with the races in
> the slowpath.
>
> Furthermore, to ease the testing of the performance impact of this new
code,
> there is means to disable this behaviour runtime (without having to reboot
> the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y),
> by issuing the following command:
>
>  # echo NO_OWNER_SPIN > /debug/sched_features
>
> This command re-enables spinning again (this is also the default):
>
>  # echo OWNER_SPIN > /debug/sched_features
>   
One thing that worries me here is that the spinners will spin on a 
memory location in struct mutex, which means that the cacheline holding 
the mutex (which is likely to be under write activity from the owner) 
will be continuously shared by the spinners, slowing the owner down when 
it needs to unshare it.  One way out of this is to spin on a location in 
struct mutex_waiter, and have the mutex owner touch it when it schedules 
out.

So:
- each task_struct has an array of currently owned mutexes, appended to 
by mutex_lock()
- mutex waiters spin on mutex_waiter.wait, which they initialize to zero
- when switching out of a task, walk the mutex list, and for each mutex, 
bump each waiter''s wait variable, and clear the owner array
- when unlocking a mutex, bump the nearest waiter''s wait variable, and 
remove from the owner array

Something similar might be done to spinlocks to reduce cacheline 
contention from spinners and the owner.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-12 16:20 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 12 Jan 2009, Linus Torvalds wrote:> 
> You made it back into the locked version.
Btw, even if you probably had some reason for this, one thing to note is 
that I think Chris'' performance testing showed that the version using a
lock was inferior to his local btrfs hack, while the unlocked version 
actually beat his hack.

Maybe I misunderstood his numbers, though. But if I followed that sub-part 
of the test right, it really means that the locked version is pointless - 
it will never be able to replace peoples local hacks for this same thing, 
because it just doesn''t give the performance people are looking for.

Since the whole (and _only_) point of this thing is to perform well, 
that''s a big deal.

		Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-12 16:45 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 08:20 -0800, Linus Torvalds wrote:> 
> On Mon, 12 Jan 2009, Linus Torvalds wrote:
> > 
> > You made it back into the locked version.
> 
> Btw, even if you probably had some reason for this, one thing to note is 
> that I think Chris'' performance testing showed that the version
using a
> lock was inferior to his local btrfs hack, while the unlocked version 
> actually beat his hack.
> 
The spinning hack was faster than everything before v7 (we''ll call it
the Linus-fix), and the v7 code was much faster than my spin.

This is somewhere in between, with slightly better fairness than v7.

             spin         v7         v8
dbench 50    580MB/s      789MB/s    421MB/s
file creates 152 file/s   162 file/s 195 file/s
file stat    3.8s total   2.3s total 5.3s total

(the file stat run is total run time, so lower is better.  The other
numbers are files or MB per second, so higher is better)

For the file create run, v8 had much lower system time than v7,
averaging 1s of sys time per proc instead of 1.6s.

-chris


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-12 16:50 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 11:45 -0500, Chris Mason wrote:> On Mon, 2009-01-12 at 08:20 -0800, Linus Torvalds wrote:
> > 
> > On Mon, 12 Jan 2009, Linus Torvalds wrote:
> > > 
> > > You made it back into the locked version.
> > 
> > Btw, even if you probably had some reason for this, one thing to note
is
> > that I think Chris'' performance testing showed that the
version using a
> > lock was inferior to his local btrfs hack, while the unlocked version 
> > actually beat his hack.
> > 
> 
> The spinning hack was faster than everything before v7 (we''ll call
it
> the Linus-fix), and the v7 code was much faster than my spin.
> 
> This is somewhere in between, with slightly better fairness than v7.
> 
>              spin         v7         v8
> dbench 50    580MB/s      789MB/s    421MB/s
> file creates 152 file/s   162 file/s 195 file/s
> file stat    3.8s total   2.3s total 5.3s total
> 
> (the file stat run is total run time, so lower is better.  The other
> numbers are files or MB per second, so higher is better)
> 
> For the file create run, v8 had much lower system time than v7,
> averaging 1s of sys time per proc instead of 1.6s.
Right, how about the spread in completion time, because that is the only
reason I tried this fairness stuff, because you reported massive
differences there.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-12 17:13 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 18:13 +0200, Avi Kivity wrote:
> One thing that worries me here is that the spinners will spin on a 
> memory location in struct mutex, which means that the cacheline holding 
> the mutex (which is likely to be under write activity from the owner) 
> will be continuously shared by the spinners, slowing the owner down when 
> it needs to unshare it.  One way out of this is to spin on a location in 
> struct mutex_waiter, and have the mutex owner touch it when it schedules 
> out.
Yeah, that is what pure MCS locks do -- however I don''t think its a
feasible strategy for this spin/sleep hybrid.
> So:
> - each task_struct has an array of currently owned mutexes, appended to 
> by mutex_lock()
That''s not going to fly I think. Lockdep does this but its very
expensive and has some issues. We''re currently at 48 max owners, and
still some code paths manage to exceed that.
> - mutex waiters spin on mutex_waiter.wait, which they initialize to zero
> - when switching out of a task, walk the mutex list, and for each mutex, 
> bump each waiter''s wait variable, and clear the owner array
Which is O(n).
> - when unlocking a mutex, bump the nearest waiter''s wait variable,
and
> remove from the owner array
> 
> Something similar might be done to spinlocks to reduce cacheline 
> contention from spinners and the owner.
Spinlocks can use ''pure'' MCS locks.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-12 17:14 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 17:50 +0100, Peter Zijlstra wrote:> > 
> > (the file stat run is total run time, so lower is better.  The other
> > numbers are files or MB per second, so higher is better)
> > 
> > For the file create run, v8 had much lower system time than v7,
> > averaging 1s of sys time per proc instead of 1.6s.
> 
> Right, how about the spread in completion time, because that is the only
> reason I tried this fairness stuff, because you reported massive
> differences there.
> 
I reran the numbers with a slightly different kernel config and they
have changed somewhat.  These are just for the 4k file create run, all
numbers in files created per second (and the numbers are stable across
runs)

v8 avg 176.90 median 171.85 std 12.49 high 215.97 low 165.54
v7 avg 169.02 median 163.77 std 16.82 high 267.95 low 157.95

-chris


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-12 17:16 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 08:20 -0800, Linus Torvalds wrote:> 
> On Mon, 12 Jan 2009, Linus Torvalds wrote:
> > 
> > You made it back into the locked version.
> 
> Btw, even if you probably had some reason for this, one thing to note is 
> that I think Chris'' performance testing showed that the version
using a
> lock was inferior to his local btrfs hack, while the unlocked version 
> actually beat his hack.
> 
> Maybe I misunderstood his numbers, though. But if I followed that sub-part 
> of the test right, it really means that the locked version is pointless - 
> it will never be able to replace peoples local hacks for this same thing, 
> because it just doesn''t give the performance people are looking
for.
> 
> Since the whole (and _only_) point of this thing is to perform well, 
> that''s a big deal.
Like said in reply to Chris'' email, I just wanted to see if fairness
was
worth the effort, because the pure unlocked spin showed significant
unfairness (and I know some people really care about some level of
fairness).

Initial testing with the simple test-mutex thing didn''t show too bad
numbers.

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity

2009-Jan-12 17:23 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Peter Zijlstra wrote:> On Mon, 2009-01-12 at 18:13 +0200, Avi Kivity wrote:
>
>   
>> One thing that worries me here is that the spinners will spin on a 
>> memory location in struct mutex, which means that the cacheline holding
>> the mutex (which is likely to be under write activity from the owner) 
>> will be continuously shared by the spinners, slowing the owner down
when
>> it needs to unshare it.  One way out of this is to spin on a location
in
>> struct mutex_waiter, and have the mutex owner touch it when it
schedules
>> out.
>>     
>
> Yeah, that is what pure MCS locks do -- however I don''t think its
a
> feasible strategy for this spin/sleep hybrid.
>   
Bummer.
>> So:
>> - each task_struct has an array of currently owned mutexes, appended to
>> by mutex_lock()
>>     
>
> That''s not going to fly I think. Lockdep does this but its very
> expensive and has some issues. We''re currently at 48 max owners,
and
> still some code paths manage to exceed that.
>   
Might make it per-cpu instead, and set a bit in the mutex when 
scheduling out so we know not to remove it from the list on unlock.
>> - mutex waiters spin on mutex_waiter.wait, which they initialize to
zero
>> - when switching out of a task, walk the mutex list, and for each
mutex,
>> bump each waiter''s wait variable, and clear the owner array
>>     
>
> Which is O(n).
>   
It may be better than O(n) cpus banging on the mutex for the lock 
duration.  Of course we should avoid walking the part of the list where 
non-spinning owners wait (or maybe have a separate list for spinners).
>> - when unlocking a mutex, bump the nearest waiter''s wait
variable, and
>> remove from the owner array
>>
>> Something similar might be done to spinlocks to reduce cacheline 
>> contention from spinners and the owner.
>>     
>
> Spinlocks can use ''pure'' MCS locks.
>   
I''ll read up on those, thanks.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-12 17:24 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 12:14 -0500, Chris Mason wrote:> On Mon, 2009-01-12 at 17:50 +0100, Peter Zijlstra wrote:
> > > 
> > > (the file stat run is total run time, so lower is better.  The
other
> > > numbers are files or MB per second, so higher is better)
> > > 
> > > For the file create run, v8 had much lower system time than v7,
> > > averaging 1s of sys time per proc instead of 1.6s.
> > 
> > Right, how about the spread in completion time, because that is the
only
> > reason I tried this fairness stuff, because you reported massive
> > differences there.
> > 
> 
> I reran the numbers with a slightly different kernel config and they
> have changed somewhat.  These are just for the 4k file create run, all
> numbers in files created per second (and the numbers are stable across
> runs)
> 
> v8 avg 176.90 median 171.85 std 12.49 high 215.97 low 165.54
> v7 avg 169.02 median 163.77 std 16.82 high 267.95 low 157.95
Any opinions on the fairness matter, will -v9 be unlocked and unfair
again?

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-12 17:30 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 18:24 +0100, Peter Zijlstra wrote:> On Mon, 2009-01-12 at 12:14 -0500, Chris Mason wrote:
> > On Mon, 2009-01-12 at 17:50 +0100, Peter Zijlstra wrote:
> > > > 
> > > > (the file stat run is total run time, so lower is better. 
The other
> > > > numbers are files or MB per second, so higher is better)
> > > > 
> > > > For the file create run, v8 had much lower system time than
v7,
> > > > averaging 1s of sys time per proc instead of 1.6s.
> > > 
> > > Right, how about the spread in completion time, because that is
the only
> > > reason I tried this fairness stuff, because you reported massive
> > > differences there.
> > > 
> > 
> > I reran the numbers with a slightly different kernel config and they
> > have changed somewhat.  These are just for the 4k file create run, all
> > numbers in files created per second (and the numbers are stable across
> > runs)
> > 
> > v8 avg 176.90 median 171.85 std 12.49 high 215.97 low 165.54
> > v7 avg 169.02 median 163.77 std 16.82 high 267.95 low 157.95
> 
> Any opinions on the fairness matter, will -v9 be unlocked and unfair
> again?
I''d rather have it simple than fair.  My benchmarks are pretty dumb, I
wouldn''t want to add complexity just based on them.

-chris




--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity

2009-Jan-12 17:32 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Peter Zijlstra wrote:> Spinlocks can use ''pure'' MCS locks.
>   
How about this, then.  In mutex_lock(), keep wait_lock locked and only 
release it when scheduling out.  Waiter spinning naturally follows.  If 
spinlocks are cache friendly (are they today?) we inherit that.  If 
there is no contention on the mutex, then we don''t need to reacquire
the
wait_lock on mutex_unlock() (not that the atomic op is that expensive 
these days).

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Boaz Harrosh

2009-Jan-12 17:33 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Peter Zijlstra wrote:> On Mon, 2009-01-12 at 08:20 -0800, Linus Torvalds wrote:
>> On Mon, 12 Jan 2009, Linus Torvalds wrote:
>>> You made it back into the locked version.
>> Btw, even if you probably had some reason for this, one thing to note
is
>> that I think Chris'' performance testing showed that the
version using a
>> lock was inferior to his local btrfs hack, while the unlocked version 
>> actually beat his hack.
>>
>> Maybe I misunderstood his numbers, though. But if I followed that
sub-part
>> of the test right, it really means that the locked version is pointless
-
>> it will never be able to replace peoples local hacks for this same
thing,
>> because it just doesn''t give the performance people are
looking for.
>>
>> Since the whole (and _only_) point of this thing is to perform well, 
>> that''s a big deal.
> 
> Like said in reply to Chris'' email, I just wanted to see if
fairness was
> worth the effort, because the pure unlocked spin showed significant
> unfairness (and I know some people really care about some level of
> fairness).
> 
Which brings me back to my initial reaction to this work. Do we need
two flavors of Mutex? some program sections need Fairness, some need
performance. Some need low-latency, some need absolute raw CPU power.

Because at the end of the day spinning in a saturated CPU work-load
that does not care about latency, eats away cycles that could be spent
on computation. Think multi-threaded video processing for example. 
Thing I would like to measure is 
1 - how many times we spin and at the end get a lock
2 - how many times we spin and at the end sleep.
3 - how many times we sleep like before.
vs. In old case CPU spent on scheduling. Just to see if we are actually loosing
cycles at the end.
> Initial testing with the simple test-mutex thing didn''t show too
bad
> numbers.
> 
Boaz
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-12 18:07 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 19:33 +0200, Boaz Harrosh wrote:
> Which brings me back to my initial reaction to this work. Do we need
> two flavors of Mutex? some program sections need Fairness, some need
> performance. Some need low-latency, some need absolute raw CPU power.
Thing is, its the kernel, we cannot have such knobs per task. So we have
to pick one and stick with it -- the ''best'' we can do is what
PREEMPT_RT
does and replace the thing whole sale at build time.
> Because at the end of the day spinning in a saturated CPU work-load
> that does not care about latency, eats away cycles that could be spent
> on computation. Think multi-threaded video processing for example. 
> Thing I would like to measure is 
> 1 - how many times we spin and at the end get a lock
> 2 - how many times we spin and at the end sleep.
> 3 - how many times we sleep like before.
> vs. In old case CPU spent on scheduling. Just to see if we are actually
loosing
> cycles at the end.
Feel free to do so ;-)

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-13 15:15 UTC

head link

[PATCH -v9][RFC] mutex: implement adaptive spinning

Subject: mutex: implement adaptive spinning
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon Jan 12 14:01:47 CET 2009

Change mutex contention behaviour such that it will sometimes busy wait on
acquisition - moving its behaviour closer to that of spinlocks.

This concept got ported to mainline from the -rt tree, where it was originally
implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins.

Testing with Ingo''s test-mutex application
(http://lkml.org/lkml/2006/1/8/50)
gave a 304% boost for VFS scalability on my testbox:

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               298932

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               98287

The key criteria for the busy wait is that the lock owner has to be running on
a (different) cpu. The idea is that as long as the owner is running, there is a
fair chance it''ll release the lock soon, and thus we''ll be
better off spinning
instead of blocking/scheduling.

Since regular mutexes (as opposed to rtmutexes) do not atomically track the
owner, we add the owner in a non-atomic fashion and deal with the races in
the slowpath.

Furthermore, to ease the testing of the performance impact of this new code,
there is means to disable this behaviour runtime (without having to reboot
the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y),
by issuing the following command:

 # echo NO_OWNER_SPIN > /debug/sched_features

This command re-enables spinning again (this is also the default):

 # echo OWNER_SPIN > /debug/sched_features

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Chenges since -v8:
 - dropped the fairness

Changes since -v7:
 - made all the mutex ops fully non-preemptible
 - implemented FIFO fairness using the wait_list
 - fixed owner tracking issues

 include/linux/mutex.h   |    5 +
 include/linux/sched.h   |    2 
 kernel/mutex-debug.c    |    9 ---
 kernel/mutex-debug.h    |   18 +++---
 kernel/mutex.c          |  128 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/mutex.h          |   22 +++++++-
 kernel/sched.c          |   71 +++++++++++++++++++++++++-
 kernel/sched_features.h |    1 
 8 files changed, 216 insertions(+), 40 deletions(-)

Index: linux-2.6/include/linux/mutex.h
==================================================================---
linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -50,8 +50,10 @@ struct mutex {
 	atomic_t		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct thread_info	*owner;
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
 	void			*magic;
 #endif
@@ -68,7 +70,6 @@ struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
 #ifdef CONFIG_DEBUG_MUTEXES
-	struct mutex		*lock;
 	void			*magic;
 #endif
 };
Index: linux-2.6/include/linux/sched.h
==================================================================---
linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -328,7 +328,9 @@ extern signed long schedule_timeout(sign
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
+extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
 struct nsproxy;
 struct user_namespace;
Index: linux-2.6/kernel/mutex-debug.c
==================================================================---
linux-2.6.orig/kernel/mutex-debug.c
+++ linux-2.6/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
-	lock->owner = new_owner;
-}
-
 void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex
 
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
-	waiter->lock = lock;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lo
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev &&
!lock->wait_list.next);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	mutex_clear_owner(lock);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->owner = NULL;
 	lock->magic = lock;
 }
 
Index: linux-2.6/kernel/mutex-debug.h
==================================================================---
linux-2.6.orig/kernel/mutex-debug.h
+++ linux-2.6/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
-	lock->owner = NULL;
-}
-
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mu
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
Index: linux-2.6/kernel/mutex.c
==================================================================---
linux-2.6.orig/kernel/mutex.c
+++ linux-2.6/kernel/mutex.c
@@ -10,6 +10,11 @@
  * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
  * David Howells for suggestions and improvements.
  *
+ *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ *    from the -rt tree, where it was originally implemented for rtmutexes
+ *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ *    and Sven Dietrich.
+ *
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const c
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
+	mutex_clear_owner(lock);
 
 	debug_mutex_init(lock, name, key);
 }
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mu
 	 * ''unlocked'' into ''locked'' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	mutex_set_owner(lock);
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *
 	 * The unlocking fastpath is the 0->1 transition from
''locked''
 	 * into ''unlocked'' state:
 	 */
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(lock);
+#endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
 }
 
@@ -129,21 +144,82 @@ __mutex_lock_common(struct mutex *lock, 
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
-	unsigned int old_val;
 	unsigned long flags;
 
+	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+	/*
+	 * Optimistic spinning.
+	 *
+	 * We try to spin for acquisition when we find that there are no
+	 * pending waiters and the lock owner is currently running on a
+	 * (different) CPU.
+	 *
+	 * The rationale is that if the lock owner is running, it is likely to
+	 * release the lock soon.
+	 *
+	 * Since this needs the lock owner, and this mutex implementation
+	 * doesn''t track the owner atomically in the lock field, we need to
+	 * track it non-atomically.
+	 *
+	 * We can''t do this for DEBUG_MUTEXES because that relies on
wait_lock
+	 * to serialize everything.
+	 */
+
+	for (;;) {
+		struct thread_info *owner;
+
+		/*
+		 * If there are pending waiters, join them.
+		 */
+		if (!list_empty(&lock->wait_list))
+			break;
+
+		/*
+		 * If there''s an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
+
+		/*
+		 * When there''s no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we''re an RT task that will live-lock because we won''t
let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		if (atomic_xchg(&lock->count, -1) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don''t need
+		 * memory barriers as we''ll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+#endif
+
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
-	mutex_acquire(&lock->dep_map, subclass, 0, ip);
 	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
-	old_val = atomic_xchg(&lock->count, -1);
-	if (old_val == 1)
+	if (atomic_xchg(&lock->count, -1) == 1)
 		goto done;
 
 	lock_contended(&lock->dep_map, ip);
@@ -158,8 +234,7 @@ __mutex_lock_common(struct mutex *lock, 
 		 * that when we release the lock, we properly wake up the
 		 * other waiters:
 		 */
-		old_val = atomic_xchg(&lock->count, -1);
-		if (old_val == 1)
+		if (atomic_xchg(&lock->count, -1) == 1)
 			break;
 
 		/*
@@ -173,21 +248,22 @@ __mutex_lock_common(struct mutex *lock, 
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
+			preempt_enable();
 			return -EINTR;
 		}
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule();
+		__schedule();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-	debug_mutex_set_owner(lock, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +272,7 @@ done:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
 
 	return 0;
 }
@@ -222,7 +299,8 @@ int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+				   subclass, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +338,6 @@ __mutex_unlock_common_slowpath(atomic_t 
 		wake_up_process(waiter->task);
 	}
 
-	debug_mutex_clear_owner(lock);
-
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
@@ -298,18 +374,30 @@ __mutex_lock_interruptible_slowpath(atom
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
@@ -352,9 +440,10 @@ static inline int __mutex_trylock_slowpa
 
 	prev = atomic_xchg(&lock->count, -1);
 	if (likely(prev == 1)) {
-		debug_mutex_set_owner(lock, current_thread_info());
+		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -380,8 +469,13 @@ static inline int __mutex_trylock_slowpa
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
-					__mutex_trylock_slowpath);
+	int ret;
+
+	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+	if (ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
Index: linux-2.6/kernel/mutex.h
==================================================================---
linux-2.6.orig/kernel/mutex.h
+++ linux-2.6/kernel/mutex.h
@@ -16,8 +16,26 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
-#define debug_mutex_clear_owner(lock)			do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
Index: linux-2.6/kernel/sched.c
==================================================================---
linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4535,15 +4535,13 @@ pick_next_task(struct rq *rq, struct tas
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -4600,13 +4598,80 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
+}
 
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	preempt_disable();
+	__schedule();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+	unsigned int cpu;
+	struct rq *rq;
+
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Need to access the cpu field knowing that
+	 * DEBUG_PAGEALLOC could have unmapped it if
+	 * the mutex owner just released it and exited.
+	 */
+	if (probe_kernel_address(&owner->cpu, cpu))
+		goto out;
+#else
+	cpu = owner->cpu;
+#endif
+
+	/*
+	 * Even if the access succeeded (likely case),
+	 * the cpu field may no longer be valid.
+	 */
+	if (cpu >= nr_cpumask_bits)
+		goto out;
+
+	/*
+	 * We need to validate that we can do a
+	 * get_cpu() and that we have the percpu area.
+	 */
+	if (!cpu_online(cpu))
+		goto out;
+
+	rq = cpu_rq(cpu);
+
+	for (;;) {
+		/*
+		 * Owner changed, break to re-assess state.
+		 */
+		if (lock->owner != owner)
+			break;
+
+		/*
+		 * Is that owner really running on that cpu?
+		 */
+		if (task_thread_info(rq->curr) != owner || need_resched())
+			return 0;
+
+		cpu_relax();
+	}
+out:
+	return 1;
+}
+#endif
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
Index: linux-2.6/kernel/sched_features.h
==================================================================---
linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-13 16:16 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Tue, 13 Jan 2009, Peter Zijlstra wrote:> 
> Change mutex contention behaviour such that it will sometimes busy wait on
> acquisition - moving its behaviour closer to that of spinlocks.
Okey, dokey. Looks reasonable, but I wonder if this part came from v8 and 
wasn''t intentional:
> +		if (atomic_xchg(&lock->count, -1) == 1) {
> +			lock_acquired(&lock->dep_map, ip);
> +			mutex_set_owner(lock);
> +			preempt_enable();
> +			return 0;
> +		}
Now you''re forcing the slow-path on unlock. Maybe it was intentional, 
maybe it wasn''t. Did you perhaps mean

	if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {

here? I thought we agreed it was safe, if only because it should be 
equivalent to just having done "mutex_trylock()" instead of a
"real" lock
sequence.

			Linus
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-13 16:21 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Tue, 2009-01-13 at 08:16 -0800, Linus Torvalds wrote:> 
> On Tue, 13 Jan 2009, Peter Zijlstra wrote:
> > 
> > Change mutex contention behaviour such that it will sometimes busy
wait on
> > acquisition - moving its behaviour closer to that of spinlocks.
> 
> Okey, dokey. Looks reasonable, but I wonder if this part came from v8 and 
> wasn''t intentional:
> 
> > +		if (atomic_xchg(&lock->count, -1) == 1) {
> > +			lock_acquired(&lock->dep_map, ip);
> > +			mutex_set_owner(lock);
> > +			preempt_enable();
> > +			return 0;
> > +		}
> 
> Now you''re forcing the slow-path on unlock. Maybe it was
intentional,
> maybe it wasn''t. Did you perhaps mean
> 
> 	if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
> 
> here? I thought we agreed it was safe, if only because it should be 
> equivalent to just having done "mutex_trylock()" instead of a
"real" lock
> sequence.
Yes, that was an ''accident'' from -v8, yes we did think the
cmpxchg was
good, however I did get some spurious lockups on -v7, and I only noticed
the thing after I''d done most of the testing, so I decided to let it be
for now.

Let me put the cmpxchg back in and see if this is all still good (only
3*2*2 configs to test :-).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-13 16:39 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

* Peter Zijlstra <peterz@infradead.org> wrote:
> > Now you''re forcing the slow-path on unlock. Maybe it was
intentional,
> > maybe it wasn''t. Did you perhaps mean
> > 
> > 	if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
> > 
> > here? I thought we agreed it was safe, if only because it should be 
> > equivalent to just having done "mutex_trylock()" instead of
a "real"
> > lock sequence.
> 
> Yes, that was an ''accident'' from -v8, yes we did think
the cmpxchg was
> good, however I did get some spurious lockups on -v7, and I only noticed 
> the thing after I''d done most of the testing, so I decided to let
it be
> for now.
> 
> Let me put the cmpxchg back in and see if this is all still good (only 
> 3*2*2 configs to test :-).
i saw sporadic lockups with -v7 too, so if you send a -v10 with Linus''s
sequence for the unlock it takes about an hour of testing to check whether 
it still occurs.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-13 16:40 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Tue, 2009-01-13 at 17:21 +0100, Peter Zijlstra wrote:> On Tue, 2009-01-13 at 08:16 -0800, Linus Torvalds wrote:
> > 
> > On Tue, 13 Jan 2009, Peter Zijlstra wrote:
> > > 
> > > Change mutex contention behaviour such that it will sometimes
busy wait on
> > > acquisition - moving its behaviour closer to that of spinlocks.
> > 
> > Okey, dokey. Looks reasonable, but I wonder if this part came from v8
and
> > wasn''t intentional:
> > 
> > > +		if (atomic_xchg(&lock->count, -1) == 1) {
> > > +			lock_acquired(&lock->dep_map, ip);
> > > +			mutex_set_owner(lock);
> > > +			preempt_enable();
> > > +			return 0;
> > > +		}
> > 
> > Now you''re forcing the slow-path on unlock. Maybe it was
intentional,
> > maybe it wasn''t. Did you perhaps mean
> > 
> > 	if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
> > 
> > here? I thought we agreed it was safe, if only because it should be 
> > equivalent to just having done "mutex_trylock()" instead of
a "real" lock
> > sequence.
> 
> Yes, that was an ''accident'' from -v8, yes we did think
the cmpxchg was
> good, however I did get some spurious lockups on -v7, and I only noticed
> the thing after I''d done most of the testing, so I decided to let
it be
> for now.
> 
> Let me put the cmpxchg back in and see if this is all still good (only
> 3*2*2 configs to test :-).
Ok, tested only 1, but that was the one I remember lockups from -- and
that seems to be good with the cmpxchg.

Do you fancy me sending v10 or will you make that change locally?
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-13 16:49 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Tue, 13 Jan 2009, Peter Zijlstra wrote:> 
> Ok, tested only 1, but that was the one I remember lockups from -- and
> that seems to be good with the cmpxchg.
> 
> Do you fancy me sending v10 or will you make that change locally?
I''d like to get this in, but I''m not going to apply it in any
case without
way more testing.

It missed the merge window, but I''m potentially willing to let it slip
in
- but only if there are a fair number of people willing to do numbers and 
some harsh testing. Preferably people who did see problems earlier, ie at 
a minimum Chris and Ingo under the loads they used before and saw issues 
with.

So do a v10, and ask people to test.

			Linus
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-13 17:21 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Tue, 2009-01-13 at 08:49 -0800, Linus Torvalds wrote:> 
> So do a v10, and ask people to test.
---
Subject: mutex: implement adaptive spinning
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon Jan 12 14:01:47 CET 2009

Change mutex contention behaviour such that it will sometimes busy wait on
acquisition - moving its behaviour closer to that of spinlocks.

This concept got ported to mainline from the -rt tree, where it was originally
implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins.

Testing with Ingo''s test-mutex application
(http://lkml.org/lkml/2006/1/8/50)
gave a 304% boost for VFS scalability on my testbox:

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               298932

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               98287

The key criteria for the busy wait is that the lock owner has to be running on
a (different) cpu. The idea is that as long as the owner is running, there is a
fair chance it''ll release the lock soon, and thus we''ll be
better off spinning
instead of blocking/scheduling.

Since regular mutexes (as opposed to rtmutexes) do not atomically track the
owner, we add the owner in a non-atomic fashion and deal with the races in
the slowpath.

Furthermore, to ease the testing of the performance impact of this new code,
there is means to disable this behaviour runtime (without having to reboot
the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y),
by issuing the following command:

 # echo NO_OWNER_SPIN > /debug/sched_features

This command re-enables spinning again (this is also the default):

 # echo OWNER_SPIN > /debug/sched_features

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Changes since -v9:
 - cmpxchg acquire in the spin loop

Changes since -v8:
 - dropped the fairness

Changes since -v7:
 - made all the mutex ops fully non-preemptible
 - implemented FIFO fairness using the wait_list
 - fixed owner tracking issues

 include/linux/mutex.h   |    5 +
 include/linux/sched.h   |    2 
 kernel/mutex-debug.c    |    9 ---
 kernel/mutex-debug.h    |   18 +++---
 kernel/mutex.c          |  128 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/mutex.h          |   22 +++++++-
 kernel/sched.c          |   71 +++++++++++++++++++++++++-
 kernel/sched_features.h |    1 
 8 files changed, 216 insertions(+), 40 deletions(-)

Index: linux-2.6/include/linux/mutex.h
==================================================================---
linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -50,8 +50,10 @@ struct mutex {
 	atomic_t		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct thread_info	*owner;
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
 	void			*magic;
 #endif
@@ -68,7 +70,6 @@ struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
 #ifdef CONFIG_DEBUG_MUTEXES
-	struct mutex		*lock;
 	void			*magic;
 #endif
 };
Index: linux-2.6/include/linux/sched.h
==================================================================---
linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -328,7 +328,9 @@ extern signed long schedule_timeout(sign
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
+extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
 struct nsproxy;
 struct user_namespace;
Index: linux-2.6/kernel/mutex-debug.c
==================================================================---
linux-2.6.orig/kernel/mutex-debug.c
+++ linux-2.6/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
-	lock->owner = new_owner;
-}
-
 void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex
 
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
-	waiter->lock = lock;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lo
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev &&
!lock->wait_list.next);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	mutex_clear_owner(lock);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->owner = NULL;
 	lock->magic = lock;
 }
 
Index: linux-2.6/kernel/mutex-debug.h
==================================================================---
linux-2.6.orig/kernel/mutex-debug.h
+++ linux-2.6/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
-	lock->owner = NULL;
-}
-
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mu
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
Index: linux-2.6/kernel/mutex.c
==================================================================---
linux-2.6.orig/kernel/mutex.c
+++ linux-2.6/kernel/mutex.c
@@ -10,6 +10,11 @@
  * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
  * David Howells for suggestions and improvements.
  *
+ *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ *    from the -rt tree, where it was originally implemented for rtmutexes
+ *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ *    and Sven Dietrich.
+ *
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const c
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
+	mutex_clear_owner(lock);
 
 	debug_mutex_init(lock, name, key);
 }
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mu
 	 * ''unlocked'' into ''locked'' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	mutex_set_owner(lock);
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *
 	 * The unlocking fastpath is the 0->1 transition from
''locked''
 	 * into ''unlocked'' state:
 	 */
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(lock);
+#endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
 }
 
@@ -129,21 +144,82 @@ __mutex_lock_common(struct mutex *lock, 
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
-	unsigned int old_val;
 	unsigned long flags;
 
+	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+	/*
+	 * Optimistic spinning.
+	 *
+	 * We try to spin for acquisition when we find that there are no
+	 * pending waiters and the lock owner is currently running on a
+	 * (different) CPU.
+	 *
+	 * The rationale is that if the lock owner is running, it is likely to
+	 * release the lock soon.
+	 *
+	 * Since this needs the lock owner, and this mutex implementation
+	 * doesn''t track the owner atomically in the lock field, we need to
+	 * track it non-atomically.
+	 *
+	 * We can''t do this for DEBUG_MUTEXES because that relies on
wait_lock
+	 * to serialize everything.
+	 */
+
+	for (;;) {
+		struct thread_info *owner;
+
+		/*
+		 * If there are pending waiters, join them.
+		 */
+		if (!list_empty(&lock->wait_list))
+			break;
+
+		/*
+		 * If there''s an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
+
+		/*
+		 * When there''s no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we''re an RT task that will live-lock because we won''t
let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don''t need
+		 * memory barriers as we''ll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+#endif
+
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
-	mutex_acquire(&lock->dep_map, subclass, 0, ip);
 	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
-	old_val = atomic_xchg(&lock->count, -1);
-	if (old_val == 1)
+	if (atomic_xchg(&lock->count, -1) == 1)
 		goto done;
 
 	lock_contended(&lock->dep_map, ip);
@@ -158,8 +234,7 @@ __mutex_lock_common(struct mutex *lock, 
 		 * that when we release the lock, we properly wake up the
 		 * other waiters:
 		 */
-		old_val = atomic_xchg(&lock->count, -1);
-		if (old_val == 1)
+		if (atomic_xchg(&lock->count, -1) == 1)
 			break;
 
 		/*
@@ -173,21 +248,22 @@ __mutex_lock_common(struct mutex *lock, 
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
+			preempt_enable();
 			return -EINTR;
 		}
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule();
+		__schedule();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-	debug_mutex_set_owner(lock, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +272,7 @@ done:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
 
 	return 0;
 }
@@ -222,7 +299,8 @@ int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+				   subclass, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +338,6 @@ __mutex_unlock_common_slowpath(atomic_t 
 		wake_up_process(waiter->task);
 	}
 
-	debug_mutex_clear_owner(lock);
-
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
@@ -298,18 +374,30 @@ __mutex_lock_interruptible_slowpath(atom
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
@@ -352,9 +440,10 @@ static inline int __mutex_trylock_slowpa
 
 	prev = atomic_xchg(&lock->count, -1);
 	if (likely(prev == 1)) {
-		debug_mutex_set_owner(lock, current_thread_info());
+		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -380,8 +469,13 @@ static inline int __mutex_trylock_slowpa
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
-					__mutex_trylock_slowpath);
+	int ret;
+
+	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+	if (ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
Index: linux-2.6/kernel/mutex.h
==================================================================---
linux-2.6.orig/kernel/mutex.h
+++ linux-2.6/kernel/mutex.h
@@ -16,8 +16,26 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
-#define debug_mutex_clear_owner(lock)			do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
Index: linux-2.6/kernel/sched.c
==================================================================---
linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4535,15 +4535,13 @@ pick_next_task(struct rq *rq, struct tas
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -4600,13 +4598,80 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
+}
 
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	preempt_disable();
+	__schedule();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+	unsigned int cpu;
+	struct rq *rq;
+
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Need to access the cpu field knowing that
+	 * DEBUG_PAGEALLOC could have unmapped it if
+	 * the mutex owner just released it and exited.
+	 */
+	if (probe_kernel_address(&owner->cpu, cpu))
+		goto out;
+#else
+	cpu = owner->cpu;
+#endif
+
+	/*
+	 * Even if the access succeeded (likely case),
+	 * the cpu field may no longer be valid.
+	 */
+	if (cpu >= nr_cpumask_bits)
+		goto out;
+
+	/*
+	 * We need to validate that we can do a
+	 * get_cpu() and that we have the percpu area.
+	 */
+	if (!cpu_online(cpu))
+		goto out;
+
+	rq = cpu_rq(cpu);
+
+	for (;;) {
+		/*
+		 * Owner changed, break to re-assess state.
+		 */
+		if (lock->owner != owner)
+			break;
+
+		/*
+		 * Is that owner really running on that cpu?
+		 */
+		if (task_thread_info(rq->curr) != owner || need_resched())
+			return 0;
+
+		cpu_relax();
+	}
+out:
+	return 1;
+}
+#endif
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
Index: linux-2.6/kernel/sched_features.h
==================================================================---
linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-13 18:12 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> 
> On Tue, 13 Jan 2009, Peter Zijlstra wrote:
> > 
> > Ok, tested only 1, but that was the one I remember lockups from -- and
> > that seems to be good with the cmpxchg.
> > 
> > Do you fancy me sending v10 or will you make that change locally?
> 
> I''d like to get this in, but I''m not going to apply it in
any case
> without way more testing.
> 
> It missed the merge window, but I''m potentially willing to let it
slip
> in - but only if there are a fair number of people willing to do numbers 
> and some harsh testing. Preferably people who did see problems earlier, 
> ie at a minimum Chris and Ingo under the loads they used before and saw 
> issues with.
I am tracking Peter''s code in tip/core/locking:

 c10b491: mutex: implement adaptive spinning, v8
 c356a7c: mutex: implement adaptive spinning, v7
 b89e5d8: mutex: implement adaptive spinning, v6
 237ac94: mutex: implement adaptive spinning

And v8 is rock solid in all my testing - and i''ll give v10 a similar 
workout as well.

Would you prefer a single commit or is this kind of delta development 
history useful, with all the variants (at least the later, more promising 
ones) included?

The commits are in a tight group with no merges inbetween.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-13 18:21 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Tue, 13 Jan 2009, Ingo Molnar wrote:>
> And v8 is rock solid in all my testing - and i''ll give v10 a
similar
> workout as well.
The differences between v8 and v10 are very fundamental, since v8 does the 
spinning inside the spinlock''ed loop (the spinning itself is not inside
the spinlock, but all the "real action" is).  So v8 not showing
problems
doesn''t really say much about v10 - totally different algorithms that 
share only some of the support code.

So even if many lines look the same, those code-lines aren''t the really
interesting ones. The only really interesting once is really the 
atomic_cmpxchg (outside spinlock) vs atomic_xchg (inside spinlock), and 
those are almost diametrically opposite.
> Would you prefer a single commit or is this kind of delta development 
> history useful, with all the variants (at least the later, more promising 
> ones) included?
I''m not sure it makes sense to show the history here, especially as
there
really were two different approaches, and while they share many issues, 
they sure aren''t equivalent nor are we really talking about any
evolution
of the patch except in the sense of one being the kick-starter for the 
alternative approach.

What _can_ make sense is to commit some of the infrastructure helper code 
separately, ie the lock ownership and preemption changes, since those 
really are independent of the spinning code, and at least the preemption 
thing is interesting and relevant even without it.

			Linus
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-13 18:24 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Tue, 13 Jan 2009, Ingo Molnar wrote:
> >
> > And v8 is rock solid in all my testing - and i''ll give v10 a
similar
> > workout as well.
> 
> The differences between v8 and v10 are very fundamental, since v8 does 
> the spinning inside the spinlock''ed loop (the spinning itself is
not
> inside the spinlock, but all the "real action" is).  So v8 not
showing
> problems doesn''t really say much about v10 - totally different 
> algorithms that share only some of the support code.
> 
> So even if many lines look the same, those code-lines aren''t the
really
> interesting ones. The only really interesting once is really the 
> atomic_cmpxchg (outside spinlock) vs atomic_xchg (inside spinlock), and 
> those are almost diametrically opposite.
yeah. What i thought they would be useful for are testing and experiments 
like this:

" what if you switch the spinning to more fair by typing this in your 
  current tree:

     git revert c10b491
"

... but that''s a pretty narrow purpose.
> > Would you prefer a single commit or is this kind of delta development 
> > history useful, with all the variants (at least the later, more 
> > promising ones) included?
> 
> I''m not sure it makes sense to show the history here, especially
as
> there really were two different approaches, and while they share many 
> issues, they sure aren''t equivalent nor are we really talking
about any
> evolution of the patch except in the sense of one being the kick-starter 
> for the alternative approach.
> 
> What _can_ make sense is to commit some of the infrastructure helper 
> code separately, ie the lock ownership and preemption changes, since 
> those really are independent of the spinning code, and at least the 
> preemption thing is interesting and relevant even without it.
ok, we''ll improve the splitup.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-13 18:33 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

* Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, 2009-01-13 at 08:49 -0800, Linus Torvalds wrote:
> > 
> > So do a v10, and ask people to test.
below is the v8 -> v10 delta patch - for all who''d like to review
the
changes.

	Ingo

------------>
From d154179e2d4d4667bcbf22920eeab563bc042e6a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 13 Jan 2009 18:21:54 +0100
Subject: [PATCH] mutex: implement adaptive spinning, v10

Change mutex contention behaviour such that it will sometimes busy wait on
acquisition - moving its behaviour closer to that of spinlocks.

This concept got ported to mainline from the -rt tree, where it was originally
implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins.

Testing with Ingo''s test-mutex application
(http://lkml.org/lkml/2006/1/8/50)
gave a 304% boost for VFS scalability on my testbox:

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               298932

 # ./test-mutex-shm V 16 10 | grep "^avg ops"
 avg ops/sec:               98287

The key criteria for the busy wait is that the lock owner has to be running on
a (different) cpu. The idea is that as long as the owner is running, there is a
fair chance it''ll release the lock soon, and thus we''ll be
better off spinning
instead of blocking/scheduling.

Since regular mutexes (as opposed to rtmutexes) do not atomically track the
owner, we add the owner in a non-atomic fashion and deal with the races in
the slowpath.

Furthermore, to ease the testing of the performance impact of this new code,
there is means to disable this behaviour runtime (without having to reboot
the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y),
by issuing the following command:

 # echo NO_OWNER_SPIN > /debug/sched_features

This command re-enables spinning again (this is also the default):

 # echo OWNER_SPIN > /debug/sched_features

Changes since -v9:
 - cmpxchg acquire in the spin loop

Changes since -v8:
 - dropped the fairness

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h |    5 --
 include/linux/sched.h |    3 +-
 kernel/mutex.c        |  168 +++++++++++++++++++++----------------------------
 kernel/sched.c        |   21 ++-----
 4 files changed, 79 insertions(+), 118 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b374c64..3069ec7 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -52,7 +52,6 @@ struct mutex {
 	struct list_head	wait_list;
 #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct thread_info	*owner;
-	int			waiters;
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
@@ -70,10 +69,6 @@ struct mutex {
 struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
-	struct mutex		*lock;
-#ifdef CONFIG_SMP
-	int			spin;
-#endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dae1a3c..c34b137 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -330,8 +330,7 @@ extern signed long schedule_timeout_killable(signed long
timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
-extern int mutex_spin_on_owner(struct mutex_waiter *waiter,
-			       struct thread_info *owner);
+extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
 struct nsproxy;
 struct user_namespace;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 332bb2e..0d5336d 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -96,10 +96,8 @@ void inline __sched mutex_lock(struct mutex *lock)
 	 * The locking fastpath is the 1->0 transition from
 	 * ''unlocked'' into ''locked'' state.
 	 */
-	preempt_disable();
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
 	mutex_set_owner(lock);
-	preempt_enable();
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -124,7 +122,6 @@ void __sched mutex_unlock(struct mutex *lock)
 	 * The unlocking fastpath is the 0->1 transition from
''locked''
 	 * into ''unlocked'' state:
 	 */
-	preempt_disable();
 #ifndef CONFIG_DEBUG_MUTEXES
 	/*
 	 * When debugging is enabled we must not clear the owner before time,
@@ -134,97 +131,93 @@ void __sched mutex_unlock(struct mutex *lock)
 	mutex_clear_owner(lock);
 #endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
-	preempt_enable();
 }
 
 EXPORT_SYMBOL(mutex_unlock);
 
-#ifdef CONFIG_SMP
-static void mutex_spin_or_schedule(struct mutex_waiter *waiter,
-				   unsigned long *flags)
+/*
+ * Lock a mutex (possibly interruptible), slowpath:
+ */
+static inline int __sched
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+	       	unsigned long ip)
 {
-	struct mutex *lock = waiter->lock;
+	struct task_struct *task = current;
+	struct mutex_waiter waiter;
+	unsigned long flags;
 
-	waiter->spin = !lock->waiters;
-	if (!waiter->spin)
-		goto sleep;
+	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+	/*
+	 * Optimistic spinning.
+	 *
+	 * We try to spin for acquisition when we find that there are no
+	 * pending waiters and the lock owner is currently running on a
+	 * (different) CPU.
+	 *
+	 * The rationale is that if the lock owner is running, it is likely to
+	 * release the lock soon.
+	 *
+	 * Since this needs the lock owner, and this mutex implementation
+	 * doesn''t track the owner atomically in the lock field, we need to
+	 * track it non-atomically.
+	 *
+	 * We can''t do this for DEBUG_MUTEXES because that relies on
wait_lock
+	 * to serialize everything.
+	 */
 
-	spin_unlock_mutex(&lock->wait_lock, *flags);
-	while (waiter->spin && !lock->waiters) {
+	for (;;) {
 		struct thread_info *owner;
 
-		/* Busy wait on the owner, if any. */
-		owner = ACCESS_ONCE(lock->owner);
-		if (owner && !mutex_spin_on_owner(waiter, owner))
+		/*
+		 * If there are pending waiters, join them.
+		 */
+		if (!list_empty(&lock->wait_list))
 			break;
 
-		cpu_relax();
-	}
-	spin_lock_mutex(&lock->wait_lock, *flags);
-
-	if (!waiter->spin) {
-		__set_task_state(waiter->task, TASK_RUNNING);
-		return;
-	}
-	waiter->spin = 0;
-sleep:
-	/*
-	 * Stop all other spinners.
-	 */
-	lock->waiters++;
-	spin_unlock_mutex(&lock->wait_lock, *flags);
-
-	__schedule();
-
-	spin_lock_mutex(&lock->wait_lock, *flags);
-	lock->waiters--;
-}
+		/*
+		 * If there''s an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
 
-static void mutex_wake_up(struct mutex_waiter *waiter)
-{
-	if (waiter->spin)
-		waiter->spin = 0;
-	else
-		wake_up_process(waiter->task);
-}
-#else
-static void mutex_spin_or_schedule(struct mutex_waiter *waiter,
-				   unsigned long *flags)
-{
-	struct mutex *lock = waiter->lock;
+		/*
+		 * When there''s no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we''re an RT task that will live-lock because we won''t
let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
 
-	spin_unlock_mutex(&lock->wait_lock, *flags);
-	__schedule();
-	spin_lock_mutex(&lock->wait_lock, *flags);
-}
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
 
-static void mutex_wake_up(struct mutex_waiter *waiter)
-{
-	wake_up_process(waiter->task);
-}
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don''t need
+		 * memory barriers as we''ll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
 #endif
 
-/*
- * Lock a mutex (possibly interruptible), slowpath:
- */
-static inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
-	       	unsigned long ip)
-{
-	struct task_struct *task = current;
-	struct mutex_waiter waiter;
-	unsigned long flags;
-
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
-	mutex_acquire(&lock->dep_map, subclass, 0, ip);
 	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
-	waiter.lock = lock;
 
 	if (atomic_xchg(&lock->count, -1) == 1)
 		goto done;
@@ -255,18 +248,21 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
+			preempt_enable();
 			return -EINTR;
 		}
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
-		mutex_spin_or_schedule(&waiter, &flags);
+		spin_unlock_mutex(&lock->wait_lock, flags);
+		__schedule();
+		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
 	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
@@ -276,6 +272,7 @@ done:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
 
 	return 0;
 }
@@ -285,9 +282,7 @@ void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	preempt_disable();
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
-	preempt_enable();
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -295,28 +290,17 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested);
 int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
-	int ret;
-
 	might_sleep();
-	preempt_disable();
-	ret = __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
-	preempt_enable();
-
-	return ret;
+	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
 
 int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
-	int ret;
-
 	might_sleep();
-	preempt_disable();
-	ret = __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
-	preempt_enable();
-
-	return ret;
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+				   subclass, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -351,7 +335,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int
nested)
 
 		debug_mutex_wake_waiter(lock, waiter);
 
-		mutex_wake_up(waiter);
+		wake_up_process(waiter->task);
 	}
 
 	spin_unlock_mutex(&lock->wait_lock, flags);
@@ -393,12 +377,10 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
 	int ret;
 
 	might_sleep();
-	preempt_disable();
 	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
 	if (!ret)
 		mutex_set_owner(lock);
-	preempt_enable();
 
 	return ret;
 }
@@ -410,12 +392,10 @@ int __sched mutex_lock_killable(struct mutex *lock)
 	int ret;
 
 	might_sleep();
-	preempt_disable();
 	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
 	if (!ret)
 		mutex_set_owner(lock);
-	preempt_enable();
 
 	return ret;
 }
@@ -491,11 +471,9 @@ int __sched mutex_trylock(struct mutex *lock)
 {
 	int ret;
 
-	preempt_disable();
 	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
 	if (ret)
 		mutex_set_owner(lock);
-	preempt_enable();
 
 	return ret;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 9dd69c9..8f097fd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4611,17 +4611,14 @@ EXPORT_SYMBOL(schedule);
  * Look out! "owner" is an entirely speculative pointer
  * access and not reliable.
  */
-int mutex_spin_on_owner(struct mutex_waiter *waiter, struct thread_info *owner)
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 {
-	struct mutex *lock = waiter->lock;
 	unsigned int cpu;
 	struct rq *rq;
-	int ret = 1;
 
 	if (!sched_feat(OWNER_SPIN))
 		return 0;
 
-	preempt_disable();
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	/*
 	 * Need to access the cpu field knowing that
@@ -4650,7 +4647,7 @@ int mutex_spin_on_owner(struct mutex_waiter *waiter,
struct thread_info *owner)
 
 	rq = cpu_rq(cpu);
 
-	while (waiter->spin && !lock->waiters) {
+	for (;;) {
 		/*
 		 * Owner changed, break to re-assess state.
 		 */
@@ -4660,21 +4657,13 @@ int mutex_spin_on_owner(struct mutex_waiter *waiter,
struct thread_info *owner)
 		/*
 		 * Is that owner really running on that cpu?
 		 */
-		if (task_thread_info(rq->curr) != owner) {
-			ret = 0;
-			break;
-		}
-
-		if (need_resched()) {
-			ret = 0;
-			break;
-		}
+		if (task_thread_info(rq->curr) != owner || need_resched())
+			return 0;
 
 		cpu_relax();
 	}
 out:
-	preempt_enable_no_resched();
-	return ret;
+	return 1;
 }
 #endif
 
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-13 18:40 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Tue, 13 Jan 2009, Ingo Molnar wrote:> 
> below is the v8 -> v10 delta patch - for all who''d like to
review the
> changes.
Hmm. This does seem to indicate that v10 lost many of the preempt changes. 
Probably because Peter went back to v7 to create it.

I also wonder about the timing numbers - the numbers were probably done 
with some random version and then not re-done, so the commit message is 
not necessarily "sane" in that respect - the timings quoted
don''t
necessarily match the code committed.

		Linus
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-13 19:01 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> 
> On Tue, 13 Jan 2009, Ingo Molnar wrote:
> > 
> > below is the v8 -> v10 delta patch - for all who''d like to
review the
> > changes.
> 
> Hmm. This does seem to indicate that v10 lost many of the preempt 
> changes. Probably because Peter went back to v7 to create it.
> 
> I also wonder about the timing numbers - the numbers were probably done 
> with some random version and then not re-done, so the commit message is 
> not necessarily "sane" in that respect - the timings quoted
don''t
> necessarily match the code committed.
yeah - i didnt take care with v10''s commit log because it will be
redone
with a structural splitup anyway, instead of the current time based 
history.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-14 02:58 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Tue, 2009-01-13 at 18:21 +0100, Peter Zijlstra wrote:> On Tue, 2009-01-13 at 08:49 -0800, Linus Torvalds wrote:
> > 
> > So do a v10, and ask people to test.
> 
> ---
> Subject: mutex: implement adaptive spinning
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Mon Jan 12 14:01:47 CET 2009
> 
> Change mutex contention behaviour such that it will sometimes busy wait on
> acquisition - moving its behaviour closer to that of spinlocks.
> 
I''ve spent a bunch of time on this one, and noticed earlier today that
I
still had bits of CONFIG_FTRACE compiling.  I wasn''t actually tracing
anything, but it seems to have had a big performance hit.

The bad news is the simple spin got much much faster, dbench 50 coming
in at 1282MB/s instead of 580MB/s.  (other benchmarks give similar
results)

v10 is better that not spinning, but its in the 5-10% range.  So, I''ve
been trying to find ways to close the gap, just to understand exactly
where it is different.

If I take out:
	/*
	 * If there are pending waiters, join them.
	 */
	if (!list_empty(&lock->wait_list))
		break;

v10 pops dbench 50 up to 1800MB/s.  The other tests soundly beat my
spinning and aren''t less fair.  But clearly this isn''t a good
solution.

I tried a few variations, like only checking the wait list once before
looping, which helps some.  Are there other suggestions on better tuning
options?

(I retested v7 and see similar results)

-chris

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Dmitry Adamushko

2009-Jan-14 11:18 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

2009/1/14 Chris Mason <chris.mason@oracle.com>:> On Tue, 2009-01-13 at 18:21 +0100, Peter Zijlstra wrote:
>> On Tue, 2009-01-13 at 08:49 -0800, Linus Torvalds wrote:
>> >
>> > So do a v10, and ask people to test.
>>
>> ---
>> Subject: mutex: implement adaptive spinning
>> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
>> Date: Mon Jan 12 14:01:47 CET 2009
>>
>> Change mutex contention behaviour such that it will sometimes busy wait
on
>> acquisition - moving its behaviour closer to that of spinlocks.
>>
>
> I''ve spent a bunch of time on this one, and noticed earlier today
that I
> still had bits of CONFIG_FTRACE compiling.  I wasn''t actually
tracing
> anything, but it seems to have had a big performance hit.
>
> The bad news is the simple spin got much much faster, dbench 50 coming
> in at 1282MB/s instead of 580MB/s.  (other benchmarks give similar
> results)
>
> v10 is better that not spinning, but its in the 5-10% range.  So,
I''ve
> been trying to find ways to close the gap, just to understand exactly
> where it is different.
>
> If I take out:
>        /*
>         * If there are pending waiters, join them.
>         */
>        if (!list_empty(&lock->wait_list))
>                break;
>
>
> v10 pops dbench 50 up to 1800MB/s.  The other tests soundly beat my
> spinning and aren''t less fair.  But clearly this isn''t a
good solution.
>
> I tried a few variations, like only checking the wait list once before
> looping, which helps some.  Are there other suggestions on better tuning
> options?
(some thoughts/speculations)

Perhaps for highly-contanded mutexes the spinning implementation may
quickly degrade [*] to the non-spinning one (i.e. the current
sleep-wait mutex) and then just stay in this state until a moment of
time when there are no waiters  [**]  -- i.e.
list_empty(&lock->wait_list) == 1 and waiters can start spinning
again.

what may trigger [*]:

(1) obviously, an owner scheduling out.

Even if it happens rarely (otherwise, it''s not a target scenario for
our optimization), due to the [**] it may take quite some time until
waiters are able to spin again.

let''s say, waiters (almost) never block (and possibly, such cases
would be better off just using a spinlock after some refactoring, if
possible)

(2) need_resched() is triggered for one of the waiters.

(3) !owner && rt_task(p)

quite unlikely, but possible (there are 2 race windows).

Of course, the question is whether it really takes a noticeable amount
of time to get out of the [**] state.
I''d imagine it can be a case for highly-contended locks.

If this is the case indeed, then which of 1,2,3 gets triggered the most.

Have you tried removing need_resched() checks? So we kind of emulate
real spinlocks here.
>
> -chris
>
-- 
Best regards,
Dmitry Adamushko
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 11:21 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

* Chris Mason <chris.mason@oracle.com> wrote:
> v10 is better that not spinning, but its in the 5-10% range.  So,
I''ve
> been trying to find ways to close the gap, just to understand exactly 
> where it is different.
> 
> If I take out:
> 	/*
> 	 * If there are pending waiters, join them.
> 	 */
> 	if (!list_empty(&lock->wait_list))
> 		break;
> 
> 
> v10 pops dbench 50 up to 1800MB/s.  The other tests soundly beat my 
> spinning and aren''t less fair.  But clearly this isn''t a
good solution.
i think since we already decided that it''s ok to be somewhat unfair
(_all_
batching constructs introduce unfairness, so the question is never
''should
we?'' but ''by how much?''), we should just take this
out and enjoy the speed
...

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-14 15:43 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Wed, 14 Jan 2009, Ingo Molnar wrote:
> 
> * Chris Mason <chris.mason@oracle.com> wrote:
> 
> > v10 is better that not spinning, but its in the 5-10% range.  So,
I''ve
> > been trying to find ways to close the gap, just to understand exactly 
> > where it is different.
> > 
> > If I take out:
> > 	/*
> > 	 * If there are pending waiters, join them.
> > 	 */
> > 	if (!list_empty(&lock->wait_list))
> > 		break;
> > 
> > 
> > v10 pops dbench 50 up to 1800MB/s.  The other tests soundly beat my 
> > spinning and aren''t less fair.  But clearly this
isn''t a good solution.
> 
> i think since we already decided that it''s ok to be somewhat
unfair (_all_
> batching constructs introduce unfairness, so the question is never
''should
> we?'' but ''by how much?''), we should just take
this out and enjoy the speed
Yeah, let''s just do it. It''s going to be unfair, but
let''s see if the
unfairness is going to actually be noticeable in real life. I suspect it 
isn''t, and if it is, we can look at it again and see if there are other
approaches.

If it makes _that_ much of a difference on dbench, it''s worth being
unfair
even if it''s just for some dubious benchmark.

			Linus
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-14 16:23 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Wed, 2009-01-14 at 07:43 -0800, Linus Torvalds wrote:> 
> On Wed, 14 Jan 2009, Ingo Molnar wrote:
> 
> > 
> > * Chris Mason <chris.mason@oracle.com> wrote:
> > 
> > > v10 is better that not spinning, but its in the 5-10% range.  So,
I''ve
> > > been trying to find ways to close the gap, just to understand
exactly
> > > where it is different.
> > > 
> > > If I take out:
> > > 	/*
> > > 	 * If there are pending waiters, join them.
> > > 	 */
> > > 	if (!list_empty(&lock->wait_list))
> > > 		break;
> > > 
> > > 
> > > v10 pops dbench 50 up to 1800MB/s.  The other tests soundly beat
my
> > > spinning and aren''t less fair.  But clearly this
isn''t a good solution.
> > 
> > i think since we already decided that it''s ok to be somewhat
unfair (_all_
> > batching constructs introduce unfairness, so the question is never
''should
> > we?'' but ''by how much?''), we should just
take this out and enjoy the speed
Ok, numbers first, incremental below:

* dbench 50 (higher is better):
spin        1282MB/s
v10         548MB/s
v10 no wait 1868MB/s

* 4k creates (numbers in files/second higher is better):
spin        avg 200.60 median 193.20 std 19.71 high 305.93 low 186.82
v10         avg 180.94 median 175.28 std 13.91 high 229.31 low 168.73
v10 no wait avg 232.18 median 222.38 std 22.91 high 314.66 low 209.12

* File stats (numbers in seconds, lower is better):
spin        2.27s
v10         5.1s
v10 no wait 1.6s

This patch brings v10 up to v10 no wait.  The changes are smaller than
they look, I just moved the need_resched checks in __mutex_lock_common
after the cmpxchg.

-chris

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 0d5336d..c2d47b7 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -171,12 +171,6 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 		struct thread_info *owner;
 
 		/*
-		 * If there are pending waiters, join them.
-		 */
-		if (!list_empty(&lock->wait_list))
-			break;
-
-		/*
 		 * If there''s an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
@@ -184,6 +178,13 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 		if (owner && !mutex_spin_on_owner(lock, owner))
 			break;
 
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
 		/*
 		 * When there''s no owner, we might have preempted between the
 		 * owner acquiring the lock and setting the owner field. If
@@ -192,14 +193,6 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 		 */
 		if (!owner && (need_resched() || rt_task(task)))
 			break;
-
-		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
-			lock_acquired(&lock->dep_map, ip);
-			mutex_set_owner(lock);
-			preempt_enable();
-			return 0;
-		}
-
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
 		 * everything in this loop to be re-loaded. We don''t need


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-14 16:46 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Mon, 2009-01-12 at 19:32 +0200, Avi Kivity wrote:> Peter Zijlstra wrote:
> > Spinlocks can use ''pure'' MCS locks.
> >   
> 
> How about this, then.  In mutex_lock(), keep wait_lock locked and only 
> release it when scheduling out.  Waiter spinning naturally follows.  If 
> spinlocks are cache friendly (are they today?) 
(no they''re not, Nick''s ticket locks still spin on a shared
cacheline
IIRC -- the MCS locks mentioned could fix this)
> we inherit that.  If 
> there is no contention on the mutex, then we don''t need to
reacquire the
> wait_lock on mutex_unlock() (not that the atomic op is that expensive 
> these days).
That might actually work, although we''d have to move the
__mutex_slowpath_needs_to_unlock() branch outside wait_lock otherwise
we''ll deadlock :-)

It might be worth trying this if we get serious fairness issues with the
current construct.

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-14 16:47 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

On Wed, 2009-01-14 at 12:18 +0100, Dmitry Adamushko
wrote:> 2009/1/14 Chris Mason <chris.mason@oracle.com>:
> > On Tue, 2009-01-13 at 18:21 +0100, Peter Zijlstra wrote:
> >> On Tue, 2009-01-13 at 08:49 -0800, Linus Torvalds wrote:
> >> >
> >> > So do a v10, and ask people to test.
> >>
> >> ---
> >> Subject: mutex: implement adaptive spinning
> >> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> >> Date: Mon Jan 12 14:01:47 CET 2009
> >>
> >> Change mutex contention behaviour such that it will sometimes busy
wait on
> >> acquisition - moving its behaviour closer to that of spinlocks.
> >>
> >
> > I''ve spent a bunch of time on this one, and noticed earlier
today that I
> > still had bits of CONFIG_FTRACE compiling.  I wasn''t actually
tracing
> > anything, but it seems to have had a big performance hit.
> >
> > The bad news is the simple spin got much much faster, dbench 50 coming
> > in at 1282MB/s instead of 580MB/s.  (other benchmarks give similar
> > results)
> >
> > v10 is better that not spinning, but its in the 5-10% range.  So,
I''ve
> > been trying to find ways to close the gap, just to understand exactly
> > where it is different.
> >
> > If I take out:
> >        /*
> >         * If there are pending waiters, join them.
> >         */
> >        if (!list_empty(&lock->wait_list))
> >                break;
> >
> >
> > v10 pops dbench 50 up to 1800MB/s.  The other tests soundly beat my
> > spinning and aren''t less fair.  But clearly this
isn''t a good solution.
> >
> > I tried a few variations, like only checking the wait list once before
> > looping, which helps some.  Are there other suggestions on better
tuning
> > options?
> 
> (some thoughts/speculations)
> 
> Perhaps for highly-contanded mutexes the spinning implementation may
> quickly degrade [*] to the non-spinning one (i.e. the current
> sleep-wait mutex) and then just stay in this state until a moment of
> time when there are no waiters  [**]  -- i.e.
> list_empty(&lock->wait_list) == 1 and waiters can start spinning
> again.
It is actually ok if the highly contention mutexes don''t degrade as
long
as they are highly contended and the holder isn''t likely to schedule.
> 
> what may trigger [*]:
> 
> (1) obviously, an owner scheduling out.
> 
> Even if it happens rarely (otherwise, it''s not a target scenario
for
> our optimization), due to the [**] it may take quite some time until
> waiters are able to spin again.
> 
> let''s say, waiters (almost) never block (and possibly, such cases
> would be better off just using a spinlock after some refactoring, if
> possible)
> 
> (2) need_resched() is triggered for one of the waiters.
> 
> (3) !owner && rt_task(p)
> 
> quite unlikely, but possible (there are 2 race windows).
> 
> Of course, the question is whether it really takes a noticeable amount
> of time to get out of the [**] state.
> I''d imagine it can be a case for highly-contended locks.
> 
> If this is the case indeed, then which of 1,2,3 gets triggered the most.
Sorry, I don''t have stats on that.
> 
> Have you tried removing need_resched() checks? So we kind of emulate
> real spinlocks here.
Unfortunately, the need_resched() checks deal with a few of the ugly
corners.  They are more important without the waiter list check.
Basically if we spun without the need_resched() checks, the process who
wants to unlock might not be able to schedule back in.

-chris



--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-14 17:00 UTC

head link

[PATCH -v11][RFC] mutex: implement adaptive spinning

Full series, including changelogs available at:

 http://programming.kicks-ass.net/kernel-patches/mutex-adaptive-spin/

and should shortly appear in a git tree near Ingo :-)

 mutex: small cleanup
 mutex: preemption fixes
 mutex: implement adaptive spinning
 mutex: set owner in mutex_lock() only
 mutex: adaptive spin for debug too
 mutex: adaptive spin performance tweaks

---
 include/linux/mutex.h   |    5 +-
 include/linux/sched.h   |    2 
 kernel/mutex-debug.c    |    9 ---
 kernel/mutex-debug.h    |   18 ++++---
 kernel/mutex.c          |  118 ++++++++++++++++++++++++++++++++++++++++--------
 kernel/mutex.h          |   22 ++++++++
 kernel/sched.c          |   71 +++++++++++++++++++++++++++-
 kernel/sched_features.h |    1 
 8 files changed, 204 insertions(+), 42 deletions(-)

Index: linux-2.6/kernel/mutex.c
==================================================================---
linux-2.6.orig/kernel/mutex.c
+++ linux-2.6/kernel/mutex.c
@@ -10,6 +10,11 @@
  * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
  * David Howells for suggestions and improvements.
  *
+ *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ *    from the -rt tree, where it was originally implemented for rtmutexes
+ *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ *    and Sven Dietrich.
+ *
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const c
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
+	mutex_clear_owner(lock);
 
 	debug_mutex_init(lock, name, key);
 }
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mu
 	 * ''unlocked'' into ''locked'' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	mutex_set_owner(lock);
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *
 	 * The unlocking fastpath is the 0->1 transition from
''locked''
 	 * into ''unlocked'' state:
 	 */
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(lock);
+#endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
 }
 
@@ -129,21 +144,71 @@ __mutex_lock_common(struct mutex *lock, 
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
-	unsigned int old_val;
 	unsigned long flags;
 
+	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#ifdef CONFIG_SMP
+	/*
+	 * Optimistic spinning.
+	 *
+	 * We try to spin for acquisition when we find that there are no
+	 * pending waiters and the lock owner is currently running on a
+	 * (different) CPU.
+	 *
+	 * The rationale is that if the lock owner is running, it is likely to
+	 * release the lock soon.
+	 *
+	 * Since this needs the lock owner, and this mutex implementation
+	 * doesn''t track the owner atomically in the lock field, we need to
+	 * track it non-atomically.
+	 */
+
+	for (;;) {
+		struct thread_info *owner;
+
+		/*
+		 * If there''s an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
+
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			preempt_enable();
+			return 0;
+		}
+
+		/*
+		 * When there''s no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we''re an RT task that will live-lock because we won''t
let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don''t need
+		 * memory barriers as we''ll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+#endif
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
-	mutex_acquire(&lock->dep_map, subclass, 0, ip);
 	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
-	old_val = atomic_xchg(&lock->count, -1);
-	if (old_val == 1)
+	if (atomic_xchg(&lock->count, -1) == 1)
 		goto done;
 
 	lock_contended(&lock->dep_map, ip);
@@ -158,8 +223,7 @@ __mutex_lock_common(struct mutex *lock, 
 		 * that when we release the lock, we properly wake up the
 		 * other waiters:
 		 */
-		old_val = atomic_xchg(&lock->count, -1);
-		if (old_val == 1)
+		if (atomic_xchg(&lock->count, -1) == 1)
 			break;
 
 		/*
@@ -173,21 +237,21 @@ __mutex_lock_common(struct mutex *lock, 
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
+			preempt_enable();
 			return -EINTR;
 		}
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule();
+		__schedule();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-	debug_mutex_set_owner(lock, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +260,7 @@ done:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
 
 	return 0;
 }
@@ -222,7 +287,8 @@ int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+				   subclass, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +326,6 @@ __mutex_unlock_common_slowpath(atomic_t 
 		wake_up_process(waiter->task);
 	}
 
-	debug_mutex_clear_owner(lock);
-
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
@@ -298,18 +362,30 @@ __mutex_lock_interruptible_slowpath(atom
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
@@ -351,10 +427,9 @@ static inline int __mutex_trylock_slowpa
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	prev = atomic_xchg(&lock->count, -1);
-	if (likely(prev == 1)) {
-		debug_mutex_set_owner(lock, current_thread_info());
+	if (likely(prev == 1))
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-	}
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -380,8 +455,13 @@ static inline int __mutex_trylock_slowpa
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
-					__mutex_trylock_slowpath);
+	int ret;
+
+	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+	if (ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
Index: linux-2.6/include/linux/sched.h
==================================================================---
linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -328,7 +328,9 @@ extern signed long schedule_timeout(sign
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
+extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
 struct nsproxy;
 struct user_namespace;
Index: linux-2.6/kernel/sched.c
==================================================================---
linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4535,15 +4535,13 @@ pick_next_task(struct rq *rq, struct tas
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -4600,13 +4598,80 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
+}
 
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	preempt_disable();
+	__schedule();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+	unsigned int cpu;
+	struct rq *rq;
+
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Need to access the cpu field knowing that
+	 * DEBUG_PAGEALLOC could have unmapped it if
+	 * the mutex owner just released it and exited.
+	 */
+	if (probe_kernel_address(&owner->cpu, cpu))
+		goto out;
+#else
+	cpu = owner->cpu;
+#endif
+
+	/*
+	 * Even if the access succeeded (likely case),
+	 * the cpu field may no longer be valid.
+	 */
+	if (cpu >= nr_cpumask_bits)
+		goto out;
+
+	/*
+	 * We need to validate that we can do a
+	 * get_cpu() and that we have the percpu area.
+	 */
+	if (!cpu_online(cpu))
+		goto out;
+
+	rq = cpu_rq(cpu);
+
+	for (;;) {
+		/*
+		 * Owner changed, break to re-assess state.
+		 */
+		if (lock->owner != owner)
+			break;
+
+		/*
+		 * Is that owner really running on that cpu?
+		 */
+		if (task_thread_info(rq->curr) != owner || need_resched())
+			return 0;
+
+		cpu_relax();
+	}
+out:
+	return 1;
+}
+#endif
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
Index: linux-2.6/include/linux/mutex.h
==================================================================---
linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -50,8 +50,10 @@ struct mutex {
 	atomic_t		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct thread_info	*owner;
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
 	void			*magic;
 #endif
@@ -68,7 +70,6 @@ struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
 #ifdef CONFIG_DEBUG_MUTEXES
-	struct mutex		*lock;
 	void			*magic;
 #endif
 };
Index: linux-2.6/kernel/mutex-debug.c
==================================================================---
linux-2.6.orig/kernel/mutex-debug.c
+++ linux-2.6/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
-	lock->owner = new_owner;
-}
-
 void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex
 
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
-	waiter->lock = lock;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lo
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev &&
!lock->wait_list.next);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	mutex_clear_owner(lock);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->owner = NULL;
 	lock->magic = lock;
 }
 
Index: linux-2.6/kernel/mutex-debug.h
==================================================================---
linux-2.6.orig/kernel/mutex-debug.h
+++ linux-2.6/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
-	lock->owner = NULL;
-}
-
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mu
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
Index: linux-2.6/kernel/mutex.h
==================================================================---
linux-2.6.orig/kernel/mutex.h
+++ linux-2.6/kernel/mutex.h
@@ -16,8 +16,26 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
-#define debug_mutex_clear_owner(lock)			do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
Index: linux-2.6/kernel/sched_features.h
==================================================================---
linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Nick Piggin

2009-Jan-14 17:04 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Wed, Jan 14, 2009 at 05:46:39PM +0100, Peter Zijlstra
wrote:> On Mon, 2009-01-12 at 19:32 +0200, Avi Kivity wrote:
> > Peter Zijlstra wrote:
> > > Spinlocks can use ''pure'' MCS locks.
> > >   
> > 
> > How about this, then.  In mutex_lock(), keep wait_lock locked and only
> > release it when scheduling out.  Waiter spinning naturally follows. 
If
> > spinlocks are cache friendly (are they today?) 
> 
> (no they''re not, Nick''s ticket locks still spin on a
shared cacheline
> IIRC -- the MCS locks mentioned could fix this)
It reminds me. I wrote a basic variation of MCS spinlocks a while back. And
converted dcache lock to use it, which showed large dbench improvements on
a big machine (of course for different reasons than the dbench improvements
in this threaed).

http://lkml.org/lkml/2008/8/28/24

Each "lock" object is sane in size because given set of spin-local
queues may
only be used once per lock stack. But any spinlocks within a mutex acquisition
will always be at the bottom of such a stack anyway, by definition.

If you can use any code or concept for your code, that would be great.

 > > we inherit that.  If 
> > there is no contention on the mutex, then we don''t need to
reacquire the
> > wait_lock on mutex_unlock() (not that the atomic op is that expensive 
> > these days).
> 
> That might actually work, although we''d have to move the
> __mutex_slowpath_needs_to_unlock() branch outside wait_lock otherwise
> we''ll deadlock :-)
> 
> It might be worth trying this if we get serious fairness issues with the
> current construct.--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 17:06 UTC

head link

[PATCH -v11 delta] mutex: implement adaptive spinning

* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> > > If I take out:
> > > 	/*
> > > 	 * If there are pending waiters, join them.
> > > 	 */
> > > 	if (!list_empty(&lock->wait_list))
> > > 		break;
> > > 
> > > 
> > > v10 pops dbench 50 up to 1800MB/s.  The other tests soundly beat
my
> > > spinning and aren''t less fair.  But clearly this
isn''t a good solution.
> > 
> > i think since we already decided that it''s ok to be somewhat
unfair (_all_
> > batching constructs introduce unfairness, so the question is never
''should
> > we?'' but ''by how much?''), we should just
take this out and enjoy the speed
> 
> Yeah, let''s just do it. It''s going to be unfair, but
let''s see if the
> unfairness is going to actually be noticeable in real life. I suspect it 
> isn''t, and if it is, we can look at it again and see if there are
other
> approaches.
> 
> If it makes _that_ much of a difference on dbench, it''s worth
being
> unfair even if it''s just for some dubious benchmark.
Below is the final v10->v11 delta, for review. Beyond the tweak from 
Chris, there''s small cleanups and the debug simplification from
Johannes
Weiner. We think this is the final version and are doing final tests now.

	Ingo

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 0d5336d..703dec2 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,7 +148,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned
int subclass,
 
 	preempt_disable();
 	mutex_acquire(&lock->dep_map, subclass, 0, ip);
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+#ifdef CONFIG_SMP
 	/*
 	 * Optimistic spinning.
 	 *
@@ -162,21 +162,12 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 	 * Since this needs the lock owner, and this mutex implementation
 	 * doesn''t track the owner atomically in the lock field, we need to
 	 * track it non-atomically.
-	 *
-	 * We can''t do this for DEBUG_MUTEXES because that relies on
wait_lock
-	 * to serialize everything.
 	 */
 
 	for (;;) {
 		struct thread_info *owner;
 
 		/*
-		 * If there are pending waiters, join them.
-		 */
-		if (!list_empty(&lock->wait_list))
-			break;
-
-		/*
 		 * If there''s an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
@@ -184,6 +175,12 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 		if (owner && !mutex_spin_on_owner(lock, owner))
 			break;
 
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			preempt_enable();
+			return 0;
+		}
+
 		/*
 		 * When there''s no owner, we might have preempted between the
 		 * owner acquiring the lock and setting the owner field. If
@@ -193,13 +190,6 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 		if (!owner && (need_resched() || rt_task(task)))
 			break;
 
-		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
-			lock_acquired(&lock->dep_map, ip);
-			mutex_set_owner(lock);
-			preempt_enable();
-			return 0;
-		}
-
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
 		 * everything in this loop to be re-loaded. We don''t need
@@ -209,7 +199,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned
int subclass,
 		cpu_relax();
 	}
 #endif
-
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -263,7 +252,6 @@ done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, current_thread_info());
-	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -439,10 +427,8 @@ static inline int __mutex_trylock_slowpath(atomic_t
*lock_count)
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	prev = atomic_xchg(&lock->count, -1);
-	if (likely(prev == 1)) {
-		mutex_set_owner(lock);
+	if (likely(prev == 1))
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-	}
 
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))

Nick Piggin

2009-Jan-14 17:18 UTC

head link

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

On Wed, Jan 14, 2009 at 06:00:36PM +0100, Peter Zijlstra
wrote:> Full series, including changelogs available at:
> 
>  http://programming.kicks-ass.net/kernel-patches/mutex-adaptive-spin/
> 
> and should shortly appear in a git tree near Ingo :-)
Linus is going to take a wholesale conversion of mutexes to adaptive
mutexes? He''s gone soft. I put on my asbestos underwear for no reason,
then.

 > @@ -173,21 +237,21 @@ __mutex_lock_common(struct mutex *lock, 
>  			spin_unlock_mutex(&lock->wait_lock, flags);
>  
>  			debug_mutex_free_waiter(&waiter);
> +			preempt_enable();
>  			return -EINTR;
>  		}
>  		__set_task_state(task, state);
>  
>  		/* didnt get the lock, go to sleep: */
>  		spin_unlock_mutex(&lock->wait_lock, flags);
> -		schedule();
> +		__schedule();
Why does this need to do a preempt-disabled schedule? After we schedule
away, the next task can do arbitrary things or reschedule itself, so if
we have not anticipated such a condition here, then I can''t see what
__schedule protects. At least a comment is in order?

Pity to add the call overhead to schedule just for this case. BTW.
__schedule shouldn''t need to be asmlinkage?
>  		spin_lock_mutex(&lock->wait_lock, flags);
>  	}
>  --
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-14 17:22 UTC

head link

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

On Wed, 2009-01-14 at 18:18 +0100, Nick Piggin wrote:
> > @@ -173,21 +237,21 @@ __mutex_lock_common(struct mutex *lock, 
> >  			spin_unlock_mutex(&lock->wait_lock, flags);
> >  
> >  			debug_mutex_free_waiter(&waiter);
> > +			preempt_enable();
> >  			return -EINTR;
> >  		}
> >  		__set_task_state(task, state);
> >  
> >  		/* didnt get the lock, go to sleep: */
> >  		spin_unlock_mutex(&lock->wait_lock, flags);
> > -		schedule();
> > +		__schedule();
> 
> Why does this need to do a preempt-disabled schedule? After we schedule
> away, the next task can do arbitrary things or reschedule itself, so if
> we have not anticipated such a condition here, then I can''t see
what
> __schedule protects. At least a comment is in order?
From:
http://programming.kicks-ass.net/kernel-patches/mutex-adaptive-spin/mutex-preempt.patch

Subject: mutex: preemption fixes
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed Jan 14 15:36:26 CET 2009

The problem is that dropping the spinlock right before schedule is a voluntary
preemption point and can cause a schedule, right after which we schedule again.

Fix this inefficiency by keeping preemption disabled until we schedule, do this
by explicitly disabling preemption and providing a schedule() variant that
assumes preemption is already disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Pity to add the call overhead to schedule just for this case.
Good point, seeing any way around that?
>  BTW. __schedule shouldn''t need to be asmlinkage?
TBH I''ve no clue, probably not, Ingo?

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Avi Kivity

2009-Jan-14 17:23 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Nick Piggin wrote:>> (no they''re not, Nick''s ticket locks still spin on a
shared cacheline
>> IIRC -- the MCS locks mentioned could fix this)
>>     
>
> It reminds me. I wrote a basic variation of MCS spinlocks a while back. And
> converted dcache lock to use it, which showed large dbench improvements on
> a big machine (of course for different reasons than the dbench improvements
> in this threaed).
>
> http://lkml.org/lkml/2008/8/28/24
>
> Each "lock" object is sane in size because given set of
spin-local queues may
> only be used once per lock stack. But any spinlocks within a mutex
acquisition
> will always be at the bottom of such a stack anyway, by definition.
>
> If you can use any code or concept for your code, that would be great.
>   
Does it make sense to replace ''nest'' with a per-cpu counter
that''s
incremented on each lock?  I guest you''d have to search for the value
of
nest on unlock, but it would a very short search (typically length 1, 2 
if lock sorting is used to avoid deadlocks).

I think you''d need to make the lock store the actual node pointer, not 
the cpu number, since the values of nest would be different on each cpu.

That would allow you to replace spinlocks with mcs_locks wholesale.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Dmitry Adamushko

2009-Jan-14 17:32 UTC

head link

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

2009/1/14 Chris Mason <chris.mason@oracle.com>:> On Wed, 2009-01-14 at 12:18 +0100, Dmitry Adamushko wrote:
>> 2009/1/14 Chris Mason <chris.mason@oracle.com>:
>> > On Tue, 2009-01-13 at 18:21 +0100, Peter Zijlstra wrote:
>> >> On Tue, 2009-01-13 at 08:49 -0800, Linus Torvalds wrote:
>> >> >
>> >> > So do a v10, and ask people to test.
>> >>
>> >> ---
>> >> Subject: mutex: implement adaptive spinning
>> >> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
>> >> Date: Mon Jan 12 14:01:47 CET 2009
>> >>
>> >> Change mutex contention behaviour such that it will sometimes
busy wait on
>> >> acquisition - moving its behaviour closer to that of
spinlocks.
>> >>
>> >
>> > I''ve spent a bunch of time on this one, and noticed
earlier today that I
>> > still had bits of CONFIG_FTRACE compiling.  I wasn''t
actually tracing
>> > anything, but it seems to have had a big performance hit.
>> >
>> > The bad news is the simple spin got much much faster, dbench 50
coming
>> > in at 1282MB/s instead of 580MB/s.  (other benchmarks give similar
>> > results)
>> >
>> > v10 is better that not spinning, but its in the 5-10% range.  So,
I''ve
>> > been trying to find ways to close the gap, just to understand
exactly
>> > where it is different.
>> >
>> > If I take out:
>> >        /*
>> >         * If there are pending waiters, join them.
>> >         */
>> >        if (!list_empty(&lock->wait_list))
>> >                break;
>> >
>> >
>> > v10 pops dbench 50 up to 1800MB/s.  The other tests soundly beat
my
>> > spinning and aren''t less fair.  But clearly this
isn''t a good solution.
>> >
>> > I tried a few variations, like only checking the wait list once
before
>> > looping, which helps some.  Are there other suggestions on better
tuning
>> > options?
>>
>> (some thoughts/speculations)
>>
>> Perhaps for highly-contanded mutexes the spinning implementation may
>> quickly degrade [*] to the non-spinning one (i.e. the current
>> sleep-wait mutex) and then just stay in this state until a moment of
>> time when there are no waiters  [**]  -- i.e.
>> list_empty(&lock->wait_list) == 1 and waiters can start spinning
>> again.
>
> It is actually ok if the highly contention mutexes don''t degrade
as long
> as they are highly contended and the holder isn''t likely to
schedule.
Yes, my point was that they likely do fall back (degrade) to the
wait-on-the-list (non-spinning) behavior with dbench, and that''s why
the performance numbers are similar (5-10% IIRC) to those of the
''normal'' mutex.

And the thing is that for the highly contention mutexes, it''s not easy
to switch back to the (fast) spinning-wait -- just because most of the
time there is someone waiting for the lock (and if this
''someone'' is
not a spinner, none of the new mutex_lock() requests can do
busy-waiting/spinning).

But whatever, without the list_empty() check it''s not relevant any
more.
>>
>> what may trigger [*]:
>>
>> (1) obviously, an owner scheduling out.
>>
>> Even if it happens rarely (otherwise, it''s not a target
scenario for
>> our optimization), due to the [**] it may take quite some time until
>> waiters are able to spin again.
>>
>> let''s say, waiters (almost) never block (and possibly, such
cases
>> would be better off just using a spinlock after some refactoring, if
>> possible)
>>
>> (2) need_resched() is triggered for one of the waiters.
>>
>> (3) !owner && rt_task(p)
>>
>> quite unlikely, but possible (there are 2 race windows).
>>
>> Of course, the question is whether it really takes a noticeable amount
>> of time to get out of the [**] state.
>> I''d imagine it can be a case for highly-contended locks.
>>
>> If this is the case indeed, then which of 1,2,3 gets triggered the
most.
>
> Sorry, I don''t have stats on that.
>
>>
>> Have you tried removing need_resched() checks? So we kind of emulate
>> real spinlocks here.
>
> Unfortunately, the need_resched() checks deal with a few of the ugly
> corners.  They are more important without the waiter list check.
> Basically if we spun without the need_resched() checks, the process who
> wants to unlock might not be able to schedule back in.
Yeah, for the "owner == NULL" case. If the owner was preempted in the
fast path right after taking the lock and before calling
mutex_set_owner().

btw., I wonder... would an additional preempt_disable/enable() in the
fast path harm that much?
We could avoid the preemption scenario above.
>
> -chris
>

-- 
Best regards,
Dmitry Adamushko
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 18:33 UTC

head link

[GIT PULL] adaptive spinning mutexes

* Peter Zijlstra <peterz@infradead.org> wrote:
> Full series, including changelogs available at:
> 
>  http://programming.kicks-ass.net/kernel-patches/mutex-adaptive-spin/
> 
> and should shortly appear in a git tree near Ingo :-)
Linus,

Please pull the adaptive-mutexes-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
adaptive-mutexes-for-linus

We dropped two fresh patches from v11 for the time being: the two debug 
patches, they had test failures [they triggered mutex debugging false 
positives].

So this tree is v10 (which got a lot of testing already) plus Chris''s 
performance patch. It passes all x86 runtime tests here.

The cross build matrix looks good too:

 testing 10 architectures.
 [ syncing linus ... ]
 testing      alpha:  -git:  pass (  21),  -tip:  pass (  21)
 testing        arm:  -git:  pass (   5),  -tip:  pass (   5)
 testing   blackfin:  -git:  pass (  20),  -tip:  pass (  20)
 testing       cris:  -git:  pass (  32),  -tip:  pass (  32)
 testing       ia64:  -git:  pass ( 153),  -tip:  pass ( 153)
 testing       m32r:  -git:  pass (  21),  -tip:  pass (  21)
 testing       m68k:  -git:  pass (  34),  -tip:  pass (  34)
 testing       mips:  -git:  pass (   4),  -tip:  pass (   4)
 testing    powerpc:  -git:  pass (  11),  -tip:  pass (  11)
 testing      sparc:  -git:  pass (   0),  -tip:  pass (   0)

Passes all and no new warnings.

So in theory this is good enough as a pre-rc2 pull too, should you feel 
tempted ;-)

 Thanks,

	Ingo

------------------>
Chris Mason (1):
      mutex: adaptive spinnning, performance tweaks

Peter Zijlstra (3):
      mutex: small cleanup
      mutex: preemption fixes
      mutex: implement adaptive spinning


 include/linux/mutex.h   |    5 +-
 include/linux/sched.h   |    2 +
 kernel/mutex-debug.c    |    9 +---
 kernel/mutex-debug.h    |   18 ++++---
 kernel/mutex.c          |  121 ++++++++++++++++++++++++++++++++++++++++-------
 kernel/mutex.h          |   22 ++++++++-
 kernel/sched.c          |   71 ++++++++++++++++++++++++++-
 kernel/sched_features.h |    1 +
 8 files changed, 209 insertions(+), 40 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 7a0e5c4..3069ec7 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -50,8 +50,10 @@ struct mutex {
 	atomic_t		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct thread_info	*owner;
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
 	void			*magic;
 #endif
@@ -68,7 +70,6 @@ struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
 #ifdef CONFIG_DEBUG_MUTEXES
-	struct mutex		*lock;
 	void			*magic;
 #endif
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b8..c34b137 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -328,7 +328,9 @@ extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
+extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
 struct nsproxy;
 struct user_namespace;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160..50d022e 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
 /*
  * Must be called with lock->wait_lock held.
  */
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
-	lock->owner = new_owner;
-}
-
 void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct
mutex_waiter *waiter,
 
 	/* Mark the current thread as blocked on the lock: */
 	ti->task->blocked_on = waiter;
-	waiter->lock = lock;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev &&
!lock->wait_list.next);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	mutex_clear_owner(lock);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->owner = NULL;
 	lock->magic = lock;
 }
 
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdf..6b2d735 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
 /*
  * This must be called with lock->wait_lock held.
  */
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
-	lock->owner = NULL;
-}
-
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
 extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4f45d4b..5d79781 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
  * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
  * David Howells for suggestions and improvements.
  *
+ *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ *    from the -rt tree, where it was originally implemented for rtmutexes
+ *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ *    and Sven Dietrich.
+ *
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct
lock_class_key *key)
 	atomic_set(&lock->count, 1);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
+	mutex_clear_owner(lock);
 
 	debug_mutex_init(lock, name, key);
 }
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
 	 * ''unlocked'' into ''locked'' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	mutex_set_owner(lock);
 }
 
 EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
 	 * The unlocking fastpath is the 0->1 transition from
''locked''
 	 * into ''unlocked'' state:
 	 */
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(lock);
+#endif
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
 }
 
@@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
-	unsigned int old_val;
 	unsigned long flags;
 
+	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+	/*
+	 * Optimistic spinning.
+	 *
+	 * We try to spin for acquisition when we find that there are no
+	 * pending waiters and the lock owner is currently running on a
+	 * (different) CPU.
+	 *
+	 * The rationale is that if the lock owner is running, it is likely to
+	 * release the lock soon.
+	 *
+	 * Since this needs the lock owner, and this mutex implementation
+	 * doesn''t track the owner atomically in the lock field, we need to
+	 * track it non-atomically.
+	 *
+	 * We can''t do this for DEBUG_MUTEXES because that relies on
wait_lock
+	 * to serialize everything.
+	 */
+
+	for (;;) {
+		struct thread_info *owner;
+
+		/*
+		 * If there''s an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
+
+		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+			lock_acquired(&lock->dep_map, ip);
+			mutex_set_owner(lock);
+			preempt_enable();
+			return 0;
+		}
+
+		/*
+		 * When there''s no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we''re an RT task that will live-lock because we won''t
let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don''t need
+		 * memory barriers as we''ll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+#endif
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_lock_common(lock, &waiter);
-	mutex_acquire(&lock->dep_map, subclass, 0, ip);
 	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
-	old_val = atomic_xchg(&lock->count, -1);
-	if (old_val == 1)
+	if (atomic_xchg(&lock->count, -1) == 1)
 		goto done;
 
 	lock_contended(&lock->dep_map, ip);
@@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned
int subclass,
 		 * that when we release the lock, we properly wake up the
 		 * other waiters:
 		 */
-		old_val = atomic_xchg(&lock->count, -1);
-		if (old_val == 1)
+		if (atomic_xchg(&lock->count, -1) == 1)
 			break;
 
 		/*
@@ -173,21 +241,22 @@ __mutex_lock_common(struct mutex *lock, long state,
unsigned int subclass,
 			spin_unlock_mutex(&lock->wait_lock, flags);
 
 			debug_mutex_free_waiter(&waiter);
+			preempt_enable();
 			return -EINTR;
 		}
 		__set_task_state(task, state);
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule();
+		__schedule();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
 done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-	debug_mutex_set_owner(lock, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +265,7 @@ done:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 
 	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
 
 	return 0;
 }
@@ -222,7 +292,8 @@ int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+				   subclass, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +331,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int
nested)
 		wake_up_process(waiter->task);
 	}
 
-	debug_mutex_clear_owner(lock);
-
 	spin_unlock_mutex(&lock->wait_lock, flags);
 }
 
@@ -298,18 +367,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret =  __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_killable_slowpath);
+	if (!ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
@@ -352,9 +433,10 @@ static inline int __mutex_trylock_slowpath(atomic_t
*lock_count)
 
 	prev = atomic_xchg(&lock->count, -1);
 	if (likely(prev == 1)) {
-		debug_mutex_set_owner(lock, current_thread_info());
+		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
+
 	/* Set it back to 0 if there are no waiters: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -380,8 +462,13 @@ static inline int __mutex_trylock_slowpath(atomic_t
*lock_count)
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
-					__mutex_trylock_slowpath);
+	int ret;
+
+	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+	if (ret)
+		mutex_set_owner(lock);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075daf..67578ca 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
-#define debug_mutex_clear_owner(lock)			do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+	lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13..589e730 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4538,15 +4538,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -4603,13 +4601,80 @@ need_resched_nonpreemptible:
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
+}
 
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+	preempt_disable();
+	__schedule();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+	unsigned int cpu;
+	struct rq *rq;
+
+	if (!sched_feat(OWNER_SPIN))
+		return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Need to access the cpu field knowing that
+	 * DEBUG_PAGEALLOC could have unmapped it if
+	 * the mutex owner just released it and exited.
+	 */
+	if (probe_kernel_address(&owner->cpu, cpu))
+		goto out;
+#else
+	cpu = owner->cpu;
+#endif
+
+	/*
+	 * Even if the access succeeded (likely case),
+	 * the cpu field may no longer be valid.
+	 */
+	if (cpu >= nr_cpumask_bits)
+		goto out;
+
+	/*
+	 * We need to validate that we can do a
+	 * get_cpu() and that we have the percpu area.
+	 */
+	if (!cpu_online(cpu))
+		goto out;
+
+	rq = cpu_rq(cpu);
+
+	for (;;) {
+		/*
+		 * Owner changed, break to re-assess state.
+		 */
+		if (lock->owner != owner)
+			break;
+
+		/*
+		 * Is that owner really running on that cpu?
+		 */
+		if (task_thread_info(rq->curr) != owner || need_resched())
+			return 0;
+
+		cpu_relax();
+	}
+out:
+	return 1;
+}
+#endif
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b..07bc02e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-14 18:40 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 2009-01-14 at 19:33 +0100, Ingo Molnar wrote:> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > Full series, including changelogs available at:
> > 
> >  http://programming.kicks-ass.net/kernel-patches/mutex-adaptive-spin/
> > 
> > and should shortly appear in a git tree near Ingo :-)
> 
> Linus,
> 
> Please pull the adaptive-mutexes-for-linus git tree from:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
adaptive-mutexes-for-linus
> 
I was going to put this into the btrfs tree, but since you have a branch
just for adaptive mutexes, is it easier to put there?

From: Chris Mason <chris.mason@oracle.com>

Btrfs: stop spinning on mutex_trylock and let the adaptive code spin for us

Mutexes now spin internally and the btrfs spin is no longer required for
performance.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae77..40ba8e8 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -37,16 +37,6 @@
 
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
-	int i;
-
-	if (mutex_trylock(&eb->mutex))
-		return 0;
-	for (i = 0; i < 512; i++) {
-		cpu_relax();
-		if (mutex_trylock(&eb->mutex))
-			return 0;
-	}
-	cpu_relax();
 	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL -
btrfs_header_level(eb));
 	return 0;
 }


--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 18:47 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Ingo Molnar <mingo@elte.hu> wrote:
> Linus,
> 
> Please pull the adaptive-mutexes-for-linus git tree from:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
adaptive-mutexes-for-linus
> 
> We dropped two fresh patches from v11 for the time being: the two debug 
> patches, they had test failures [they triggered mutex debugging false 
> positives].
> 
> So this tree is v10 (which got a lot of testing already) plus
Chris''s
> performance patch. It passes all x86 runtime tests here.
Latest performance figures, on a 2-socket 16-way Nehalem test-system, 
running the code above, measured via "test-mutex V 128 10" VFS 
creat+unlink scalability test on tmpfs and ext3:

                              no-spin           spin

 [tmpfs]    avg ops/sec:       291038         392865       (+34.9%)
 [ext3]     avg ops/sec:       283291         435674       (+53.7%)

Those numbers impress the heck out of me, rarely do we have such kind of 
speedups these days, for any established functionality.

We still have the /sys/debug/sched_features tunable under 
CONFIG_SCHED_DEBUG=y, so should this cause any performance regressions 
somewhere, it can be pinned down and blamed back on this change easily, 
without bisection and without rebooting the box.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2009-Jan-14 18:53 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 14 Jan 2009 19:33:19 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> Please pull the adaptive-mutexes-for-linus git tree
<fear>

- It seems a major shortcoming that the feature is disabled if
  CONFIG_DEBUG_MUTEXES=y.  It means that lots of people won''t test it.

- When people hit performance/latency oddities, it would be nice if
  they had a /proc knob with which they could disable this feature at
  runtime.

  This would also be useful for comparative performance testing.
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 19:00 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Andrew Morton <akpm@linux-foundation.org> wrote:
> On Wed, 14 Jan 2009 19:33:19 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> 
> > Please pull the adaptive-mutexes-for-linus git tree
> 
> <fear>
> 
> - It seems a major shortcoming that the feature is disabled if
>   CONFIG_DEBUG_MUTEXES=y.  It means that lots of people won''t test
it.
> 
> - When people hit performance/latency oddities, it would be nice if
>   they had a /proc knob with which they could disable this feature at
>   runtime.
> 
>   This would also be useful for comparative performance testing.
Yeah. From my other mail:
> > We still have the /sys/debug/sched_features tunable under 
> > CONFIG_SCHED_DEBUG=y, so should this cause any performance regressions
> > somewhere, it can be pinned down and blamed back on this change 
> > easily, without bisection and without rebooting the box.
This kind of easy knob was included early on - this is how all those spin 
versus no-spin numbers were done.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-14 19:23 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 2009-01-14 at 10:53 -0800, Andrew Morton wrote:> On Wed, 14 Jan 2009 19:33:19 +0100 Ingo Molnar <mingo@elte.hu> wrote:
> 
> > Please pull the adaptive-mutexes-for-linus git tree
> 
> <fear>
> 
> - It seems a major shortcoming that the feature is disabled if
>   CONFIG_DEBUG_MUTEXES=y.  It means that lots of people won''t test
it.
Yes, that''s a bit unfortunate, a simple patch to enable that is:

I must admit I''m a bit stumped on why that debug check triggers, I
couldn''t reproduce today, but Ingo ran into it quite quickly :/

Index: linux-2.6/kernel/mutex-debug.c
==================================================================---
linux-2.6.orig/kernel/mutex-debug.c
+++ linux-2.6/kernel/mutex-debug.c
@@ -74,7 +74,7 @@ void debug_mutex_unlock(struct mutex *lo
 		return;
 
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+	/* DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); */
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev &&
!lock->wait_list.next);
 	mutex_clear_owner(lock);
 }
Index: linux-2.6/kernel/mutex.c
==================================================================---
linux-2.6.orig/kernel/mutex.c
+++ linux-2.6/kernel/mutex.c
@@ -148,7 +148,7 @@ __mutex_lock_common(struct mutex *lock, 
 
 	preempt_disable();
 	mutex_acquire(&lock->dep_map, subclass, 0, ip);
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+#ifdef CONFIG_SMP
 	/*
 	 * Optimistic spinning.
 	 *
@@ -162,9 +162,6 @@ __mutex_lock_common(struct mutex *lock, 
 	 * Since this needs the lock owner, and this mutex implementation
 	 * doesn''t track the owner atomically in the lock field, we need to
 	 * track it non-atomically.
-	 *
-	 * We can''t do this for DEBUG_MUTEXES because that relies on
wait_lock
-	 * to serialize everything.
 	 */
 
 	for (;;) {

> - When people hit performance/latency oddities, it would be nice if
>   they had a /proc knob with which they could disable this feature at
>   runtime.
> 
>   This would also be useful for comparative performance testing.
CONFIG_SCHED_DEBUG=y

echo NO_MUTEX_SPIN > /debug/sched_features


--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 19:28 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Ingo Molnar <mingo@elte.hu> wrote:
> Latest performance figures, on a 2-socket 16-way Nehalem test-system, 
> running the code above, measured via "test-mutex V 128 10" VFS 
> creat+unlink scalability test on tmpfs and ext3:
> 
>                               no-spin           spin
> 
>  [tmpfs]    avg ops/sec:       291038         392865       (+34.9%)
>  [ext3]     avg ops/sec:       283291         435674       (+53.7%)
Btw., for historic kicks i just went back to v2.6.15-2019-gf17578d - the 
last pre-mutexes semaphore based kernel, using the same .config.

I tracked down two bugs in it to make it boot on a Nehalem, so we can now 
compare the above numbers against historic semaphore performance:

                      [v2.6.14]     [v2.6.29]

                      Semaphores  | Mutexes
            ----------------------------------------------
                                  | no-spin           spin
                                  |
  [tmpfs]   ops/sec:       50713  |  291038         392865       (+34.9%)
  [ext3]    ops/sec:       45214  |  283291         435674       (+53.7%)

A 10x macro-performance improvement on ext3, compared to 2.6.14 :-)

While lots of other details got changed meanwhile, i''m sure most of the
performance win on this particular VFS workload comes from mutexes.

So i think the long mutex migration pain was definitely worth it.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 19:33 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Wed, 2009-01-14 at 10:53 -0800, Andrew Morton wrote:
> > On Wed, 14 Jan 2009 19:33:19 +0100 Ingo Molnar <mingo@elte.hu>
wrote:
> > 
> > > Please pull the adaptive-mutexes-for-linus git tree
> > 
> > <fear>
> > 
> > - It seems a major shortcoming that the feature is disabled if
> >   CONFIG_DEBUG_MUTEXES=y.  It means that lots of people won''t
test it.
> 
> Yes, that''s a bit unfortunate, a simple patch to enable that is:
> 
> I must admit I''m a bit stumped on why that debug check triggers, I
> couldn''t reproduce today, but Ingo ran into it quite quickly :/
Yes, the debug patch caused this false positive warning on one of my 
test-systems:

Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 255762
------------[ cut here ]------------
WARNING: at kernel/mutex-debug.c:77 debug_mutex_unlock+0x94/0xde()
Hardware name:         
Modules linked in:
Pid: 0, comm: swapper Not tainted 2.6.29-rc1-tip-00983-ge1af3bd-dirty #1
Call Trace:
 [<ffffffff8024f2f7>] warn_slowpath+0xd8/0xf7
 [<ffffffff8024f5ec>] ? wake_up_klogd+0x9/0x2f
 [<ffffffff80270fd1>] ? graph_lock+0x27/0x66
 [<ffffffff80275329>] ? validate_chain+0xd4d/0xd9f
 [<ffffffff802714c4>] ? save_trace+0x3f/0xb2
 [<ffffffff80275b4f>] ? __lock_acquire+0x7d4/0x832
 [<ffffffff80275c5f>] ? lock_acquire+0xb2/0xc2
 [<ffffffff8025091c>] ? cpu_maps_update_begin+0x17/0x19
 [<ffffffff80271d21>] ? trace_hardirqs_off+0xd/0xf
 [<ffffffff80270a8e>] debug_mutex_unlock+0x94/0xde
 [<ffffffff80906d71>] __mutex_unlock_slowpath+0xdd/0x152
 [<ffffffff80906df4>] mutex_unlock+0xe/0x10
 [<ffffffff80250955>] cpu_maps_update_done+0x15/0x17
 [<ffffffff808ce8b5>] register_cpu_notifier+0x2c/0x32
 [<ffffffff80d7683e>] page_alloc_init+0x10/0x12
 [<ffffffff80d5ac45>] start_kernel+0x1ba/0x422
 [<ffffffff80d5a140>] ? early_idt_handler+0x0/0x73
 [<ffffffff80d5a2c3>] x86_64_start_reservations+0xae/0xb2
 [<ffffffff80d5a421>] x86_64_start_kernel+0x137/0x146
---[ end trace a7919e7f17c0a725 ]---
Kernel command line: root=/dev/sda1 earlyprintk=serial,ttyS0,115200
console=ttyS0,115200 console=tty 5 profile=0 debug initcall_debug apic=debug
apic=verbose ignore_loglevel sysrq_always_enabled pci=nomsi
kernel profiling enabled (shift: 0)
debug: sysrq always enabled.
Initializing CPU#0

So we left that change out from this pull request. It''s not a big deal
i
think - mutex debugging always had a different fast-path from no-debug 
mutexes anyway (or a lack of fast-path to begin with). So the performance 
characteristics were always subtly different. Now they might be more 
different - but we''ll fix that too.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2009-Jan-14 19:36 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 14 Jan 2009 20:00:08 +0100
Ingo Molnar <mingo@elte.hu> wrote:
> 
> * Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > On Wed, 14 Jan 2009 19:33:19 +0100 Ingo Molnar <mingo@elte.hu>
wrote:
> > 
> > > Please pull the adaptive-mutexes-for-linus git tree
> > 
> > <fear>
> > 
> > - It seems a major shortcoming that the feature is disabled if
> >   CONFIG_DEBUG_MUTEXES=y.  It means that lots of people won''t
test it.
^^^?
 > > - When people hit performance/latency oddities, it would be nice if
> >   they had a /proc knob with which they could disable this feature at
> >   runtime.
> > 
> >   This would also be useful for comparative performance testing.
> 
> Yeah. From my other mail:
> 
> > > We still have the /sys/debug/sched_features tunable under 
> > > CONFIG_SCHED_DEBUG=y, so should this cause any performance
regressions
> > > somewhere, it can be pinned down and blamed back on this change 
> > > easily, without bisection and without rebooting the box.
> 
> This kind of easy knob was included early on - this is how all those spin 
> versus no-spin numbers were done.
Do people enable CONFIG_SCHED_DEBUG?

CONFIG_DEBUG_MUTEXES=n, CONFIG_SCHED_DEBUG=y is getting to be a pretty
small subset?

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-14 19:50 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 2009-01-14 at 11:36 -0800, Andrew Morton wrote:
> Do people enable CONFIG_SCHED_DEBUG?
Well, I have it always enabled, but I''ve honestly no idea if that makes
me weird.
> CONFIG_DEBUG_MUTEXES=n, CONFIG_SCHED_DEBUG=y is getting to be a pretty
> small subset?
Could be, do you fancy me doing a sysctl? shouldn''t be hard.

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 20:14 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Andrew Morton <akpm@linux-foundation.org> wrote:
> On Wed, 14 Jan 2009 20:00:08 +0100
> Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > * Andrew Morton <akpm@linux-foundation.org> wrote:
> > 
> > > On Wed, 14 Jan 2009 19:33:19 +0100 Ingo Molnar
<mingo@elte.hu> wrote:
> > > 
> > > > Please pull the adaptive-mutexes-for-linus git tree
> > > 
> > > <fear>
> > > 
> > > - It seems a major shortcoming that the feature is disabled if
> > >   CONFIG_DEBUG_MUTEXES=y.  It means that lots of people
won''t test it.
> 
> ^^^?
>  
> > > - When people hit performance/latency oddities, it would be nice
if
> > >   they had a /proc knob with which they could disable this
feature at
> > >   runtime.
> > > 
> > >   This would also be useful for comparative performance testing.
> > 
> > Yeah. From my other mail:
> > 
> > > > We still have the /sys/debug/sched_features tunable under 
> > > > CONFIG_SCHED_DEBUG=y, so should this cause any performance
regressions
> > > > somewhere, it can be pinned down and blamed back on this
change
> > > > easily, without bisection and without rebooting the box.
> > 
> > This kind of easy knob was included early on - this is how all those
spin
> > versus no-spin numbers were done.
> 
> Do people enable CONFIG_SCHED_DEBUG?
If they suspect performance problems and want to analyze them?

Note that CONFIG_SCHED_DEBUG=y is also the default.
> CONFIG_DEBUG_MUTEXES=n, CONFIG_SCHED_DEBUG=y is getting to be a pretty 
> small subset?
Those two are the default config settings actually, so i''d expect it to
be
the most commonly occuring combinations.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2009-Jan-14 20:21 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 14 Jan 2009 20:50:50 +0100
Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Wed, 2009-01-14 at 11:36 -0800, Andrew Morton wrote:
> 
> > Do people enable CONFIG_SCHED_DEBUG?
> 
> Well, I have it always enabled, but I''ve honestly no idea if that
makes
> me weird.
It''d be weird if you''re not weird.
> > CONFIG_DEBUG_MUTEXES=n, CONFIG_SCHED_DEBUG=y is getting to be a pretty
> > small subset?
> 
> Could be, do you fancy me doing a sysctl? shouldn''t be hard.
The /sys/debug knob depends upon CONFIG_SCHED_DEBUG, I assume?

umm, yes, I do think that it would be prudent to make this control
unconditionally available.  Experience tells us that any regressions
which this change causes could take a long time to turn up - sometimes
years.  By which time the people who are reporting the regressions are
running packaged kernels, and if that packaged kernel didn''t enable
CONFIG_SCHED_DEBUG, we''re a bit screwed.

Did we end up deciding to remove the CONFIG_DEBUG_MUTEXES=n dependency?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 20:27 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Wed, 2009-01-14 at 11:36 -0800, Andrew Morton wrote:
> 
> > Do people enable CONFIG_SCHED_DEBUG?
> 
> Well, I have it always enabled, but I''ve honestly no idea if that
makes
> me weird.
> 
> > CONFIG_DEBUG_MUTEXES=n, CONFIG_SCHED_DEBUG=y is getting to be a pretty
> > small subset?
> 
> Could be, do you fancy me doing a sysctl? shouldn''t be hard.
i dunno, why another fancy sysctl for something that fits quite nicely 
into the existing sched_features scheme that we''ve been using for such 
purposes for the past 3-4 kernel releases?

we always provided various toggles for new scheduler features via 
/sys/debug/sched_features, so that people can do performance regression 
testing, and it works quite well.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2009-Jan-14 20:30 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 14 Jan 2009 21:14:35 +0100
Ingo Molnar <mingo@elte.hu> wrote:
> 
> * Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > On Wed, 14 Jan 2009 20:00:08 +0100
> > Ingo Molnar <mingo@elte.hu> wrote:
> > 
> > > 
> > > * Andrew Morton <akpm@linux-foundation.org> wrote:
> > > 
> > > > On Wed, 14 Jan 2009 19:33:19 +0100 Ingo Molnar
<mingo@elte.hu> wrote:
> > > > 
> > > > > Please pull the adaptive-mutexes-for-linus git tree
> > > > 
> > > > <fear>
> > > > 
> > > > - It seems a major shortcoming that the feature is disabled
if
> > > >   CONFIG_DEBUG_MUTEXES=y.  It means that lots of people
won''t test it.
> > 
> > ^^^?
> >  
> > > > - When people hit performance/latency oddities, it would be
nice if
> > > >   they had a /proc knob with which they could disable this
feature at
> > > >   runtime.
> > > > 
> > > >   This would also be useful for comparative performance
testing.
> > > 
> > > Yeah. From my other mail:
> > > 
> > > > > We still have the /sys/debug/sched_features tunable
under
> > > > > CONFIG_SCHED_DEBUG=y, so should this cause any
performance regressions
> > > > > somewhere, it can be pinned down and blamed back on
this change
> > > > > easily, without bisection and without rebooting the
box.
> > > 
> > > This kind of easy knob was included early on - this is how all
those spin
> > > versus no-spin numbers were done.
> > 
> > Do people enable CONFIG_SCHED_DEBUG?
> 
> If they suspect performance problems and want to analyze them?
The vast majority of users do not and usually cannot compile their own
kernels.
> Note that CONFIG_SCHED_DEBUG=y is also the default.
akpm:/usr/src/25> echo $ARCH
x86_64
akpm:/usr/src/25> make defconfig
*** Default configuration is based on ''x86_64_defconfig''
#
# configuration written to .config
#
akpm:/usr/src/25> grep SCHED_DEBUG .config 
# CONFIG_SCHED_DEBUG is not set
> > CONFIG_DEBUG_MUTEXES=n, CONFIG_SCHED_DEBUG=y is getting to be a pretty
> > small subset?
> 
> Those two are the default config settings actually, so i''d expect
it to be
> the most commonly occuring combinations.
akpm:/usr/src/25> grep DEBUG_MUTEXES .config
# CONFIG_DEBUG_MUTEXES is not set

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2009-Jan-14 20:44 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 14 Jan 2009 21:27:36 +0100
Ingo Molnar <mingo@elte.hu> wrote:
> 
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > On Wed, 2009-01-14 at 11:36 -0800, Andrew Morton wrote:
> > 
> > > Do people enable CONFIG_SCHED_DEBUG?
> > 
> > Well, I have it always enabled, but I''ve honestly no idea if
that makes
> > me weird.
> > 
> > > CONFIG_DEBUG_MUTEXES=n, CONFIG_SCHED_DEBUG=y is getting to be a
pretty
> > > small subset?
> > 
> > Could be, do you fancy me doing a sysctl? shouldn''t be hard.
> 
> i dunno, why another fancy sysctl for something that fits quite nicely 
> into the existing sched_features scheme that we''ve been using for
such
> purposes for the past 3-4 kernel releases?
> 
> we always provided various toggles for new scheduler features via 
> /sys/debug/sched_features, so that people can do performance regression 
> testing, and it works quite well.
> 
If we know that this control will be reliably available in packaged
kernels then fine.  But how to we arrange for that?
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 20:51 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Andrew Morton <akpm@linux-foundation.org> wrote:
> > > Do people enable CONFIG_SCHED_DEBUG?
> > 
> > If they suspect performance problems and want to analyze them?
> 
> The vast majority of users do not and usually cannot compile their own 
> kernels.
... which they derive from distro kernels or some old .config they always 
used, via ''make oldconfig''. You are arguing against
well-established facts
here.

If you dont believe my word for it, here''s an analysis of all kernel 
configs posted to lkml in the past 8 months:

   $ grep ^CONFIG_SCHED_DEBUG linux-kernel | wc -l
   424

   $ grep ''CONFIG_SCHED_DEBUG is not'' linux-kernel | wc -l
   109

i.e. CONFIG_SCHED_DEBUG=y is set in 80% of the configs. A large majority 
of testers has it enabled and /sys/debug/sched_features was always a good 
mechanism that we used for runtime toggles.
> > Note that CONFIG_SCHED_DEBUG=y is also the default.
> 
> akpm:/usr/src/25> echo $ARCH
> x86_64
> akpm:/usr/src/25> make defconfig
> *** Default configuration is based on ''x86_64_defconfig''
x86 defconfig is used too, but it''s a pretty rare usage.

Under default i mean the customary meaning of default config: it''s the 
default if you come via ''make oldconfig'' or if you derive your
config from
a distro config:

| config SCHED_DEBUG
|        bool "Collect scheduler debugging info"
|        depends on DEBUG_KERNEL && PROC_FS
|        default y

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2009-Jan-14 21:06 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 14 Jan 2009 21:51:22 +0100
Ingo Molnar <mingo@elte.hu> wrote:
> 
> * Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > > > Do people enable CONFIG_SCHED_DEBUG?
> > > 
> > > If they suspect performance problems and want to analyze them?
> > 
> > The vast majority of users do not and usually cannot compile their own
> > kernels.
> 
> ... which they derive from distro kernels or some old .config they always 
> used, via ''make oldconfig''. You are arguing against
well-established facts
> here.
> 
> If you dont believe my word for it, here''s an analysis of all
kernel
> configs posted to lkml in the past 8 months:
> 
>    $ grep ^CONFIG_SCHED_DEBUG linux-kernel | wc -l
>    424
> 
>    $ grep ''CONFIG_SCHED_DEBUG is not'' linux-kernel | wc
-l
>    109
> 
> i.e. CONFIG_SCHED_DEBUG=y is set in 80% of the configs. A large majority 
> of testers has it enabled and /sys/debug/sched_features was always a good 
> mechanism that we used for runtime toggles.
You just disproved your own case :(
> > > Note that CONFIG_SCHED_DEBUG=y is also the default.
> > 
> > akpm:/usr/src/25> echo $ARCH
> > x86_64
> > akpm:/usr/src/25> make defconfig
> > *** Default configuration is based on
''x86_64_defconfig''
> 
> x86 defconfig is used too, but it''s a pretty rare usage.
> 
> Under default i mean the customary meaning of default config: it''s
the
> default if you come via ''make oldconfig'' or if you derive
your config from
> a distro config:
> 
> | config SCHED_DEBUG
> |        bool "Collect scheduler debugging info"
> |        depends on DEBUG_KERNEL && PROC_FS
> |        default y
> 
This simply isn''t reliable.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 21:14 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Andrew Morton <akpm@linux-foundation.org> wrote:
> On Wed, 14 Jan 2009 21:51:22 +0100
> Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > * Andrew Morton <akpm@linux-foundation.org> wrote:
> > 
> > > > > Do people enable CONFIG_SCHED_DEBUG?
> > > > 
> > > > If they suspect performance problems and want to analyze
them?
> > > 
> > > The vast majority of users do not and usually cannot compile
their own
> > > kernels.
> > 
> > ... which they derive from distro kernels or some old .config they
always
> > used, via ''make oldconfig''. You are arguing against
well-established facts
> > here.
> > 
> > If you dont believe my word for it, here''s an analysis of all
kernel
> > configs posted to lkml in the past 8 months:
> > 
> >    $ grep ^CONFIG_SCHED_DEBUG linux-kernel | wc -l
> >    424
> > 
> >    $ grep ''CONFIG_SCHED_DEBUG is not'' linux-kernel |
wc -l
> >    109
> > 
> > i.e. CONFIG_SCHED_DEBUG=y is set in 80% of the configs. A large
majority
> > of testers has it enabled and /sys/debug/sched_features was always a
good
> > mechanism that we used for runtime toggles.
> 
> You just disproved your own case :(
how so? 80% is not enough? I also checked Fedora and it has SCHED_DEBUG=y 
in its kernel rpms.

note that there''s also a performance issue here: we generally _dont
want_
a debug sysctl overhead in the mutex code or in any fastpath for that 
matter. So making it depend on SCHED_DEBUG is useful.

sched_feat() features get optimized out at build time when SCHED_DEBUG is 
disabled. So it gives us the best of two worlds: the utility of sysctls in 
the SCHED_DEBUG=y, and they get compiled out in the !SCHED_DEBUG case.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2009-Jan-14 21:35 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, 14 Jan 2009 22:14:58 +0100
Ingo Molnar <mingo@elte.hu> wrote:
> 
> * Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > On Wed, 14 Jan 2009 21:51:22 +0100
> > Ingo Molnar <mingo@elte.hu> wrote:
> > 
> > > 
> > > * Andrew Morton <akpm@linux-foundation.org> wrote:
> > > 
> > > > > > Do people enable CONFIG_SCHED_DEBUG?
> > > > > 
> > > > > If they suspect performance problems and want to
analyze them?
> > > > 
> > > > The vast majority of users do not and usually cannot compile
their own
> > > > kernels.
> > > 
> > > ... which they derive from distro kernels or some old .config
they always
> > > used, via ''make oldconfig''. You are arguing
against well-established facts
> > > here.
> > > 
> > > If you dont believe my word for it, here''s an analysis
of all kernel
> > > configs posted to lkml in the past 8 months:
> > > 
> > >    $ grep ^CONFIG_SCHED_DEBUG linux-kernel | wc -l
> > >    424
> > > 
> > >    $ grep ''CONFIG_SCHED_DEBUG is not''
linux-kernel | wc -l
> > >    109
> > > 
> > > i.e. CONFIG_SCHED_DEBUG=y is set in 80% of the configs. A large
majority
> > > of testers has it enabled and /sys/debug/sched_features was
always a good
> > > mechanism that we used for runtime toggles.
> > 
> > You just disproved your own case :(
> 
> how so? 80% is not enough?
No.

It really depends on what distros do.
> I also checked Fedora and it has SCHED_DEBUG=y 
> in its kernel rpms.
If all distros set SCHED_DEBUG=y then fine.

But if they do this then we should do this at the kernel.org level, and
make it a hard-to-turn-off thing via CONFIG_EMBEDDED=y.
> note that there''s also a performance issue here: we generally
_dont want_
> a debug sysctl overhead in the mutex code or in any fastpath for that 
> matter. So making it depend on SCHED_DEBUG is useful.
> 
> sched_feat() features get optimized out at build time when SCHED_DEBUG is 
> disabled. So it gives us the best of two worlds: the utility of sysctls in 
> the SCHED_DEBUG=y, and they get compiled out in the !SCHED_DEBUG case.
I''m not detecting here a sufficient appreciation of the number of
sched-related regressions we''ve seen in recent years, nor of the
difficulty encountered in diagnosing and fixing them.  Let alone
the difficulty getting those fixes propagated out a *long* time
after the regression was added.

You''re taking a whizzy new feature which drastically changes a critical
core kernel feature and jamming it into mainline with a vestigial
amount of testing coverage without giving sufficient care and thought
to the practical lessons which we have learned from doing this in the
past.

This is a highly risky change.  It''s not that the probability of
failure is high - the problem is that the *cost* of the improbable
failure is high.  We should seek to minimize that cost.

Ingo Molnar

2009-Jan-14 21:41 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Ingo Molnar <mingo@elte.hu> wrote:
> > You just disproved your own case :(
> 
> how so? 80% is not enough? I also checked Fedora and it has 
> SCHED_DEBUG=y in its kernel rpms.
Ubuntu has CONFIG_SCHED_DEBUG=y as well in their kernels.
> note that there''s also a performance issue here: we generally
_dont
> want_ a debug sysctl overhead in the mutex code or in any fastpath for 
> that matter. So making it depend on SCHED_DEBUG is useful.
> 
> sched_feat() features get optimized out at build time when SCHED_DEBUG 
> is disabled. So it gives us the best of two worlds: the utility of 
> sysctls in the SCHED_DEBUG=y, and they get compiled out in the 
> !SCHED_DEBUG case.
There''s a third issue as well: the toggle _is_ intentionally
debug-only,
while sysctls are non-debug and we _really_ dont want feature assymetry 
like that.

It will just splinter the application space: if there''s significant 
performance variances then apps will just go the path of least resistence: 
instead of debugging the performance problems properly, the first group of 
applications will be tuned for the sysctl_mutex_spin=0 case, the second 
group of applications will be tuned for the sysctl_mutex_spin=1 case.

And if an enterprise distro decided to flip the default around we''d
have a
real tuning mess.

/sys/debug/sched_features strikes the right kind of balance IMO: it''s
not
always available and it''s explicitly in debugfs with no stable ABI so
apps
cannot standardize on being able to tweak it, but it''s convenient
enough
in practice for developers to depend on it, performance analysis is easy, 
etc., etc.

	Ingo

Kay Sievers

2009-Jan-14 21:50 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, Jan 14, 2009 at 22:41, Ingo Molnar <mingo@elte.hu>
wrote:>
> * Ingo Molnar <mingo@elte.hu> wrote:
>
>> > You just disproved your own case :(
>>
>> how so? 80% is not enough? I also checked Fedora and it has
>> SCHED_DEBUG=y in its kernel rpms.
>
> Ubuntu has CONFIG_SCHED_DEBUG=y as well in their kernels.
$ cat /etc/SuSE-release
openSUSE 11.1 (x86_64)
VERSION = 11.1

$ uname -a
Linux nga 2.6.27.7-9-default #1 SMP 2008-12-04 18:10:04 +0100 x86_64
x86_64 x86_64 GNU/Linux

$ zgrep SCHED_DEBUG /proc/config.gz
CONFIG_SCHED_DEBUG=y

$ zgrep DEBUG_MUTEX /proc/config.gz
# CONFIG_DEBUG_MUTEXES is not set

Kay
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 22:34 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Kay Sievers <kay.sievers@vrfy.org> wrote:
> On Wed, Jan 14, 2009 at 22:41, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > * Ingo Molnar <mingo@elte.hu> wrote:
> >
> >> > You just disproved your own case :(
> >>
> >> how so? 80% is not enough? I also checked Fedora and it has
> >> SCHED_DEBUG=y in its kernel rpms.
> >
> > Ubuntu has CONFIG_SCHED_DEBUG=y as well in their kernels.
> 
> $ cat /etc/SuSE-release
> openSUSE 11.1 (x86_64)
> VERSION = 11.1
> 
> $ uname -a
> Linux nga 2.6.27.7-9-default #1 SMP 2008-12-04 18:10:04 +0100 x86_64
> x86_64 x86_64 GNU/Linux
> 
> $ zgrep SCHED_DEBUG /proc/config.gz
> CONFIG_SCHED_DEBUG=y
> 
> $ zgrep DEBUG_MUTEX /proc/config.gz
> # CONFIG_DEBUG_MUTEXES is not set
Fedora has mutex debugging disabled too:

   # CONFIG_DEBUG_MUTEXES is not set

So the 3 main Linux distros, generating ~95% of the kerneloops.org 
feedback traffic, all have SCHED_DEBUG=y, and at least two have 
!DEBUG_MUTEXES. (possibly Ubuntu has that disabled it too)

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-14 23:23 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Andrew Morton <akpm@linux-foundation.org> wrote:
> > I also checked Fedora and it has SCHED_DEBUG=y 
> > in its kernel rpms.
> 
> If all distros set SCHED_DEBUG=y then fine.
95% of the distros and significant majority of the lkml traffic.

And no, we dont generally dont provide knobs for essential performance 
features of core Linux kernel primitives - so the existence of SPIN_OWNER 
in /sys/debug/sched_features is an exception already.

We dont have any knob to switch ticket spinlocks to old-style spinlocks. 
We dont have any knob to switch the page allocator from LIFO to FIFO. We 
dont have any knob to turn off the coalescing of vmas in the MM. We dont 
have any knob to turn the mmap_sem from an rwsem to a mutex to a spinlock.

Why? Beacause such design and implementation details are what make Linux 
Linux, and we stand by those decisions for better or worse. And we do try 
to eliminate as many ''worse'' situations as possible, but we
dont provide
knobs galore. We offer flexibility in our willingness to fix any genuine 
performance issues in our source code.

The thing is that apps tend to gravitate towards solutions with the least 
short-term cost. If a super important enterprise app can solve their 
performance problem by either redesigning their broken code, or by turning 
off a feature we have in the kernel in their install scripts (which we 
made so easy to tune via a stable sysctl), guess which variant they will 
chose? Even if they hurt all other apps in the process.
> > note that there''s also a performance issue here: we generally
_dont
> > want_ a debug sysctl overhead in the mutex code or in any fastpath for
> > that matter. So making it depend on SCHED_DEBUG is useful.
> > 
> > sched_feat() features get optimized out at build time when SCHED_DEBUG
> > is disabled. So it gives us the best of two worlds: the utility of 
> > sysctls in the SCHED_DEBUG=y, and they get compiled out in the 
> > !SCHED_DEBUG case.
> 
> I''m not detecting here a sufficient appreciation of the number of 
> sched-related regressions we''ve seen in recent years, nor of the 
> difficulty encountered in diagnosing and fixing them.  Let alone the 
> difficulty getting those fixes propagated out a *long* time after the 
> regression was added.
The bugzilla you just dug out in another thread does not seem to apply, so 
i''m not sure what you are referring to.

Regarding historic tendencies, we have numbers like:

                      [v2.6.14]     [v2.6.29]

                      Semaphores  | Mutexes
            ----------------------------------------------
                                  | no-spin           spin
                                  |
  [tmpfs]   ops/sec:       50713  |  291038         392865       (+34.9%)
  [ext3]    ops/sec:       45214  |  283291         435674       (+53.7%)

10x performance improvement on ext3, compared to 2.6.14.

I''m sure there will be other numbers that go down - but the thing is, 
we''ve _never_ been good at finding the worst-possible workload cases 
during development.
> You''re taking a whizzy new feature which drastically changes a
critical
> core kernel feature and jamming it into mainline with a vestigial amount 
> of testing coverage without giving sufficient care and thought to the 
> practical lessons which we have learned from doing this in the past.
If you look at the whole existence of /sys/debug/sched_feature you''ll
see
how careful we''ve been about performance regressions. We made it a 
sched_feat() exactly because if a change goes wrong and becomes a step 
backwards then it''s a oneliner to turn it default-off.

We made use of that facility in the past and we have a number of debug 
knobs there right now:

 # cat /debug/sched_features 
 NEW_FAIR_SLEEPERS NORMALIZED_SLEEPER WAKEUP_PREEMPT START_DEBIT 
 AFFINE_WAKEUPS CACHE_HOT_BUDDY SYNC_WAKEUPS NO_HRTICK NO_DOUBLE_TICK 
 ASYM_GRAN LB_BIAS LB_WAKEUP_UPDATE ASYM_EFF_LOAD NO_WAKEUP_OVERLAP 
 LAST_BUDDY OWNER_SPIN

All of those ~16 scheduler knobs were done out of caution, to make sure 
that if we change some scheduling aspect there''s a convenient way to
debug
performance or interactivity regressions, without forcing people into 
bisection and/or reboots, etc.
> This is a highly risky change.  It''s not that the probability of
failure
> is high - the problem is that the *cost* of the improbable failure is 
> high.  We should seek to minimize that cost.
It never mattered much to the efficiency of finding performance 
regressions whether a feature sat tight for 4 kernel releases in -mm or 
went upstream in a week. It _does_ matter to stability - but not 
significantly to performance.

What matteres most to getting performance right is testing exposure and 
variance, not length of the integration period. Easy revertability helps 
too - and that is a given here - it''s literally a oneliner to disable
it.
See that oneliner below.

	Ingo

Index: linux/kernel/sched_features.h
==================================================================---
linux.orig/kernel/sched_features.h
+++ linux/kernel/sched_features.h
@@ -13,4 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
-SCHED_FEAT(OWNER_SPIN, 1)
+SCHED_FEAT(OWNER_SPIN, 0)
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Nick Piggin

2009-Jan-15 00:46 UTC

head link

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

On Wed, Jan 14, 2009 at 06:22:36PM +0100, Peter Zijlstra
wrote:> On Wed, 2009-01-14 at 18:18 +0100, Nick Piggin wrote:
> 
> > > @@ -173,21 +237,21 @@ __mutex_lock_common(struct mutex *lock, 
> > >  			spin_unlock_mutex(&lock->wait_lock, flags);
> > >  
> > >  			debug_mutex_free_waiter(&waiter);
> > > +			preempt_enable();
> > >  			return -EINTR;
> > >  		}
> > >  		__set_task_state(task, state);
> > >  
> > >  		/* didnt get the lock, go to sleep: */
> > >  		spin_unlock_mutex(&lock->wait_lock, flags);
> > > -		schedule();
> > > +		__schedule();
> > 
> > Why does this need to do a preempt-disabled schedule? After we
schedule
> > away, the next task can do arbitrary things or reschedule itself, so
if
> > we have not anticipated such a condition here, then I can''t
see what
> > __schedule protects. At least a comment is in order?
> 
> From:
>
http://programming.kicks-ass.net/kernel-patches/mutex-adaptive-spin/mutex-preempt.patch
> 
> Subject: mutex: preemption fixes
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Wed Jan 14 15:36:26 CET 2009
> 
> The problem is that dropping the spinlock right before schedule is a
voluntary
> preemption point and can cause a schedule, right after which we schedule
again.
> 
> Fix this inefficiency by keeping preemption disabled until we schedule, do
this
> by explicitly disabling preemption and providing a schedule() variant that
> assumes preemption is already disabled.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> 
> > Pity to add the call overhead to schedule just for this case.
> 
> Good point, seeing any way around that?
Hmm, well this is rather a slow path, I would say. I''d prefer not to
modify schedule in this way (if we just get scheduled back on after
being switched away, the subsequent call to schedule is going to be
cache hot and not do too much work).

preempt_enable_noresched maybe if you really care, would close up the
window even more. But is it really worthwhile? We''d want to see numbers
(when in doubt, keep it  simpler).
 
> >  BTW. __schedule shouldn''t need to be asmlinkage?
> 
> TBH I''ve no clue, probably not, Ingo?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Nick Piggin

2009-Jan-15 00:50 UTC

head link

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

On Wed, Jan 14, 2009 at 07:23:12PM +0200, Avi Kivity
wrote:> Nick Piggin wrote:
> >>(no they''re not, Nick''s ticket locks still spin
on a shared cacheline
> >>IIRC -- the MCS locks mentioned could fix this)
> >>    
> >
> >It reminds me. I wrote a basic variation of MCS spinlocks a while back.
And
> >converted dcache lock to use it, which showed large dbench improvements
on
> >a big machine (of course for different reasons than the dbench
improvements
> >in this threaed).
> >
> >http://lkml.org/lkml/2008/8/28/24
> >
> >Each "lock" object is sane in size because given set of
spin-local queues
> >may
> >only be used once per lock stack. But any spinlocks within a mutex 
> >acquisition
> >will always be at the bottom of such a stack anyway, by definition.
> >
> >If you can use any code or concept for your code, that would be great.
> >  
> 
> Does it make sense to replace ''nest'' with a per-cpu
counter that''s
> incremented on each lock?  I guest you''d have to search for the
value of
> nest on unlock, but it would a very short search (typically length 1, 2 
> if lock sorting is used to avoid deadlocks).
> 
> I think you''d need to make the lock store the actual node pointer,
not
> the cpu number, since the values of nest would be different on each cpu.
> 
> That would allow you to replace spinlocks with mcs_locks wholesale.
nest could get quite large (and basically is unbounded), though. I
think there would definitely be variations (NMCS locks is interesting).
But OTOH I think that for _most_ spinlocks, optimising for spinning
case is wrong. For some really tricky global spinlocks, yes I think
MCS is a good idea. But moreso as a stopgap until hopefully more
scalable algorithms are written.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Nick Piggin

2009-Jan-15 00:55 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, Jan 14, 2009 at 01:35:29PM -0800, Andrew Morton
wrote:> You''re taking a whizzy new feature which drastically changes a
critical
> core kernel feature and jamming it into mainline with a vestigial
> amount of testing coverage without giving sufficient care and thought
> to the practical lessons which we have learned from doing this in the
> past.
> 
> This is a highly risky change.  It''s not that the probability of
> failure is high - the problem is that the *cost* of the improbable
> failure is high.  We should seek to minimize that cost.
There is very little downside to waiting for at least the next release
cycle. What''s the case for making an exception and merging it right
now?
It actually still seems to be generating a lot of changes and
discussion right up until yesterday...

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2009-Jan-15 07:44 UTC

head link

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

On Thu, 2009-01-15 at 01:46 +0100, Nick Piggin wrote:
> Hmm, well this is rather a slow path, I would say. I''d prefer not
to
> modify schedule in this way (if we just get scheduled back on after
> being switched away, the subsequent call to schedule is going to be
> cache hot and not do too much work).
> 
> preempt_enable_noresched maybe if you really care, would close up the
> window even more. But is it really worthwhile? We''d want to see
numbers
> (when in doubt, keep it  simpler).
I initially did the preempt_enable_no_resched() thing and that showed
some improvement for PREEMPT=y kernels (lost the numbers though).

When I redid all the patches I tried closing that last hole by doing
that __schedule() thing, never realizing that schedule() would then get
extra overhead,.. d''0h.

I agree that that isn''t worth it. I shall revert to
preempt_enable_no_resched() and try to get some new numbers.

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Nick Piggin

2009-Jan-15 07:52 UTC

head link

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

On Thu, Jan 15, 2009 at 08:44:03AM +0100, Peter Zijlstra
wrote:> On Thu, 2009-01-15 at 01:46 +0100, Nick Piggin wrote:
> 
> > Hmm, well this is rather a slow path, I would say. I''d prefer
not to
> > modify schedule in this way (if we just get scheduled back on after
> > being switched away, the subsequent call to schedule is going to be
> > cache hot and not do too much work).
> > 
> > preempt_enable_noresched maybe if you really care, would close up the
> > window even more. But is it really worthwhile? We''d want to
see numbers
> > (when in doubt, keep it  simpler).
> 
> I initially did the preempt_enable_no_resched() thing and that showed
> some improvement for PREEMPT=y kernels (lost the numbers though).
> 
> When I redid all the patches I tried closing that last hole by doing
> that __schedule() thing, never realizing that schedule() would then get
> extra overhead,.. d''0h.
> 
> I agree that that isn''t worth it. I shall revert to
> preempt_enable_no_resched() and try to get some new numbers.
Thanks!
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Johannes Weiner

2009-Jan-15 08:41 UTC

head link

[PATCH] mutex: set owner only once on acquisition

mutex_lock() sets the lock owner, no need to set it upfront in
__mutex_lock_common().

Inside __mutex_lock_common() we can cope with the case where the
successful acquirer got preempted by us before setting the owner
field: there is an explicit check in the spinning code and the
sleeping part doesn''t rely on it.

The debug code does owner checks only on unlock where the field is
garuanteed to be set.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/mutex.c |    2 --
 1 file changed, 2 deletions(-)

Just a small observation.  Peter said it wouldn''t matter much as the
write is to a hot cache line.  But otoh, why keep it if it''s not
necessary. :)

--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -195,7 +195,6 @@ __mutex_lock_common(struct mutex *lock, 
 
 		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
 			lock_acquired(&lock->dep_map, ip);
-			mutex_set_owner(lock);
 			preempt_enable();
 			return 0;
 		}
@@ -263,7 +262,6 @@ done:
 	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, current_thread_info());
-	mutex_set_owner(lock);
 
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))

Johannes Weiner

2009-Jan-15 08:56 UTC

head link

Re: [PATCH] mutex: set owner only once on acquisition

On Thu, Jan 15, 2009 at 09:41:01AM +0100, Johannes Weiner
wrote:> mutex_lock() sets the lock owner, no need to set it upfront in
> __mutex_lock_common().
> 
> Inside __mutex_lock_common() we can cope with the case where the
> successful acquirer got preempted by us before setting the owner
> field: there is an explicit check in the spinning code and the
> sleeping part doesn''t rely on it.
> 
> The debug code does owner checks only on unlock where the field is
> garuanteed to be set.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  kernel/mutex.c |    2 --
>  1 file changed, 2 deletions(-)
> 
> Just a small observation.  Peter said it wouldn''t matter much as
the
> write is to a hot cache line.  But otoh, why keep it if it''s not
> necessary. :)
Damn, I''m really async here, sorry Peter.  Just noticed you already
picked it up.

	Hannes
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-15 09:53 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Chris Mason <chris.mason@oracle.com> wrote:
> On Wed, 2009-01-14 at 19:33 +0100, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > Full series, including changelogs available at:
> > > 
> > > 
http://programming.kicks-ass.net/kernel-patches/mutex-adaptive-spin/
> > > 
> > > and should shortly appear in a git tree near Ingo :-)
> > 
> > Linus,
> > 
> > Please pull the adaptive-mutexes-for-linus git tree from:
> > 
> >    git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
adaptive-mutexes-for-linus
> > 
> 
> I was going to put this into the btrfs tree, but since you have a branch
> just for adaptive mutexes, is it easier to put there?
> 
> From: Chris Mason <chris.mason@oracle.com>
> 
> Btrfs: stop spinning on mutex_trylock and let the adaptive code spin for us
> 
> Mutexes now spin internally and the btrfs spin is no longer required for
> performance.
> 
> Signed-off-by: Chris Mason <chris.mason@oracle.com>
applied it to tip/core/locking, thanks Chris!

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Folkert van Heusden

2009-Jan-15 11:45 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

> > >> > You just disproved your own case :(
> > >>
> > >> how so? 80% is not enough? I also checked Fedora and it has
> > >> SCHED_DEBUG=y in its kernel rpms.
> > >
> > > Ubuntu has CONFIG_SCHED_DEBUG=y as well in their kernels.
Debian too:

mauer:~/bin# grep CONFIG_SCHED_DEBUG /boot/config-2.6.2*
/boot/config-2.6.24-1-amd64:# CONFIG_SCHED_DEBUG is not set
/boot/config-2.6.25-2-amd64:CONFIG_SCHED_DEBUG=y
/boot/config-2.6.26-1-amd64:CONFIG_SCHED_DEBUG=y
> > $ zgrep DEBUG_MUTEX /proc/config.gz
> > # CONFIG_DEBUG_MUTEXES is not set
mauer:~/bin# grep DEBUG_MUTEX /boot/config-2.6.2*
/boot/config-2.6.22-3-amd64:# CONFIG_DEBUG_MUTEXES is not set
/boot/config-2.6.24-1-amd64:# CONFIG_DEBUG_MUTEXES is not set
/boot/config-2.6.25-2-amd64:# CONFIG_DEBUG_MUTEXES is not set
/boot/config-2.6.26-1-amd64:# CONFIG_DEBUG_MUTEXES is not set


Folkert van Heusden

-- 
Looking for a cheap but fast webhoster with an excellent helpdesk?
http://keetweej.vanheusden.com/redir.php?id=1001
----------------------------------------------------------------------
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com

Chris Samuel

2009-Jan-15 12:53 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, 15 Jan 2009 9:34:08 am Ingo Molnar wrote:
> So the 3 main Linux distros, generating ~95% of the kerneloops.org
> feedback traffic, all have SCHED_DEBUG=y, and at least two have
> !DEBUG_MUTEXES. (possibly Ubuntu has that disabled it too)
That looks correct for Ubuntu:

chris@mythtv:~$ grep CONFIG_DEBUG_MUTEXES /boot/config-2.6.2*
/boot/config-2.6.24-21-generic:# CONFIG_DEBUG_MUTEXES is not set
/boot/config-2.6.27-7-generic:# CONFIG_DEBUG_MUTEXES is not set
/boot/config-2.6.27-9-generic:# CONFIG_DEBUG_MUTEXES is not set


-- 
 Chris Samuel  :  http://www.csamuel.org/  :  Melbourne, VIC

This email may come with a PGP signature as a file. Do not panic.
For more info see: http://en.wikipedia.org/wiki/OpenPGP

Matthew Wilcox

2009-Jan-15 17:44 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Wed, Jan 14, 2009 at 08:28:11PM +0100, Ingo Molnar
wrote:>                       [v2.6.14]     [v2.6.29]
> 
>                       Semaphores  | Mutexes
>             ----------------------------------------------
>                                   | no-spin           spin
>                                   |
>   [tmpfs]   ops/sec:       50713  |  291038         392865       (+34.9%)
>   [ext3]    ops/sec:       45214  |  283291         435674       (+53.7%)
> 
> A 10x macro-performance improvement on ext3, compared to 2.6.14 :-)
> 
> While lots of other details got changed meanwhile, i''m sure most
of the
> performance win on this particular VFS workload comes from mutexes.
I asked a couple of our benchmarking teams to try -v9.  Neither the OLTP
benchmark, nor the kernel-perf test suite found any significant
performance change.  I suspect mutex contention isn''t a significant
problem for most workloads.

Has anyone found a non-synthetic benchmark where this makes a
significant difference?  Aside from btrfs, I mean.

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you''re interested in selling us
this
operating system, but compare it to ours.  We can''t possibly take such
a retrograde step."
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-15 18:05 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, 15 Jan 2009, Matthew Wilcox wrote:> 
> I asked a couple of our benchmarking teams to try -v9.  Neither the OLTP
> benchmark, nor the kernel-perf test suite found any significant
> performance change.  I suspect mutex contention isn''t a
significant
> problem for most workloads.
We''ve been very good about spreading out mutexes. The normal kernel
really
only does it for write accesses to the same directory, and readdir(). 
Almost everything else is totally in the noise.

I think Ingo''s benchmark essentially tests exactly that "write to
same
directory" case, and little else.

Unless:
> Has anyone found a non-synthetic benchmark where this makes a
> significant difference?  Aside from btrfs, I mean.
Yea, if you have some particular filesystem (or other subsystem) that uses 
a global mutex, you''ll obviously see way more contention. Btrfs may not
be
_unique_ in this regard, but it''s definitely doing something that
isn''t
good.

Btw, it''s doing something that ext3 also used to do iirc, until we
fixed
it to use spinlocks instead (the block group lock in particular).

Yeah - just double-checked. Commit c12b9866ea52 in the historical Linux 
archive, from 2003. Which made block allocation protected by a per-group 
spinlock, rather than lock_super().

		Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-15 18:06 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Matthew Wilcox <matthew@wil.cx> wrote:
> On Wed, Jan 14, 2009 at 08:28:11PM +0100, Ingo Molnar wrote:
> >                       [v2.6.14]     [v2.6.29]
> > 
> >                       Semaphores  | Mutexes
> >             ----------------------------------------------
> >                                   | no-spin           spin
> >                                   |
> >   [tmpfs]   ops/sec:       50713  |  291038         392865      
(+34.9%)
> >   [ext3]    ops/sec:       45214  |  283291         435674      
(+53.7%)
> > 
> > A 10x macro-performance improvement on ext3, compared to 2.6.14 :-)
> > 
> > While lots of other details got changed meanwhile, i''m sure
most of
> > the performance win on this particular VFS workload comes from 
> > mutexes.
> 
> I asked a couple of our benchmarking teams to try -v9.  Neither the OLTP 
> benchmark, nor the kernel-perf test suite found any significant 
> performance change.  I suspect mutex contention isn''t a
significant
> problem for most workloads.
basically only VFS is mutex-bound really, and few of the
''benchmarks'' tend
to be VFS intense. Maybe things like mail-server benchmarks would do that.

Also, -v9 is like two days old code ;-) Old and crufty. The real 
performance uptick was not even in -v10 but in -v11 (the one we submitted 
in this thread).

	Ingo

Ingo Molnar

2009-Jan-15 18:08 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> > Has anyone found a non-synthetic benchmark where this makes a 
> > significant difference?  Aside from btrfs, I mean.
> 
> Yea, if you have some particular filesystem (or other subsystem) that 
> uses a global mutex, you''ll obviously see way more contention.
Btrfs may
> not be _unique_ in this regard, but it''s definitely doing
something that
> isn''t good.
> 
> Btw, it''s doing something that ext3 also used to do iirc, until we
fixed
> it to use spinlocks instead (the block group lock in particular).
> 
> Yeah - just double-checked. Commit c12b9866ea52 in the historical Linux 
> archive, from 2003. Which made block allocation protected by a per-group 
> spinlock, rather than lock_super().
btw., i think spin-mutexes have a design advantage here: in a lot of code 
areas it''s quite difficult to use spinlocks - cannot allocate memory, 
cannot call any code that can sporadically block (but does not _normally_ 
block), etc.

With mutexes those atomicity constraints go away - and the performance 
profile should now be quite close to that of spinlocks as well.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-15 18:16 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, 15 Jan 2009, Ingo Molnar wrote:> 
> btw., i think spin-mutexes have a design advantage here: in a lot of code 
> areas it''s quite difficult to use spinlocks - cannot allocate
memory,
> cannot call any code that can sporadically block (but does not _normally_ 
> block), etc.
> 
> With mutexes those atomicity constraints go away - and the performance 
> profile should now be quite close to that of spinlocks as well.
Umm. Except if you wrote the code nicely and used spinlocks, you
wouldn''t
hold the lock over all those unnecessary and complex operations.

IOW, if you do pre-allocation instead of holding a lock over the 
allocation, you win. So yes, spin-mutexes makes it easier to write the 
code, but it also makes it easier to just plain be lazy.

			Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-15 19:26 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, 2009-01-15 at 10:16 -0800, Linus Torvalds wrote:> 
> On Thu, 15 Jan 2009, Ingo Molnar wrote:
> > 
> > btw., i think spin-mutexes have a design advantage here: in a lot of
code
> > areas it''s quite difficult to use spinlocks - cannot allocate
memory,
> > cannot call any code that can sporadically block (but does not
_normally_
> > block), etc.
> > 
> > With mutexes those atomicity constraints go away - and the performance
> > profile should now be quite close to that of spinlocks as well.
> 
> Umm. Except if you wrote the code nicely and used spinlocks, you
wouldn''t
> hold the lock over all those unnecessary and complex operations.
> 
While this is true, there are examples of places we should expect
speedups for this today.

Concurrent file creation/deletion in a single dir will often find things
hot in cache and not have to block anywhere (mail spools).

Concurrent O_DIRECT aio writes to the same file, where i_mutex is
dropped early on.

pipes should see a huge improvement.

I''ll kick off some runs of my three benchmarks on ext3 for comparison.
If there are things less synthetic people would like to see, please let
me know.

-chris


--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-15 20:13 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, 15 Jan 2009, Chris Mason wrote:
> On Thu, 2009-01-15 at 10:16 -0800, Linus Torvalds wrote:
> > 
> > Umm. Except if you wrote the code nicely and used spinlocks, you
wouldn''t
> > hold the lock over all those unnecessary and complex operations.
> 
> While this is true, there are examples of places we should expect
> speedups for this today.
Sure. There are cases where we do have to use sleeping things, because the 
code is generic and really can''t control what lower levels do, and
those
lower levels have to be able to sleep.

So:
> Concurrent file creation/deletion in a single dir will often find things
> hot in cache and not have to block anywhere (mail spools).
The inode->i_mutex thing really does need to use a mutex, and spinning 
will help. Of course, it should only help when you really have lots of 
concurrent create/delete/readdir in the same directory, and that hopefully 
is a very rare load in real life, but hey, it''s a valid one.
> Concurrent O_DIRECT aio writes to the same file, where i_mutex is
> dropped early on.
Won''t the actual IO costs generally dominate in everything but trivial 
benchmarks?
> pipes should see a huge improvement.
Hmm. Pipes may be interesting, but on the other hand, the cases that would 
see huge improvements would tend to be the cases where the biggest 
performance gain is from running both sides on the same CPU. The only case 
where a pipe gets really contended is when both producer and consumer 
basically do nothing with the data, so the biggest costs is the copy in 
kernel space (read: pure benchmarking, no real load), and then you often 
get better performance by scheduling on a single CPU due to cache effects 
and no lock bouncing. 

			Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2009-Jan-15 21:04 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, 2009-01-15 at 12:13 -0800, Linus Torvalds wrote:> 
> On Thu, 15 Jan 2009, Chris Mason wrote:
> 
> > On Thu, 2009-01-15 at 10:16 -0800, Linus Torvalds wrote:
> > > 
> > > Umm. Except if you wrote the code nicely and used spinlocks, you
wouldn''t
> > > hold the lock over all those unnecessary and complex operations.
> > 
> > While this is true, there are examples of places we should expect
> > speedups for this today.
> 
> Sure. There are cases where we do have to use sleeping things, because the 
> code is generic and really can''t control what lower levels do, and
those
> lower levels have to be able to sleep.
> 
> So:
> 
> > Concurrent file creation/deletion in a single dir will often find
things
> > hot in cache and not have to block anywhere (mail spools).
> 
> The inode->i_mutex thing really does need to use a mutex, and spinning 
> will help. Of course, it should only help when you really have lots of 
> concurrent create/delete/readdir in the same directory, and that hopefully 
> is a very rare load in real life, but hey, it''s a valid one.
> 
Mail server deletes is the common one I can think of.  Mail server
creates end up bound by fsync in my testing here, so it doesn''t quite
show up unless your IO subsystem is much faster than your kernel.

> > Concurrent O_DIRECT aio writes to the same file, where i_mutex is
> > dropped early on.
> 
> Won''t the actual IO costs generally dominate in everything but
trivial
> benchmarks?
Benchmarks in the past on large IO rigs have shown it.  It isn''t
something you''re going to see on a regular sized server, but for AIO +
O_DIRECT, the pieces that hold i_mutex can be seen in real life.

[ re: pipes, ok I don''t know of realistic pipe benchmarks but
I''ll run
them if people can suggest one ]

I ran a mailserver delivery simulation (fs_mark), and then my standard
dbench/4k creates/4k stats on ext3 and ext4.

ext3 had very similar scores with spinning on and off.

ext4 scored the same on mailserver delivery (fsync bound apparently)

For the others:

dbench no spin: 1970MB/s
dbench spin:    3036MB/s

(4k creates in file/sec, higher is better)
4k creates no spin: avg 418.91 median 346.12 std 306.37 high 2025.80 low 338.11
4k creates spin:    avg 406.09 median 344.27 std 179.95 high 1249.78 low 336.10

Somehow the spinning mutex made ext4 more fair.  The 4k create test had
50 procs doing file creates in parallel, but each proc was operating in
its own directory.  So, they shouldn''t actually be competing for any of
the VFS mutexes.

The stat run was the same patched/unpatched, probably for the same
reason.

None of this is very earth shattering, apparently my canned benchmarks
for btrfs pain points don''t apply that well to ext34.  

-chris


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ingo Molnar

2009-Jan-15 22:03 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

* Chris Mason <chris.mason@oracle.com> wrote:
> [ re: pipes, ok I don''t know of realistic pipe benchmarks but
I''ll run
> them if people can suggest one ]
Threaded servers making heavy use of sys_splice() ought to hit the pipe 
mutex all the time.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Paul E. McKenney

2009-Jan-16 00:53 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, Jan 15, 2009 at 10:16:53AM -0800, Linus Torvalds
wrote:> 
> 
> On Thu, 15 Jan 2009, Ingo Molnar wrote:
> > 
> > btw., i think spin-mutexes have a design advantage here: in a lot of
code
> > areas it''s quite difficult to use spinlocks - cannot allocate
memory,
> > cannot call any code that can sporadically block (but does not
_normally_
> > block), etc.
> > 
> > With mutexes those atomicity constraints go away - and the performance
> > profile should now be quite close to that of spinlocks as well.
> 
> Umm. Except if you wrote the code nicely and used spinlocks, you
wouldn''t
> hold the lock over all those unnecessary and complex operations.
> 
> IOW, if you do pre-allocation instead of holding a lock over the 
> allocation, you win. So yes, spin-mutexes makes it easier to write the 
> code, but it also makes it easier to just plain be lazy.
In infrequently invoked code such as some error handling, lazy/simple
can be a big win.

							Thanx, Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Linus Torvalds

2009-Jan-16 01:01 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, 15 Jan 2009, Paul E. McKenney wrote:
> On Thu, Jan 15, 2009 at 10:16:53AM -0800, Linus Torvalds wrote:
> > 
> > IOW, if you do pre-allocation instead of holding a lock over the 
> > allocation, you win. So yes, spin-mutexes makes it easier to write the
> > code, but it also makes it easier to just plain be lazy.
> 
> In infrequently invoked code such as some error handling, lazy/simple
> can be a big win.
Sure. I don''t disagree at all. On such code we don''t even care
about
locking. If it _really_ is fundamentally very rarely invoked.

But if we''re talking things like core filesystem locks, it''s
_really_
irritating when one of those (supposedly rare) allocation delays or the 
need to do IO then blocks all those (supposedly common) nice cached cases.

So I don''t dispute at all that "mutex with spinning" performs
better than
a mutex, but I _do_ claim that it has some potentially huge downsides 
compared to a real spinlock. It may perform as well as a spinlock in the 
nice common case, but then when you hit the non-common case you see the 
difference between well-written code and badly written code.

And sadly, while allocations _usually_ are nice and immediate, and while 
our caches _usually_ mean that we don''t need to do IO, bad behavior
when
we do need to do IO is what really kills interactive feel. Suddenly 
everything else is hurting too, because they wanted that lock - even if 
they didn''t need to do IO or allocate anything.

			Linus
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Paul E. McKenney

2009-Jan-16 01:34 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, Jan 15, 2009 at 05:01:32PM -0800, Linus Torvalds
wrote:> On Thu, 15 Jan 2009, Paul E. McKenney wrote:
> > On Thu, Jan 15, 2009 at 10:16:53AM -0800, Linus Torvalds wrote:
> > > 
> > > IOW, if you do pre-allocation instead of holding a lock over the 
> > > allocation, you win. So yes, spin-mutexes makes it easier to
write the
> > > code, but it also makes it easier to just plain be lazy.
> > 
> > In infrequently invoked code such as some error handling, lazy/simple
> > can be a big win.
> 
> Sure. I don''t disagree at all. On such code we don''t even
care about
> locking. If it _really_ is fundamentally very rarely invoked.
> 
> But if we''re talking things like core filesystem locks,
it''s _really_
> irritating when one of those (supposedly rare) allocation delays or the 
> need to do IO then blocks all those (supposedly common) nice cached cases.
Certainly if there was one big mutex covering all the operations, it would
indeed be bad.  On the other hand, if the filesystem/cache was partitioned
(perhaps hashed) so that there was a large number of such locks, then
if should be OK.  Yes, I am making the perhaps naive assumption that
hot spots such as the root inode would be in the cache.  And that they
would rarely collide with allocation or I/O, which might also be naive.

But on this point I must defer to the filesystem folks.
> So I don''t dispute at all that "mutex with spinning"
performs better than
> a mutex, but I _do_ claim that it has some potentially huge downsides 
> compared to a real spinlock. It may perform as well as a spinlock in the 
> nice common case, but then when you hit the non-common case you see the 
> difference between well-written code and badly written code.
> 
> And sadly, while allocations _usually_ are nice and immediate, and while 
> our caches _usually_ mean that we don''t need to do IO, bad
behavior when
> we do need to do IO is what really kills interactive feel. Suddenly 
> everything else is hurting too, because they wanted that lock - even if 
> they didn''t need to do IO or allocate anything.
I certainly agree that there are jobs that a spin-mutex is ill-suited for.

							Thanx, Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Nick Piggin

2009-Jan-16 03:03 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

On Thu, Jan 15, 2009 at 10:16:53AM -0800, Linus Torvalds
wrote:> 
> 
> On Thu, 15 Jan 2009, Ingo Molnar wrote:
> > 
> > btw., i think spin-mutexes have a design advantage here: in a lot of
code
> > areas it''s quite difficult to use spinlocks - cannot allocate
memory,
> > cannot call any code that can sporadically block (but does not
_normally_
> > block), etc.
> > 
> > With mutexes those atomicity constraints go away - and the performance
> > profile should now be quite close to that of spinlocks as well.
> 
> Umm. Except if you wrote the code nicely and used spinlocks, you
wouldn''t
> hold the lock over all those unnecessary and complex operations.
> 
> IOW, if you do pre-allocation instead of holding a lock over the 
> allocation, you win. So yes, spin-mutexes makes it easier to write the 
> code, but it also makes it easier to just plain be lazy.
Yeah, I agree often it is harder to get the locking right but you end up
with a better result. With mutexes, on the off chance you do have t oblock
while holding the lock, performance and latency of other threads will tank.
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Folkert van Heusden

2009-Jan-16 13:32 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

> I''ll kick off some runs of my three benchmarks on ext3 for
comparison.
> If there are things less synthetic people would like to see, please let
> me know.
What about a web-server test? Number of hits per second it can do?


Folkert van Heusden

-- 
MultiTail er et flexible tool for å kontrolere Logfiles og commandoer.
Med filtrer, farger, sammenføringer, forskeliger ansikter etc.
http://www.vanheusden.com/multitail/
----------------------------------------------------------------------
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Folkert van Heusden

2009-Jan-16 13:57 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

> > I''ll kick off some runs of my three benchmarks on ext3 for
comparison.
> > If there are things less synthetic people would like to see, please
let
> > me know.
> 
> What about a web-server test? Number of hits per second it can do?
Quick hack: http://vanheusden.com/tortureweb/tortureweb-0.1.tgz
To test multiple requesters, start this program multiple times in
parallel.
There are probably better testers but for a quick test and might be
sufficient.


Folkert van Heusden

-- 
Multitail es una herramienta flexible que permite visualizar los "log
file" y seguir la ejecución de comandos. Permite filtrar, añadir
colores, combinar archivos, la visualización de diferencias (diff-
view), etc.  http://www.vanheusden.com/multitail/
----------------------------------------------------------------------
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Folkert van Heusden

2009-Jan-16 14:07 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

> So I don''t dispute at all that "mutex with spinning"
performs better than
> a mutex, but I _do_ claim that it has some potentially huge downsides 
> compared to a real spinlock. It may perform as well as a spinlock in the 
> nice common case, but then when you hit the non-common case you see the 
> difference between well-written code and badly written code.
Make it mount-point dependant. Then your mail-spool can use the spinlock
version and e.g. the /usr filesystem uses regular mutexes. Might be
tricky to implement I guess.


Folkert van Heusden

-- 
MultiTail cok yonlu kullanimli bir program, loglari okumak, verilen
kommandolari yerine getirebilen. Filter, renk verme, merge, ''diff-
view'', vs.  http://www.vanheusden.com/multitail/
----------------------------------------------------------------------
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Bill Davidsen

2009-Jan-16 18:37 UTC

head link

Re: [GIT PULL] adaptive spinning mutexes

Chris Mason wrote:> On Thu, 2009-01-15 at 10:16 -0800, Linus Torvalds wrote:
>> On Thu, 15 Jan 2009, Ingo Molnar wrote:
>>> btw., i think spin-mutexes have a design advantage here: in a lot
of code
>>> areas it''s quite difficult to use spinlocks - cannot
allocate memory,
>>> cannot call any code that can sporadically block (but does not
_normally_
>>> block), etc.
>>>
>>> With mutexes those atomicity constraints go away - and the
performance
>>> profile should now be quite close to that of spinlocks as well.
>> Umm. Except if you wrote the code nicely and used spinlocks, you
wouldn''t
>> hold the lock over all those unnecessary and complex operations.
>>
> 
> While this is true, there are examples of places we should expect
> speedups for this today.
> 
> Concurrent file creation/deletion in a single dir will often find things
> hot in cache and not have to block anywhere (mail spools).
> And although not as common, NNTP servers using file per article storage.
> Concurrent O_DIRECT aio writes to the same file, where i_mutex is
> dropped early on.
> 
> pipes should see a huge improvement.
> I''d like to see that. Didn''t realize how slow pipes really
are.
> I''ll kick off some runs of my three benchmarks on ext3 for
comparison.
> If there are things less synthetic people would like to see, please let
> me know.
> 
> -chris
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
Bill Davidsen <davidsen@tmr.com>
   "We have more to fear from the bungling of the incompetent than from
the machinations of the wicked."  - from Slashdot

Btrfs devel - Jan 2009 - [PATCH -v8][RFC] mutex: implement adaptive spinning

[PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

[PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

[PATCH -v11][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

[PATCH -v11 delta] mutex: implement adaptive spinning

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [PATCH -v9][RFC] mutex: implement adaptive spinning

[GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

Re: [PATCH -v8][RFC] mutex: implement adaptive spinning

Re: [GIT PULL] adaptive spinning mutexes

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

Re: [PATCH -v11][RFC] mutex: implement adaptive spinning

[PATCH] mutex: set owner only once on acquisition

Re: [PATCH] mutex: set owner only once on acquisition

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes

Re: [GIT PULL] adaptive spinning mutexes