thr3ads.net - Xen devel - [Xen-devel] [PATCH] CPUIDLE: shorten hpet spin

If this information is useful, please help other people find it:
Share via:

Wei, Gang

2010-Apr-20 05:39 UTC

[Xen-devel] [PATCH] CPUIDLE: shorten hpet spin_lock holding time

CPUIDLE: shorten hpet spin_lock holding time

Try to reduce spin_lock overhead for deep C state entry/exit. This will benefit
systems with a lot of cpus which need the hpet broadcast to wakeup from deep C
state.

Signed-off-by: Wei Gang <gang.wei@intel.com>

diff -r 7ee8bb40200a xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c	Thu Apr 15 19:11:16 2010 +0100
+++ b/xen/arch/x86/hpet.c	Fri Apr 16 15:05:28 2010 +0800
@@ -186,6 +186,9 @@ static void handle_hpet_broadcast(struct
 
 again:
     ch->next_event = STIME_MAX;
+
+    spin_unlock_irq(&ch->lock);
+
     next_event = STIME_MAX;
     mask = (cpumask_t)CPU_MASK_NONE;
     now = NOW();
@@ -204,10 +207,14 @@ again:
 
     if ( next_event != STIME_MAX )
     {
-        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
+        spin_lock_irq(&ch->lock);
+
+        if ( next_event < ch->next_event &&
+             reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
-    }
-    spin_unlock_irq(&ch->lock);
+
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 static void hpet_interrupt_handler(int irq, void *data,
@@ -656,10 +663,15 @@ void hpet_broadcast_enter(void)
     BUG_ON( !ch );
 
     ASSERT(!local_irq_is_enabled());
-    spin_lock(&ch->lock);
 
     if ( hpet_attach_channel )
+    {
+        spin_lock(&ch->lock);
+
         hpet_attach_channel(cpu, ch);
+
+        spin_unlock(&ch->lock);
+    }
 
     /* Cancel any outstanding LAPIC timer event and disable interrupts. */
     reprogram_timer(0);
@@ -667,6 +679,8 @@ void hpet_broadcast_enter(void)
 
     cpu_set(cpu, ch->cpumask);
 
+    spin_lock(&ch->lock);
+
     /* reprogram if current cpu expire time is nearer */
     if ( this_cpu(timer_deadline_end) < ch->next_event )
         reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline_end), NOW(), 1);
@@ -683,8 +697,6 @@ void hpet_broadcast_exit(void)
         return;
 
     BUG_ON( !ch );
-
-    spin_lock_irq(&ch->lock);
 
     if ( cpu_test_and_clear(cpu, ch->cpumask) )
     {
@@ -693,14 +705,22 @@ void hpet_broadcast_exit(void)
         if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
             raise_softirq(TIMER_SOFTIRQ);
 
+        spin_lock_irq(&ch->lock);
+
         if ( cpus_empty(ch->cpumask) && ch->next_event !=
STIME_MAX )
             reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
+
+        spin_unlock_irq(&ch->lock);
     }
 
     if ( hpet_detach_channel )
+    {
+        spin_lock_irq(&ch->lock);
+
         hpet_detach_channel(cpu);
 
-    spin_unlock_irq(&ch->lock);
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 int hpet_broadcast_is_available(void)

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-20 12:49 UTC

head link

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

Is this a measurable win? The newer locking looks like it could be dodgy on
32-bit Xen: the 64-bit reads of timer_deadline_{start,end} will be
non-atomic and unsynchronised so you can read garbage. Even on 64-bit Xen
you can read stale values. I''ll be surprised if you got a performance
win
from chopping up critical regions in individual functions like that anyway.

 -- Keir

On 20/04/2010 06:39, "Wei, Gang" <gang.wei@intel.com> wrote:
> CPUIDLE: shorten hpet spin_lock holding time
> 
> Try to reduce spin_lock overhead for deep C state entry/exit. This will
> benefit systems with a lot of cpus which need the hpet broadcast to wakeup
> from deep C state.
> 
> Signed-off-by: Wei Gang <gang.wei@intel.com>
> 
> diff -r 7ee8bb40200a xen/arch/x86/hpet.c
> --- a/xen/arch/x86/hpet.c Thu Apr 15 19:11:16 2010 +0100
> +++ b/xen/arch/x86/hpet.c Fri Apr 16 15:05:28 2010 +0800
> @@ -186,6 +186,9 @@ static void handle_hpet_broadcast(struct
>  
>  again:
>      ch->next_event = STIME_MAX;
> +
> +    spin_unlock_irq(&ch->lock);
> +
>      next_event = STIME_MAX;
>      mask = (cpumask_t)CPU_MASK_NONE;
>      now = NOW();
> @@ -204,10 +207,14 @@ again:
>  
>      if ( next_event != STIME_MAX )
>      {
> -        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
> +        spin_lock_irq(&ch->lock);
> +
> +        if ( next_event < ch->next_event &&
> +             reprogram_hpet_evt_channel(ch, next_event, now, 0) )
>              goto again;
> -    }
> -    spin_unlock_irq(&ch->lock);
> +
> +        spin_unlock_irq(&ch->lock);
> +    }
>  }
>  
>  static void hpet_interrupt_handler(int irq, void *data,
> @@ -656,10 +663,15 @@ void hpet_broadcast_enter(void)
>      BUG_ON( !ch );
>  
>      ASSERT(!local_irq_is_enabled());
> -    spin_lock(&ch->lock);
>  
>      if ( hpet_attach_channel )
> +    {
> +        spin_lock(&ch->lock);
> +
>          hpet_attach_channel(cpu, ch);
> +
> +        spin_unlock(&ch->lock);
> +    }
>  
>      /* Cancel any outstanding LAPIC timer event and disable interrupts. */
>      reprogram_timer(0);
> @@ -667,6 +679,8 @@ void hpet_broadcast_enter(void)
>  
>      cpu_set(cpu, ch->cpumask);
>  
> +    spin_lock(&ch->lock);
> +
>      /* reprogram if current cpu expire time is nearer */
>      if ( this_cpu(timer_deadline_end) < ch->next_event )
>          reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline_end),
NOW(),
> 1);
> @@ -683,8 +697,6 @@ void hpet_broadcast_exit(void)
>          return;
>  
>      BUG_ON( !ch );
> -
> -    spin_lock_irq(&ch->lock);
>  
>      if ( cpu_test_and_clear(cpu, ch->cpumask) )
>      {
> @@ -693,14 +705,22 @@ void hpet_broadcast_exit(void)
>          if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
>              raise_softirq(TIMER_SOFTIRQ);
>  
> +        spin_lock_irq(&ch->lock);
> +
>          if ( cpus_empty(ch->cpumask) && ch->next_event !=
STIME_MAX )
>              reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
> +
> +        spin_unlock_irq(&ch->lock);
>      }
>  
>      if ( hpet_detach_channel )
> +    {
> +        spin_lock_irq(&ch->lock);
> +
>          hpet_detach_channel(cpu);
>  
> -    spin_unlock_irq(&ch->lock);
> +        spin_unlock_irq(&ch->lock);
> +    }
>  }
>  
>  int hpet_broadcast_is_available(void)



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Wei, Gang

2010-Apr-20 14:04 UTC

head link

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On Tuesday, 2010-4-20 8:49 PM, Keir Fraser wrote:> Is this a measurable win? The newer locking looks like it could be
> dodgy on 32-bit Xen: the 64-bit reads of timer_deadline_{start,end}
> will be non-atomic and unsynchronised so you can read garbage. Even
> on 64-bit Xen you can read stale values. I''ll be surprised if you
got
> a performance win from chopping up critical regions in individual
> functions like that anyway. 
First of all, this is a measurable power win for cpu overcommitment idle case
(vcpu:pcpu > 2:1, pcpu >= 32, guests are non-tickless kernels).

As to the non-atomic access to timer_deadline_{start,end}, it should already be
there before this patch. It is not protected by the hpet lock. Shall we add
rw_lock for each timer_deadline_{start,end}? This can be done separately.

Jimmy
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-20 14:21 UTC

head link

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 20/04/2010 15:04, "Wei, Gang" <gang.wei@intel.com> wrote:
> On Tuesday, 2010-4-20 8:49 PM, Keir Fraser wrote:
>> Is this a measurable win? The newer locking looks like it could be
>> dodgy on 32-bit Xen: the 64-bit reads of timer_deadline_{start,end}
>> will be non-atomic and unsynchronised so you can read garbage. Even
>> on 64-bit Xen you can read stale values. I''ll be surprised if
you got
>> a performance win from chopping up critical regions in individual
>> functions like that anyway.
> 
> First of all, this is a measurable power win for cpu overcommitment idle
case
> (vcpu:pcpu > 2:1, pcpu >= 32, guests are non-tickless kernels).
So lots of short sleep periods, and possibly only a very few HPET channels
to share? How prevalent is always-running APIC timer now, and is that going
to be supported in future processors?
> As to the non-atomic access to timer_deadline_{start,end}, it should
already
> be there before this patch. It is not protected by the hpet lock. Shall we
add
> rw_lock for each timer_deadline_{start,end}? This can be done separately.
The bug isn''t previously there, since the fields will not be read
unless the
cpu is in ch->cpumask, which (was) protected by ch->lock. That was
sufficient since a CPU would not modify timer_deadline_{start,end} between
hpet_broadcast_enter and hpet_broadcast_exit. After your patch,
handle_hpet_broadcast is no longer fully synchronised against those
functions.

 -- Keir



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Wei, Gang

2010-Apr-20 15:20 UTC

head link

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On Tuesday, 2010-4-20 10:22 PM, Keir Fraser wrote:> On 20/04/2010 15:04, "Wei, Gang" <gang.wei@intel.com>
wrote:
> 
>> On Tuesday, 2010-4-20 8:49 PM, Keir Fraser wrote:
>>> Is this a measurable win? The newer locking looks like it could be
>>> dodgy on 32-bit Xen: the 64-bit reads of timer_deadline_{start,end}
>>> will be non-atomic and unsynchronised so you can read garbage. Even
>>> on 64-bit Xen you can read stale values. I''ll be surprised
if you
>>> got a performance win from chopping up critical regions in
>>> individual functions like that anyway.
>> 
>> First of all, this is a measurable power win for cpu overcommitment
>> idle case (vcpu:pcpu > 2:1, pcpu >= 32, guests are non-tickless
>> kernels). 
> 
> So lots of short sleep periods, and possibly only a very few HPET
> channels to share? 
Yes.
> How prevalent is always-running APIC timer now,
> and is that going to be supported in future processors?
Always-running APIC timer just started from Westmere. And I personally guess it
should be supported in future processors. There are a lot of existing system
still need hpet broadcast wakeup.
>> As to the non-atomic access to timer_deadline_{start,end}, it should
>> already be there before this patch. It is not protected by the hpet
>> lock. Shall we add rw_lock for each timer_deadline_{start,end}? This
>> can be done separately. 
> 
> The bug isn''t previously there, since the fields will not be read
> unless the cpu is in ch->cpumask, which (was) protected by ch->lock.
> That was sufficient since a CPU would not modify
> timer_deadline_{start,end} between hpet_broadcast_enter and
> hpet_broadcast_exit. After your patch, handle_hpet_broadcast is no
> longer fully synchronised against those functions.
Ok, your are right. So we may just need to make sure the cpu is in
ch->cpumask while reading the timer_deadline_{start,end}. I can make some
changes to my patch.

Jimmy
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Wei, Gang

2010-Apr-20 16:05 UTC

head link

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

Resend.

CPUIDLE: shorten hpet spin_lock holding time

Try to reduce spin_lock overhead for deep C state entry/exit. This will benefit
systems with a lot of cpus which need the hpet broadcast to wakeup from deep C
state.

Signed-off-by: Wei Gang <gang.wei@intel.com>

diff -r dbf0fd95180f xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c	Tue Apr 20 14:32:53 2010 +0100
+++ b/xen/arch/x86/hpet.c	Tue Apr 20 23:48:19 2010 +0800
@@ -186,6 +186,9 @@ static void handle_hpet_broadcast(struct
 
 again:
     ch->next_event = STIME_MAX;
+
+    spin_unlock_irq(&ch->lock);
+
     next_event = STIME_MAX;
     mask = (cpumask_t)CPU_MASK_NONE;
     now = NOW();
@@ -193,10 +196,17 @@ again:
     /* find all expired events */
     for_each_cpu_mask(cpu, ch->cpumask)
     {
-        if ( per_cpu(timer_deadline_start, cpu) <= now )
-            cpu_set(cpu, mask);
-        else if ( per_cpu(timer_deadline_end, cpu) < next_event )
-            next_event = per_cpu(timer_deadline_end, cpu);
+        spin_lock_irq(&ch->lock);
+
+        if ( cpumask_test_cpu(cpu, ch->cpumask) )
+        {
+            if ( per_cpu(timer_deadline_start, cpu) <= now )
+                cpu_set(cpu, mask);
+            else if ( per_cpu(timer_deadline_end, cpu) < next_event )
+                next_event = per_cpu(timer_deadline_end, cpu);
+        }
+
+        spin_unlock_irq(&ch->lock);
     }
 
     /* wakeup the cpus which have an expired event. */
@@ -204,10 +214,14 @@ again:
 
     if ( next_event != STIME_MAX )
     {
-        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
+        spin_lock_irq(&ch->lock);
+
+        if ( next_event < ch->next_event &&
+             reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
-    }
-    spin_unlock_irq(&ch->lock);
+
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 static void hpet_interrupt_handler(int irq, void *data,
@@ -656,17 +670,23 @@ void hpet_broadcast_enter(void)
     BUG_ON( !ch );
 
     ASSERT(!local_irq_is_enabled());
-    spin_lock(&ch->lock);
 
     if ( hpet_attach_channel )
+    {
+        spin_lock(&ch->lock);
+
         hpet_attach_channel(cpu, ch);
+
+        spin_unlock(&ch->lock);
+    }
 
     /* Cancel any outstanding LAPIC timer event and disable interrupts. */
     reprogram_timer(0);
     disable_APIC_timer();
 
+    spin_lock(&ch->lock);
+
     cpu_set(cpu, ch->cpumask);
-
     /* reprogram if current cpu expire time is nearer */
     if ( this_cpu(timer_deadline_end) < ch->next_event )
         reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline_end), NOW(), 1);
@@ -684,23 +704,28 @@ void hpet_broadcast_exit(void)
 
     BUG_ON( !ch );
 
+    /* Reprogram the deadline; trigger timer work now if it has passed. */
+    enable_APIC_timer();
+    if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
+        raise_softirq(TIMER_SOFTIRQ);
+
     spin_lock_irq(&ch->lock);
 
-    if ( cpu_test_and_clear(cpu, ch->cpumask) )
-    {
-        /* Reprogram the deadline; trigger timer work now if it has passed. */
-        enable_APIC_timer();
-        if ( !reprogram_timer(this_cpu(timer_deadline_start)) )
-            raise_softirq(TIMER_SOFTIRQ);
-
-        if ( cpus_empty(ch->cpumask) && ch->next_event !=
STIME_MAX )
-            reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
-    }
+    cpu_clear(cpu, ch->cpumask);
+    if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
+        reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
+
+    spin_unlock_irq(&ch->lock);
+
 
     if ( hpet_detach_channel )
+    {
+        spin_lock_irq(&ch->lock);
+
         hpet_detach_channel(cpu);
 
-    spin_unlock_irq(&ch->lock);
+        spin_unlock_irq(&ch->lock);
+    }
 }
 
 int hpet_broadcast_is_available(void)

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-21 08:09 UTC

head link

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 20/04/2010 17:05, "Wei, Gang" <gang.wei@intel.com> wrote:
> Resend.
> 
> CPUIDLE: shorten hpet spin_lock holding time
> 
> Try to reduce spin_lock overhead for deep C state entry/exit. This will
> benefit systems with a lot of cpus which need the hpet broadcast to wakeup
> from deep C state.
> 
> Signed-off-by: Wei Gang <gang.wei@intel.com>
It fixes the unsafe accesses to timer_deadline_{start,end} but I still think
this optimisation is misguided and also unsafe. There is nothing to stop new
CPUs being added to ch->cpumask after you start scanning ch->cpumask. For
example, some new CPU which has a timer_deadline_end greater than
ch->next_event, so it does not reprogram the HPET. But handle_hpet_broadcast
is already mid-scan and misses this new CPU, so it does not reprogram the
HPET either. Hence no timer fires for the new CPU and it misses its
deadline.

Really I think a better approach than something like this patch would be to
better advertise the timer_slop=xxx Xen boot parameter for power-saving
scenarios. I wonder what your numbers look like if you re-run your benchmark
with (say) timer_slop=10000000 (i.e., 10ms slop) on the Xen command line?

 -- Keir




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Wei, Gang

2010-Apr-21 09:06 UTC

head link

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On Wednesday, 2010-4-21 4:10 PM, Keir Fraser wrote:> It fixes the unsafe accesses to timer_deadline_{start,end} but I
> still think this optimisation is misguided and also unsafe. There is
> nothing to stop new CPUs being added to ch->cpumask after you start
> scanning ch->cpumask. For example, some new CPU which has a
> timer_deadline_end greater than ch->next_event, so it does not
> reprogram the HPET. But handle_hpet_broadcast is already mid-scan and
> misses this new CPU, so it does not reprogram the HPET either. Hence
> no timer fires for the new CPU and it misses its deadline.
This will not happen. ch->next_event has already been set as STIME_MAX before
start scanning ch->cpumask, so the new CPU with smallest timer_deadline_end
will reprogram the HPET successfully.
> Really I think a better approach than something like this patch would
> be to better advertise the timer_slop=xxx Xen boot parameter for
> power-saving scenarios. I wonder what your numbers look like if you
> re-run your benchmark with (say) timer_slop=10000000 (i.e., 10ms
> slop) on the Xen command line? 
I think it is another story. Enlarging timer_slop is one way to aligned &
reduce breakevents, it do have effects to save power and possibly bring larger
latency. What I am trying to address here is how to reduce spin_lock overheads
in idel entry/exit path. The spin_lock overheads along with other overheads in
the system with 32pcpu/64vcpu caused >25% cpu utilization while all guest are
idle.

Jimmy
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-21 09:25 UTC

head link

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 21/04/2010 10:06, "Wei, Gang" <gang.wei@intel.com> wrote:
>> It fixes the unsafe accesses to timer_deadline_{start,end} but I
>> still think this optimisation is misguided and also unsafe. There is
>> nothing to stop new CPUs being added to ch->cpumask after you start
>> scanning ch->cpumask. For example, some new CPU which has a
>> timer_deadline_end greater than ch->next_event, so it does not
>> reprogram the HPET. But handle_hpet_broadcast is already mid-scan and
>> misses this new CPU, so it does not reprogram the HPET either. Hence
>> no timer fires for the new CPU and it misses its deadline.
> 
> This will not happen. ch->next_event has already been set as STIME_MAX
before
> start scanning ch->cpumask, so the new CPU with smallest
timer_deadline_end
> will reprogram the HPET successfully.
Okay, then CPU A executes hpet_broadcast_enter() and programs the HPET
channel for its timeout X. Meanwhile concurrently executing
handle_hpet_broadcast misses CPU A but finds some other CPU B with timeout Y
much later than X, and erroneously programs the HPET channel with Y, causing
CPU A to miss its deadline by an arbitrary amount.

I dare say I can carry on finding races. :-)
> I think it is another story. Enlarging timer_slop is one way to aligned
&
> reduce breakevents, it do have effects to save power and possibly bring
larger
> latency. What I am trying to address here is how to reduce spin_lock
overheads
> in idel entry/exit path. The spin_lock overheads along with other overheads
in
> the system with 32pcpu/64vcpu caused >25% cpu utilization while all
guest are
> idle.
So far it''s looked to me like a correctness/performance tradeoff. :-D

 -- Keir



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Wei, Gang

2010-Apr-21 09:36 UTC

head link

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On Wednesday, 2010-4-21 5:26 PM, Keir Fraser wrote:> On 21/04/2010 10:06, "Wei, Gang" <gang.wei@intel.com>
wrote:
> 
>>> It fixes the unsafe accesses to timer_deadline_{start,end} but I
>>> still think this optimisation is misguided and also unsafe. There
is
>>> nothing to stop new CPUs being added to ch->cpumask after you
start
>>> scanning ch->cpumask. For example, some new CPU which has a
>>> timer_deadline_end greater than ch->next_event, so it does not
>>> reprogram the HPET. But handle_hpet_broadcast is already mid-scan
>>> and misses this new CPU, so it does not reprogram the HPET either.
>>> Hence no timer fires for the new CPU and it misses its deadline.
>> 
>> This will not happen. ch->next_event has already been set as
>> STIME_MAX before start scanning ch->cpumask, so the new CPU with
>> smallest timer_deadline_end will reprogram the HPET successfully.
> 
> Okay, then CPU A executes hpet_broadcast_enter() and programs the HPET
> channel for its timeout X. Meanwhile concurrently executing
> handle_hpet_broadcast misses CPU A but finds some other CPU B with
> timeout Y much later than X, and erroneously programs the HPET
> channel with Y, causing CPU A to miss its deadline by an arbitrary
> amount. 
It is also not possible. handle_hpet_broadcast reprogram HPET just while
next_event < ch->next_event. Here next_evet is Y, and ch->next_event is
X. :-)
> I dare say I can carry on finding races. :-)
Please go on... The two racing conditions you mentioned were just considered
before I resent the patch. :-D

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-21 09:52 UTC

head link

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 21/04/2010 10:36, "Wei, Gang" <gang.wei@intel.com> wrote:
>> Okay, then CPU A executes hpet_broadcast_enter() and programs the HPET
>> channel for its timeout X. Meanwhile concurrently executing
>> handle_hpet_broadcast misses CPU A but finds some other CPU B with
>> timeout Y much later than X, and erroneously programs the HPET
>> channel with Y, causing CPU A to miss its deadline by an arbitrary
>> amount. 
> 
> It is also not possible. handle_hpet_broadcast reprogram HPET just while
> next_event < ch->next_event. Here next_evet is Y, and
ch->next_event is X. :-)
Ah yes, I spotted that just after I replied! Okay I''m almost convinced
now,
but...
>> I dare say I can carry on finding races. :-)
> 
> Please go on... The two racing conditions you mentioned were just
considered
> before I resent the patch. :-D
Okay, one concern I still have is over possible races around
cpuidle_wakeup_mwait(). It makes use of a cpumask cpuidle_mwait_flags,
avoiding an IPI to cpus in the mask. However, there is nothing to stop the
CPU having cleared itself from that cpumask before cpuidle does the write to
softirq_pending. In that case, even assuming the CPU is now non-idle and so
wakeup is spurious, a subsequent attempt to raise_softirq(TIMER_SOFTIRQ)
will incorrectly not IPI because the flag is already set in softirq_pending?

This race would be benign in the current locking strategy (without your
patch) because the CPU that has left mwait_idle_with_hints() cannot get
further than hpet_broadcast_exit() because it will spin on ch->lock, and
there is a guaranteed check of softirq_pending at some point after that.
However your patch removes that synchronisation because ch->lock is not held
across cpuidle_wakeup_mwait().

What do you think to that?

 -- Keir



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-21 10:03 UTC

head link

Re: [Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 21/04/2010 10:52, "Keir Fraser" <keir.fraser@eu.citrix.com>
wrote:
> Okay, one concern I still have is over possible races around
> cpuidle_wakeup_mwait(). It makes use of a cpumask cpuidle_mwait_flags,
> avoiding an IPI to cpus in the mask. However, there is nothing to stop the
> CPU having cleared itself from that cpumask before cpuidle does the write
to
> softirq_pending. In that case, even assuming the CPU is now non-idle and so
> wakeup is spurious, a subsequent attempt to raise_softirq(TIMER_SOFTIRQ)
> will incorrectly not IPI because the flag is already set in
softirq_pending?
Oh, another one, which can **even occur without your patch**: CPU A adds
itself to cpuidle_mwait_flags while cpuidle_wakeup_mwait() is running. That
function doesn''t see CPU A in its first read of the cpumask so it does
not
set TIMER_SOFTIRQ for CPU A. But it then *does* see CPU A in its final read
of the cpumask, and hence clears A from the caller''s mask. Hence the
caller
does not pass CPU A to cpumask_raise_softirq(). Hence CPU A is erroneously
not woken.

Isn''t the synchronisation around those mwait/monitor functions just
inherently broken, even without your patch, and your patch just makes it
worse? :-)

 -- Keir



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Wei, Gang

2010-Apr-22 03:59 UTC

head link

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On Wednesday, 2010-4-21 6:03 PM, Keir Fraser wrote:> On 21/04/2010 10:52, "Keir Fraser"
<keir.fraser@eu.citrix.com> wrote:
> 
>> Okay, one concern I still have is over possible races around
>> cpuidle_wakeup_mwait(). It makes use of a cpumask
>> cpuidle_mwait_flags, avoiding an IPI to cpus in the mask. However,
>> there is nothing to stop the CPU having cleared itself from that
>> cpumask before cpuidle does the write to softirq_pending. In that
>> case, even assuming the CPU is now non-idle and so wakeup is
>> spurious, a subsequent attempt to raise_softirq(TIMER_SOFTIRQ) will
>> incorrectly not IPI because the flag is already set in
>> softirq_pending?  
If a CPU cleared itself from cpuidle_mwait_flags, then this CPU didn''t
need a IPI to be waken. And one useless write to softirq_pending
doesn''t have any side effect. So this case should be acceptable.
> Oh, another one, which can **even occur without your patch**: CPU A
> adds itself to cpuidle_mwait_flags while cpuidle_wakeup_mwait() is
> running. That function doesn''t see CPU A in its first read of the
> cpumask so it does not set TIMER_SOFTIRQ for CPU A. But it then
> *does* see CPU A in its final read of the cpumask, and hence clears A
> from the caller''s mask. Hence the caller does not pass CPU A to
> cpumask_raise_softirq(). Hence CPU A is erroneously not woken.
Nice shot. This issue can be resolved via keeping and using a snapshot of
cpuidle_mwait_flags in cpuidle_wakeup_mwait.
> Isn''t the synchronisation around those mwait/monitor functions
just
> inherently broken, even without your patch, and your patch just makes
> it worse? :-)
How about now after fixing the second concern you mentioned?

Jimmy
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-22 07:22 UTC

head link

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 22/04/2010 04:59, "Wei, Gang" <gang.wei@intel.com> wrote:
>>> Okay, one concern I still have is over possible races around
>>> cpuidle_wakeup_mwait(). It makes use of a cpumask
>>> cpuidle_mwait_flags, avoiding an IPI to cpus in the mask. However,
>>> there is nothing to stop the CPU having cleared itself from that
>>> cpumask before cpuidle does the write to softirq_pending. In that
>>> case, even assuming the CPU is now non-idle and so wakeup is
>>> spurious, a subsequent attempt to raise_softirq(TIMER_SOFTIRQ) will
>>> incorrectly not IPI because the flag is already set in
>>> softirq_pending?
> 
> If a CPU cleared itself from cpuidle_mwait_flags, then this CPU
didn''t need a
> IPI to be waken. And one useless write to softirq_pending doesn''t
have any
> side effect. So this case should be acceptable.
That''s not totally convincing. The write to softirq_pending has one
extra
side effect: it is possible that the next time TIMER_SOFTIRQ really needs to
be raised on that CPU, it will not receive notification via IPI, because the
flag is already set in its softirq_pending mask.

Hm, let me see if I can come up with a patch for this and post it for you.

 -- Keir



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-22 08:19 UTC

head link

Re: [Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 22/04/2010 08:22, "Keir Fraser" <keir.fraser@eu.citrix.com>
wrote:
>> If a CPU cleared itself from cpuidle_mwait_flags, then this CPU
didn''t need a
>> IPI to be waken. And one useless write to softirq_pending
doesn''t have any
>> side effect. So this case should be acceptable.
> 
> That''s not totally convincing. The write to softirq_pending has
one extra
> side effect: it is possible that the next time TIMER_SOFTIRQ really needs
to
> be raised on that CPU, it will not receive notification via IPI, because
the
> flag is already set in its softirq_pending mask.
> 
> Hm, let me see if I can come up with a patch for this and post it for you.
How about the attached patch? It MWAITs on a completely new flag field,
avoiding the IPI-avoidance semantics of softirq_pending. It also folds in
your patch. It also does wakeup-waiting checks on timer_deadline_start, that
being the field that initiates wakeup via the MONITORed memory region.

 -- Keir



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-22 08:21 UTC

head link

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 20/04/2010 17:05, "Wei, Gang" <gang.wei@intel.com> wrote:
> Resend.
> 
> CPUIDLE: shorten hpet spin_lock holding time
> 
> Try to reduce spin_lock overhead for deep C state entry/exit. This will
> benefit systems with a lot of cpus which need the hpet broadcast to wakeup
> from deep C state.
> 
> Signed-off-by: Wei Gang <gang.wei@intel.com>
I was going to ask: do you still get the decent power-saving win from this
new version of the patch? You acquire/release the lock a lot more
potentially in this version, since you do so inside the loop over cpumask in
handle_hpet_broadcast().

 -- Keir




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Keir Fraser

2010-Apr-22 08:23 UTC

head link

Re: [Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On 22/04/2010 09:19, "Keir Fraser" <keir.fraser@eu.citrix.com>
wrote:
>> Hm, let me see if I can come up with a patch for this and post it for
you.
> 
> How about the attached patch? It MWAITs on a completely new flag field,
> avoiding the IPI-avoidance semantics of softirq_pending. It also folds in
> your patch. It also does wakeup-waiting checks on timer_deadline_start,
that
> being the field that initiates wakeup via the MONITORed memory region.
...If you do think it looks okay, could you also test it out on relevant
hardware? :-)

 Thanks,
 Keir



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Wei, Gang

2010-Apr-29 11:08 UTC

head link

RE: [Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On Thursday, 2010-4-22 4:23 PM, Keir Fraser wrote:> On 22/04/2010 09:19, "Keir Fraser"
<keir.fraser@eu.citrix.com> wrote:
>> How about the attached patch? It MWAITs on a completely new flag
>> field, avoiding the IPI-avoidance semantics of softirq_pending. It
>> also folds in your patch. It also does wakeup-waiting checks on
>> timer_deadline_start, that being the field that initiates wakeup via
>> the MONITORed memory region. 
> 
> ...If you do think it looks okay, could you also test it out on
> relevant hardware? :-)
Did some modification to this patch -- move the per_cpu mait_wakeup flag into
irq_cpustat_t to make it __cacheline_aligned, and add check for
timer_deadline_start==0 (means no timer in queue, it took me quite a lot time to
find out it is necessary) case. Tested ok.

Jimmy

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Wei, Gang

2010-Apr-29 11:14 UTC

head link

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

On Thursday, 2010-4-22 4:21 PM, Keir Fraser wrote:> On 20/04/2010 17:05, "Wei, Gang" <gang.wei@intel.com>
wrote:
> 
>> Resend.
>> 
>> CPUIDLE: shorten hpet spin_lock holding time
>> 
>> Try to reduce spin_lock overhead for deep C state entry/exit. This
>> will benefit systems with a lot of cpus which need the hpet
>> broadcast to wakeup from deep C state. 
>> 
>> Signed-off-by: Wei Gang <gang.wei@intel.com>
> 
> I was going to ask: do you still get the decent power-saving win from
> this new version of the patch? You acquire/release the lock a lot more
> potentially in this version, since you do so inside the loop over
> cpumask in handle_hpet_broadcast().
Yes, I tested it and could still get the decent power-saving win from it. And it
should not have racing issue after the mwait-wakeup re-implementation patch.

Jimmy
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Xen devel - Apr 2010 - [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

Re: [Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

Re: [Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

Re: [Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

RE: [Xen-devel] Re: [PATCH] CPUIDLE: shorten hpet spin_lock holding time

[Xen-devel] RE: [PATCH] CPUIDLE: shorten hpet spin_lock holding time