This is the patch for deferrable timer As Xen introduce the C state support, it become important to optimize the C state residency. The key point of the optimization is reducing the breaking events. Since timer interrupt is the major part of the breaking event, this patch is the first step to reduce the timer interrupts. The basic idea of this patch is that certain ac timer does not stick to one exact firing point, instead, it is fine with firing period, e.g. period [a, b]. With the firing period introduced, it is possible to group multiple ac timers, whose firing periods has non-null period intersection. for example, suppose ac timer x, y has firing period [a1, b1], [a2, b2], and [a1,b1]^[a2,b2] = [a3,b3] (where ^ stands for union intersection). in this case, xen can group ac timer x and y, and fire them at any time in [a3,b3], this in turn will reduce the timer interrupt. And this type of ac timer is called deferrable timer. This patch adds new ac timer API set_timer_deferrable for the deferrable timer. Signed-off-by: Yu Ke <ke.yu@intel.com> Wei Gang <gang.wei@intel.com> diff -r 19970181d6a4 xen/arch/x86/hpet.c --- a/xen/arch/x86/hpet.c Tue Jul 01 14:50:35 2008 +0100 +++ b/xen/arch/x86/hpet.c Wed Jul 09 14:47:13 2008 +0800 @@ -14,8 +14,6 @@ #include <asm/div64.h> #include <asm/hpet.h> -#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1)) - #define MAX_DELTA_NS MILLISECS(10*1000) #define MIN_DELTA_NS MICROSECS(20) diff -r 19970181d6a4 xen/arch/x86/time.c --- a/xen/arch/x86/time.c Tue Jul 01 14:50:35 2008 +0100 +++ b/xen/arch/x86/time.c Thu Jul 10 09:21:47 2008 +0800 @@ -478,7 +478,9 @@ static void plt_overflow(void *unused) plt_stamp = count; spin_unlock(&platform_timer_lock); - set_timer(&plt_overflow_timer, NOW() + plt_overflow_period); + set_timer_deferrable( &plt_overflow_timer, + NOW() + plt_overflow_period - plt_overflow_period/8, + NOW() + plt_overflow_period ); } static s_time_t __read_platform_stime(u64 platform_time) diff -r 19970181d6a4 xen/common/timer.c --- a/xen/common/timer.c Tue Jul 01 14:50:35 2008 +0100 +++ b/xen/common/timer.c Thu Jul 10 10:28:33 2008 +0800 @@ -27,10 +27,16 @@ */ #define TIMER_SLOP (50*1000) /* ns */ +#define MAX_READY_TIMERS 16 + struct timers { spinlock_t lock; struct timer **heap; struct timer *running; + /* ready timers for next fire */ + struct timer *ready[MAX_READY_TIMERS]; + int ready_nr; + s_time_t ready_expires; } __cacheline_aligned; static DEFINE_PER_CPU(struct timers, timers); @@ -113,6 +119,29 @@ static int remove_entry(struct timer **h return (pos == 1); } +/* Delete @t from @ready queue, return TRUE if find timer in queue*/ +static int remove_ready_entry(struct timer *t) +{ + int i, j, rc = 0; + struct timers *ts; + + ts = &per_cpu(timers, t->cpu); + + for ( i = 0; i < ts->ready_nr; i++ ) + { + if ( ts->ready[i] == t ) + { + t->heap_offset = 0; + ts->ready_nr--; + for ( j = i; j < ts->ready_nr; j++ ) + ts->ready[j] = ts->ready[j+1]; + rc = 1; + break; + } + } + + return rc; +} /* Add new entry @t to @heap. Return TRUE if new top of heap. */ static int add_entry(struct timer ***pheap, struct timer *t) @@ -158,7 +187,10 @@ static inline void __stop_timer(struct t static inline void __stop_timer(struct timer *timer) { int cpu = timer->cpu; - if ( remove_entry(per_cpu(timers, cpu).heap, timer) ) + + if ( remove_ready_entry(timer) ) + cpu_raise_softirq(cpu, TIMER_SOFTIRQ); + else if ( remove_entry(per_cpu(timers, cpu).heap, timer) ) cpu_raise_softirq(cpu, TIMER_SOFTIRQ); } @@ -191,8 +223,8 @@ static inline void timer_unlock(struct t #define timer_unlock_irqrestore(t, flags) \ do { timer_unlock(t); local_irq_restore(flags); } while ( 0 ) - -void set_timer(struct timer *timer, s_time_t expires) +/* Set timer that can expire in period [expires_start, expires_end] */ +void set_timer_deferrable(struct timer *timer, s_time_t expires_start, s_time_t expires_end) { unsigned long flags; @@ -201,7 +233,8 @@ void set_timer(struct timer *timer, s_ti if ( active_timer(timer) ) __stop_timer(timer); - timer->expires = expires; + timer->expires = expires_start; + timer->expires_deferred = expires_end; if ( likely(!timer->killed) ) __add_timer(timer); @@ -209,6 +242,10 @@ void set_timer(struct timer *timer, s_ti timer_unlock_irqrestore(timer, flags); } +void set_timer(struct timer *timer, s_time_t expires) +{ + set_timer_deferrable(timer, expires, expires); +} void stop_timer(struct timer *timer) { @@ -295,6 +332,7 @@ static void timer_softirq_action(void) s_time_t now; void (*fn)(void *); void *data; + int i; ts = &this_cpu(timers); @@ -304,6 +342,23 @@ static void timer_softirq_action(void) heap = ts->heap; now = NOW(); + if ( ts->ready_expires < now + TIMER_SLOP ) + { + for ( i = 0; i < ts->ready_nr; i++ ) + { + fn = ts->ready[i]->function; + data = ts->ready[i]->data; + + ts->running = ts->ready[i]; + ts->ready[i]->heap_offset = 0; + + spin_unlock_irq(&ts->lock); + (*fn)(data); + spin_lock_irq(&ts->lock); + } + ts->ready_nr = 0; + } + while ( (GET_HEAP_SIZE(heap) != 0) && ((t = heap[1])->expires < (now + TIMER_SLOP)) ) { @@ -322,9 +377,34 @@ static void timer_softirq_action(void) heap = ts->heap; } + /* queue ready timers for next fire */ + if ( ts->ready_nr == 0 ) + { + s_time_t start, end; + + start = 0; + end = STIME_MAX; + + while ( (GET_HEAP_SIZE(heap) != 0) && + ((t = heap[1])->expires <= end) && + (ts->ready_nr < MAX_READY_TIMERS) ) + { + remove_entry(heap, t); + + start = t->expires; + if ( end > t->expires_deferred ) + end = t->expires_deferred; + + ts->ready[ts->ready_nr++] = t; + t->heap_offset = 1; /* mark it still active */ + } + + ts->ready_expires = (start + end) / 2; + } + ts->running = NULL; - this_cpu(timer_deadline) = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0; + this_cpu(timer_deadline) = ts->ready_nr ? ts->ready_expires : 0; } while ( !reprogram_timer(this_cpu(timer_deadline)) ); diff -r 19970181d6a4 xen/include/xen/time.h --- a/xen/include/xen/time.h Tue Jul 01 14:50:35 2008 +0100 +++ b/xen/include/xen/time.h Wed Jul 09 14:49:15 2008 +0800 @@ -52,6 +52,7 @@ struct tm gmtime(unsigned long t); #define SECONDS(_s) ((s_time_t)((_s) * 1000000000ULL)) #define MILLISECS(_ms) ((s_time_t)((_ms) * 1000000ULL)) #define MICROSECS(_us) ((s_time_t)((_us) * 1000ULL)) +#define STIME_MAX ((s_time_t)((uint64_t)~0ULL>>1)) extern void update_vcpu_system_time(struct vcpu *v); extern void update_domain_wallclock_time(struct domain *d); diff -r 19970181d6a4 xen/include/xen/timer.h --- a/xen/include/xen/timer.h Tue Jul 01 14:50:35 2008 +0100 +++ b/xen/include/xen/timer.h Tue Jul 08 16:44:35 2008 +0800 @@ -15,6 +15,7 @@ struct timer { struct timer { /* System time expiry value (nanoseconds since boot). */ s_time_t expires; + s_time_t expires_deferred; /* CPU on which this timer will be installed and executed. */ unsigned int cpu; /* On expiry, ''(*function)(data)'' will be executed in softirq context. */ @@ -63,6 +64,9 @@ static inline void init_timer( * been initialised by init_timer() (so that callback details are known). */ extern void set_timer(struct timer *timer, s_time_t expires); + +/* Set timer that can expire in period [expires-offset, expires+offset] */ +extern void set_timer_deferrable(struct timer *timer, s_time_t expires, s_time_t offset); /* * Deactivate a timer This function has no effect if the timer is not currently _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 17/7/08 13:54, "Yu, Ke" <ke.yu@intel.com> wrote:> This patch adds new ac timer API set_timer_deferrable for the deferrable > timer.Whether this is worthwhile depends on what likely users there are. platform-timer overflow is not very compelling since if you''re using anything other than PIT (which ought to be likely on a modern system supporting deep sleep) the overflow period should be multiple seconds. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser wrote:> On 17/7/08 13:54, "Yu, Ke" <ke.yu@intel.com> wrote: > >> This patch adds new ac timer API set_timer_deferrable for the >> deferrable timer. > > Whether this is worthwhile depends on what likely users there are. > platform-timer overflow is not very compelling since if you''re using > anything other than PIT (which ought to be likely on a modern system > supporting deep sleep) the overflow period should be multiple seconds. > > -- KeirTrue. Another user is the Px state sampling timer, which is 20ms. Other potential timers (e.g. sched timer, hvm pt timers) are also under evaluation, the principle is to make sure there is no performance downgrade. Best Regards Ke _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yu, Ke wrote:>Keir Fraser wrote: > > >>On 17/7/08 13:54, "Yu, Ke" <ke.yu@intel.com> wrote: >> >> >> >>>This patch adds new ac timer API set_timer_deferrable for the >>>deferrable timer. >>> >>> >>Whether this is worthwhile depends on what likely users there are. >>platform-timer overflow is not very compelling since if you''re using >>anything other than PIT (which ought to be likely on a modern system >>supporting deep sleep) the overflow period should be multiple seconds. >> >> -- Keir >> >> > >True. Another user is the Px state sampling timer, which is 20ms. Other >potential timers (e.g. sched timer, hvm pt timers) are also under >evaluation, the principle is to make sure there is no performance >downgrade. > >Ke, One would think that hpet or vpt support for the guest-handles-missed-ticks policy would be a good application for a deferrable timer. If a deferrable timer were used, then the comparator (cmp) would have to be warped to a non-integer multiple of the period. This is because Linux reads the comparator register to estimate the delay since the interrupt was posted. I don''t think warping like this will be a problem. At some point, I can test this. I think we could use the deferrable timer for the guest-does-not-handle-missed-ticks policy as well. Any investigation that you want to do in the platform timer area would be fine. Or I can do it, but that will probably be after I do the vpt.c/hpet.c integration work. thanks, Dave>Best Regards >Ke > >_______________________________________________ >Xen-devel mailing list >Xen-devel@lists.xensource.com >http://lists.xensource.com/xen-devel > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Dave, Glad to see there is deferrable timer application. Please go ahead with that. And I will keep you updated if there is finding in my side. BTW, Could you please elaborate more on the "guest-handles-missed-tick" case? Since there is no need to inject missed tick to guest, which timer would be used as deferrable timer? Best Regards Ke Dave Winchell wrote:> Ke, > > One would think that hpet or vpt support for the > guest-handles-missed-ticks policy would be a good application for a > deferrable timer. > If a deferrable timer were used, then the comparator (cmp) would have > to > be warped to a non-integer multiple of the period. This is because > Linux reads the comparator register to estimate the delay since the > interrupt > was posted. > I don''t think warping like this will be a problem. At some point, I > can test this. > > I think we could use the deferrable timer for the > guest-does-not-handle-missed-ticks > policy as well. > > Any investigation that you want to do in the platform timer area would > be fine. > Or I can do it, but that will probably be after I do the vpt.c/hpet.c > integration > work. > > thanks, > Dave > >> Best Regards >> Ke >> >> _______________________________________________ >> Xen-devel mailing list >> Xen-devel@lists.xensource.com >> http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yu, Ke wrote:> Dave, > > Glad to see there is deferrable timer application. Please go ahead > with that. And I will keep you updated if there is finding in my side. > > BTW, Could you please elaborate more on the > "guest-handles-missed-tick" case? Since there is no need to inject > missed tick to guest, which timer would be used as deferrable timer?Oh, I catch your points now, please ignore my previous question. You actually means that: since guest can handle the missed tick correcty, it is acceptable that the hpet/vpt timer is defered, so the hpet/vpt timer itself can be deferrable timer. so is the "guest-does-not-handle-missed-ticks" case, since xen can handle that by inject missed tick respectively. If my understanding is correct, I would say your point is truly good, I expect this will reduce the timer count much especially when there is multiple HVMs. Best Regards Ke> > Best Regards > Ke > > Dave Winchell wrote: >> Ke, >> >> One would think that hpet or vpt support for the >> guest-handles-missed-ticks policy would be a good application for a >> deferrable timer. If a deferrable timer were used, then the >> comparator (cmp) would have to be warped to a non-integer multiple >> of the period. This is because Linux reads the comparator register >> to estimate the delay since the interrupt was posted. >> I don''t think warping like this will be a problem. At some point, I >> can test this. >> >> I think we could use the deferrable timer for the >> guest-does-not-handle-missed-ticks >> policy as well. >> >> Any investigation that you want to do in the platform timer area >> would be fine. Or I can do it, but that will probably be after I do >> the vpt.c/hpet.c integration work. >> >> thanks, >> Dave >> >>> Best Regards >>> Ke >>> >>> _______________________________________________ >>> Xen-devel mailing list >>> Xen-devel@lists.xensource.com >>> http://lists.xensource.com/xen-devel > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ke, Yu, Ke wrote:>Dave, > >Glad to see there is deferrable timer application. Please go ahead with >that. And I will keep you updated if there is finding in my side. >ok.> > >BTW, Could you please elaborate more on the "guest-handles-missed-tick" >case? Since there is no need to inject missed tick to guest, which timer >would be used as deferrable timer? > >Hpet.c uses set_timer for hpet comparator/timer 0. When that timer expires, a clock interrupt may be injected to the guest. This timer is normally set to expire at the next period boundary. We could, instead, have it expire over a range of say, several periods. Vpt.c works in a similar fashion for its periodic timer. Other clocksources, e.g. pit, rtc, are layred on vpt.c with interface create_periodic_timer. I can imagine an option passed to create_periodic_timer signifying that a deferrable timer may be used. Ideally, the deferrable timer would have an option where a set of allowable timeout values, rather than a range, could be provided. If it had this option, we could keep the timeouts on the integer*period time line. Otherwise I need to warp the comparator as discussed below. I anticipate that there may be some problems with warping. I realize that specifying a range gives you more options for combining timeouts. I don''t mind trying to solve the warping problem. One further option would be a deferrable timer with a range fallowed by a non-deferrable timer to get back on the integer*period timeline for interrupt delivery. thanks, Dave>Best Regards >Ke > >Dave Winchell wrote: > > >>Ke, >> >>One would think that hpet or vpt support for the >>guest-handles-missed-ticks policy would be a good application for a >>deferrable timer. >>If a deferrable timer were used, then the comparator (cmp) would have >>to >>be warped to a non-integer multiple of the period. This is because >>Linux reads the comparator register to estimate the delay since the >>interrupt >>was posted. >>I don''t think warping like this will be a problem. At some point, I >>can test this. >> >>I think we could use the deferrable timer for the >>guest-does-not-handle-missed-ticks >>policy as well. >> >>Any investigation that you want to do in the platform timer area would >>be fine. >>Or I can do it, but that will probably be after I do the vpt.c/hpet.c >>integration >>work. >> >>thanks, >>Dave >> >> >> >>>Best Regards >>>Ke >>> >>>_______________________________________________ >>>Xen-devel mailing list >>>Xen-devel@lists.xensource.com >>>http://lists.xensource.com/xen-devel >>> >>> > > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yu, Ke wrote:>Yu, Ke wrote: > > >>Dave, >> >>Glad to see there is deferrable timer application. Please go ahead >>with that. And I will keep you updated if there is finding in my side. >> >>BTW, Could you please elaborate more on the >>"guest-handles-missed-tick" case? Since there is no need to inject >>missed tick to guest, which timer would be used as deferrable timer? >> >> > >Oh, I catch your points now, please ignore my previous question. You >actually means that: since guest can handle the missed tick correcty, it >is acceptable that the hpet/vpt timer is defered, so the hpet/vpt timer >itself can be deferrable timer. >Yes.> so is the >"guest-does-not-handle-missed-ticks" case, since xen can handle that by >inject missed tick respectively. > >For the guest-does-not-handle-missed-ticks case we inject the correct number of interrupts, i.e. N*period, N an integer, but we can delay a bit before doing so. So I think we can use deferrable timers for both policies.>If my understanding is correct, I would say your point is truly good, I >expect this will reduce the timer count much especially when there is >multiple HVMs. > >Best Regards >Ke > > > >>Best Regards >>Ke >> >>Dave Winchell wrote: >> >> >>>Ke, >>> >>>One would think that hpet or vpt support for the >>>guest-handles-missed-ticks policy would be a good application for a >>>deferrable timer. If a deferrable timer were used, then the >>>comparator (cmp) would have to be warped to a non-integer multiple >>>of the period. This is because Linux reads the comparator register >>>to estimate the delay since the interrupt was posted. >>>I don''t think warping like this will be a problem. At some point, I >>>can test this. >>> >>>I think we could use the deferrable timer for the >>>guest-does-not-handle-missed-ticks >>>policy as well. >>> >>>Any investigation that you want to do in the platform timer area >>>would be fine. Or I can do it, but that will probably be after I do >>>the vpt.c/hpet.c integration work. >>> >>>thanks, >>>Dave >>> >>> >>> >>>>Best Regards >>>>Ke >>>> >>>>_______________________________________________ >>>>Xen-devel mailing list >>>>Xen-devel@lists.xensource.com >>>>http://lists.xensource.com/xen-devel >>>> >>>> >>_______________________________________________ >>Xen-devel mailing list >>Xen-devel@lists.xensource.com >>http://lists.xensource.com/xen-devel >> >> > > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
>From: Dave Winchell [mailto:dwinchell@virtualiron.com] >Sent: 2008年7月18日 23:29 > >Vpt.c works in a similar fashion for its periodic timer. Other >clocksources, >e.g. pit, rtc, are layred on vpt.c with interface >create_periodic_timer. >I can imagine an option passed to create_periodic_timer signifying that >a deferrable timer may be used.Agree.> >Ideally, the deferrable timer would have an option where a set >of allowable >timeout values, rather than a range, could be provided. If it had this >option, we could keep >the timeouts on the integer*period time line. Otherwise I need to warp >the comparator >as discussed below. I anticipate that there may be some problems with >warping.Not sure why this option is required. In any case, you just keep cmp updated by integer*period which is just enough in timer fn and has nothing to do with whether timer is deferred. Timers are always deferred before and after, with only difference on the extent, especially when you consider the point when guest gets chance to act on it.> >I realize that specifying a range gives you more options for combining >timeouts. >I don''t mind trying to solve the warping problem.Yes, the purpose of the deferral is to reduce timer ticks, or else it really make no sense.> >One further option would be a deferrable timer with a range >fallowed by >a non-deferrable >timer to get back on the integer*period timeline for interrupt >delivery.What''s the purpose then? current algorithm to find whether one timer should be deferred is to find whether other timers may expire in its tolerated future, regardless of whether next timer is deferrable or not. Why do you care whether next one is non-deferrable? Thanks, Kevin _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Kevin, Let me just try to use the deferrable timers as they are specified today. If I have trouble doing so, then we can discuss at that point. thanks, Dave Tian, Kevin wrote:>>From: Dave Winchell [mailto:dwinchell@virtualiron.com] >>Sent: 2008年7月18日 23:29 >> >>Vpt.c works in a similar fashion for its periodic timer. Other >>clocksources, >>e.g. pit, rtc, are layred on vpt.c with interface >>create_periodic_timer. >>I can imagine an option passed to create_periodic_timer signifying that >>a deferrable timer may be used. >> >> > >Agree. > > > >>Ideally, the deferrable timer would have an option where a set >>of allowable >>timeout values, rather than a range, could be provided. If it had this >>option, we could keep >>the timeouts on the integer*period time line. Otherwise I need to warp >>the comparator >>as discussed below. I anticipate that there may be some problems with >>warping. >> >> > >Not sure why this option is required. In any case, you just keep cmp >updated by integer*period which is just enough in timer fn and has >nothing to do with whether timer is deferred. Timers are always deferred >before and after, with only difference on the extent, especially when >you consider the point when guest gets chance to act on it. > >> > >>I realize that specifying a range gives you more options for combining >>timeouts. >>I don''t mind trying to solve the warping problem. >> >> > >Yes, the purpose of the deferral is to reduce timer ticks, or else it >really make no sense. > > > >>One further option would be a deferrable timer with a range >>fallowed by >>a non-deferrable >>timer to get back on the integer*period timeline for interrupt >>delivery. >> >> > >What''s the purpose then? current algorithm to find whether one >timer should be deferred is to find whether other timers may >expire in its tolerated future, regardless of whether next timer >is deferrable or not. Why do you care whether next one is >non-deferrable? > >Thanks, >Kevin > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel