Laszlo Ersek
2011-Oct-18 20:42 UTC
[Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
... because the "clock_event_device framework" already accounts for idle time through the "event_handler" function pointer in xen_timer_interrupt(). The patch is intended as the completion of [1]. It should fix the double idle times seen in PV guests'' /proc/stat [2]. It should be orthogonal to stolen time accounting (the removed code seems to be isolated). The approach may be completely misguided. [1] https://lkml.org/lkml/2011/10/6/10 [2] http://lists.xensource.com/archives/html/xen-devel/2010-08/msg01068.html Signed-off-by: Laszlo Ersek <lersek@redhat.com> --- arch/x86/xen/time.c | 17 ++--------------- 1 files changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 163b467..377f6ae 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -36,9 +36,8 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); /* snapshots of runstate info */ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); -/* unused ns of stolen and blocked time */ +/* unused ns of stolen time */ static DEFINE_PER_CPU(u64, xen_residual_stolen); -static DEFINE_PER_CPU(u64, xen_residual_blocked); /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) @@ -115,7 +114,7 @@ static void do_stolen_accounting(void) { struct vcpu_runstate_info state; struct vcpu_runstate_info *snap; - s64 blocked, runnable, offline, stolen; + s64 runnable, offline, stolen; cputime_t ticks; get_runstate_snapshot(&state); @@ -125,7 +124,6 @@ static void do_stolen_accounting(void) snap = &__get_cpu_var(xen_runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ - blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; @@ -141,17 +139,6 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __this_cpu_write(xen_residual_stolen, stolen); account_steal_ticks(ticks); - - /* Add the appropriate number of ticks of blocked time, - including any left-overs from last time. */ - blocked += __this_cpu_read(xen_residual_blocked); - - if (blocked < 0) - blocked = 0; - - ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); - __this_cpu_write(xen_residual_blocked, blocked); - account_idle_ticks(ticks); } /* Get the TSC speed from Xen */ -- 1.7.4.4 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2011-Oct-19 07:51 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
>>> On 18.10.11 at 22:42, Laszlo Ersek <lersek@redhat.com> wrote: > ... because the "clock_event_device framework" already accounts for idle > time through the "event_handler" function pointer in > xen_timer_interrupt().As event_handler is being checked to be non-zero, shouldn''t the code you remove simply become conditional (upon event_handler being zero)? Jan> The patch is intended as the completion of [1]. It should fix the double > idle times seen in PV guests'' /proc/stat [2]. It should be orthogonal to > stolen time accounting (the removed code seems to be isolated). > > The approach may be completely misguided. > > [1] https://lkml.org/lkml/2011/10/6/10 > [2] http://lists.xensource.com/archives/html/xen-devel/2010-08/msg01068.html > > Signed-off-by: Laszlo Ersek <lersek@redhat.com> > --- > arch/x86/xen/time.c | 17 ++--------------- > 1 files changed, 2 insertions(+), 15 deletions(-) > > diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c > index 163b467..377f6ae 100644 > --- a/arch/x86/xen/time.c > +++ b/arch/x86/xen/time.c > @@ -36,9 +36,8 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, > xen_runstate); > /* snapshots of runstate info */ > static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); > > -/* unused ns of stolen and blocked time */ > +/* unused ns of stolen time */ > static DEFINE_PER_CPU(u64, xen_residual_stolen); > -static DEFINE_PER_CPU(u64, xen_residual_blocked); > > /* return an consistent snapshot of 64-bit time/counter value */ > static u64 get64(const u64 *p) > @@ -115,7 +114,7 @@ static void do_stolen_accounting(void) > { > struct vcpu_runstate_info state; > struct vcpu_runstate_info *snap; > - s64 blocked, runnable, offline, stolen; > + s64 runnable, offline, stolen; > cputime_t ticks; > > get_runstate_snapshot(&state); > @@ -125,7 +124,6 @@ static void do_stolen_accounting(void) > snap = &__get_cpu_var(xen_runstate_snapshot); > > /* work out how much time the VCPU has not been runn*ing* */ > - blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; > runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; > offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; > > @@ -141,17 +139,6 @@ static void do_stolen_accounting(void) > ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); > __this_cpu_write(xen_residual_stolen, stolen); > account_steal_ticks(ticks); > - > - /* Add the appropriate number of ticks of blocked time, > - including any left-overs from last time. */ > - blocked += __this_cpu_read(xen_residual_blocked); > - > - if (blocked < 0) > - blocked = 0; > - > - ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); > - __this_cpu_write(xen_residual_blocked, blocked); > - account_idle_ticks(ticks); > } > > /* Get the TSC speed from Xen */_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Laszlo Ersek
2011-Oct-19 14:54 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
On 10/19/11 09:51, Jan Beulich wrote:>>>> On 18.10.11 at 22:42, Laszlo Ersek<lersek@redhat.com> wrote: >> ... because the "clock_event_device framework" already accounts for idle >> time through the "event_handler" function pointer in >> xen_timer_interrupt(). > > As event_handler is being checked to be non-zero, shouldn''t the > code you remove simply become conditional (upon event_handler > being zero)?I think that wouldn''t be hard to implement, but I''m afraid the paragraph you quoted from my proposed commit message could be wrong -- perhaps it''s not the event_handler callback that cranks the idle time counter. Please see https://bugzilla.redhat.com/show_bug.cgi?id=624756#c26 In short, (a) idle time is increased in cpu_idle(), which seems to be running as a standalone kernel thread; (b) the event_handler I found invoked from xen_timer_interrupt() is hrtimer_interrupt(); (c) I couldn''t figure out if cpu_idle() keeps waking up "on its own", or if it needs periodic kicks from hrtimer_interrupt() (executed by some other thread). Thank you Laszlo _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Laszlo Ersek
2011-Oct-20 14:35 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
On 10/19/11 09:51, Jan Beulich wrote:>>>> On 18.10.11 at 22:42, Laszlo Ersek<lersek@redhat.com> wrote: >> ... because the "clock_event_device framework" already accounts for idle >> time through the "event_handler" function pointer in >> xen_timer_interrupt(). > > As event_handler is being checked to be non-zero, shouldn''t the > code you remove simply become conditional (upon event_handler > being zero)?After experimenting further and reading more code, I think that the event_handler == NULL case is spurious and impossible in the long run. (I tested faking it periodically and the VM stops progressing during boot, after udev is started). Furthermore / independently, these are the callers of account_idle_ticks() -- a complete tree: account_idle_ticks() [kernel/sched.c] <- tick_nohz_restart_sched_tick() [kernel/time/tick-sched.c] <- cpu_idle() [various arches] <- do_stolen_accounting() [arch/x86/xen/time.c] <- xen_timer_interrupt() <- consider_steal_time() [arch/ia64/xen/time.c] <- xen_do_steal_accounting() = xen_time_ops.do_steal_accounting (The ia64/xen time code seems to be modeled after the x86/xen code.) Jeremy, could you please educate me why the original version of do_stolen_accounting() (commit f91a8b44) had added the account_steal_time(idle_task(smp_processor_id()), ticks); part? I think it was wrong from the start. In linux-2.6.18-xen, cpu_idle() [arch/x86_64/kernel/process-xen.c] doesn''t seem to bump the idle time counter. So the interrupt handler routine timer_interrupt() [arch/i386/kernel/time-xen.c] has to, after doing the stolen accounting. I suspect this logic was transferred to the pvops kernel superfluously, where cpu_idle() was already handling the idle time accounting. I believe my claim is consistent with the fact that only the xen-specific timer interrupt handlers care directly about idle time in the pvops kernel. I''m convinced the patch is correct, and only the commit message might need a small fix (mentioning cpu_idle()). If the above reasoning is insufficient, whom should I mail directly to confirm/refute/complete it? I tried mingo and tglx in private, but got no answer yet. If you can accept the reasoning, I''ll resend the patch with an updated commit message. Thank you very much, Laszlo _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Laszlo Ersek
2011-Oct-20 15:02 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
On 10/20/11 16:35, Laszlo Ersek wrote:> I''m convinced the patch is correct, and only the commit message might > need a small fix (mentioning cpu_idle()).I forgot to say that I also added counters to xen_timer_interrupt(), account_idle_ticks() (called from cpu_idle()), and the idle time branch of account_process_tick(). (The last one is reached from xen_timer_interrupt() via event_handler == &tick_nohz_handler, after highres=off was passed). When the VM was left alone, they were increasing in strict lock-step. account_idle_time() <- account_idle_ticks() <- tick_nohz_restart_sched_tick() <- cpu_idle() <- account_process_tick() <- update_process_times() <- tick_nohz_handler() [highres=off] <- xen_timer_interrupt() <- (tick_periodic()) <- (tick_sched_timer()) The timer interrupt appears to kick cpu_idle(), and the latter accounts for the time spent idly. Laszlo _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2011-Oct-26 20:52 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
On Thu, Oct 20, 2011 at 05:02:41PM +0200, Laszlo Ersek wrote:> On 10/20/11 16:35, Laszlo Ersek wrote: > > >I''m convinced the patch is correct, and only the commit message might > >need a small fix (mentioning cpu_idle()).Hey Laszlo and Zhenzhong, Rest assured - I haven''t forgotten about the two time patches.. little busy with some of the Fedore Core 16 kernel bugs.> > I forgot to say that I also added counters to xen_timer_interrupt(), > account_idle_ticks() (called from cpu_idle()), and the idle time > branch of account_process_tick(). (The last one is reached from > xen_timer_interrupt() via event_handler == &tick_nohz_handler, after > highres=off was passed). When the VM was left alone, they were > increasing in strict lock-step. > > account_idle_time() > > <- account_idle_ticks() > <- tick_nohz_restart_sched_tick() > <- cpu_idle() > > <- account_process_tick() > <- update_process_times() > <- tick_nohz_handler() [highres=off] > <- xen_timer_interrupt() > > <- (tick_periodic()) > <- (tick_sched_timer()) > > The timer interrupt appears to kick cpu_idle(), and the latter > accounts for the time spent idly. > > Laszlo > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2011-Nov-09 13:35 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
>>> On 18.10.11 at 22:42, Laszlo Ersek <lersek@redhat.com> wrote: > ... because the "clock_event_device framework" already accounts for idle > time through the "event_handler" function pointer in > xen_timer_interrupt(). > > The patch is intended as the completion of [1]. It should fix the double > idle times seen in PV guests'' /proc/stat [2]. It should be orthogonal to > stolen time accounting (the removed code seems to be isolated).After some more looking around I still think it''s incorrect, albeit for a different reason: What tick_nohz_restart_sched_tick() accounts as idle time is *all* time that passed while in cpu_idle(). What gets accounted in do_stolen_accounting() (without your patch) is different: - time the vCPU was in RUNSTATE_blocked gets accounted as idle - time the vCPU was in RUNSTATE_runnable and RUNSTATE_offline gets accounted as stolen. That is, on an overcommitted system (and without your patch) I would expect you to not see the (full) double idle increment for a not fully idle and not fully loaded vCPU. Now we can of course say that for most purposes it''s fine to ignore the difference between idle and steal time, but there must be a reason these have been getting accounted separately in Linux without regard to Xen. So I really think that what needs to be suppressed to address this is tick_nohz_restart_sched_tick()''s call to account_idle_ticks(). But simply forcing CONFIG_VIRT_CPU_ACCOUNTING is not an immediate option (not even when considering a Xen-only kernel), as that has further implications. Instead I wonder whether under Xen we should e.g. update per_cpu(tick_cpu_sched, cpu).idle_jiffies prior to calling tick_nohz_restart_sched_tick() (possibly implicitly by doing the update in do_stolen_accounting(), though that wouldn''t work when the wakeup is due to an interrupt other than the timer one). The alternative of accounting the negative of the steal time as idle time in do_stolen_accounting() doesn''t look correct either, since not all stolen time was necessarily accounted as idle (some would have got - also incorrectly - accounted as process or system time). So my conclusion is that only the full implementation of what CONFIG_VIRT_CPU_ACCOUNTING expects would actually get things sorted out properly (but that would imply Xen *and* native are willing to pay the performance price, or the enabling of this can be made dynamic so that at least native could remain unaffected). Or the negative stolen time should be accounted to the current process, the system, or as idle, depending on the context which do_stolen_accounting() runs in (just like timer_interrupt() did in the XenoLinux kernels for positive accounting). Jan> The approach may be completely misguided. > > [1] https://lkml.org/lkml/2011/10/6/10 > [2] http://lists.xensource.com/archives/html/xen-devel/2010-08/msg01068.html > > Signed-off-by: Laszlo Ersek <lersek@redhat.com> > --- > arch/x86/xen/time.c | 17 ++--------------- > 1 files changed, 2 insertions(+), 15 deletions(-) > > diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c > index 163b467..377f6ae 100644 > --- a/arch/x86/xen/time.c > +++ b/arch/x86/xen/time.c > @@ -36,9 +36,8 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, > xen_runstate); > /* snapshots of runstate info */ > static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); > > -/* unused ns of stolen and blocked time */ > +/* unused ns of stolen time */ > static DEFINE_PER_CPU(u64, xen_residual_stolen); > -static DEFINE_PER_CPU(u64, xen_residual_blocked); > > /* return an consistent snapshot of 64-bit time/counter value */ > static u64 get64(const u64 *p) > @@ -115,7 +114,7 @@ static void do_stolen_accounting(void) > { > struct vcpu_runstate_info state; > struct vcpu_runstate_info *snap; > - s64 blocked, runnable, offline, stolen; > + s64 runnable, offline, stolen; > cputime_t ticks; > > get_runstate_snapshot(&state); > @@ -125,7 +124,6 @@ static void do_stolen_accounting(void) > snap = &__get_cpu_var(xen_runstate_snapshot); > > /* work out how much time the VCPU has not been runn*ing* */ > - blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; > runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; > offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; > > @@ -141,17 +139,6 @@ static void do_stolen_accounting(void) > ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); > __this_cpu_write(xen_residual_stolen, stolen); > account_steal_ticks(ticks); > - > - /* Add the appropriate number of ticks of blocked time, > - including any left-overs from last time. */ > - blocked += __this_cpu_read(xen_residual_blocked); > - > - if (blocked < 0) > - blocked = 0; > - > - ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); > - __this_cpu_write(xen_residual_blocked, blocked); > - account_idle_ticks(ticks); > } > > /* Get the TSC speed from Xen */_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Laszlo Ersek
2011-Nov-09 17:47 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
On 11/09/11 14:35, Jan Beulich wrote:>>>> On 18.10.11 at 22:42, Laszlo Ersek<lersek@redhat.com> wrote: >> ... because the "clock_event_device framework" already accounts for idle >> time through the "event_handler" function pointer in >> xen_timer_interrupt(). >> >> The patch is intended as the completion of [1]. It should fix the double >> idle times seen in PV guests'' /proc/stat [2]. It should be orthogonal to >> stolen time accounting (the removed code seems to be isolated). > > After some more looking around I still think it''s incorrect, albeit for > a different reason: What tick_nohz_restart_sched_tick() accounts > as idle time is *all* time that passed while in cpu_idle(). What gets > accounted in do_stolen_accounting() (without your patch) is > different: > - time the vCPU was in RUNSTATE_blocked gets accounted as idle > - time the vCPU was in RUNSTATE_runnable and RUNSTATE_offline > gets accounted as stolen. > > That is, on an overcommitted system (and without your patch) I > would expect you to not see the (full) double idle increment for a not > fully idle and not fully loaded vCPU.I tried to verify this with an experiment. Please examine if the experiment is bogus or not. On a four-PCPU host (hyperthreading off, RHEL-5.7+ hypervisor & dom0) I started three virtual machines: VM1: four VCPUs, four processes running a busy loop each, independently. VM2: ditto VM3: single VCPU running the attached program (which otherwise puts 1/2 load on a single CPU, virtual or physical.) OS is RHEL-6.1. In VM3, I also ran this script: $ grep cpu0 /proc/stat; sleep 20; grep cpu0 /proc/stat cpu0 10421 0 510 119943 608 0 1 122 0 cpu0 11420 0 510 121942 608 0 1 126 0 The difference in the fourth numerical column is still 1999, even though only 10 seconds of those 20 were spent idly. Does the experiment miss the point (or do I), or does this disprove the idea? (Interestingly, according to virt-manager, the load distribution between the VMs looked like: VM1: 7/16 = 43.75% VM2: 7/16 = 43.75% VM3: 2/16 = 1/8 = 12.50% as if VM3''s load had been first extracted and the rest split between VM1 and VM2. When I stop VM1 and VM2, VM3 stays at 12.5%. Under the above load, I would have expected: VM1: 8/17 ~= 47.06% VM2: 8/17 ~= 47.06% VM3: 1/17 ~= 5.88% ie. "eight and half" VCPUs sharing the host evenly. Could this have any relevance?) Thank you Laszlo _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2011-Nov-10 08:32 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
>>> On 09.11.11 at 18:47, Laszlo Ersek <lersek@redhat.com> wrote: > On 11/09/11 14:35, Jan Beulich wrote: >> That is, on an overcommitted system (and without your patch) I >> would expect you to not see the (full) double idle increment for a not >> fully idle and not fully loaded vCPU. > > I tried to verify this with an experiment. Please examine if the > experiment is bogus or not. > > On a four-PCPU host (hyperthreading off, RHEL-5.7+ hypervisor & dom0) I > started three virtual machines: > > VM1: four VCPUs, four processes running a busy loop each, independently. > VM2: ditto > VM3: single VCPU running the attached program (which otherwise puts 1/2 > load on a single CPU, virtual or physical.) OS is RHEL-6.1. > > In VM3, I also ran this script: > > $ grep cpu0 /proc/stat; sleep 20; grep cpu0 /proc/stat > cpu0 10421 0 510 119943 608 0 1 122 0 > cpu0 11420 0 510 121942 608 0 1 126 0 > > The difference in the fourth numerical column is still 1999, even though > only 10 seconds of those 20 were spent idly. > > Does the experiment miss the point (or do I), or does this disprove the > idea?For one, my expectation may be wrong (while I think the consideration of the accounting still being wrong even with the patch is correct). Second, the amount of stolen time (presumably the second to last column; not sure what kernel version RHEL-6.1 uses, so I can''t immediately verify) being just 4 is certainly too small to be relevant (meaning that VM3''s only vCPU got scheduled almost instantly in too many cases, which I think is the intended behavior of the credit scheduler in a contrived environment like this). To get the amount of stolen time up, I think one would have to penalize VM3 (so that it doesn''t benefit from not having been scheduled for 100ms each time). I don''t, however, know how to achieve that in practice. One question certainly possible to answer is whether, with your patch and across different (over-)load scenarios, process, system, idle and steal times add up to wall time, which I don''t think would be the case. Which isn''t to say that they do without your patch, just that it addresses only part of a wider issue. Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2011-Nov-10 18:05 UTC
Re: [Xen-devel] [PATCH] remove blocked time accounting from xen "clockchip"
On 11/09/2011 05:35 AM, Jan Beulich wrote:>>>> On 18.10.11 at 22:42, Laszlo Ersek <lersek@redhat.com> wrote: >> ... because the "clock_event_device framework" already accounts for idle >> time through the "event_handler" function pointer in >> xen_timer_interrupt(). >> >> The patch is intended as the completion of [1]. It should fix the double >> idle times seen in PV guests'' /proc/stat [2]. It should be orthogonal to >> stolen time accounting (the removed code seems to be isolated). > After some more looking around I still think it''s incorrect, albeit for > a different reason: What tick_nohz_restart_sched_tick() accounts > as idle time is *all* time that passed while in cpu_idle(). What gets > accounted in do_stolen_accounting() (without your patch) is > different: > - time the vCPU was in RUNSTATE_blocked gets accounted as idle > - time the vCPU was in RUNSTATE_runnable and RUNSTATE_offline > gets accounted as stolen. > > That is, on an overcommitted system (and without your patch) I > would expect you to not see the (full) double idle increment for a not > fully idle and not fully loaded vCPU. > > Now we can of course say that for most purposes it''s fine to > ignore the difference between idle and steal time, but there must > be a reason these have been getting accounted separately in Linux > without regard to Xen. > > So I really think that what needs to be suppressed to address this > is tick_nohz_restart_sched_tick()''s call to account_idle_ticks(). But > simply forcing CONFIG_VIRT_CPU_ACCOUNTING is not an immediate > option (not even when considering a Xen-only kernel), as that has > further implications. Instead I wonder whether under Xen we should > e.g. update per_cpu(tick_cpu_sched, cpu).idle_jiffies prior to calling > tick_nohz_restart_sched_tick() (possibly implicitly by doing the > update in do_stolen_accounting(), though that wouldn''t work when > the wakeup is due to an interrupt other than the timer one). > > The alternative of accounting the negative of the steal time as > idle time in do_stolen_accounting() doesn''t look correct either, > since not all stolen time was necessarily accounted as idle (some > would have got - also incorrectly - accounted as process or system > time). > > So my conclusion is that only the full implementation of what > CONFIG_VIRT_CPU_ACCOUNTING expects would actually get > things sorted out properly (but that would imply Xen *and* native > are willing to pay the performance price, or the enabling of this > can be made dynamic so that at least native could remain > unaffected). > > Or the negative stolen time should be accounted to the current > process, the system, or as idle, depending on the context which > do_stolen_accounting() runs in (just like timer_interrupt() did in > the XenoLinux kernels for positive accounting).I was always suspicious of the timer-interrupt-based stolen time accounting code. It''s really a hold-over from when ticks were the main timekeeping mechanism, but with highres timers its massive overkill and probably a source of performance degradation. Overall, the stolen time accounting isn''t really very important. The kernel doesn''t use it at all internally - it doesn''t affect scheduling decisions, for example. It''s only exported in /proc/stat, and some tools like top will display it if its non-zero. It could be useful for diagnosing performance problems on a heavily loaded host system, but other tools like "xentop" would give you as much information. So really, all this code is nice to have, but I''m not sure its worth a lot of time to fix, especially if it makes idle accounting hard (which *is* important). However, that said, PeterZ recently added some code to the scheduler so that time "stolen" by interrupt handling isn''t accounted against the task running at the time, and the design is specifically intended to also be useful for virtualization stolen time as well. There are some KVM patches floating around to implement this, and we should look at doing a Xen implementation. That would be much more practically useful, since (I think) it allows the scheduler to be aware of stolen time and not penalize tasks for time they never got to use. Or at the very least it could give you per-task stolen stats (maybe?) which would be useful. J _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jan Beulich
2011-Dec-21 08:32 UTC
Re: [PATCH] remove blocked time accounting from xen "clockchip"
>>> On 18.10.11 at 22:42, Laszlo Ersek <lersek@redhat.com> wrote: > ... because the "clock_event_device framework" already accounts for idle > time through the "event_handler" function pointer in > xen_timer_interrupt(). > > The patch is intended as the completion of [1]. It should fix the double > idle times seen in PV guests'' /proc/stat [2]. It should be orthogonal to > stolen time accounting (the removed code seems to be isolated).Finally found time to play with this a little myself. As expected, the change brings a significant improvement, but isn''t sufficient to be on par with the forward port kernels not using the clockevent infrastructure (we switched to this only in 2.6.34 and above) when it comes to correct steal time accounting. Specifically, without further adjustments, on a 4:3 over-committed system doing a kernel build, I''m seeing an over-accounting of approximately 4%. I was able to reduce this to slightly above 1% with below (experimental and not 32-bit compatible) change: --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3864,6 +3864,29 @@ return ns; } +#ifndef CONFIG_XEN +#define _cputime_to_cputime64 cputime_to_cputime64 +#else +#define NS_PER_TICK (1000000000 / HZ) +static DEFINE_PER_CPU(u64, stolen_snapshot); +static DEFINE_PER_CPU(unsigned int, stolen_residual); + +static cputime64_t _cputime_to_cputime64(cputime_t t) +{ + //todo not entirely suitable for 32-bit + u64 s = this_cpu_read(runstate.time[RUNSTATE_runnable]); + unsigned long adj = div_u64_rem(s - this_cpu_read(stolen_snapshot) + + this_cpu_read(stolen_residual), + NS_PER_TICK, + &__get_cpu_var(stolen_residual)); + + this_cpu_write(stolen_snapshot, s); + t = cputime_sub(t, jiffies_to_cputime(adj)); + + return cputime_le(t, cputime_max) ? cputime_to_cputime64(t) : 0; +} +#endif + /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to @@ -3882,7 +3905,7 @@ account_group_user_time(p, cputime); /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); + tmp = _cputime_to_cputime64(cputime); if (TASK_NICE(p) > 0) cpustat->nice = cputime64_add(cpustat->nice, tmp); else @@ -3934,7 +3957,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, cputime_t cputime_scaled, cputime64_t *target_cputime64) { - cputime64_t tmp = cputime_to_cputime64(cputime); + cputime64_t tmp = _cputime_to_cputime64(cputime); /* Add system time to process. */ p->stime = cputime_add(p->stime, cputime); @@ -3996,7 +4019,7 @@ void account_idle_time(cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); + cputime64_t cputime64 = _cputime_to_cputime64(cputime); struct rq *rq = this_rq(); if (atomic_read(&rq->nr_iowait) > 0) @@ -4033,7 +4056,7 @@ struct rq *rq) { cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + cputime64_t tmp = _cputime_to_cputime64(cputime_one_jiffy); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; if (irqtime_account_hi_update()) { I currently have no idea what the remain 1% could be attributed to, so thoughts from others would certainly be welcome. Jan
Laszlo Ersek
2011-Dec-21 13:53 UTC
Re: [PATCH] remove blocked time accounting from xen "clockchip"
On 12/21/11 09:32, Jan Beulich wrote:> Specifically, without further adjustments, on a 4:3 over-committed > system doing a kernel build, I''m seeing an over-accounting of > approximately 4%. I was able to reduce this to slightly above 1% > with below (experimental and not 32-bit compatible) change: > > --- a/kernel/sched.c > +++ b/kernel/sched.c > @@ -3864,6 +3864,29 @@ > return ns; > } > > +#ifndef CONFIG_XEN > +#define _cputime_to_cputime64 cputime_to_cputime64 > +#else > +#define NS_PER_TICK (1000000000 / HZ) > +static DEFINE_PER_CPU(u64, stolen_snapshot); > +static DEFINE_PER_CPU(unsigned int, stolen_residual); > + > +static cputime64_t _cputime_to_cputime64(cputime_t t) > +{ > + //todo not entirely suitable for 32-bit > + u64 s = this_cpu_read(runstate.time[RUNSTATE_runnable]); > + unsigned long adj = div_u64_rem(s - this_cpu_read(stolen_snapshot) > + + this_cpu_read(stolen_residual), > + NS_PER_TICK, > + &__get_cpu_var(stolen_residual)); > + > + this_cpu_write(stolen_snapshot, s); > + t = cputime_sub(t, jiffies_to_cputime(adj)); > + > + return cputime_le(t, cputime_max) ? cputime_to_cputime64(t) : 0; > +} > +#endif > +Why is this not entirely suitable for 32-bit? div_u64_rem() indeed returns an u64, but for the truncation to do actual damage, the retval (which is the number of ticks that was stolen from the VCPU) would have to be greater than about 4*10^9, which seems improbable with 1 msec long ticks. Also cputime_sub() only depends, in order to leave any underflow detectable, on adj <= cputime_max (cca. 2*10^9). I''m likely looking in the wrong place though.> /* > * Account user cpu time to a process. > * @p: the process that the cpu time gets accounted to > @@ -3882,7 +3905,7 @@ > account_group_user_time(p, cputime); > > /* Add user time to cpustat. */ > - tmp = cputime_to_cputime64(cputime); > + tmp = _cputime_to_cputime64(cputime); > if (TASK_NICE(p)> 0) > cpustat->nice = cputime64_add(cpustat->nice, tmp); > else > @@ -3934,7 +3957,7 @@ > void __account_system_time(struct task_struct *p, cputime_t cputime, > cputime_t cputime_scaled, cputime64_t *target_cputime64) > { > - cputime64_t tmp = cputime_to_cputime64(cputime); > + cputime64_t tmp = _cputime_to_cputime64(cputime); > > /* Add system time to process. */ > p->stime = cputime_add(p->stime, cputime); > @@ -3996,7 +4019,7 @@ > void account_idle_time(cputime_t cputime) > { > struct cpu_usage_stat *cpustat =&kstat_this_cpu.cpustat; > - cputime64_t cputime64 = cputime_to_cputime64(cputime); > + cputime64_t cputime64 = _cputime_to_cputime64(cputime); > struct rq *rq = this_rq(); > > if (atomic_read(&rq->nr_iowait)> 0) > @@ -4033,7 +4056,7 @@ > struct rq *rq) > { > cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); > - cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); > + cputime64_t tmp = _cputime_to_cputime64(cputime_one_jiffy); > struct cpu_usage_stat *cpustat =&kstat_this_cpu.cpustat; > > if (irqtime_account_hi_update()) { > > I currently have no idea what the remain 1% could be attributed to, > so thoughts from others would certainly be welcome.What about irqtime_account_process_tick() calling account_idle_time() with "cputime_one_jiffy" -- it could more frequently trigger the underflow in _cputime_to_cputime64(). In that case we might want to decrease idle time (ie. account "negative" cputime against idle), but can''t. As always, apologies if what I wrote is painfully wrong. Laszlo
Jan Beulich
2011-Dec-21 14:58 UTC
Re: [PATCH] remove blocked time accounting from xen "clockchip"
>>> On 21.12.11 at 14:53, Laszlo Ersek <lersek@redhat.com> wrote: > On 12/21/11 09:32, Jan Beulich wrote: > >> Specifically, without further adjustments, on a 4:3 over-committed >> system doing a kernel build, I''m seeing an over-accounting of >> approximately 4%. I was able to reduce this to slightly above 1% >> with below (experimental and not 32-bit compatible) change: >> >> --- a/kernel/sched.c >> +++ b/kernel/sched.c >> @@ -3864,6 +3864,29 @@ >> return ns; >> } >> >> +#ifndef CONFIG_XEN >> +#define _cputime_to_cputime64 cputime_to_cputime64 >> +#else >> +#define NS_PER_TICK (1000000000 / HZ) >> +static DEFINE_PER_CPU(u64, stolen_snapshot); >> +static DEFINE_PER_CPU(unsigned int, stolen_residual); >> + >> +static cputime64_t _cputime_to_cputime64(cputime_t t) >> +{ >> + //todo not entirely suitable for 32-bit >> + u64 s = this_cpu_read(runstate.time[RUNSTATE_runnable]); >> + unsigned long adj = div_u64_rem(s - this_cpu_read(stolen_snapshot) >> + + this_cpu_read(stolen_residual), >> + NS_PER_TICK, >> + &__get_cpu_var(stolen_residual)); >> + >> + this_cpu_write(stolen_snapshot, s); >> + t = cputime_sub(t, jiffies_to_cputime(adj)); >> + >> + return cputime_le(t, cputime_max) ? cputime_to_cputime64(t) : 0; >> +} >> +#endif >> + > > Why is this not entirely suitable for 32-bit? div_u64_rem() indeed > returns an u64, but for the truncation to do actual damage, the retval > (which is the number of ticks that was stolen from the VCPU) would have > to be greater than about 4*10^9, which seems improbable with 1 msec long > ticks. Also cputime_sub() only depends, in order to leave any underflow > detectable, on adj <= cputime_max (cca. 2*10^9). I''m likely looking in > the wrong place though.No, this is not about overflows. The very first this_cpu_read() is what is problematic on 64-bit - it reads from space updated by the hypervisor, and hence reading in two halves may get interrupted by an update. The solution is to use cmpxchg8b here, but I only just now put that part together.>> /* >> * Account user cpu time to a process. >> * @p: the process that the cpu time gets accounted to >> @@ -3882,7 +3905,7 @@ >> account_group_user_time(p, cputime); >> >> /* Add user time to cpustat. */ >> - tmp = cputime_to_cputime64(cputime); >> + tmp = _cputime_to_cputime64(cputime); >> if (TASK_NICE(p)> 0) >> cpustat->nice = cputime64_add(cpustat->nice, tmp); >> else >> @@ -3934,7 +3957,7 @@ >> void __account_system_time(struct task_struct *p, cputime_t cputime, >> cputime_t cputime_scaled, cputime64_t *target_cputime64) >> { >> - cputime64_t tmp = cputime_to_cputime64(cputime); >> + cputime64_t tmp = _cputime_to_cputime64(cputime); >> >> /* Add system time to process. */ >> p->stime = cputime_add(p->stime, cputime); >> @@ -3996,7 +4019,7 @@ >> void account_idle_time(cputime_t cputime) >> { >> struct cpu_usage_stat *cpustat =&kstat_this_cpu.cpustat; >> - cputime64_t cputime64 = cputime_to_cputime64(cputime); >> + cputime64_t cputime64 = _cputime_to_cputime64(cputime); >> struct rq *rq = this_rq(); >> >> if (atomic_read(&rq->nr_iowait)> 0) >> @@ -4033,7 +4056,7 @@ >> struct rq *rq) >> { >> cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); >> - cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); >> + cputime64_t tmp = _cputime_to_cputime64(cputime_one_jiffy); >> struct cpu_usage_stat *cpustat =&kstat_this_cpu.cpustat; >> >> if (irqtime_account_hi_update()) { >> >> I currently have no idea what the remain 1% could be attributed to, >> so thoughts from others would certainly be welcome. > > What about irqtime_account_process_tick() calling account_idle_time() > with "cputime_one_jiffy" -- it could more frequently trigger the > underflow in _cputime_to_cputime64(). In that case we might want to > decrease idle time (ie. account "negative" cputime against idle), but can''t.I don''t think the underflow can actually happen, I just wanted to be safe by checking for it. If the calculation underflowed, it would mean that more time was stolen than was spent in the current state (e.g. idle) prior to the adjustment, which ought to be impossible. Jan
Konrad Rzeszutek Wilk
2012-Jan-19 19:42 UTC
Re: [PATCH] remove blocked time accounting from xen "clockchip"
On Thu, Nov 10, 2011 at 10:05:14AM -0800, Jeremy Fitzhardinge wrote:> On 11/09/2011 05:35 AM, Jan Beulich wrote: > >>>> On 18.10.11 at 22:42, Laszlo Ersek <lersek@redhat.com> wrote: > >> ... because the "clock_event_device framework" already accounts for idle > >> time through the "event_handler" function pointer in > >> xen_timer_interrupt(). > >> > >> The patch is intended as the completion of [1]. It should fix the double > >> idle times seen in PV guests'' /proc/stat [2]. It should be orthogonal to > >> stolen time accounting (the removed code seems to be isolated). > > After some more looking around I still think it''s incorrect, albeit for > > a different reason: What tick_nohz_restart_sched_tick() accounts > > as idle time is *all* time that passed while in cpu_idle(). What gets > > accounted in do_stolen_accounting() (without your patch) is > > different: > > - time the vCPU was in RUNSTATE_blocked gets accounted as idle > > - time the vCPU was in RUNSTATE_runnable and RUNSTATE_offline > > gets accounted as stolen. > > > > That is, on an overcommitted system (and without your patch) I > > would expect you to not see the (full) double idle increment for a not > > fully idle and not fully loaded vCPU. > > > > Now we can of course say that for most purposes it''s fine to > > ignore the difference between idle and steal time, but there must > > be a reason these have been getting accounted separately in Linux > > without regard to Xen. > > > > So I really think that what needs to be suppressed to address this > > is tick_nohz_restart_sched_tick()''s call to account_idle_ticks(). But > > simply forcing CONFIG_VIRT_CPU_ACCOUNTING is not an immediate > > option (not even when considering a Xen-only kernel), as that has > > further implications. Instead I wonder whether under Xen we should > > e.g. update per_cpu(tick_cpu_sched, cpu).idle_jiffies prior to calling > > tick_nohz_restart_sched_tick() (possibly implicitly by doing the > > update in do_stolen_accounting(), though that wouldn''t work when > > the wakeup is due to an interrupt other than the timer one). > > > > The alternative of accounting the negative of the steal time as > > idle time in do_stolen_accounting() doesn''t look correct either, > > since not all stolen time was necessarily accounted as idle (some > > would have got - also incorrectly - accounted as process or system > > time). > > > > So my conclusion is that only the full implementation of what > > CONFIG_VIRT_CPU_ACCOUNTING expects would actually get > > things sorted out properly (but that would imply Xen *and* native > > are willing to pay the performance price, or the enabling of this > > can be made dynamic so that at least native could remain > > unaffected). > > > > Or the negative stolen time should be accounted to the current > > process, the system, or as idle, depending on the context which > > do_stolen_accounting() runs in (just like timer_interrupt() did in > > the XenoLinux kernels for positive accounting). > > I was always suspicious of the timer-interrupt-based stolen time > accounting code. It''s really a hold-over from when ticks were the main > timekeeping mechanism, but with highres timers its massive overkill and > probably a source of performance degradation. > > Overall, the stolen time accounting isn''t really very important. The > kernel doesn''t use it at all internally - it doesn''t affect scheduling > decisions, for example. It''s only exported in /proc/stat, and some > tools like top will display it if its non-zero. It could be useful for > diagnosing performance problems on a heavily loaded host system, but > other tools like "xentop" would give you as much information. > > So really, all this code is nice to have, but I''m not sure its worth a > lot of time to fix, especially if it makes idle accounting hard (which > *is* important). > > However, that said, PeterZ recently added some code to the scheduler so > that time "stolen" by interrupt handling isn''t accounted against the > task running at the time, and the design is specifically intended to > also be useful for virtualization stolen time as well. There are some > KVM patches floating around to implement this, and we should look atI finally got some time to look at them and I think they are these ones: git log --oneline e03b644fe68b1c6401465b02724d261538dba10f..3c404b578fab699c4708279938078d9404b255a4 3c404b5 KVM guest: Add a pv_ops stub for steal time c9aaa89 KVM: Steal time implementation 9ddabbe KVM: KVM Steal time guest/host interface 4b6b35f KVM: Add constant to represent KVM MSRs enabled bit in guest/host interface What is interesting is that they end up inserting a bunch of: + if (steal_account_process_tick()) + return; + in irqtime_account_process_tick and in account_process_tick. The steal_account_process_tick just makes a pv_time_ops.steal_clock based on the asm goto of paravirt_steal_enabled key. I think if we were to persue this we could rip out in the ''do_stolen_accounting'' the call to ''accont_idle_tick'' and ''account_idle_time'' completly. In essence making that function behave as a pv_time_ops.steal_clock implementation. Also that would mean we could remove in the ''xen_timer_interrupt'' function the call to ''do_stolen_accounting'' I think this would mean we would account _only_ for the RUNSTATE_runnable and RUNSTATE_offline as Laszlo''s earlier patch suggested? And the RUNSTATE_blocked would be figured out by the existing mechanism in the sched.c code?> doing a Xen implementation. That would be much more practically useful, > since (I think) it allows the scheduler to be aware of stolen time and > not penalize tasks for time they never got to use. Or at the very least > it could give you per-task stolen stats (maybe?) which would be useful. > > J > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel
Jan Beulich
2012-Jan-20 09:57 UTC
Re: [PATCH] remove blocked time accounting from xen "clockchip"
>>> On 19.01.12 at 20:42, Konrad Rzeszutek Wilk <konrad@darnok.org> wrote: > I finally got some time to look at them and I think they are these ones: > > git log --oneline > e03b644fe68b1c6401465b02724d261538dba10f..3c404b578fab699c4708279938078d9404b > 255a4 > 3c404b5 KVM guest: Add a pv_ops stub for steal time > c9aaa89 KVM: Steal time implementation > 9ddabbe KVM: KVM Steal time guest/host interface > 4b6b35f KVM: Add constant to represent KVM MSRs enabled bit in guest/host > interface > > What is interesting is that they end up inserting a bunch of: > > > + if (steal_account_process_tick()) > + return; > + > > in irqtime_account_process_tick and in account_process_tick.And this (particularly the "return" part of it) is what I have a hard time to understand: How can it be correct to not do any of the other accounting? After all, the function calls only account_steal_time(), but its certainly going to be common that part of the time was stolen, and part was spent executing. Further, it''s being called only from the process tick accounting functions, but clearly part of idle or interrupt time can also be stolen. Jan
Konrad Rzeszutek Wilk
2012-Jan-20 16:00 UTC
Re: [PATCH] remove blocked time accounting from xen "clockchip"
On Fri, Jan 20, 2012 at 09:57:04AM +0000, Jan Beulich wrote:> >>> On 19.01.12 at 20:42, Konrad Rzeszutek Wilk <konrad@darnok.org> wrote: > > I finally got some time to look at them and I think they are these ones: > > > > git log --oneline > > e03b644fe68b1c6401465b02724d261538dba10f..3c404b578fab699c4708279938078d9404b > > 255a4 > > 3c404b5 KVM guest: Add a pv_ops stub for steal time > > c9aaa89 KVM: Steal time implementation > > 9ddabbe KVM: KVM Steal time guest/host interface > > 4b6b35f KVM: Add constant to represent KVM MSRs enabled bit in guest/host > > interface > > > > What is interesting is that they end up inserting a bunch of: > > > > > > + if (steal_account_process_tick()) > > + return; > > + > > > > in irqtime_account_process_tick and in account_process_tick. > > And this (particularly the "return" part of it) is what I have a hard > time to understand: How can it be correct to not do any of the > other accounting? After all, the function calls only > account_steal_time(), but its certainly going to be common that > part of the time was stolen, and part was spent executing. > > Further, it''s being called only from the process tick accountingAlso from ''irqtime_account_idle_ticks'' which is called from account_idle_ticks (if tsc is part of the picture) which is called from tick_nohz_idle_exit. So at the end of the idle loop the idle time is accounted for.> functions, but clearly part of idle or interrupt time can also be > stolen.It looks as if the other interrupt times: so the CPUTIME_SOFTIRQ and CPUTIME_IRQ are completly skipped - but only if there is a "steal time". The ''steal time'' from the KVM is based on the host scheduler notion of ''run_delay''. I think the ''run_delay'' is based purely on block I/O delay or swap I/O delay. So if the host is not running in any of those issues, then the ''steal_account_process_tick'' won''t have any values. And the ''if (..) return;'' wont be taken and it will continue to attribute the other ''time'' slots with appropiate values. If we have CPU intensive guests that are overcommitted, the guest /proc/schedstats won''t show the delay between the host putting it on a CPU as as ''steal'' time but rather as ''idle'' time - I think? That seems odd. I am probably misreading how ''run_delay'' gets computed.
Konrad Rzeszutek Wilk
2012-Jan-23 22:07 UTC
Re: [PATCH] remove blocked time accounting from xen "clockchip"
On Fri, Jan 20, 2012 at 09:57:04AM +0000, Jan Beulich wrote:> >>> On 19.01.12 at 20:42, Konrad Rzeszutek Wilk <konrad@darnok.org> wrote: > > I finally got some time to look at them and I think they are these ones: > > > > git log --oneline > > e03b644fe68b1c6401465b02724d261538dba10f..3c404b578fab699c4708279938078d9404b > > 255a4 > > 3c404b5 KVM guest: Add a pv_ops stub for steal time > > c9aaa89 KVM: Steal time implementation > > 9ddabbe KVM: KVM Steal time guest/host interface > > 4b6b35f KVM: Add constant to represent KVM MSRs enabled bit in guest/host > > interface > > > > What is interesting is that they end up inserting a bunch of: > > > > > > + if (steal_account_process_tick()) > > + return; > > + > > > > in irqtime_account_process_tick and in account_process_tick. > > And this (particularly the "return" part of it) is what I have a hard > time to understand: How can it be correct to not do any of the > other accounting? After all, the function calls only > account_steal_time(), but its certainly going to be common that > part of the time was stolen, and part was spent executing. > > Further, it''s being called only from the process tick accounting > functions, but clearly part of idle or interrupt time can also be > stolen.I took a stab at trying to implement this using this API and basing it on top Laszlo patch.. But I''ve doubts about it too and I haven''t run any benchmarks past booting a guest. commit b1acd2adad821fd87da6941c38f0dbaddd37dc6b Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Date: Mon Jan 23 14:21:20 2012 -0500 xen/time: Use the pv_ops steal_time. In the past we were using do_stolen_account() to update the stolen time and this was done when by calling account_steal_ticks(). The do_stolen_account() was called from xen_timer_interrupt(). The xen_timer_interrupt() would be called periodically (or frequently) depending on what the timer decided. We would piggy-back on the timer IRQ to update the steal time and idle time (fixed by ''remove blocked time accounting from xen "clockchip"'' patch). Instead of piggy-backing on the timer interrupt lets do it via the provided API pv-ops time calls - the steal_clock. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 5c4a7f8..59cdd1b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -71,14 +71,14 @@ static u64 get64(const u64 *p) /* * Runstate accounting */ -static void get_runstate_snapshot(struct vcpu_runstate_info *res) +static void get_runstate_snapshot(struct vcpu_runstate_info *res, int cpu) { u64 state_time; struct vcpu_runstate_info *state; BUG_ON(preemptible()); - state = &__get_cpu_var(xen_runstate); + state = &per_cpu(xen_runstate, cpu); /* * The runstate info is always updated by the hypervisor on @@ -110,18 +110,18 @@ void xen_setup_runstate_info(int cpu) BUG(); } -static void do_stolen_accounting(void) +static u64 xen_steal_clock(int cpu) { struct vcpu_runstate_info state; struct vcpu_runstate_info *snap; s64 runnable, offline, stolen; - cputime_t ticks; + u64 ticks; - get_runstate_snapshot(&state); + get_runstate_snapshot(&state, cpu); WARN_ON(state.state != RUNSTATE_running); - snap = &__get_cpu_var(xen_runstate_snapshot); + snap = &per_cpu(xen_runstate_snapshot, cpu); /* work out how much time the VCPU has not been runn*ing* */ runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; @@ -138,7 +138,8 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __this_cpu_write(xen_residual_stolen, stolen); - account_steal_ticks(ticks); + + return ticks; } /* Get the TSC speed from Xen */ @@ -377,8 +378,6 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) ret = IRQ_HANDLED; } - do_stolen_accounting(); - return ret; } @@ -439,6 +438,7 @@ void xen_timer_resume(void) static const struct pv_time_ops xen_time_ops __initconst = { .sched_clock = xen_clocksource_read, + .steal_clock = xen_steal_clock, }; static void __init xen_time_init(void) @@ -464,6 +464,9 @@ static void __init xen_time_init(void) xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); + + jump_label_inc(¶virt_steal_enabled); + jump_label_inc(¶virt_steal_rq_enabled); } void __init xen_init_time_ops(void)> > Jan