Here are some patches to help xen run on a system where the CPUs are not synchronized so that the tsc counters drift. Without this patch I get lots of ''time went backwards'' messages in the linux logs. I still get an occasional message, but not the number I did previously. It has been tested on an x86 box with 4 hyper threaded cpus (8 logical), but has not been tested with x86-64 or any other processor. The patches are against today''s xeno-unstable.bk bits. I am still trying to understand why I get any ''time went backwards'' messages, but thought I would post this now to get feedback as it is. --- xeno-unstable.bk/xen/include/public/xen.h.orig 2005-05-23 17:24:21.000000000 -0700 +++ xeno-unstable.bk/xen/include/public/xen.h 2005-05-23 16:37:22.000000000 -0700 @@ -338,6 +338,21 @@ typedef struct } PACKED vcpu_info_t; /* 8 + arch */ /* + * Xen/kernel shared data + * per cpu timing information. + */ +typedef struct time_info_st +{ + u32 time_version1; + u32 time_version2; + tsc_timestamp_t tsc_timestamp; /* TSC at last update */ + u64 system_time; /* time, in nanoseconds, since boot */ + u64 cpu_freq; /* CPU frequency (Hz) */ + u32 wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ + u32 wc_usec; /* Usecs 00:00:00 UTC, Jan 1, 1970. */ +} PACKED time_info_t; + +/* * Xen/kernel shared data -- pointer provided in start_info. * NB. We expect that this struct is smaller than a page. */ @@ -412,6 +427,7 @@ typedef struct shared_info_st arch_shared_info_t arch; + time_info_t vcpu_time[MAX_VIRT_CPUS]; } PACKED shared_info_t; /* --- xeno-unstable.bk/xen/common/schedule.c.orig 2005-05-23 17:25:39.000000000 -0700 +++ xeno-unstable.bk/xen/common/schedule.c 2005-05-23 16:42:24.000000000 -0700 @@ -57,6 +57,7 @@ string_param("sched", opt_sched); /* Various timer handlers. */ static void s_timer_fn(unsigned long unused); static void t_timer_fn(unsigned long unused); +static void tsc_timer_fn(unsigned long unused); static void dom_timer_fn(unsigned long data); /* This is global for now so that private implementations can reach it */ @@ -80,6 +81,7 @@ static struct scheduler ops; /* Per-CPU periodic timer sends an event to the currently-executing domain. */ static struct ac_timer t_timer[NR_CPUS]; +static struct ac_timer tsc_timer[NR_CPUS]; void free_domain_struct(struct domain *d) { @@ -514,6 +516,7 @@ int idle_cpu(int cpu) * Timers: the scheduler utilises a number of timers * - s_timer: per CPU timer for preemption and scheduling decisions * - t_timer: per CPU periodic timer to send timer interrupt to current dom + * - tsc_timer: per CPU periodic timer to update time bases * - dom_timer: per domain timer to specifiy timeout values ****************************************************************************/ @@ -548,6 +551,18 @@ static void t_timer_fn(unsigned long unu add_ac_timer(&t_timer[cpu]); } +/* Periodic tick timer: update time bases for per-cpu timing. */ +static void tsc_timer_fn(unsigned long unused) +{ + unsigned int cpu = current->processor; + + extern void percpu_ticks(void); + percpu_ticks(); + + tsc_timer[cpu].expires = NOW() + MILLISECS(500); + add_ac_timer(&tsc_timer[cpu]); +} + /* Domain timer function, sends a virtual timer interrupt to domain */ static void dom_timer_fn(unsigned long data) { @@ -578,6 +593,11 @@ void __init scheduler_init(void) t_timer[i].cpu = i; t_timer[i].data = 3; t_timer[i].function = &t_timer_fn; + + init_ac_timer(&tsc_timer[i]); + tsc_timer[i].cpu = i; + tsc_timer[i].data = 4; + tsc_timer[i].function = &tsc_timer_fn; } schedule_data[0].idle = &idle0_exec_domain; @@ -609,6 +629,9 @@ void schedulers_start(void) t_timer_fn(0); smp_call_function((void *)t_timer_fn, NULL, 1, 1); + + tsc_timer_fn(0); + smp_call_function((void *)tsc_timer_fn, NULL, 1, 1); } --- xeno-unstable.bk/xen/arch/x86/smpboot.c.orig 2005-05-23 15:33:50.000000000 -0700 +++ xeno-unstable.bk/xen/arch/x86/smpboot.c 2005-05-23 16:41:56.000000000 -0700 @@ -400,6 +400,7 @@ void __init start_secondary(void) extern void percpu_traps_init(void); extern void cpu_init(void); + extern void setup_percpu_time(void); set_current(idle_task[cpu]); set_processor_id(cpu); @@ -421,6 +422,8 @@ void __init start_secondary(void) construct_percpu_idt(cpu); #endif + setup_percpu_time(); + local_flush_tlb(); startup_cpu_idle_loop(); --- xeno-unstable.bk/xen/arch/x86/time.c.orig 2005-05-23 17:25:12.000000000 -0700 +++ xeno-unstable.bk/xen/arch/x86/time.c 2005-05-23 16:42:35.000000000 -0700 @@ -50,6 +50,29 @@ static u64 full_tsc_irq; static s_time_t stime_irq; /* System time at last ''time update'' */ static unsigned long wc_sec, wc_usec; /* UTC time at last ''time update''. */ static rwlock_t time_lock = RW_LOCK_UNLOCKED; +static time_info_t percpu_time_info[NR_CPUS]; + +void percpu_ticks(void) +{ + int cpu = smp_processor_id(); + time_info_t *t = &percpu_time_info[cpu]; + u64 tsc, delta; + u64 quarter = t->cpu_freq >> 2; + + rdtscll(tsc); + delta = tsc - t->tsc_timestamp; + while (delta >= quarter) { + t->wc_usec += 1000000UL / 4; + t->system_time += 1000000000ULL / 4; + t->tsc_timestamp += quarter; + delta -= quarter; + } + + while (t->wc_usec > 1000000UL) { + t->wc_sec += 1; + t->wc_usec -= 10000000UL; + } +} void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) { @@ -278,20 +301,29 @@ static inline void __update_dom_time(str { struct domain *d = ed->domain; shared_info_t *si = d->shared_info; + time_info_t *dom = &si->vcpu_time[ed->processor]; + time_info_t *xen = &percpu_time_info[smp_processor_id()]; spin_lock(&d->time_lock); si->time_version1++; + dom->time_version1++; wmb(); si->cpu_freq = cpu_freq; + dom->cpu_freq = xen->cpu_freq; si->tsc_timestamp = full_tsc_irq; + dom->tsc_timestamp = xen->tsc_timestamp; si->system_time = stime_irq; + dom->system_time = xen->system_time; si->wc_sec = wc_sec; + dom->wc_sec = xen->wc_sec; si->wc_usec = wc_usec; + dom->wc_usec = xen->wc_usec; wmb(); si->time_version2++; + dom->time_version2++; spin_unlock(&d->time_lock); } @@ -299,8 +331,11 @@ static inline void __update_dom_time(str void update_dom_time(struct exec_domain *ed) { unsigned long flags; + int cpu = smp_processor_id(); - if ( ed->domain->shared_info->tsc_timestamp != full_tsc_irq ) + if ( ed->domain->shared_info->tsc_timestamp != full_tsc_irq + || ed->domain->shared_info->vcpu_time[ed->processor].tsc_timestamp !+ percpu_time_info[cpu].tsc_timestamp) { read_lock_irqsave(&time_lock, flags); __update_dom_time(ed); @@ -313,6 +348,7 @@ void do_settime(unsigned long secs, unsi { s64 delta; long _usecs = (long)usecs; + int i; write_lock_irq(&time_lock); @@ -327,6 +363,10 @@ void do_settime(unsigned long secs, unsi wc_sec = secs; wc_usec = _usecs; + for (i=0; i<NR_CPUS; i++) { + percpu_time_info[i].wc_sec = wc_sec; + percpu_time_info[i].wc_usec = wc_usec; + } /* Others will pick up the change at the next tick. */ __update_dom_time(current); @@ -336,16 +376,39 @@ void do_settime(unsigned long secs, unsi } +spinlock_t tsc_lock = SPIN_LOCK_UNLOCKED; + +/* + * Time setup for this processor. + */ +void __init setup_percpu_time(void) +{ + unsigned long flags; + unsigned long ticks_per_frac; + int cpu = smp_processor_id(); + + /* only have 1 cpu calibrate at a time */ + spin_lock_irqsave(&tsc_lock, flags); + ticks_per_frac = calibrate_tsc(); + spin_unlock_irqrestore(&tsc_lock, flags); + + if (!ticks_per_frac) + panic("Error calibrating TSC\n"); + percpu_time_info[cpu].cpu_freq = (u64)ticks_per_frac * (u64)CALIBRATE_FRAC; + rdtscll(percpu_time_info[cpu].tsc_timestamp); + percpu_time_info[cpu].system_time = stime_irq; +} + /* Late init function (after all CPUs are booted). */ int __init init_xen_time() { u64 scale; unsigned int cpu_ghz; + int i; cpu_ghz = (unsigned int)(cpu_freq / 1000000000ULL); for ( rdtsc_bitshift = 0; cpu_ghz != 0; rdtsc_bitshift++, cpu_ghz >>= 1 ) continue; - scale = 1000000000LL << (32 + rdtsc_bitshift); scale /= cpu_freq; st_scale_f = scale & 0xffffffff; @@ -358,6 +421,12 @@ int __init init_xen_time() /* Wallclock time starts as the initial RTC time. */ wc_sec = get_cmos_time(); + for (i=0; i<NR_CPUS; i++) { + percpu_time_info[i].wc_sec = wc_sec; + percpu_time_info[i].wc_usec = 0; + percpu_time_info[i].system_time = stime_irq; + percpu_time_info[i].cpu_freq = cpu_freq; // default speed + } printk("Time init:\n"); printk(".... cpu_freq: %08X:%08X\n", (u32)(cpu_freq>>32),(u32)cpu_freq); --- xeno-unstable.bk/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/time.c.orig 2005-05-23 17:28:47.000000000 -0700 +++ xeno-unstable.bk/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/time.c 2005-05-23 17:06:18.000000000 -0700 @@ -105,9 +105,13 @@ struct timer_opts *cur_timer = &timer_ts /* These are peridically updated in shared_info, and then copied here. */ u32 shadow_tsc_stamp; +DEFINE_PER_CPU(u32, shadow_tsc_stamp); u64 shadow_system_time; +DEFINE_PER_CPU(u64, shadow_system_time); static u32 shadow_time_version; +DEFINE_PER_CPU(u32, shadow_time_version); static struct timeval shadow_tv; +static DEFINE_PER_CPU(struct timeval, shadow_tv); /* * We use this to ensure that gettimeofday() is monotonically increasing. We @@ -171,23 +175,29 @@ __setup("independent_wallclock", __indep static void __get_time_values_from_xen(void) { shared_info_t *s = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); do { shadow_time_version = s->time_version2; + per_cpu(shadow_time_version, cpu) = s->vcpu_time[cpu].time_version2; rmb(); shadow_tv.tv_sec = s->wc_sec; shadow_tv.tv_usec = s->wc_usec; shadow_tsc_stamp = (u32)s->tsc_timestamp; shadow_system_time = s->system_time; + per_cpu(shadow_tv.tv_sec, cpu) = s->vcpu_time[cpu].wc_sec; + per_cpu(shadow_tv.tv_usec, cpu) = s->vcpu_time[cpu].wc_usec; + per_cpu(shadow_tsc_stamp, cpu) = (u32)s->vcpu_time[cpu].tsc_timestamp; + per_cpu(shadow_system_time, cpu) = s->vcpu_time[cpu].system_time; rmb(); } - while (shadow_time_version != s->time_version1); + while (shadow_time_version != s->time_version1 || per_cpu(shadow_time_version, cpu) != s->vcpu_time[cpu].time_version1); cur_timer->mark_offset(); } #define TIME_VALUES_UP_TO_DATE \ - ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); }) + ({ rmb(); ((per_cpu(shadow_time_version, cpu) == HYPERVISOR_shared_info->vcpu_time[cpu].time_version2) && (shadow_time_version == HYPERVISOR_shared_info->time_version2)); }) /* * This version of gettimeofday has microsecond resolution @@ -200,6 +210,7 @@ void do_gettimeofday(struct timeval *tv) unsigned long max_ntp_tick; unsigned long flags; s64 nsec; + int cpu = smp_processor_id(); do { unsigned long lost; @@ -227,7 +238,7 @@ void do_gettimeofday(struct timeval *tv) sec = xtime.tv_sec; usec += (xtime.tv_nsec / NSEC_PER_USEC); - nsec = shadow_system_time - processed_system_time; + nsec = per_cpu(shadow_system_time, cpu) - per_cpu(processed_system_time, cpu); __normalize_time(&sec, &nsec); usec += (long)nsec / NSEC_PER_USEC; @@ -273,6 +284,7 @@ int do_settimeofday(struct timespec *tv) long wtm_nsec; s64 nsec; struct timespec xentime; + int cpu = smp_processor_id(); if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -306,7 +318,7 @@ int do_settimeofday(struct timespec *tv) */ nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - nsec -= (shadow_system_time - processed_system_time); + nsec -= (per_cpu(shadow_system_time, cpu) - per_cpu(processed_system_time, cpu)); __normalize_time(&sec, &nsec); wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); @@ -392,27 +404,25 @@ static inline void do_timer_interrupt(in struct pt_regs *regs) { time_t wtm_sec, sec; - s64 delta, delta_cpu, nsec; + s64 delta_cpu, nsec; long sec_diff, wtm_nsec; int cpu = smp_processor_id(); do { __get_time_values_from_xen(); - delta = delta_cpu = (s64)shadow_system_time + - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC); - delta -= processed_system_time; - delta_cpu -= per_cpu(processed_system_time, cpu); + delta_cpu = (s64)per_cpu(shadow_system_time, cpu) + + ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC) + - per_cpu(processed_system_time, cpu); } while (!TIME_VALUES_UP_TO_DATE); - if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) { + if (unlikely(delta_cpu < 0)) { printk("Timer ISR/%d: Time went backwards: " - "delta=%lld cpu_delta=%lld shadow=%lld " - "off=%lld processed=%lld cpu_processed=%lld\n", - cpu, delta, delta_cpu, shadow_system_time, + "cpu_delta=%lld cpu_shadow=%lld " + "off=%lld cpu_processed=%lld\n", + cpu, delta_cpu, per_cpu(shadow_system_time, cpu), ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC), - processed_system_time, per_cpu(processed_system_time, cpu)); for (cpu = 0; cpu < num_online_cpus(); cpu++) printk(" %d: %lld\n", cpu, @@ -420,19 +430,15 @@ static inline void do_timer_interrupt(in return; } - /* System-wide jiffy work. */ - while (delta >= NS_PER_TICK) { - delta -= NS_PER_TICK; - processed_system_time += NS_PER_TICK; - do_timer(regs); - } - /* Local CPU jiffy work. */ while (delta_cpu >= NS_PER_TICK) { delta_cpu -= NS_PER_TICK; per_cpu(processed_system_time, cpu) += NS_PER_TICK; update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING, regs); + /* System-wide jiffy work. */ + if (cpu == 0) + do_timer(regs); } if (cpu != 0) @@ -447,19 +453,19 @@ static inline void do_timer_interrupt(in ((time_status & STA_UNSYNC) != 0) && (xtime.tv_sec > (last_update_from_xen + 60))) { /* Adjust shadow for jiffies that haven''t updated xtime yet. */ - shadow_tv.tv_usec -= + per_cpu(shadow_tv.tv_usec, cpu) -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ); - HANDLE_USEC_UNDERFLOW(shadow_tv); + HANDLE_USEC_UNDERFLOW(per_cpu(shadow_tv, cpu)); /* * Reset our running time counts if they are invalidated by * a warp backwards of more than 500ms. */ - sec_diff = xtime.tv_sec - shadow_tv.tv_sec; + sec_diff = xtime.tv_sec - per_cpu(shadow_tv.tv_sec, cpu); if (unlikely(abs(sec_diff) > 1) || unlikely(((sec_diff * USEC_PER_SEC) + (xtime.tv_nsec / NSEC_PER_USEC) - - shadow_tv.tv_usec) > 500000)) { + per_cpu(shadow_tv.tv_usec, cpu)) > 500000)) { #ifdef CONFIG_XEN_PRIVILEGED_GUEST last_rtc_update = last_update_to_xen = 0; #endif @@ -467,8 +473,8 @@ static inline void do_timer_interrupt(in } /* Update our unsynchronised xtime appropriately. */ - sec = shadow_tv.tv_sec; - nsec = shadow_tv.tv_usec * NSEC_PER_USEC; + sec = per_cpu(shadow_tv.tv_sec, cpu); + nsec = per_cpu(shadow_tv.tv_usec, cpu) * NSEC_PER_USEC; __normalize_time(&sec, &nsec); wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); @@ -498,7 +504,7 @@ static inline void do_timer_interrupt(in op.cmd = DOM0_SETTIME; op.u.settime.secs = tv.tv_sec; op.u.settime.usecs = tv.tv_usec; - op.u.settime.system_time = shadow_system_time; + op.u.settime.system_time = per_cpu(shadow_system_time, cpu); HYPERVISOR_dom0_op(&op); last_update_to_xen = xtime.tv_sec; @@ -670,7 +676,7 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); processed_system_time = shadow_system_time; - per_cpu(processed_system_time, 0) = processed_system_time; + per_cpu(processed_system_time, 0) = per_cpu(shadow_system_time, 0); if (timer_tsc_init.init(NULL) != 0) BUG(); @@ -759,7 +765,7 @@ void time_resume(void) /* Reset our own concept of passage of system time. */ processed_system_time = shadow_system_time; - per_cpu(processed_system_time, 0) = processed_system_time; + per_cpu(processed_system_time, 0) = per_cpu(shadow_system_time, 0); /* Accept a warp in UTC (wall-clock) time. */ last_seen_tv.tv_sec = 0; @@ -776,7 +782,7 @@ void local_setup_timer(void) do { seq = read_seqbegin(&xtime_lock); - per_cpu(processed_system_time, cpu) = shadow_system_time; + per_cpu(processed_system_time, cpu) = per_cpu(shadow_system_time, cpu); } while (read_seqretry(&xtime_lock, seq)); per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER); --- xeno-unstable.bk/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c.orig 2005-05-23 17:29:10.000000000 -0700 +++ xeno-unstable.bk/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c 2005-05-23 17:04:11.000000000 -0700 @@ -10,6 +10,7 @@ #include <linux/cpufreq.h> #include <linux/string.h> #include <linux/jiffies.h> +#include <linux/percpu.h> #include <asm/timer.h> #include <asm/io.h> @@ -35,8 +36,8 @@ extern spinlock_t i8253_lock; static int use_tsc; -static unsigned long long monotonic_base; -static u32 monotonic_offset; +static DEFINE_PER_CPU(unsigned long long, monotonic_base); +static DEFINE_PER_CPU(u32, monotonic_offset); static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; /* convert from cycles(64bits) => nanoseconds (64bits) @@ -74,8 +75,8 @@ static inline unsigned long long cycles_ */ static unsigned long fast_gettimeoffset_quotient; -extern u32 shadow_tsc_stamp; -extern u64 shadow_system_time; +extern DEFINE_PER_CPU(u32, shadow_tsc_stamp); +extern DEFINE_PER_CPU(u64, shadow_system_time); static unsigned long get_offset_tsc(void) { @@ -86,7 +87,7 @@ static unsigned long get_offset_tsc(void rdtsc(eax,edx); /* .. relative to previous jiffy (32 bits is enough) */ - eax -= shadow_tsc_stamp; + eax -= per_cpu(shadow_tsc_stamp, smp_processor_id()); /* * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient @@ -110,12 +111,13 @@ static unsigned long long monotonic_cloc { unsigned long long last_offset, this_offset, base; unsigned seq; + int cpu = smp_processor_id(); /* atomically read monotonic base & last_offset */ do { seq = read_seqbegin(&monotonic_lock); - last_offset = monotonic_offset; - base = monotonic_base; + last_offset = per_cpu(monotonic_offset, cpu); + base = per_cpu(monotonic_base, cpu); } while (read_seqretry(&monotonic_lock, seq)); /* Read the Time Stamp Counter */ @@ -152,11 +154,12 @@ unsigned long long sched_clock(void) static void mark_offset_tsc(void) { + int cpu = smp_processor_id(); /* update the monotonic base value */ write_seqlock(&monotonic_lock); - monotonic_base = shadow_system_time; - monotonic_offset = shadow_tsc_stamp; + per_cpu(monotonic_base, cpu) = per_cpu(shadow_system_time, cpu); + per_cpu(monotonic_offset, cpu) = per_cpu(shadow_tsc_stamp, cpu); write_sequnlock(&monotonic_lock); } -- Don Fry brazilnut@us.ibm.com _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
> Here are some patches to help xen run on a system where the > CPUs are not synchronized so that the tsc counters drift. > Without this patch I get lots of ''time went backwards'' > messages in the linux logs. I still get an occasional > message, but not the number I did previously.Don, This is looking good. To help other people review the patch, it might be a good idea to post some of the design discussion we had off list as I think the approach will be new to most people. (Perhaps put some of the text in a comment in the hypervisor interface). As regards the time going backwards messages, if you''re seeing small -ve deltas, I''m not surprised -- you need to round to some precision as we won''t be nanosecond accurate. Experience suggests we''ll be good for a few 10''s of ns with any kind of decent crystal. We could round to e.g. 512ns or 1024ns to make sure. Best, Ian _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Tue, May 24, 2005 at 02:20:36AM +0100, Ian Pratt wrote:> > Don, > > This is looking good. To help other people review the patch, it might be > a good idea to post some of the design discussion we had off list as I > think the approach will be new to most people. (Perhaps put some of the > text in a comment in the hypervisor interface). > > As regards the time going backwards messages, if you''re seeing small -ve > deltas, I''m not surprised -- you need to round to some precision as we > won''t be nanosecond accurate. Experience suggests we''ll be good for a > few 10''s of ns with any kind of decent crystal. We could round to e.g. > 512ns or 1024ns to make sure. > > Best, > Ian >I am including the email that we exchanged off-list. I started to edit it, but decided that something I thought wasn''t important, others would find vital, so I include all the email. The time going backwards was only occasionally, and it was a BIG jump backwards. I tracked it down yesterday to a problem with doing 32-bit arithmetic in Linux on the tsc values. For some reason, every 5-20 minutes xen seems to pause for about 5 seconds. This causes the tsc to wrap if only 32-bits are used, and the ''time went backwards'' message is printed. I changed to use 64-bit tsc deltas and have been running since yesterday afternoon without any ''time went backwards'' messages. I want to do some more cleanup (remove my debugging code) and will post all my changes to the list this afternoon. ----- Forwarded by Don Fry/Beaverton/IBM on 05/26/2005 09:29 AM ----- Bruce Jones/Beaverton/IBM wrote on 04/21/2005 09:07:26 AM:> John, can you provide some additional technical guidance here? > > Ian, Keir: John is the implementor of our Linux changes for Summit > and understands these issues better than anyone. > > I''ve added Don to the cc: list but he''s on vacation this week and > not reading email. > > -- brucej > > Ian Pratt <Ian.Pratt@cl.cam.ac.uk> wrote on 04/20/2005 05:42:47 PM: > > > > "Ian Pratt" <m+Ian.Pratt@cl.cam.ac.uk> wrote on 04/20/2005 04:47:44 PM: > > > > Please could Don write a paragraph explaining why cyclone timer support > > > > is useful. Do summit systems have different frequency CPUs in the same > > > > box? > > > Bruce writes: > > > I can write that paragraph myself. IBM''s high end xSeries systems are > > > NUMA systems, each node is a separate machine with it''s own front side > > > bus, I/O buses, etc... The chipset provides a cache-coherent interconnect > > > to allow them to be cabled together into one big system. > > > > OK, so even the FSB clocks come from different crystals. > > Yes, and the hardware intentionally skews their frequencies, for reasons > only the chipset designers understand. :) > > > > We had a boatload of problems with Linux when we first shipped it, with > > > time moving around forward and backward for applications. The processors > > > in the various nodes run at different frequencies and the on-processor > > > timers do not run in sync. We needed to modify Linux to use a system-wide > > > timer. Our chipset (code-named Cyclone) provides one, for newer systems > > > Intel has defined the HPET that we can use. We need to make similar > > > changes to Xen. > > > > This needs some agreement on the design. > > > > My gut feeling is that it should still be possible for guests to use > > the TSC to calculate the time offset relative to the published > > Xen system time record (which is updated every couple of > > seconds). The TSC calibration should be good enough to mean that > > the relative drift over the period between records is tiny (and > > errors can''t accumulate beyond the period). > > My gut feeling is that your gut feeling is wrong. We can''t ever > use the TSC on these systems - even a tiny amount of relative drift > causes problems. > > But I''m no expert. John, this is your cue. Please join in. > > > The ''TSC when time record created'' and ''TSC frequency'' will have > > to be per VCPU and updated to reflect the real CPU that the VCPU > > is running on. > > As long as these are virtual and not read using the readTSC instruction, > we may be OK. > > > > > Ian > > > > > >----- Forwarded by Don Fry/Beaverton/IBM on 05/26/2005 09:29 AM ----- "Ian Pratt" <m+Ian.Pratt@cl.cam.ac.uk> wrote on 04/21/2005 09:24:54 AM:> > Yes, and the hardware intentionally skews their frequencies, > > for reasons only the chipset designers understand. :) > > It''s to be sneaky as regards FCC EMC emissions regulations. > > Some systems even modulate the PCI bus frequency. > > > > My gut feeling is that it should still be possible for > > guests to use > > > the TSC to calculate the time offset relative to the published Xen > > > system time record (which is updated every couple of > > seconds). The TSC > > > calibration should be good enough to mean that the relative > > drift over > > > the period between records is tiny (and errors can''t > > accumulate beyond > > > the period). > > > > My gut feeling is that your gut feeling is wrong. We can''t > > ever use the TSC on these systems - even a tiny amount of > > relative drift causes problems. > > It depends on the crystal stability, the accuracy with which the > calibration is done, and the frequency of publishing new absoloute time > records. > > The latter can be made quite frequent if need be. > > I''d much prefer avoiding having to expose linux to the HPET/cyclone by > hiding it Xen, and having the guest use TSC extrapolation from the the > time record published by Xen. > We''d just need to update the current interface to have per-CPU records > (and TSC frequency calibration). > > > But I''m no expert. John, this is your cue. Please join in. > > > > > The ''TSC when time record created'' and ''TSC frequency'' will > > have to be > > > per VCPU and updated to reflect the real CPU that the VCPU > > is running > > > on. > > > > As long as these are virtual and not read using the readTSC > > instruction, we may be OK. > > Using readTSC should be fine, since we''re only using it to extrapolate > from the last Xen supplied time record, and we''ve calibrated the > frequency of the particular CPU we''re running on. We only have to worry > about rapid clock drift due to sudden temperature changes etc, but even > then we can just update the calibration frequency periodically. Using > this approach we get to keep gettimeofday very fast, and avoid > complicating the hypervisor API -- it''s exactly what we need for > migrating a domain between physical servers with different frequency > CPUs. > > Ian >----- Forwarded by Don Fry/Beaverton/IBM on 05/26/2005 09:29 AM ----- "Ian Pratt" <m+Ian.Pratt@cl.cam.ac.uk> wrote on 04/21/2005 01:12:51 PM:> > First, forgive my lack of knowledge about Xen. Since I don''t > > know the details of what you''re proposing, let me make a > > straw-man and let you correct my assumptions. > > > > Lets say you''re proposing that time be calculated with the > > following formula: > > > > timefoday = xen_time_base + rdtsc() - xen_last_tsc[CPUNUM] > > > > Given a guest domain with two cpus, the issue is managing > > xen_last_tsc[] and xen_time_base. For the equation to work, > > xen_last_tsc[0] must hold the TSC value from CPU0 at exactly > > the time stored in xen_time_base. Additionally the same is > > true with xen_las_tsc[1]. > > I''m proposing: > > timeofday = round_to_precision( last_xen_time_base[VCPU] + > ( rdtsc() - last_xen_tsc[VCPU] ) * xen_tsc_calibrate[VCPU] > ) > > We update last_xen_time_base and last_xen_tsc on each CPU every second > or so. > xen_tsc_calibrate is calculated for each CPU at start of day. For > completeness, we could recalculate the calibration every 30s or so to > cope with crystal temperature drift if we wanted ultimate precision. > > > The difficult question is how do you ensure that the two > > values in xen_last_tsc[] are linked with the time in > > xen_time_base? This requires reading the TSC on two cpus at > > the exact same time. Additionally, this sync point must > > happen frequently enough so that the continuing drift between > > cpus isn''t noticed. > > Nope, we would set the time_base on each CPU independently, but relative > to the same timer. > This could be the cyclone, HPET, or even the PIT if its possible to read > the same PIT from any node (though I''m guessing you probably have a PIT > per node and can''t read the remote one). > > > Then you''ll have to weigh that solution against just using an > > alternate global timesource like HPET/Cyclone. > > I''d prefer to avoid this as it would mean that there''d be a different > hypervisor API for guests on cylcone/hpet systems vs. normal synchronous > CPU systems. > Using the TSC will probably give a lower cost gettimeofday, we can also > trap it and emulate if we want to lie to guests about the progress of > time. > > Best, > Ian > > > > > >----- Forwarded by Don Fry/Beaverton/IBM on 05/26/2005 09:29 AM ----- John Stultz/Beaverton/IBM wrote on 04/21/2005 01:49:54 PM:> I''m just resending this with proper addresses as something got futzed upin the CC list on that last mail.> > "Ian Pratt" <m+Ian.Pratt@cl.cam.ac.uk> wrote on 04/21/2005 01:12:51 PM: > > > > First, forgive my lack of knowledge about Xen. Since I don''t > > > know the details of what you''re proposing, let me make a > > > straw-man and let you correct my assumptions. > > > > > > Lets say you''re proposing that time be calculated with the > > > following formula: > > > > > > timefoday = xen_time_base + rdtsc() - xen_last_tsc[CPUNUM] > > > > > > Given a guest domain with two cpus, the issue is managing > > > xen_last_tsc[] and xen_time_base. For the equation to work, > > > xen_last_tsc[0] must hold the TSC value from CPU0 at exactly > > > the time stored in xen_time_base. Additionally the same is > > > true with xen_las_tsc[1]. > > > > I''m proposing: > > > > timeofday = round_to_precision( last_xen_time_base[VCPU] + > > ( rdtsc() - last_xen_tsc[VCPU] ) * xen_tsc_calibrate[VCPU] > > ) > > > > We update last_xen_time_base and last_xen_tsc on each CPU every second > > or so. > > Or possibly more frequently, as on a 4Ghz cpu the 32bit TSC will wrapeach second. Alternatively you could use the full 64bits.> > > xen_tsc_calibrate is calculated for each CPU at start of day. For > > completeness, we could recalculate the calibration every 30s or so to > > cope with crystal temperature drift if we wanted ultimate precision. > > > > > The difficult question is how do you ensure that the two > > > values in xen_last_tsc[] are linked with the time in > > > xen_time_base? This requires reading the TSC on two cpus at > > > the exact same time. Additionally, this sync point must > > > happen frequently enough so that the continuing drift between > > > cpus isn''t noticed. > > > > Nope, we would set the time_base on each CPU independently, butrelative> > to the same timer.> Hmmm. That sounds like it could work. Just be sure that preempt won''tbite you in the timeofday calculation. The bit about still using the cyclone/HPET to sync the different xen_time_base[] values is the real key.> > > This could be the cyclone, HPET, or even the PIT if its possible to read > > the same PIT from any node (though I''m guessing you probably have a PIT > > per node and can''t read the remote one).> The ioport space is unified by the BIOS so there is one global PIT sharedby all nodes. Although as you''ll need a continuous timesource that doesn''t loop between xen_time_base updates, the PIT would not work.> > thanks > -john----- Forwarded by Don Fry/Beaverton/IBM on 05/26/2005 09:29 AM ----- "Ian Pratt" <m+Ian.Pratt@cl.cam.ac.uk> wrote on 04/28/2005 07:08:05 PM:> > > First I apologize for not being involved in this email > > exchange last week. > > I am also just learning about Xen so my questions may be > > obvious to others. > > > > What is the last_xen_time_base referred to in Ian''s email? Is > > this the stime_irq or wc_sec,wc_usec or something else? > > I was referring to the wc_ wall clock and system time values. > We''ll need to make these per VPU, or perhaps slightly more cleanly, > store an offset in ns for each VCPU. > > > When would the last_xen_tsc[VCPU] values be captured by Xen? > > Currently, the tsc for cpu 0 is obtained during > > timer_interrupt as full_tsc_irq. > > These just need to be captured periodically on each real CPU -- every > couple of seconds would do it, though more frequently woulnd''t hurt. > > > When updating the domain''s shared_info structure mapping the > > physical CPU to the domain''s view of the CPU would need to be > > done. For example if domain2 was running on CPU3 and CPU2 and > > the domain''s view was cpu0 and cpu1, the saved tsc value for > > CPU3 would be copied to last_xen_tsc[0] and CPU2 to > > last_xen_tsc[1] before sending the interrupt to the domain. > > Yep, this shouldn''t be hard -- there''s already some code to spot when > they need to be updated. > > > From the last algorithm from Ian, I don''t see anything that > > refers to the Cyclone/HPET value directly. Is that because > > Xen is the only thing that reads the Cyclone/HPET counter and > > the domain just uses the TSC? > > Yep, we don''t want to expose the cyclone/hpet to guests. There''s no > need, and it would have implications for migrating VMs between different > systems. > > Strictly speaking, Xen wouldn''t even need support for the hpet/cyclone > as it could just use the shared PIT, though I have no objection to > adding such support. > > Are you happy with this design? It''s a little more work, but I believe > better in the long run. We need to get the hypervisor interface change > incorporated ASAP. > > Cheers, > Ian----- Forwarded by Don Fry/Beaverton/IBM on 05/26/2005 09:29 AM ----- "Ian Pratt" <m+Ian.Pratt@cl.cam.ac.uk> wrote on 04/30/2005 12:04:57 AM:> > It sounds like the per-cpu changes should be sufficient. > > > > Having a time base and ns deltas for each CPU sounds > > interesting, but wouldn''t you have do a subtraction to > > generate the delta in Xen, and then add it back in, in the > > domain? Just saving the per-cpu value would save the extra > > add and subtract. > > Sure, but the add/subtract won''t cost much, and it saves some space in > the shared info page, which might be an issue if we have a lot of VCPUs. > > Not a big deal either way. > > > The bottom line is that it can all be done with the TSC, > > without needing to use the Cyclone or HPET hardware, which > > isn''t available on all systems like the TSC. > > Great, we''re in agreement. I think the first stage is just to do the per > [V]CPU calibration and time vals. Could you work something up? > > Thanks, > Ian-- Don Fry brazilnut@us.ibm.com _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 26 May 2005, at 18:34, Don Fry wrote:> The time going backwards was only occasionally, and it was a BIG jump > backwards. I tracked it down yesterday to a problem with doing 32-bit > arithmetic in Linux on the tsc values. For some reason, every 5-20 > minutes xen seems to pause for about 5 seconds. This causes the tsc to > wrap if only 32-bits are used, and the ''time went backwards'' message is > printed. I changed to use 64-bit tsc deltas and have been running > since > yesterday afternoon without any ''time went backwards'' messages. I want > to do some more cleanup (remove my debugging code) and will post all my > changes to the list this afternoon.It would be good to know where those 5 seconds go. Doing 64-bit arithmetic on x86 will be slower and possibly fudge over the underlying problem. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Don Fry wrote:> The time going backwards was only occasionally, and it was a BIG jump > backwards. I tracked it down yesterday to a problem with doing 32-bit > arithmetic in Linux on the tsc values. For some reason, every 5-20 > minutes xen seems to pause for about 5 seconds. This causes the tsc to > wrap if only 32-bits are used, and the ''time went backwards'' message isWhat happens when there is activity on the system? Do you still see the pause? 5 seconds is an inordinately long time, and will impact networking if it happens during normal workloads too.. thanks, Nivedita _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser wrote:> On 26 May 2005, at 18:34, Don Fry wrote: >> The time going backwards was only occasionally, and it was a BIG jump >> backwards. I tracked it down yesterday to a problem with doing 32-bit >> arithmetic in Linux on the tsc values. For some reason, every 5-20 >> minutes xen seems to pause for about 5 seconds.>> <snip> Have you noticed this directly in any dom''s, or just in xen?> It would be good to know where those 5 seconds go.See: UFOs, abductions, etc... ;) -- Andrew Thompson http://aktzero.com/ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
I am adding some debugging to figure this out. Right now, I have nothing running on the system except dom0. Xwindows is not running. The system has 8 cpus, (4 HT) and I see this on dom0 when it is compiled with and without SMP support. The only thing running on dom0 is tail -f /var/log/messages and that is very quiet. For example, between 11:34:01 and 11:54:01 I had a 5 second pause occur 3 times. Between 11:59:00 and 12:14:01 it occurred 4 times. Since May 25 16:53:31 and now it has happened about 180 times. On Thu, May 26, 2005 at 11:03:52AM -0700, Nivedita Singhvi wrote:> Don Fry wrote: > > >The time going backwards was only occasionally, and it was a BIG jump > >backwards. I tracked it down yesterday to a problem with doing 32-bit > >arithmetic in Linux on the tsc values. For some reason, every 5-20 > >minutes xen seems to pause for about 5 seconds. This causes the tsc to > >wrap if only 32-bits are used, and the ''time went backwards'' message is > > What happens when there is activity on the system? Do you > still see the pause? 5 seconds is an inordinately long time, > and will impact networking if it happens during normal > workloads too.. > > thanks, > Nivedita-- Don Fry brazilnut@us.ibm.com _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
These patches implement per-cpu relative time. They have been tested on 32-bit x86 platforms but not even compiled on x86_64 or ia64. They solve the ''time went backwards'' problems on systems with unsynchronized cpus. Using last week''s xeno-unstable I had no problems with these patches. Using today''s bits, the system will sometimes panic when booting dom0 between ''PCI: IRQ init'' and ''Grant table initialized''. I would like to get wider testing of these changes. Signed-off-by: Don Fry <brazilnut@us.ibm.com> --- xeno-unstable.bk/xen/include/public/xen.h.orig 2005-06-07 09:50:08.000000000 -0700 +++ xeno-unstable.bk/xen/include/public/xen.h 2005-06-07 09:50:08.000000000 -0700 @@ -329,6 +329,21 @@ typedef struct } PACKED vcpu_info_t; /* 8 + arch */ /* + * Xen/kernel shared data + * per cpu timing information. + */ +typedef struct time_info_st +{ + u32 time_version1; + u32 time_version2; + tsc_timestamp_t tsc_timestamp; /* TSC at last update */ + u64 system_time; /* time, in nanoseconds, since boot */ + u64 cpu_freq; /* CPU frequency (Hz) */ + u32 wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ + u32 wc_usec; /* Usecs 00:00:00 UTC, Jan 1, 1970. */ +} PACKED time_info_t; + +/* * Xen/kernel shared data -- pointer provided in start_info. * NB. We expect that this struct is smaller than a page. */ @@ -403,6 +418,7 @@ typedef struct shared_info_st arch_shared_info_t arch; + time_info_t vcpu_time[MAX_VIRT_CPUS]; } PACKED shared_info_t; /* --- xeno-unstable.bk/xen/common/schedule.c.orig 2005-06-07 10:40:50.000000000 -0700 +++ xeno-unstable.bk/xen/common/schedule.c 2005-06-07 11:23:57.000000000 -0700 @@ -53,6 +53,7 @@ string_param("sched", opt_sched); /* Various timer handlers. */ static void s_timer_fn(void *unused); static void t_timer_fn(void *unused); +static void tsc_timer_fn(void *unused); static void dom_timer_fn(void *data); /* This is global for now so that private implementations can reach it */ @@ -76,6 +77,7 @@ static struct scheduler ops; /* Per-CPU periodic timer sends an event to the currently-executing domain. */ static struct ac_timer t_timer[NR_CPUS]; +static struct ac_timer tsc_timer[NR_CPUS]; void free_domain_struct(struct domain *d) { @@ -487,6 +489,7 @@ int idle_cpu(int cpu) * Timers: the scheduler utilises a number of timers * - s_timer: per CPU timer for preemption and scheduling decisions * - t_timer: per CPU periodic timer to send timer interrupt to current dom + * - tsc_timer: per CPU periodic timer to update time bases * - dom_timer: per domain timer to specifiy timeout values ****************************************************************************/ @@ -516,6 +519,17 @@ static void t_timer_fn(void *unused) set_ac_timer(&t_timer[cpu], NOW() + MILLISECS(10)); } +/* Periodic tick timer: update time bases for per-cpu timing. */ +static void tsc_timer_fn(void *unused) +{ + unsigned int cpu = current->processor; + + extern void percpu_ticks(void); + percpu_ticks(); + + set_ac_timer(&tsc_timer[cpu], NOW() + MILLISECS(250)); +} + /* Domain timer function, sends a virtual timer interrupt to domain */ static void dom_timer_fn(void *data) { @@ -537,6 +551,7 @@ void __init scheduler_init(void) spin_lock_init(&schedule_data[i].schedule_lock); init_ac_timer(&schedule_data[i].s_timer, s_timer_fn, NULL, i); init_ac_timer(&t_timer[i], t_timer_fn, NULL, i); + init_ac_timer(&tsc_timer[i], tsc_timer_fn, NULL, i); } schedule_data[0].curr = idle_task[0]; @@ -566,6 +581,9 @@ void schedulers_start(void) { t_timer_fn(0); smp_call_function((void *)t_timer_fn, NULL, 1, 1); + + tsc_timer_fn(0); + smp_call_function((void *)tsc_timer_fn, NULL, 1, 1); } void dump_runq(unsigned char key) --- xeno-unstable.bk/xen/arch/x86/smpboot.c.orig 2005-06-07 10:41:14.000000000 -0700 +++ xeno-unstable.bk/xen/arch/x86/smpboot.c 2005-06-07 11:23:51.000000000 -0700 @@ -431,6 +431,7 @@ void __init start_secondary(void *unused extern void percpu_traps_init(void); extern void cpu_init(void); + extern void setup_percpu_time(void); set_current(idle_task[cpu]); set_processor_id(cpu); @@ -452,6 +453,7 @@ void __init start_secondary(void *unused setup_secondary_APIC_clock(); enable_APIC_timer(); + setup_percpu_time(); /* * low-memory mappings have been cleared, flush them from --- xeno-unstable.bk/xen/arch/x86/time.c.orig 2005-06-07 10:40:57.000000000 -0700 +++ xeno-unstable.bk/xen/arch/x86/time.c 2005-06-07 11:40:22.000000000 -0700 @@ -49,6 +49,29 @@ static u64 full_tsc_irq; static s_time_t stime_irq; /* System time at last ''time update'' */ static unsigned long wc_sec, wc_usec; /* UTC time at last ''time update''. */ static rwlock_t time_lock = RW_LOCK_UNLOCKED; +static time_info_t percpu_time_info[NR_CPUS]; + +void percpu_ticks(void) +{ + int cpu = smp_processor_id(); + time_info_t *t = &percpu_time_info[cpu]; + u64 tsc, delta; + u64 quarter = t->cpu_freq >> 2; + + rdtscll(tsc); + delta = tsc - t->tsc_timestamp; + while (delta >= quarter) { + t->wc_usec += 1000000UL / 4; + t->system_time += 1000000000ULL / 4; + t->tsc_timestamp += quarter; + delta -= quarter; + } + + while (t->wc_usec > 1000000UL) { + t->wc_sec += 1; + t->wc_usec -= 10000000UL; + } +} void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) { @@ -277,20 +300,29 @@ static inline void __update_dom_time(str { struct domain *d = v->domain; shared_info_t *si = d->shared_info; + time_info_t *dom = &si->vcpu_time[v->processor]; + time_info_t *xen = &percpu_time_info[smp_processor_id()]; spin_lock(&d->time_lock); si->time_version1++; + dom->time_version1++; wmb(); si->cpu_freq = cpu_freq; + dom->cpu_freq = xen->cpu_freq; si->tsc_timestamp = full_tsc_irq; + dom->tsc_timestamp = xen->tsc_timestamp; si->system_time = stime_irq; + dom->system_time = xen->system_time; si->wc_sec = wc_sec; + dom->wc_sec = xen->wc_sec; si->wc_usec = wc_usec; + dom->wc_usec = xen->wc_usec; wmb(); si->time_version2++; + dom->time_version2++; spin_unlock(&d->time_lock); } @@ -298,8 +330,11 @@ static inline void __update_dom_time(str void update_dom_time(struct vcpu *v) { unsigned long flags; + int cpu = smp_processor_id(); - if ( v->domain->shared_info->tsc_timestamp != full_tsc_irq ) + if ( v->domain->shared_info->tsc_timestamp != full_tsc_irq + || v->domain->shared_info->vcpu_time[v->processor].tsc_timestamp !+ percpu_time_info[cpu].tsc_timestamp) { read_lock_irqsave(&time_lock, flags); __update_dom_time(v); @@ -312,6 +347,7 @@ void do_settime(unsigned long secs, unsi { s64 delta; long _usecs = (long)usecs; + int i; write_lock_irq(&time_lock); @@ -326,6 +362,10 @@ void do_settime(unsigned long secs, unsi wc_sec = secs; wc_usec = _usecs; + for (i=0; i<NR_CPUS; i++) { + percpu_time_info[i].wc_sec = wc_sec; + percpu_time_info[i].wc_usec = wc_usec; + } /* Others will pick up the change at the next tick. */ __update_dom_time(current); @@ -335,16 +375,39 @@ void do_settime(unsigned long secs, unsi } +spinlock_t tsc_lock = SPIN_LOCK_UNLOCKED; + +/* + * Time setup for this processor. + */ +void __init setup_percpu_time(void) +{ + unsigned long flags; + unsigned long ticks_per_frac; + int cpu = smp_processor_id(); + + /* only have 1 cpu calibrate at a time */ + spin_lock_irqsave(&tsc_lock, flags); + ticks_per_frac = calibrate_tsc(); + spin_unlock_irqrestore(&tsc_lock, flags); + + if (!ticks_per_frac) + panic("Error calibrating TSC\n"); + percpu_time_info[cpu].cpu_freq = (u64)ticks_per_frac * (u64)CALIBRATE_FRAC; + rdtscll(percpu_time_info[cpu].tsc_timestamp); + percpu_time_info[cpu].system_time = stime_irq; +} + /* Late init function (after all CPUs are booted). */ int __init init_xen_time() { u64 scale; unsigned int cpu_ghz; + int i; cpu_ghz = (unsigned int)(cpu_freq / 1000000000ULL); for ( rdtsc_bitshift = 0; cpu_ghz != 0; rdtsc_bitshift++, cpu_ghz >>= 1 ) continue; - scale = 1000000000LL << (32 + rdtsc_bitshift); scale /= cpu_freq; st_scale_f = scale & 0xffffffff; @@ -357,6 +420,12 @@ int __init init_xen_time() /* Wallclock time starts as the initial RTC time. */ wc_sec = get_cmos_time(); + for (i=0; i<NR_CPUS; i++) { + percpu_time_info[i].wc_sec = wc_sec; + percpu_time_info[i].wc_usec = 0; + percpu_time_info[i].system_time = stime_irq; + percpu_time_info[i].cpu_freq = cpu_freq; // default speed + } printk("Time init:\n"); printk(".... cpu_freq: %08X:%08X\n", (u32)(cpu_freq>>32),(u32)cpu_freq); --- xeno-unstable.bk/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/time.c.orig 2005-06-07 10:21:21.000000000 -0700 +++ xeno-unstable.bk/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/time.c 2005-06-07 13:06:01.000000000 -0700 @@ -105,9 +105,13 @@ struct timer_opts *cur_timer = &timer_ts /* These are peridically updated in shared_info, and then copied here. */ u32 shadow_tsc_stamp; +DEFINE_PER_CPU(u64, shadow_tsc_stamp); u64 shadow_system_time; +DEFINE_PER_CPU(u64, shadow_system_time); static u32 shadow_time_version; +DEFINE_PER_CPU(u32, shadow_time_version); static struct timeval shadow_tv; +static DEFINE_PER_CPU(struct timeval, shadow_tv); /* * We use this to ensure that gettimeofday() is monotonically increasing. We @@ -171,23 +175,29 @@ __setup("independent_wallclock", __indep static void __get_time_values_from_xen(void) { shared_info_t *s = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); do { shadow_time_version = s->time_version2; + per_cpu(shadow_time_version, cpu) = s->vcpu_time[cpu].time_version2; rmb(); shadow_tv.tv_sec = s->wc_sec; shadow_tv.tv_usec = s->wc_usec; shadow_tsc_stamp = (u32)s->tsc_timestamp; shadow_system_time = s->system_time; + per_cpu(shadow_tv.tv_sec, cpu) = s->vcpu_time[cpu].wc_sec; + per_cpu(shadow_tv.tv_usec, cpu) = s->vcpu_time[cpu].wc_usec; + per_cpu(shadow_tsc_stamp, cpu) = s->vcpu_time[cpu].tsc_timestamp; + per_cpu(shadow_system_time, cpu) = s->vcpu_time[cpu].system_time; rmb(); } - while (shadow_time_version != s->time_version1); + while (shadow_time_version != s->time_version1 || per_cpu(shadow_time_version, cpu) != s->vcpu_time[cpu].time_version1); cur_timer->mark_offset(); } #define TIME_VALUES_UP_TO_DATE \ - ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); }) + ({ rmb(); ((per_cpu(shadow_time_version, cpu) == HYPERVISOR_shared_info->vcpu_time[cpu].time_version2) && (shadow_time_version == HYPERVISOR_shared_info->time_version2)); }) /* * This version of gettimeofday has microsecond resolution @@ -200,6 +210,7 @@ void do_gettimeofday(struct timeval *tv) unsigned long max_ntp_tick; unsigned long flags; s64 nsec; + int cpu = smp_processor_id(); do { unsigned long lost; @@ -227,7 +238,7 @@ void do_gettimeofday(struct timeval *tv) sec = xtime.tv_sec; usec += (xtime.tv_nsec / NSEC_PER_USEC); - nsec = shadow_system_time - processed_system_time; + nsec = per_cpu(shadow_system_time, cpu) - per_cpu(processed_system_time, cpu); __normalize_time(&sec, &nsec); usec += (long)nsec / NSEC_PER_USEC; @@ -273,6 +284,7 @@ int do_settimeofday(struct timespec *tv) long wtm_nsec; s64 nsec; struct timespec xentime; + int cpu = smp_processor_id(); if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -306,7 +318,7 @@ int do_settimeofday(struct timespec *tv) */ nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - nsec -= (shadow_system_time - processed_system_time); + nsec -= (per_cpu(shadow_system_time, cpu) - per_cpu(processed_system_time, cpu)); __normalize_time(&sec, &nsec); wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); @@ -384,6 +396,7 @@ unsigned long profile_pc(struct pt_regs EXPORT_SYMBOL(profile_pc); #endif +extern unsigned long long get_full_tsc_offset(void); /* * timer_interrupt() needs to keep up the real-time clock, * as well as call the "do_timer()" routine every clocktick @@ -392,27 +405,25 @@ static inline void do_timer_interrupt(in struct pt_regs *regs) { time_t wtm_sec, sec; - s64 delta, delta_cpu, nsec; + s64 delta_cpu, nsec; long sec_diff, wtm_nsec; int cpu = smp_processor_id(); do { __get_time_values_from_xen(); - delta = delta_cpu = (s64)shadow_system_time + - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC); - delta -= processed_system_time; - delta_cpu -= per_cpu(processed_system_time, cpu); + delta_cpu = (s64)per_cpu(shadow_system_time, cpu) + + ((s64)get_full_tsc_offset()) + - per_cpu(processed_system_time, cpu); } while (!TIME_VALUES_UP_TO_DATE); - if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) { + if (unlikely(delta_cpu < 0)) { printk("Timer ISR/%d: Time went backwards: " - "delta=%lld cpu_delta=%lld shadow=%lld " - "off=%lld processed=%lld cpu_processed=%lld\n", - cpu, delta, delta_cpu, shadow_system_time, - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC), - processed_system_time, + "cpu_delta=%lld cpu_shadow=%lld " + "off=%lld cpu_processed=%lld\n", + cpu, delta_cpu, per_cpu(shadow_system_time, cpu), + (s64)get_full_tsc_offset(), per_cpu(processed_system_time, cpu)); for (cpu = 0; cpu < num_online_cpus(); cpu++) printk(" %d: %lld\n", cpu, @@ -420,19 +431,15 @@ static inline void do_timer_interrupt(in return; } - /* System-wide jiffy work. */ - while (delta >= NS_PER_TICK) { - delta -= NS_PER_TICK; - processed_system_time += NS_PER_TICK; - do_timer(regs); - } - /* Local CPU jiffy work. */ while (delta_cpu >= NS_PER_TICK) { delta_cpu -= NS_PER_TICK; per_cpu(processed_system_time, cpu) += NS_PER_TICK; update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING, regs); + /* System-wide jiffy work. */ + if (cpu == 0) + do_timer(regs); } if (cpu != 0) @@ -447,19 +454,19 @@ static inline void do_timer_interrupt(in ((time_status & STA_UNSYNC) != 0) && (xtime.tv_sec > (last_update_from_xen + 60))) { /* Adjust shadow for jiffies that haven''t updated xtime yet. */ - shadow_tv.tv_usec -= + per_cpu(shadow_tv.tv_usec, cpu) -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ); - HANDLE_USEC_UNDERFLOW(shadow_tv); + HANDLE_USEC_UNDERFLOW(per_cpu(shadow_tv, cpu)); /* * Reset our running time counts if they are invalidated by * a warp backwards of more than 500ms. */ - sec_diff = xtime.tv_sec - shadow_tv.tv_sec; + sec_diff = xtime.tv_sec - per_cpu(shadow_tv.tv_sec, cpu); if (unlikely(abs(sec_diff) > 1) || unlikely(((sec_diff * USEC_PER_SEC) + (xtime.tv_nsec / NSEC_PER_USEC) - - shadow_tv.tv_usec) > 500000)) { + per_cpu(shadow_tv.tv_usec, cpu)) > 500000)) { #ifdef CONFIG_XEN_PRIVILEGED_GUEST last_rtc_update = last_update_to_xen = 0; #endif @@ -467,8 +474,8 @@ static inline void do_timer_interrupt(in } /* Update our unsynchronised xtime appropriately. */ - sec = shadow_tv.tv_sec; - nsec = shadow_tv.tv_usec * NSEC_PER_USEC; + sec = per_cpu(shadow_tv.tv_sec, cpu); + nsec = per_cpu(shadow_tv.tv_usec, cpu) * NSEC_PER_USEC; __normalize_time(&sec, &nsec); wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); @@ -498,7 +505,7 @@ static inline void do_timer_interrupt(in op.cmd = DOM0_SETTIME; op.u.settime.secs = tv.tv_sec; op.u.settime.usecs = tv.tv_usec; - op.u.settime.system_time = shadow_system_time; + op.u.settime.system_time = per_cpu(shadow_system_time, cpu); HYPERVISOR_dom0_op(&op); last_update_to_xen = xtime.tv_sec; @@ -670,7 +677,7 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); processed_system_time = shadow_system_time; - per_cpu(processed_system_time, 0) = processed_system_time; + per_cpu(processed_system_time, 0) = per_cpu(shadow_system_time, 0); if (timer_tsc_init.init(NULL) != 0) BUG(); @@ -753,7 +760,7 @@ void time_resume(void) /* Reset our own concept of passage of system time. */ processed_system_time = shadow_system_time; - per_cpu(processed_system_time, 0) = processed_system_time; + per_cpu(processed_system_time, 0) = per_cpu(shadow_system_time, 0); /* Accept a warp in UTC (wall-clock) time. */ last_seen_tv.tv_sec = 0; @@ -770,7 +777,7 @@ void local_setup_timer(void) do { seq = read_seqbegin(&xtime_lock); - per_cpu(processed_system_time, cpu) = shadow_system_time; + per_cpu(processed_system_time, cpu) = per_cpu(shadow_system_time, cpu); } while (read_seqretry(&xtime_lock, seq)); per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER); --- xeno-unstable.bk/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c.orig 2005-06-07 10:21:29.000000000 -0700 +++ xeno-unstable.bk/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c 2005-06-07 10:21:29.000000000 -0700 @@ -10,6 +10,7 @@ #include <linux/cpufreq.h> #include <linux/string.h> #include <linux/jiffies.h> +#include <linux/percpu.h> #include <asm/timer.h> #include <asm/io.h> @@ -35,8 +36,8 @@ extern spinlock_t i8253_lock; static int use_tsc; -static unsigned long long monotonic_base; -static u32 monotonic_offset; +static DEFINE_PER_CPU(unsigned long long, monotonic_base); +static DEFINE_PER_CPU(u32, monotonic_offset); static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; /* convert from cycles(64bits) => nanoseconds (64bits) @@ -74,8 +75,20 @@ static inline unsigned long long cycles_ */ static unsigned long fast_gettimeoffset_quotient; -extern u32 shadow_tsc_stamp; -extern u64 shadow_system_time; +extern DEFINE_PER_CPU(u64, shadow_tsc_stamp); +extern DEFINE_PER_CPU(u64, shadow_system_time); + +unsigned long long get_full_tsc_offset(void) +{ + unsigned long long tsc; + + /* Read the Time Stamp Counter */ + rdtscll(tsc); + + tsc -= per_cpu(shadow_tsc_stamp, smp_processor_id()); + + return cycles_2_ns(tsc); +} static unsigned long get_offset_tsc(void) { @@ -86,7 +99,7 @@ static unsigned long get_offset_tsc(void rdtsc(eax,edx); /* .. relative to previous jiffy (32 bits is enough) */ - eax -= shadow_tsc_stamp; + eax -= per_cpu(shadow_tsc_stamp, smp_processor_id()); /* * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient @@ -110,12 +123,13 @@ static unsigned long long monotonic_cloc { unsigned long long last_offset, this_offset, base; unsigned seq; + int cpu = smp_processor_id(); /* atomically read monotonic base & last_offset */ do { seq = read_seqbegin(&monotonic_lock); - last_offset = monotonic_offset; - base = monotonic_base; + last_offset = per_cpu(monotonic_offset, cpu); + base = per_cpu(monotonic_base, cpu); } while (read_seqretry(&monotonic_lock, seq)); /* Read the Time Stamp Counter */ @@ -152,11 +166,12 @@ unsigned long long sched_clock(void) static void mark_offset_tsc(void) { + int cpu = smp_processor_id(); /* update the monotonic base value */ write_seqlock(&monotonic_lock); - monotonic_base = shadow_system_time; - monotonic_offset = shadow_tsc_stamp; + per_cpu(monotonic_base, cpu) = per_cpu(shadow_system_time, cpu); + per_cpu(monotonic_offset, cpu) = per_cpu(shadow_tsc_stamp, cpu); write_sequnlock(&monotonic_lock); } -- Don Fry brazilnut@us.ibm.com _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel