From: David Mosberger Basically, what the patch does is provide two hooks such that platforms (and subplatforms) can provide time-interpolation in a way that guarantees that two causally related gettimeofday() calls will never see time going backwards (unless there is a settimeofday() call, of course). There is some evidence that the current scheme does work: we use it on ia64 both for cycle-counter-based interpolation and the SGI folks use it with a chipset-based high-performance counter. It seems like enough platforms do this sort of thing to provide _some_ support in the core, especially because it's rather tricky to guarantee that time never goes backwards (short of a settimeofday, of course). This patch is based on something Jes Sorensen wrote for the SGI Itanium 2 platform (which has a chipset-internal high-res clock). I adapted it so it can be used for cycle-counter interpolation also. The net effect is that "last_time_offset" can be removed completely from the kernel. The basic idea behind the patch is simply: every time you advance xtime by N nanoseconds, you call update_wall_time_hook(NSEC). Every time the time gets set (i.e., discontinuity is OK), reset_wall_time_hook() is called. DESC make timer interpolation patch compile EDESC "arguments provided to macro `nop'" include/linux/timex.h | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/time.c | 8 +-- kernel/timer.c | 94 ++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 192 insertions(+), 12 deletions(-) diff -puN include/linux/timex.h~time-interpolation-infrastructure include/linux/timex.h --- 25/include/linux/timex.h~time-interpolation-infrastructure 2003-05-28 18:29:30.000000000 -0700 +++ 25-akpm/include/linux/timex.h 2003-05-28 18:29:30.000000000 -0700 @@ -51,6 +51,9 @@ #ifndef _LINUX_TIMEX_H #define _LINUX_TIMEX_H +#include +#include + #include /* @@ -310,6 +313,105 @@ extern long pps_calcnt; /* calibration extern long pps_errcnt; /* calibration errors */ extern long pps_stbcnt; /* stability limit exceeded */ +#ifdef CONFIG_TIME_INTERPOLATION + +struct time_interpolator { + /* cache-hot stuff first: */ + unsigned long (*get_offset) (void); + void (*update) (long); + void (*reset) (void); + + /* cache-cold stuff follows here: */ + struct time_interpolator *next; + unsigned long frequency; /* frequency in counts/second */ + long drift; /* drift in parts-per-million (or -1) */ +}; + +extern volatile unsigned long last_nsec_offset; +#ifndef __HAVE_ARCH_CMPXCHG +extern spin_lock_t last_nsec_offset_lock; +#endif +extern struct time_interpolator *time_interpolator; + +extern void register_time_interpolator(struct time_interpolator *); +extern void unregister_time_interpolator(struct time_interpolator *); + +/* Called with xtime WRITE-lock acquired. */ +static inline void +time_interpolator_update(long delta_nsec) +{ + struct time_interpolator *ti = time_interpolator; + + if (last_nsec_offset > 0) { +#ifdef __HAVE_ARCH_CMPXCHG + unsigned long new, old; + + do { + old = last_nsec_offset; + if (old > delta_nsec) + new = old - delta_nsec; + else + new = 0; + } while (cmpxchg(&last_nsec_offset, old, new) != old); +#else + /* + * This really hurts, because it serializes gettimeofday(), but without an + * atomic single-word compare-and-exchange, there isn't all that much else + * we can do. + */ + spin_lock(&last_nsec_offset_lock); + { + last_nsec_offset -= min(last_nsec_offset, delta_nsec); + } + spin_unlock(&last_nsec_offset_lock); +#endif + } + + if (ti) + (*ti->update)(delta_nsec); +} + +/* Called with xtime WRITE-lock acquired. */ +static inline void +time_interpolator_reset(void) +{ + struct time_interpolator *ti = time_interpolator; + + last_nsec_offset = 0; + if (ti) + (*ti->reset)(); +} + +/* Called with xtime READ-lock acquired. */ +static inline unsigned long +time_interpolator_get_offset(void) +{ + struct time_interpolator *ti = time_interpolator; + if (ti) + return (*ti->get_offset)(); + return last_nsec_offset; +} + +#else /* !CONFIG_TIME_INTERPOLATION */ + +static inline void +time_interpolator_update(long delta_nsec) +{ +} + +static inline void +time_interpolator_reset(void) +{ +} + +static inline unsigned long +time_interpolator_get_offset(void) +{ + return 0; +} + +#endif /* !CONFIG_TIME_INTERPOLATION */ + #endif /* KERNEL */ #endif /* LINUX_TIMEX_H */ diff -puN kernel/time.c~time-interpolation-infrastructure kernel/time.c --- 25/kernel/time.c~time-interpolation-infrastructure 2003-05-28 18:29:30.000000000 -0700 +++ 25-akpm/kernel/time.c 2003-05-28 18:29:30.000000000 -0700 @@ -35,8 +35,6 @@ */ struct timezone sys_tz; -extern unsigned long last_time_offset; - #if !defined(__alpha__) && !defined(__ia64__) /* @@ -77,9 +75,10 @@ asmlinkage long sys_stime(int * tptr) if (get_user(value, tptr)) return -EFAULT; write_seqlock_irq(&xtime_lock); + + time_interpolator_reset(); xtime.tv_sec = value; xtime.tv_nsec = 0; - last_time_offset = 0; time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; @@ -125,7 +124,7 @@ inline static void warp_clock(void) { write_seqlock_irq(&xtime_lock); xtime.tv_sec += sys_tz.tz_minuteswest * 60; - last_time_offset = 0; + time_interpolator_update(sys_tz.tz_minuteswest * 60 * NSEC_PER_SEC); write_sequnlock_irq(&xtime_lock); } @@ -381,7 +380,6 @@ leave: if ((time_status & (STA_UNSYNC|ST txc->calcnt = pps_calcnt; txc->errcnt = pps_errcnt; txc->stbcnt = pps_stbcnt; - last_time_offset = 0; write_sequnlock_irq(&xtime_lock); do_gettimeofday(&txc->time); return(result); diff -puN kernel/timer.c~time-interpolation-infrastructure kernel/timer.c --- 25/kernel/timer.c~time-interpolation-infrastructure 2003-05-28 18:29:30.000000000 -0700 +++ 25-akpm/kernel/timer.c 2003-05-28 18:29:30.000000000 -0700 @@ -517,6 +517,7 @@ static void second_overflow(void) if (xtime.tv_sec % 86400 == 0) { xtime.tv_sec--; wall_to_monotonic.tv_sec++; + time_interpolator_update(-NSEC_PER_SEC); time_state = TIME_OOP; clock_was_set(); printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -527,6 +528,7 @@ static void second_overflow(void) if ((xtime.tv_sec + 1) % 86400 == 0) { xtime.tv_sec++; wall_to_monotonic.tv_sec--; + time_interpolator_update(NSEC_PER_SEC); time_state = TIME_WAIT; clock_was_set(); printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -605,7 +607,7 @@ static void second_overflow(void) /* in the NTP reference this is called "hardclock()" */ static void update_wall_time_one_tick(void) { - long time_adjust_step; + long time_adjust_step, delta_nsec; if ( (time_adjust_step = time_adjust) != 0 ) { /* We are doing an adjtime thing. @@ -621,11 +623,11 @@ static void update_wall_time_one_tick(vo time_adjust_step = tickadj; else if (time_adjust < -tickadj) time_adjust_step = -tickadj; - + /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; } - xtime.tv_nsec += tick_nsec + time_adjust_step * 1000; + delta_nsec = tick_nsec + time_adjust_step * 1000; /* * Advance the phase, once it gets to one microsecond, then * advance the tick more. @@ -634,13 +636,15 @@ static void update_wall_time_one_tick(vo if (time_phase <= -FINEUSEC) { long ltemp = -time_phase >> (SHIFT_SCALE - 10); time_phase += ltemp << (SHIFT_SCALE - 10); - xtime.tv_nsec -= ltemp; + delta_nsec -= ltemp; } else if (time_phase >= FINEUSEC) { long ltemp = time_phase >> (SHIFT_SCALE - 10); time_phase -= ltemp << (SHIFT_SCALE - 10); - xtime.tv_nsec += ltemp; + delta_nsec += ltemp; } + xtime.tv_nsec += delta_nsec; + time_interpolator_update(delta_nsec); } /* @@ -660,6 +664,7 @@ static void update_wall_time(unsigned lo if (xtime.tv_nsec >= 1000000000) { xtime.tv_nsec -= 1000000000; xtime.tv_sec++; + time_interpolator_update(NSEC_PER_SEC); second_overflow(); } } @@ -777,7 +782,6 @@ unsigned long wall_jiffies = INITIAL_JIF #ifndef ARCH_HAVE_XTIME_LOCK seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; #endif -unsigned long last_time_offset; /* * This function runs timers and the timer-tq in bottom half context. @@ -811,7 +815,6 @@ static inline void update_times(void) wall_jiffies += ticks; update_wall_time(ticks); } - last_time_offset = 0; calc_load(ticks); } @@ -1221,3 +1224,80 @@ void __init init_timers(void) register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); } + +#ifdef CONFIG_TIME_INTERPOLATION + +volatile unsigned long last_nsec_offset; + +struct time_interpolator *time_interpolator; + +#ifndef __HAVE_ARCH_CMPXCHG +spinlock_t last_nsec_offset_lock = SPIN_LOCK_UNLOCKED; +#endif + +static struct { + spinlock_t lock; /* lock protecting list */ + struct time_interpolator *list; /* list of registered interpolators */ +} ti_global = { + .lock = SPIN_LOCK_UNLOCKED +}; + +static inline int +is_better_time_interpolator(struct time_interpolator *new) +{ + if (!time_interpolator) + return 1; + return new->frequency > 2*time_interpolator->frequency + || (unsigned long) new->drift < (unsigned long) time_interpolator->drift; +} + +void +register_time_interpolator(struct time_interpolator *ti) +{ + spin_lock(&ti_global.lock); + { + write_seqlock_irq(&xtime_lock); + { + if (is_better_time_interpolator(ti)) + time_interpolator = ti; + } + write_sequnlock_irq(&xtime_lock); + + ti->next = ti_global.list; + ti_global.list = ti; + } + spin_unlock(&ti_global.lock); +} + +void +unregister_time_interpolator(struct time_interpolator *ti) +{ + struct time_interpolator *curr, **prev; + + spin_lock(&ti_global.lock); + { + prev = &ti_global.list; + for (curr = *prev; curr; curr = curr->next) { + if (curr == ti) { + *prev = curr->next; + break; + } + prev = &curr->next; + } + write_seqlock_irq(&xtime_lock); + { + if (ti == time_interpolator) { + /* we lost the best time-interpolator: */ + time_interpolator = NULL; + /* find the next-best interpolator */ + for (curr = ti_global.list; curr; curr = curr->next) + if (is_better_time_interpolator(curr)) + time_interpolator = curr; + } + } + write_sequnlock_irq(&xtime_lock); + } + spin_unlock(&ti_global.lock); +} + +#endif /* CONFIG_TIME_INTERPOLATION */ _