From: john stultz This patch, written with the advice of Joel Becker, addresses a problem with the hangcheck-timer. The basic problem is that the hangcheck-timer code (Required for Oracle) needs a accurate hard clock which can be used to detect OS stalls (due to udelay() or pci bus hangs) that would cause system time to skew (its sort of a sanity check that insures the system's notion of time is accurate). However, currently they are using get_cycles() to fetch the cpu's TSC register, thus this does not work on systems w/o a synced TSC. As suggested by Andi Kleen (see thread here: http://www.uwsg.iu.edu/hypermail/linux/kernel/0302.0/1234.html ) I've worked with Joel and others to implement the monotonic_clock() interface. Some of the major considerations made when writing this patch were o Needs to be able to return accurate time in the absence of multiple timer interrupts o Needs to be abstracted out from the hardware o Avoids impacting gettimeofday() performance This interface returns a unsigned long long representing the number of nanoseconds that has passed since time_init(). 25-akpm/arch/i386/kernel/time.c | 11 ++++ 25-akpm/arch/i386/kernel/timers/timer_cyclone.c | 55 ++++++++++++++++++-- 25-akpm/arch/i386/kernel/timers/timer_none.c | 6 ++ 25-akpm/arch/i386/kernel/timers/timer_pit.c | 6 ++ 25-akpm/arch/i386/kernel/timers/timer_tsc.c | 63 +++++++++++++++++++++++- 25-akpm/drivers/char/hangcheck-timer.c | 12 ++-- 25-akpm/include/asm-i386/timer.h | 1 7 files changed, 141 insertions(+), 13 deletions(-) diff -puN arch/i386/kernel/time.c~monotonic-clock-hangcheck arch/i386/kernel/time.c --- 25/arch/i386/kernel/time.c~monotonic-clock-hangcheck Mon Mar 31 16:03:53 2003 +++ 25-akpm/arch/i386/kernel/time.c Mon Mar 31 16:03:53 2003 @@ -138,6 +138,17 @@ void do_settimeofday(struct timeval *tv) clock_was_set(); } +/* monotonic_clock(): returns # of nanoseconds passed since time_init() + * Note: This function is required to return accurate + * time even in the absence of multiple timer ticks. + */ +unsigned long long monotonic_clock(void) +{ + return timer->monotonic_clock(); +} +EXPORT_SYMBOL(monotonic_clock); + + /* * In order to set the CMOS clock precisely, set_rtc_mmss has to be * called 500 ms after the second nowtime has started, because when diff -puN arch/i386/kernel/timers/timer_cyclone.c~monotonic-clock-hangcheck arch/i386/kernel/timers/timer_cyclone.c --- 25/arch/i386/kernel/timers/timer_cyclone.c~monotonic-clock-hangcheck Mon Mar 31 16:03:53 2003 +++ 25-akpm/arch/i386/kernel/timers/timer_cyclone.c Mon Mar 31 16:03:54 2003 @@ -28,27 +28,46 @@ static int delay_at_last_interrupt; #define CYCLONE_MPMC_OFFSET 0x51D0 #define CYCLONE_MPCS_OFFSET 0x51A8 #define CYCLONE_TIMER_FREQ 100000000 - +#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */ int use_cyclone = 0; static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ -static u32 last_cyclone_timer; +static u32 last_cyclone_low; +static u32 last_cyclone_high; +static unsigned long long monotonic_base; +static rwlock_t monotonic_lock = RW_LOCK_UNLOCKED; + +/* helper macro to atomically read both cyclone counter registers */ +#define read_cyclone_counter(low,high) \ + do{ \ + high = cyclone_timer[1]; low = cyclone_timer[0]; \ + } while (high != cyclone_timer[1]); + static void mark_offset_cyclone(void) { int count; + unsigned long long this_offset, last_offset; + + write_lock(&monotonic_lock); + last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; + spin_lock(&i8253_lock); - /* quickly read the cyclone timer */ - if(cyclone_timer) - last_cyclone_timer = cyclone_timer[0]; + read_cyclone_counter(last_cyclone_low,last_cyclone_high); - /* calculate delay_at_last_interrupt */ + /* read values for delay_at_last_interrupt */ outb_p(0x00, 0x43); /* latch the count ASAP */ count = inb_p(0x40); /* read the latched count */ count |= inb(0x40) << 8; spin_unlock(&i8253_lock); + /* update the monotonic base value */ + this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; + monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK; + write_unlock(&monotonic_lock); + + /* calculate delay_at_last_interrupt */ count = ((LATCH-1) - count) * TICK_SIZE; delay_at_last_interrupt = (count + LATCH/2) / LATCH; } @@ -64,7 +83,7 @@ static unsigned long get_offset_cyclone( offset = cyclone_timer[0]; /* .. relative to previous jiffy */ - offset = offset - last_cyclone_timer; + offset = offset - last_cyclone_low; /* convert cyclone ticks to microseconds */ /* XXX slow, can we speed this up? */ @@ -74,6 +93,27 @@ static unsigned long get_offset_cyclone( return delay_at_last_interrupt + offset; } +static unsigned long long monotonic_clock_cyclone(void) +{ + u32 now_low, now_high; + unsigned long long last_offset, this_offset, base; + unsigned long long ret; + + /* atomically read monotonic base & last_offset */ + read_lock_irq(&monotonic_lock); + last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; + base = monotonic_base; + read_unlock_irq(&monotonic_lock); + + /* Read the cyclone counter */ + read_cyclone_counter(now_low,now_high); + this_offset = ((unsigned long long)now_high<<32)|now_low; + + /* convert to nanoseconds */ + ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK); + return ret * (1000000000 / CYCLONE_TIMER_FREQ); +} + static int __init init_cyclone(char* override) { u32* reg; @@ -194,5 +234,6 @@ struct timer_opts timer_cyclone = { .init = init_cyclone, .mark_offset = mark_offset_cyclone, .get_offset = get_offset_cyclone, + .monotonic_clock = monotonic_clock_cyclone, .delay = delay_cyclone, }; diff -puN arch/i386/kernel/timers/timer_none.c~monotonic-clock-hangcheck arch/i386/kernel/timers/timer_none.c --- 25/arch/i386/kernel/timers/timer_none.c~monotonic-clock-hangcheck Mon Mar 31 16:03:53 2003 +++ 25-akpm/arch/i386/kernel/timers/timer_none.c Mon Mar 31 16:03:54 2003 @@ -16,6 +16,11 @@ static unsigned long get_offset_none(voi return 0; } +static unsigned long long monotonic_clock_none(void) +{ + return 0; +} + static void delay_none(unsigned long loops) { int d0; @@ -34,5 +39,6 @@ struct timer_opts timer_none = { .init = init_none, .mark_offset = mark_offset_none, .get_offset = get_offset_none, + .monotonic_clock = monotonic_clock_none, .delay = delay_none, }; diff -puN arch/i386/kernel/timers/timer_pit.c~monotonic-clock-hangcheck arch/i386/kernel/timers/timer_pit.c --- 25/arch/i386/kernel/timers/timer_pit.c~monotonic-clock-hangcheck Mon Mar 31 16:03:53 2003 +++ 25-akpm/arch/i386/kernel/timers/timer_pit.c Mon Mar 31 16:03:54 2003 @@ -31,6 +31,11 @@ static void mark_offset_pit(void) /* nothing needed */ } +static unsigned long long monotonic_clock_pit(void) +{ + return 0; +} + static void delay_pit(unsigned long loops) { int d0; @@ -145,5 +150,6 @@ struct timer_opts timer_pit = { .init = init_pit, .mark_offset = mark_offset_pit, .get_offset = get_offset_pit, + .monotonic_clock = monotonic_clock_pit, .delay = delay_pit, }; diff -puN arch/i386/kernel/timers/timer_tsc.c~monotonic-clock-hangcheck arch/i386/kernel/timers/timer_tsc.c --- 25/arch/i386/kernel/timers/timer_tsc.c~monotonic-clock-hangcheck Mon Mar 31 16:03:53 2003 +++ 25-akpm/arch/i386/kernel/timers/timer_tsc.c Mon Mar 31 16:03:54 2003 @@ -24,6 +24,38 @@ static int use_tsc; static int delay_at_last_interrupt; static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ +static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ +static unsigned long long monotonic_base; +static rwlock_t monotonic_lock = RW_LOCK_UNLOCKED; + +/* convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale; +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + /* Cached *multiplier* to convert TSC counts to microseconds. * (see the equation below). @@ -61,11 +93,32 @@ static unsigned long get_offset_tsc(void return delay_at_last_interrupt + edx; } +static unsigned long long monotonic_clock_tsc(void) +{ + unsigned long long last_offset, this_offset, base; + + /* atomically read monotonic base & last_offset */ + read_lock_irq(&monotonic_lock); + last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + base = monotonic_base; + read_unlock_irq(&monotonic_lock); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return base + cycles_2_ns(this_offset - last_offset); +} + static void mark_offset_tsc(void) { int count; int countmp; static int count1=0, count2=LATCH; + unsigned long long this_offset, last_offset; + + write_lock(&monotonic_lock); + last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; /* * It is important that these two operations happen almost at * the same time. We do the RDTSC stuff first, since it's @@ -80,7 +133,7 @@ static void mark_offset_tsc(void) /* read Pentium cycle counter */ - rdtscl(last_tsc_low); + rdtsc(last_tsc_low, last_tsc_high); spin_lock(&i8253_lock); outb_p(0x00, 0x43); /* latch the count ASAP */ @@ -103,6 +156,12 @@ static void mark_offset_tsc(void) } } + /* update the monotonic base value */ + this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + monotonic_base += cycles_2_ns(this_offset - last_offset); + write_unlock(&monotonic_lock); + + /* calculate delay_at_last_interrupt */ count = ((LATCH-1) - count) * TICK_SIZE; delay_at_last_interrupt = (count + LATCH/2) / LATCH; } @@ -301,6 +360,7 @@ static int __init init_tsc(char* overrid "0" (eax), "1" (edx)); printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); } + set_cyc2ns_scale(cpu_khz/1000); return 0; } } @@ -334,5 +394,6 @@ struct timer_opts timer_tsc = { .init = init_tsc, .mark_offset = mark_offset_tsc, .get_offset = get_offset_tsc, + .monotonic_clock = monotonic_clock_tsc, .delay = delay_tsc, }; diff -puN drivers/char/hangcheck-timer.c~monotonic-clock-hangcheck drivers/char/hangcheck-timer.c --- 25/drivers/char/hangcheck-timer.c~monotonic-clock-hangcheck Mon Mar 31 16:03:53 2003 +++ 25-akpm/drivers/char/hangcheck-timer.c Mon Mar 31 16:03:54 2003 @@ -78,11 +78,13 @@ static void hangcheck_fire(unsigned long static struct timer_list hangcheck_ticktock = TIMER_INITIALIZER(hangcheck_fire, 0, 0); +extern unsigned long long monotonic_clock(void); + static void hangcheck_fire(unsigned long data) { unsigned long long cur_tsc, tsc_diff; - cur_tsc = get_cycles(); + cur_tsc = monotonic_clock(); if (cur_tsc > hangcheck_tsc) tsc_diff = cur_tsc - hangcheck_tsc; @@ -98,7 +100,7 @@ static void hangcheck_fire(unsigned long } } mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); - hangcheck_tsc = get_cycles(); + hangcheck_tsc = monotonic_clock(); } @@ -108,10 +110,10 @@ static int __init hangcheck_init(void) VERSION_STR, hangcheck_tick, hangcheck_margin); hangcheck_tsc_margin = hangcheck_margin + hangcheck_tick; - hangcheck_tsc_margin *= HZ; - hangcheck_tsc_margin *= current_cpu_data.loops_per_jiffy; + hangcheck_tsc_margin *= 1000000000; + - hangcheck_tsc = get_cycles(); + hangcheck_tsc = monotonic_clock(); mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); return 0; diff -puN include/asm-i386/timer.h~monotonic-clock-hangcheck include/asm-i386/timer.h --- 25/include/asm-i386/timer.h~monotonic-clock-hangcheck Mon Mar 31 16:03:53 2003 +++ 25-akpm/include/asm-i386/timer.h Mon Mar 31 16:03:54 2003 @@ -14,6 +14,7 @@ struct timer_opts{ int (*init)(char *override); void (*mark_offset)(void); unsigned long (*get_offset)(void); + unsigned long long (*monotonic_clock)(void); void (*delay)(unsigned long); }; _