patches/0000775000077200007720000000000010646635217011533 5ustar mingomingopatches/i386-hpet-check-if-the-counter-works.patch0000664000077200007720000000270710646635210021247 0ustar mingomingoFrom: Thomas Gleixner Some systems have a HPET which is not incrementing, which leads to a complete hang. Detect it during HPET setup. Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: john stultz Cc: Signed-off-by: Andrew Morton --- arch/i386/kernel/hpet.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -226,7 +226,8 @@ int __init hpet_enable(void) { unsigned long id; uint64_t hpet_freq; - u64 tmp; + u64 tmp, start, now; + cycle_t t1; if (!is_hpet_capable()) return 0; @@ -273,6 +274,27 @@ int __init hpet_enable(void) /* Start the counter */ hpet_start_counter(); + /* Verify whether hpet counter works */ + t1 = read_hpet(); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + if (t1 == read_hpet()) { + printk(KERN_WARNING + "HPET counter not counting. HPET disabled\n"); + goto out_nohpet; + } + /* Initialize and register HPET clocksource * * hpet period is in femto seconds per cycle patches/spinlock-init-cleanup.patch0000664000077200007720000000117010646635211016755 0ustar mingomingo--- drivers/pci/pcie/aer/aerdrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/drivers/pci/pcie/aer/aerdrv.c =================================================================== --- linux-rt.q.orig/drivers/pci/pcie/aer/aerdrv.c +++ linux-rt.q/drivers/pci/pcie/aer/aerdrv.c @@ -157,7 +157,7 @@ static struct aer_rpc* aer_alloc_rpc(str * Initialize Root lock access, e_lock, to Root Error Status Reg, * Root Error ID Reg, and Root error producer/consumer index. */ - rpc->e_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&rpc->e_lock); rpc->rpd = dev; INIT_WORK(&rpc->dpc_handler, aer_isr); patches/latency-tracer-one-off-fix.patch0000664000077200007720000000173610646635212017606 0ustar mingomingoFix a simple issue in latency_tracer.c Fix a simple issue in latency_tracer.c Signed-off-by: Jan Altenberg --- kernel/latency_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -1005,7 +1005,7 @@ static int min_idx(struct block_idx *bid idx = bidx->idx[cpu]; if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE)) continue; - if (idx >= MAX_TRACE*NR_CPUS) { + if (idx > MAX_TRACE*NR_CPUS) { printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, NR_CPUS); WARN_ON(1); @@ -1152,7 +1152,7 @@ static void update_out_trace(void) *out_entry = *entry; out_entry++; sum++; - if (sum >= MAX_TRACE*NR_CPUS) { + if (sum > MAX_TRACE*NR_CPUS) { printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, NR_CPUS); WARN_ON(1); patches/i386-move-pit-function-declarations-and-constants-to-correct-header-file.patch0000664000077200007720000000551010646635210030262 0ustar mingomingoFrom: Thomas Gleixner setup_pit_timer is declared in asm-i386/timer.h. Move it to the pit header file, so it can be used by x86_64 as well. Move also the PIT constants. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/i386/kernel/i8253.c | 2 -- arch/i386/kernel/vmiclock.c | 1 + include/asm-i386/i8253.h | 7 +++++++ include/asm-i386/mach-default/io_ports.h | 5 ----- include/asm-i386/timer.h | 1 - 5 files changed, 8 insertions(+), 8 deletions(-) Index: linux-rt.q/arch/i386/kernel/i8253.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/i8253.c +++ linux-rt.q/arch/i386/kernel/i8253.c @@ -14,8 +14,6 @@ #include #include -#include "io_ports.h" - DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); Index: linux-rt.q/arch/i386/kernel/vmiclock.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/vmiclock.c +++ linux-rt.q/arch/i386/kernel/vmiclock.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include "io_ports.h" Index: linux-rt.q/include/asm-i386/i8253.h =================================================================== --- linux-rt.q.orig/include/asm-i386/i8253.h +++ linux-rt.q/include/asm-i386/i8253.h @@ -3,6 +3,11 @@ #include +/* i8253A PIT registers */ +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 +#define PIT_CH2 0x42 + extern spinlock_t i8253_lock; extern struct clock_event_device *global_clock_event; @@ -18,4 +23,6 @@ static inline void pit_interrupt_hook(vo global_clock_event->event_handler(global_clock_event); } +extern void setup_pit_timer(void); + #endif /* __ASM_I8253_H__ */ Index: linux-rt.q/include/asm-i386/mach-default/io_ports.h =================================================================== --- linux-rt.q.orig/include/asm-i386/mach-default/io_ports.h +++ linux-rt.q/include/asm-i386/mach-default/io_ports.h @@ -7,11 +7,6 @@ #ifndef _MACH_IO_PORTS_H #define _MACH_IO_PORTS_H -/* i8253A PIT registers */ -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 -#define PIT_CH2 0x42 - /* i8259A PIC registers */ #define PIC_MASTER_CMD 0x20 #define PIC_MASTER_IMR 0x21 Index: linux-rt.q/include/asm-i386/timer.h =================================================================== --- linux-rt.q.orig/include/asm-i386/timer.h +++ linux-rt.q/include/asm-i386/timer.h @@ -5,7 +5,6 @@ #define TICK_SIZE (tick_nsec / 1000) -void setup_pit_timer(void); unsigned long long native_sched_clock(void); unsigned long native_calculate_cpu_khz(void); patches/print-might-sleep-hack.patch0000664000077200007720000000500410646635217017027 0ustar mingomingoTemporary HACK!!!! PREEMPT_RT suffers from the on going problem of running printk in atomic operations. It is very advantageous to do so but with PREEMPT_RT making spin_locks sleep, it can also be devastating. This patch does not solve the problem of printk sleeping in an atomic operation. This patch just makes printk not report that it is. Of course if printk does report that it's sleeping in an atomic operation, then that printing of the report will also print a report, and you go into recursive hell. We need to really sit down and solve the real issue here. --- include/linux/sched.h | 13 +++++++++++++ kernel/printk.c | 4 ++++ kernel/rtmutex.c | 4 +++- 3 files changed, 20 insertions(+), 1 deletion(-) Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -1341,8 +1341,21 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; #endif +#ifdef CONFIG_PREEMPT_RT + /* + * Temporary hack, until we find a solution to + * handle printk in atomic operations. + */ + int in_printk; +#endif }; +#ifdef CONFIG_PREEMPT_RT +# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0) +#else +# define set_printk_might_sleep(x) do { } while(0) +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH Index: linux-rt.q/kernel/printk.c =================================================================== --- linux-rt.q.orig/kernel/printk.c +++ linux-rt.q/kernel/printk.c @@ -339,10 +339,14 @@ static void __call_console_drivers(unsig int trace_save = trace_enabled; trace_enabled = 0; + set_printk_might_sleep(1); con->write(con, &LOG_BUF(start), end - start); + set_printk_might_sleep(0); trace_enabled = trace_save; #else + set_printk_might_sleep(1); con->write(con, &LOG_BUF(start), end - start); + set_printk_might_sleep(0); #endif } } Index: linux-rt.q/kernel/rtmutex.c =================================================================== --- linux-rt.q.orig/kernel/rtmutex.c +++ linux-rt.q/kernel/rtmutex.c @@ -631,7 +631,9 @@ static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, void fastcall (*slowfn)(struct rt_mutex *lock)) { - might_sleep(); + /* Temporary HACK! */ + if (!current->in_printk) + might_sleep(); if (likely(rt_mutex_cmpxchg(lock, NULL, current))) rt_mutex_deadlock_account_lock(lock, current); patches/preempt-realtime-netconsole.patch0000664000077200007720000000136710646635215020204 0ustar mingomingo--- drivers/net/netconsole.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) Index: linux-rt.q/drivers/net/netconsole.c =================================================================== --- linux-rt.q.orig/drivers/net/netconsole.c +++ linux-rt.q/drivers/net/netconsole.c @@ -68,21 +68,16 @@ static int configured = 0; static void write_msg(struct console *con, const char *msg, unsigned int len) { int frag, left; - unsigned long flags; if (!np.dev) return; - local_irq_save(flags); - - for(left = len; left; ) { + for (left = len; left; ) { frag = min(left, MAX_PRINT_CHUNK); netpoll_send_udp(&np, msg, frag); msg += frag; left -= frag; } - - local_irq_restore(flags); } static struct console netconsole = { patches/x86_64-convert-to-clockevents.patch0000664000077200007720000003071610646635211020127 0ustar mingomingoSubject: x86_64: convert to clock events Finally switch to the clockevents code. Share code with i386 for hpet and PIT. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/Kconfig | 10 +++ arch/x86_64/kernel/Makefile | 4 + arch/x86_64/kernel/apic.c | 90 +++++++++++++++++++---------------- arch/x86_64/kernel/i8259.c | 46 ------------------ arch/x86_64/kernel/smpboot.c | 4 - arch/x86_64/kernel/time.c | 109 +++++-------------------------------------- include/asm-x86_64/hpet.h | 16 ------ 7 files changed, 76 insertions(+), 203 deletions(-) Index: linux-rt.q/arch/x86_64/Kconfig =================================================================== --- linux-rt.q.orig/arch/x86_64/Kconfig +++ linux-rt.q/arch/x86_64/Kconfig @@ -28,7 +28,15 @@ config GENERIC_TIME bool default y -config GENERIC_CLOCKEVENTS_MIGR +config GENERIC_CLOCKEVENTS + bool + default y + +config GENERIC_CLOCKEVENTS_BROADCAST + bool + default y + +config NONIRQ_WAKEUP bool default y Index: linux-rt.q/arch/x86_64/kernel/Makefile =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/Makefile +++ linux-rt.q/arch/x86_64/kernel/Makefile @@ -9,7 +9,7 @@ obj-y := process.o signal.o entry.o trap x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \ - perfctr-watchdog.o + perfctr-watchdog.o i8253.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o @@ -48,6 +48,8 @@ obj-y += pcspeaker.o CFLAGS_vsyscall.o := $(PROFILING) -g0 +i8253-y += ../../i386/kernel/i8253.o +hpet-y += ../../i386/kernel/hpet.o therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o bootflag-y += ../../i386/kernel/bootflag.o legacy_serial-y += ../../i386/kernel/legacy_serial.o Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -858,25 +858,12 @@ static void __setup_APIC_LVTT(unsigned i static void setup_APIC_timer(void) { - unsigned long flags; - int irqen; + struct clock_event_device *levt = &__get_cpu_var(lapic_events); - local_irq_save(flags); + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); - irqen = ! cpu_isset(smp_processor_id(), - timer_interrupt_broadcast_ipi_mask); - __setup_APIC_LVTT(calibration_result, 0, irqen); - /* Turn off PIT interrupt if we use APIC timer as main timer. - Only works with the PM timer right now - TBD fix it for HPET too. */ - if ((pmtmr_ioport != 0) && - smp_processor_id() == boot_cpu_id && - apic_runs_main_timer == 1 && - !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { - stop_timer_interrupt(); - apic_runs_main_timer++; - } - local_irq_restore(flags); + clockevents_register_device(levt); } /* @@ -951,18 +938,34 @@ static void __init calibrate_APIC_clock( void __init setup_boot_APIC_clock (void) { + /* + * The local apic timer can be disabled via the kernel commandline. + * Register the lapic timer as a dummy clock event source on SMP + * systems, so the broadcast mechanism is used. On UP systems simply + * ignore it. + */ if (disable_apic_timer) { printk(KERN_INFO "Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); return; } printk(KERN_INFO "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; - calibrate_APIC_clock(); + /* - * Now set up the timer for real. + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); + setup_APIC_timer(); } @@ -1074,22 +1077,34 @@ void setup_APIC_extended_lvt(unsigned ch void smp_local_timer_interrupt(void) { - profile_tick(CPU_PROFILING); -#ifdef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif - if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) - main_timer_handler(); + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + /* - * We take the 'long' return path, and there every subsystem - * grabs the appropriate locks (kernel lock/ irq lock). + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. * - * We might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + /* + * the NMI deadlock-detector uses this. */ + add_pda(apic_timer_irqs, 1); + + evt->event_handler(evt); } /* @@ -1105,11 +1120,6 @@ void smp_apic_timer_interrupt(struct pt_ struct pt_regs *old_regs = set_irq_regs(regs); /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. */ @@ -1292,7 +1302,7 @@ static __init int setup_noapictimer(char static __init int setup_apicmaintimer(char *str) { apic_runs_main_timer = 1; - nohpet = 1; + return 1; } __setup("apicmaintimer", setup_apicmaintimer); @@ -1308,7 +1318,7 @@ static __init int setup_apicpmtimer(char { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return setup_apicmaintimer(NULL); + return 0; } __setup("apicpmtimer", setup_apicpmtimer); Index: linux-rt.q/arch/x86_64/kernel/i8259.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/i8259.c +++ linux-rt.q/arch/x86_64/kernel/i8259.c @@ -461,46 +461,6 @@ void invalidate_interrupt7(void); void thermal_interrupt(void); void threshold_interrupt(void); -static void setup_timer_hardware(void) -{ - outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , 0x40); /* LSB */ - udelay(10); - outb(LATCH >> 8 , 0x40); /* MSB */ -} - -static int timer_resume(struct sys_device *dev) -{ - setup_timer_hardware(); - return 0; -} - -void i8254_timer_resume(void) -{ - setup_timer_hardware(); -} - -static struct sysdev_class timer_sysclass = { - set_kset_name("timer_pit"), - .resume = timer_resume, -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int __init init_timer_sysfs(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(init_timer_sysfs); - void __init init_IRQ(void) { int i; @@ -550,12 +510,6 @@ void __init init_IRQ(void) set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* - * Set the clock to HZ Hz, we already have a valid - * vector now: - */ - setup_timer_hardware(); - if (!acpi_ioapic) setup_irq(2, &irq2); } Index: linux-rt.q/arch/x86_64/kernel/smpboot.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/smpboot.c +++ linux-rt.q/arch/x86_64/kernel/smpboot.c @@ -223,8 +223,6 @@ void __cpuinit smp_callin(void) local_irq_disable(); Dprintk("Stack at about %p\n",&cpuid); - disable_APIC_timer(); - /* * Save our processor parameters */ @@ -348,8 +346,6 @@ void __cpuinit start_secondary(void) enable_8259A_irq(0); } - enable_APIC_timer(); - /* * The sibling maps must be set before turing the online map on for * this cpu Index: linux-rt.q/arch/x86_64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/time.c +++ linux-rt.q/arch/x86_64/kernel/time.c @@ -28,6 +28,8 @@ #include #include #include +#include + #ifdef CONFIG_ACPI #include /* for PM timer frequency */ #include @@ -45,12 +47,8 @@ #include #include -static char *timename = NULL; - DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -193,6 +191,13 @@ static irqreturn_t timer_interrupt(int i return IRQ_HANDLED; } +static irqreturn_t timer_event_interrupt(int irq, void *dev_id) +{ + global_clock_event->event_handler(global_clock_event); + + return IRQ_HANDLED; +} + unsigned long read_persistent_clock(void) { unsigned int year, mon, day, hour, min, sec; @@ -290,63 +295,19 @@ static unsigned int __init tsc_calibrate return pmc_now * tsc_khz / (tsc_now - tsc_start); } -static void __pit_init(int val, u8 mode) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(mode, PIT_MODE); - outb_p(val & 0xff, PIT_CH0); /* LSB */ - outb_p(val >> 8, PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} - -void __init pit_init(void) -{ - __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ -} - -void pit_stop_interrupt(void) -{ - __pit_init(0, 0x30); /* mode 0 */ -} - -void stop_timer_interrupt(void) -{ - char *name; - if (hpet_address) { - name = "HPET"; - hpet_timer_stop_set_go(0); - } else { - name = "PIT"; - pit_stop_interrupt(); - } - printk(KERN_INFO "timer: %s interrupt stopped.\n", name); -} - static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL, + .handler = timer_event_interrupt, + .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, .mask = CPU_MASK_NONE, .name = "timer" }; void __init time_init(void) { - if (nohpet) - hpet_address = 0; - - if (hpet_arch_init()) - hpet_address = 0; + if (!hpet_enable()) + setup_pit_timer(); - if (hpet_use_timer) { - /* set tick_nsec to use the proper rate for HPET */ - tick_nsec = TICK_NSEC_HPET; - timename = "HPET"; - } else { - pit_init(); - timename = "PIT"; - } + setup_irq(0, &irq0); tsc_calibrate(); @@ -368,46 +329,4 @@ void __init time_init(void) printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); init_tsc_clocksource(); - - setup_irq(0, &irq0); -} - -/* - * sysfs support for the timer. - */ - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; } - -static int timer_resume(struct sys_device *dev) -{ - if (hpet_address) - hpet_reenable(); - else - i8254_timer_resume(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - -/* XXX this sysfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); Index: linux-rt.q/include/asm-x86_64/hpet.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/hpet.h +++ linux-rt.q/include/asm-x86_64/hpet.h @@ -1,18 +1,2 @@ -#ifndef _ASM_X8664_HPET_H -#define _ASM_X8664_HPET_H 1 #include - -#define HPET_TICK_RATE (HZ * 100000UL) - -extern int hpet_rtc_timer_init(void); -extern int hpet_arch_init(void); -extern int hpet_timer_stop_set_go(unsigned long tick); -extern int hpet_reenable(void); -extern unsigned int hpet_calibrate_tsc(void); - -extern int hpet_use_timer; -extern unsigned long hpet_period; -extern unsigned long hpet_tick; - -#endif patches/ppc-gtod-notrace-fix.patch0000664000077200007720000000076710646635213016514 0ustar mingomingo--- arch/powerpc/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -898,7 +898,7 @@ void div128_by_32(u64 dividend_high, u64 #include -static cycle_t timebase_read(void) +static cycle_t notrace timebase_read(void) { return (cycle_t)get_tb(); } patches/x86_64-share-hpet-h.patch0000664000077200007720000001742310646635210015775 0ustar mingomingoSubject: x86_64: share hpet.h with i386 hpet.h in asm-i386 and asm-x86_64 contain tons of duplicated stuff. Consolidate into one shared header file. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- include/asm-i386/hpet.h | 124 +++++++++++++++++----------------------------- include/asm-x86_64/hpet.h | 61 ---------------------- 2 files changed, 48 insertions(+), 137 deletions(-) Index: linux-rt.q/include/asm-i386/hpet.h =================================================================== --- linux-rt.q.orig/include/asm-i386/hpet.h +++ linux-rt.q/include/asm-i386/hpet.h @@ -4,112 +4,82 @@ #ifdef CONFIG_HPET_TIMER -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - /* * Documentation on HPET can be found at: * http://www.intel.com/ial/home/sp/pcmmspec.htm * ftp://download.intel.com/ial/home/sp/mmts098.pdf */ -#define HPET_MMAP_SIZE 1024 +#define HPET_MMAP_SIZE 1024 -#define HPET_ID 0x000 -#define HPET_PERIOD 0x004 -#define HPET_CFG 0x010 -#define HPET_STATUS 0x020 -#define HPET_COUNTER 0x0f0 -#define HPET_T0_CFG 0x100 -#define HPET_T0_CMP 0x108 -#define HPET_T0_ROUTE 0x110 -#define HPET_T1_CFG 0x120 -#define HPET_T1_CMP 0x128 -#define HPET_T1_ROUTE 0x130 -#define HPET_T2_CFG 0x140 -#define HPET_T2_CMP 0x148 -#define HPET_T2_ROUTE 0x150 - -#define HPET_ID_LEGSUP 0x00008000 -#define HPET_ID_NUMBER 0x00001f00 -#define HPET_ID_REV 0x000000ff +#define HPET_ID 0x000 +#define HPET_PERIOD 0x004 +#define HPET_CFG 0x010 +#define HPET_STATUS 0x020 +#define HPET_COUNTER 0x0f0 +#define HPET_T0_CFG 0x100 +#define HPET_T0_CMP 0x108 +#define HPET_T0_ROUTE 0x110 +#define HPET_T1_CFG 0x120 +#define HPET_T1_CMP 0x128 +#define HPET_T1_ROUTE 0x130 +#define HPET_T2_CFG 0x140 +#define HPET_T2_CMP 0x148 +#define HPET_T2_ROUTE 0x150 + +#define HPET_ID_REV 0x000000ff +#define HPET_ID_NUMBER 0x00001f00 +#define HPET_ID_64BIT 0x00002000 +#define HPET_ID_LEGSUP 0x00008000 +#define HPET_ID_VENDOR 0xffff0000 #define HPET_ID_NUMBER_SHIFT 8 +#define HPET_ID_VENDOR_SHIFT 16 -#define HPET_CFG_ENABLE 0x001 -#define HPET_CFG_LEGACY 0x002 +#define HPET_ID_VENDOR_8086 0x8086 + +#define HPET_CFG_ENABLE 0x001 +#define HPET_CFG_LEGACY 0x002 #define HPET_LEGACY_8254 2 #define HPET_LEGACY_RTC 8 -#define HPET_TN_ENABLE 0x004 -#define HPET_TN_PERIODIC 0x008 -#define HPET_TN_PERIODIC_CAP 0x010 -#define HPET_TN_SETVAL 0x040 -#define HPET_TN_32BIT 0x100 - -/* Use our own asm for 64 bit multiply/divide */ -#define ASM_MUL64_REG(eax_out,edx_out,reg_in,eax_in) \ - __asm__ __volatile__("mull %2" \ - :"=a" (eax_out), "=d" (edx_out) \ - :"r" (reg_in), "0" (eax_in)) - -#define ASM_DIV64_REG(eax_out,edx_out,reg_in,eax_in,edx_in) \ - __asm__ __volatile__("divl %2" \ - :"=a" (eax_out), "=d" (edx_out) \ - :"r" (reg_in), "0" (eax_in), "1" (edx_in)) +#define HPET_TN_LEVEL 0x0002 +#define HPET_TN_ENABLE 0x0004 +#define HPET_TN_PERIODIC 0x0008 +#define HPET_TN_PERIODIC_CAP 0x0010 +#define HPET_TN_64BIT_CAP 0x0020 +#define HPET_TN_SETVAL 0x0040 +#define HPET_TN_32BIT 0x0100 +#define HPET_TN_ROUTE 0x3e00 +#define HPET_TN_FSB 0x4000 +#define HPET_TN_FSB_CAP 0x8000 +#define HPET_TN_ROUTE_SHIFT 9 -#define KERNEL_TICK_USEC (1000000UL/HZ) /* tick value in microsec */ /* Max HPET Period is 10^8 femto sec as in HPET spec */ -#define HPET_MAX_PERIOD (100000000UL) +#define HPET_MAX_PERIOD 100000000UL /* * Min HPET period is 10^5 femto sec just for safety. If it is less than this, * then 32 bit HPET counter wrapsaround in less than 0.5 sec. */ -#define HPET_MIN_PERIOD (100000UL) -#define HPET_TICK_RATE (HZ * 100000UL) +#define HPET_MIN_PERIOD 100000UL -extern unsigned long hpet_address; /* hpet memory map physical address */ +/* hpet memory map physical address */ +extern unsigned long hpet_address; extern int is_hpet_enabled(void); - -#ifdef CONFIG_X86_64 -extern unsigned long hpet_tick; /* hpet clks count per tick */ -extern int hpet_use_timer; -extern int hpet_rtc_timer_init(void); extern int hpet_enable(void); -extern int is_hpet_capable(void); -extern int hpet_readl(unsigned long a); -#else -extern int hpet_enable(void); -#endif #ifdef CONFIG_HPET_EMULATE_RTC + +#include + extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); extern int hpet_set_rtc_irq_bit(unsigned long bit_mask); -extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec); +extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, + unsigned char sec); extern int hpet_set_periodic_freq(unsigned long freq); extern int hpet_rtc_dropped_irq(void); extern int hpet_rtc_timer_init(void); extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id); + #endif /* CONFIG_HPET_EMULATE_RTC */ #else Index: linux-rt.q/include/asm-x86_64/hpet.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/hpet.h +++ linux-rt.q/include/asm-x86_64/hpet.h @@ -1,59 +1,10 @@ #ifndef _ASM_X8664_HPET_H #define _ASM_X8664_HPET_H 1 -/* - * Documentation on HPET can be found at: - * http://www.intel.com/ial/home/sp/pcmmspec.htm - * ftp://download.intel.com/ial/home/sp/mmts098.pdf - */ - -#define HPET_MMAP_SIZE 1024 - -#define HPET_ID 0x000 -#define HPET_PERIOD 0x004 -#define HPET_CFG 0x010 -#define HPET_STATUS 0x020 -#define HPET_COUNTER 0x0f0 -#define HPET_Tn_OFFSET 0x20 -#define HPET_Tn_CFG(n) (0x100 + (n) * HPET_Tn_OFFSET) -#define HPET_Tn_ROUTE(n) (0x104 + (n) * HPET_Tn_OFFSET) -#define HPET_Tn_CMP(n) (0x108 + (n) * HPET_Tn_OFFSET) -#define HPET_T0_CFG HPET_Tn_CFG(0) -#define HPET_T0_CMP HPET_Tn_CMP(0) -#define HPET_T1_CFG HPET_Tn_CFG(1) -#define HPET_T1_CMP HPET_Tn_CMP(1) - -#define HPET_ID_VENDOR 0xffff0000 -#define HPET_ID_LEGSUP 0x00008000 -#define HPET_ID_64BIT 0x00002000 -#define HPET_ID_NUMBER 0x00001f00 -#define HPET_ID_REV 0x000000ff -#define HPET_ID_NUMBER_SHIFT 8 - -#define HPET_ID_VENDOR_SHIFT 16 -#define HPET_ID_VENDOR_8086 0x8086 - -#define HPET_CFG_ENABLE 0x001 -#define HPET_CFG_LEGACY 0x002 -#define HPET_LEGACY_8254 2 -#define HPET_LEGACY_RTC 8 - -#define HPET_TN_LEVEL 0x0002 -#define HPET_TN_ENABLE 0x0004 -#define HPET_TN_PERIODIC 0x0008 -#define HPET_TN_PERIODIC_CAP 0x0010 -#define HPET_TN_64BIT_CAP 0x0020 -#define HPET_TN_SETVAL 0x0040 -#define HPET_TN_32BIT 0x0100 -#define HPET_TN_ROUTE 0x3e00 -#define HPET_TN_FSB 0x4000 -#define HPET_TN_FSB_CAP 0x8000 - -#define HPET_TN_ROUTE_SHIFT 9 +#include #define HPET_TICK_RATE (HZ * 100000UL) -extern int is_hpet_enabled(void); extern int hpet_rtc_timer_init(void); extern int hpet_arch_init(void); extern int hpet_timer_stop_set_go(unsigned long tick); @@ -61,17 +12,7 @@ extern int hpet_reenable(void); extern unsigned int hpet_calibrate_tsc(void); extern int hpet_use_timer; -extern unsigned long hpet_address; extern unsigned long hpet_period; extern unsigned long hpet_tick; -#ifdef CONFIG_HPET_EMULATE_RTC -extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); -extern int hpet_set_rtc_irq_bit(unsigned long bit_mask); -extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec); -extern int hpet_set_periodic_freq(unsigned long freq); -extern int hpet_rtc_dropped_irq(void); -extern int hpet_rtc_timer_init(void); -#endif /* CONFIG_HPET_EMULATE_RTC */ - #endif patches/i386-nmi-watchdog-show-regs.patch0000664000077200007720000000077110646635216017544 0ustar mingomingo--- arch/i386/kernel/nmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/nmi.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/nmi.c +++ linux-rt.q/arch/i386/kernel/nmi.c @@ -390,7 +390,7 @@ notrace __kprobes int nmi_watchdog_tick( spin_lock(&lock); printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); + show_regs(regs); spin_unlock(&lock); cpu_clear(cpu, backtrace_mask); } patches/hpet-force-enable-on-ich34.patch0000664000077200007720000000556010646635211017356 0ustar mingomingoFrom us15@os.inf.tu-dresden.de Wed Jun 6 14:34:18 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.1 required=5.0 tests=AWL,MAILTO_TO_SPAM_ADDR autolearn=no version=3.1.7-deb Received: from os.inf.tu-dresden.de (os.inf.tu-dresden.de [141.76.48.99]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (No client certificate requested) by mail.tglx.de (Postfix) with ESMTP id CB67965C065 for ; Wed, 6 Jun 2007 14:34:18 +0200 (CEST) Received: from nova.inf.tu-dresden.de ([141.76.48.73] helo=laptop.hypervisor.org) by os.inf.tu-dresden.de with esmtpsa (TLSv1:AES256-SHA:256) (Exim 4.67) id 1HvuiQ-0000WF-8q; Wed, 06 Jun 2007 14:34:18 +0200 Date: Wed, 6 Jun 2007 14:34:14 +0200 From: "Udo A. Steinberg" To: Thomas Gleixner , Venkatesh Pallipadi Subject: [PATCH]: Enable HPET on ICH3 and ICH4 Message-ID: <20070606143414.6003edd0@laptop.hypervisor.org> X-Mailer: X-Mailer 5.0 Gold Mime-Version: 1.0 Content-Type: multipart/signed; boundary=Sig_TyoZ8hpf907DzN6.B9sCrGr; protocol="application/pgp-signature"; micalg=PGP-SHA1 X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ ICH3 and ICH4 have undocumented HPET capabilities. This patch enables HPET for platforms based around these ICHs. Tested on various ICH3 and ICH4 platforms. Because HPET is not officially documented for ICH3/4 and may not have been validated by chipset folks, we're on thin ice here. I'd recommend testing this patch in -hrt or -mm for a while and wait for success/failure reports before feeding it upstream. Signed-off-by: Udo A. Steinberg --- arch/i386/kernel/quirks.c | 8 ++++++++ 1 file changed, 8 insertions(+) Index: linux-rt.q/arch/i386/kernel/quirks.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/quirks.c +++ linux-rt.q/arch/i386/kernel/quirks.c @@ -232,6 +232,14 @@ static void old_ich_force_enable_hpet(st printk(KERN_DEBUG "Failed to force enable HPET\n"); } +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, old_ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, patches/i386-prepare-sharing-pit-code.patch0000664000077200007720000000331510646635211020030 0ustar mingomingoSubject: i386: prepare sharing the PIT code PIT clock events work already and the PIT handling is the same for i386 and x86_64. x86_64 does not support PIT as a clock source, so disable the PIT clocksource for x86_64. Prepare i8253.h to be shared with x8664 Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/i386/kernel/i8253.c | 3 +++ include/asm-i386/i8253.h | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/i386/kernel/i8253.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/i8253.c +++ linux-rt.q/arch/i386/kernel/i8253.c @@ -119,6 +119,7 @@ void __init setup_pit_timer(void) global_clock_event = &pit_clockevent; } +#ifndef CONFIG_X86_64 /* * Since the PIT overflows every tick, its not very useful * to just read by itself. So use jiffies to emulate a free @@ -203,3 +204,5 @@ static int __init init_pit_clocksource(v return clocksource_register(&clocksource_pit); } arch_initcall(init_pit_clocksource); + +#endif Index: linux-rt.q/include/asm-i386/i8253.h =================================================================== --- linux-rt.q.orig/include/asm-i386/i8253.h +++ linux-rt.q/include/asm-i386/i8253.h @@ -1,8 +1,6 @@ #ifndef __ASM_I8253_H__ #define __ASM_I8253_H__ -#include - /* i8253A PIT registers */ #define PIT_MODE 0x43 #define PIT_CH0 0x40 @@ -10,8 +8,12 @@ extern spinlock_t i8253_lock; +#ifdef CONFIG_GENERIC_CLOCKEVENTS + extern struct clock_event_device *global_clock_event; extern void setup_pit_timer(void); +#endif + #endif /* __ASM_I8253_H__ */ patches/netfilter-more-debugging.patch0000664000077200007720000000162310646635212017436 0ustar mingomingo doing netfilter changes and turning on netfilter debug means we've got to interpret netfilter warning messages a bit more. --- include/net/netfilter/nf_conntrack.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) Index: linux-rt.q/include/net/netfilter/nf_conntrack.h =================================================================== --- linux-rt.q.orig/include/net/netfilter/nf_conntrack.h +++ linux-rt.q/include/net/netfilter/nf_conntrack.h @@ -63,11 +63,14 @@ union nf_conntrack_help { #ifdef CONFIG_NETFILTER_DEBUG #define NF_CT_ASSERT(x) \ do { \ - if (!(x)) \ + if (!(x)) { \ /* Wooah! I'm tripping my conntrack in a frenzy of \ netplay... */ \ printk("NF_CT_ASSERT: %s:%i(%s)\n", \ __FILE__, __LINE__, __FUNCTION__); \ + if (printk_ratelimit()) \ + WARN_ON(1); \ + } \ } while(0) #else #define NF_CT_ASSERT(x) patches/rcu-preempt-fix-nmi-watchdog.patch0000664000077200007720000000264510646635213020165 0ustar mingomingoSubject: change die_chain from atomic to raw notifiers From: Ingo Molnar atomic notifier chains are using rcu_read_lock()/unlock(), but those are not NMI-safe in -rt - so switch these chains to raw notifiers. Signed-off-by: Ingo Molnar --- kernel/die_notifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) Index: linux-rt.q/kernel/die_notifier.c =================================================================== --- linux-rt.q.orig/kernel/die_notifier.c +++ linux-rt.q/kernel/die_notifier.c @@ -5,7 +5,7 @@ #include -static ATOMIC_NOTIFIER_HEAD(die_chain); +static RAW_NOTIFIER_HEAD(die_chain); int notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) @@ -19,19 +19,19 @@ int notify_die(enum die_val val, const c }; - return atomic_notifier_call_chain(&die_chain, val, &args); + return raw_notifier_call_chain(&die_chain, val, &args); } int register_die_notifier(struct notifier_block *nb) { vmalloc_sync_all(); - return atomic_notifier_chain_register(&die_chain, nb); + return raw_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); int unregister_die_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&die_chain, nb); + return raw_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier); patches/bh-state-lock.patch0000664000077200007720000000576210646635214015220 0ustar mingomingo I was compiling a kernel in a shell that I set to a priority of 20, and it locked up on the bit_spin_lock crap of jbd. This patch adds another spinlock to the buffer head and uses that instead of the bit_spins. From: Steven Rostedt Signed-off-by: Ingo Molnar -- fs/buffer.c | 3 ++- include/linux/buffer_head.h | 1 + include/linux/jbd.h | 12 ++++++------ 3 files changed, 9 insertions(+), 7 deletions(-) Index: linux-rt.q/fs/buffer.c =================================================================== --- linux-rt.q.orig/fs/buffer.c +++ linux-rt.q/fs/buffer.c @@ -40,7 +40,6 @@ #include #include #include -#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -2897,6 +2896,7 @@ struct buffer_head *alloc_buffer_head(gf if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); spin_lock_init(&ret->b_uptodate_lock); + spin_lock_init(&ret->b_state_lock); get_cpu_var(bh_accounting).nr++; recalc_bh_state(); put_cpu_var(bh_accounting); @@ -2909,6 +2909,7 @@ void free_buffer_head(struct buffer_head { BUG_ON(!list_empty(&bh->b_assoc_buffers)); BUG_ON(spin_is_locked(&bh->b_uptodate_lock)); + BUG_ON(spin_is_locked(&bh->b_state_lock)); kmem_cache_free(bh_cachep, bh); get_cpu_var(bh_accounting).nr--; recalc_bh_state(); Index: linux-rt.q/include/linux/buffer_head.h =================================================================== --- linux-rt.q.orig/include/linux/buffer_head.h +++ linux-rt.q/include/linux/buffer_head.h @@ -70,6 +70,7 @@ struct buffer_head { associated with */ atomic_t b_count; /* users using this buffer_head */ spinlock_t b_uptodate_lock; + spinlock_t b_state_lock; }; /* Index: linux-rt.q/include/linux/jbd.h =================================================================== --- linux-rt.q.orig/include/linux/jbd.h +++ linux-rt.q/include/linux/jbd.h @@ -331,32 +331,32 @@ static inline struct journal_head *bh2jh static inline void jbd_lock_bh_state(struct buffer_head *bh) { - bit_spin_lock(BH_State, &bh->b_state); + spin_lock(&bh->b_state_lock); } static inline int jbd_trylock_bh_state(struct buffer_head *bh) { - return bit_spin_trylock(BH_State, &bh->b_state); + return spin_trylock(&bh->b_state_lock); } static inline int jbd_is_locked_bh_state(struct buffer_head *bh) { - return bit_spin_is_locked(BH_State, &bh->b_state); + return spin_is_locked(&bh->b_state_lock); } static inline void jbd_unlock_bh_state(struct buffer_head *bh) { - bit_spin_unlock(BH_State, &bh->b_state); + spin_unlock(&bh->b_state_lock); } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { - bit_spin_lock(BH_JournalHead, &bh->b_state); + spin_lock_irq(&bh->b_uptodate_lock); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { - bit_spin_unlock(BH_JournalHead, &bh->b_state); + spin_unlock_irq(&bh->b_uptodate_lock); } struct jbd_revoke_table_s; patches/version.patch0000664000077200007720000000107710646635217014246 0ustar mingomingoSubject: add -rt extra-version From: Ingo Molnar add -rt extra-version. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/Makefile =================================================================== --- linux-rt.q.orig/Makefile +++ linux-rt.q/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 22 -EXTRAVERSION = .1-cfs-v19 +EXTRAVERSION = .1-rt4 NAME = Holy Dancing Manatees, Batman! # *DOCUMENTATION* patches/preempt-realtime-mips.patch0000664000077200007720000013101010646635214016767 0ustar mingomingo arch/mips/Kconfig | 13 +- arch/mips/kernel/asm-offsets.c | 2 arch/mips/kernel/entry.S | 22 ++-- arch/mips/kernel/i8259.c | 2 arch/mips/kernel/module.c | 2 arch/mips/kernel/process.c | 8 - arch/mips/kernel/scall32-o32.S | 2 arch/mips/kernel/scall64-64.S | 2 arch/mips/kernel/scall64-n32.S | 2 arch/mips/kernel/scall64-o32.S | 2 arch/mips/kernel/semaphore.c | 22 ++-- arch/mips/kernel/signal.c | 4 arch/mips/kernel/signal32.c | 4 arch/mips/kernel/smp.c | 27 ++++ arch/mips/kernel/time.c | 208 ++++++++++++++++++++++++++++++++++++-- arch/mips/kernel/traps.c | 2 arch/mips/mm/init.c | 2 arch/mips/sibyte/cfe/smp.c | 4 arch/mips/sibyte/sb1250/irq.c | 10 + arch/mips/sibyte/sb1250/smp.c | 2 arch/mips/sibyte/swarm/setup.c | 6 + include/asm-mips/asmmacro.h | 8 - include/asm-mips/atomic.h | 1 include/asm-mips/bitops.h | 5 include/asm-mips/hw_irq.h | 1 include/asm-mips/i8259.h | 2 include/asm-mips/io.h | 1 include/asm-mips/linkage.h | 5 include/asm-mips/m48t35.h | 2 include/asm-mips/mipsregs.h | 4 include/asm-mips/rwsem.h | 176 ++++++++++++++++++++++++++++++++ include/asm-mips/semaphore.h | 33 +++--- include/asm-mips/spinlock.h | 18 +-- include/asm-mips/spinlock_types.h | 4 include/asm-mips/thread_info.h | 2 include/asm-mips/time.h | 2 include/asm-mips/timeofday.h | 5 include/asm-mips/uaccess.h | 12 -- 38 files changed, 536 insertions(+), 93 deletions(-) Index: linux-rt.q/arch/mips/Kconfig =================================================================== --- linux-rt.q.orig/arch/mips/Kconfig +++ linux-rt.q/arch/mips/Kconfig @@ -674,18 +674,16 @@ source "arch/mips/philips/pnx8550/common endmenu + config RWSEM_GENERIC_SPINLOCK bool - depends on !PREEMPT_RT default y config RWSEM_XCHGADD_ALGORITHM bool - depends on !PREEMPT_RT config ASM_SEMAPHORES bool -# depends on !PREEMPT_RT default y config ARCH_HAS_ILOG2_U32 @@ -1786,6 +1784,15 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +config GENERIC_TIME + bool + default y + +source "kernel/time/Kconfig" + +config CPU_SPEED + int "CPU speed used for clocksource/clockevent calculations" + default 600 endmenu config LOCKDEP_SUPPORT Index: linux-rt.q/arch/mips/kernel/asm-offsets.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/asm-offsets.c +++ linux-rt.q/arch/mips/kernel/asm-offsets.c @@ -10,9 +10,11 @@ */ #include #include +#include #include #include #include +#include #include #include Index: linux-rt.q/arch/mips/kernel/entry.S =================================================================== --- linux-rt.q.orig/arch/mips/kernel/entry.S +++ linux-rt.q/arch/mips/kernel/entry.S @@ -30,7 +30,7 @@ .align 5 #ifndef CONFIG_PREEMPT FEXPORT(ret_from_exception) - local_irq_disable # preempt stop + raw_local_irq_disable # preempt stop b __ret_from_irq #endif FEXPORT(ret_from_irq) @@ -41,7 +41,7 @@ FEXPORT(__ret_from_irq) beqz t0, resume_kernel resume_userspace: - local_irq_disable # make sure we dont miss an + raw_local_irq_disable # make sure we dont miss an # interrupt setting need_resched # between sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -51,7 +51,9 @@ resume_userspace: #ifdef CONFIG_PREEMPT resume_kernel: - local_irq_disable + raw_local_irq_disable + lw t0, kernel_preemption + beqz t0, restore_all lw t0, TI_PRE_COUNT($28) bnez t0, restore_all need_resched: @@ -61,7 +63,9 @@ need_resched: LONG_L t0, PT_STATUS(sp) # Interrupts off? andi t0, 1 beqz t0, restore_all + raw_local_irq_disable jal preempt_schedule_irq + sw zero, TI_PRE_COUNT($28) b need_resched #endif @@ -69,7 +73,7 @@ FEXPORT(ret_from_fork) jal schedule_tail # a0 = struct task_struct *prev FEXPORT(syscall_exit) - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -140,19 +144,21 @@ FEXPORT(restore_partial) # restore part .set at work_pending: - andi t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS + # a2 is preloaded with TI_FLAGS + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beqz t0, work_notifysig work_resched: + raw_local_irq_enable t0 jal schedule - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) andi t0, a2, _TIF_WORK_MASK # is there any work to be done # other than syscall tracing? beqz t0, restore_all - andi t0, a2, _TIF_NEED_RESCHED + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bnez t0, work_resched work_notifysig: # deal with pending signals and @@ -168,7 +174,7 @@ syscall_exit_work: li t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT and t0, a2 # a2 is preloaded with TI_FLAGS beqz t0, work_pending # trace bit set? - local_irq_enable # could let do_syscall_trace() + raw_local_irq_enable # could let do_syscall_trace() # call schedule() instead move a0, sp li a1, 1 Index: linux-rt.q/arch/mips/kernel/i8259.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/i8259.c +++ linux-rt.q/arch/mips/kernel/i8259.c @@ -29,9 +29,9 @@ */ static int i8259A_auto_eoi = -1; -DEFINE_SPINLOCK(i8259A_lock); /* some platforms call this... */ void mask_and_ack_8259A(unsigned int); +DEFINE_RAW_SPINLOCK(i8259A_lock); static struct irq_chip i8259A_chip = { .name = "XT-PIC", Index: linux-rt.q/arch/mips/kernel/module.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/module.c +++ linux-rt.q/arch/mips/kernel/module.c @@ -40,7 +40,7 @@ struct mips_hi16 { static struct mips_hi16 *mips_hi16_list; static LIST_HEAD(dbe_list); -static DEFINE_SPINLOCK(dbe_lock); +static DEFINE_RAW_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { Index: linux-rt.q/arch/mips/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/process.c +++ linux-rt.q/arch/mips/kernel/process.c @@ -50,7 +50,7 @@ ATTRIB_NORET void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { #ifdef CONFIG_SMTC_IDLE_HOOK_DEBUG extern void smtc_idle_loop_hook(void); @@ -59,9 +59,11 @@ ATTRIB_NORET void cpu_idle(void) if (cpu_wait) (*cpu_wait)(); } - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux-rt.q/arch/mips/kernel/scall32-o32.S =================================================================== --- linux-rt.q.orig/arch/mips/kernel/scall32-o32.S +++ linux-rt.q/arch/mips/kernel/scall32-o32.S @@ -73,7 +73,7 @@ stack_done: 1: sw v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return lw a2, TI_FLAGS($28) # current->work Index: linux-rt.q/arch/mips/kernel/scall64-64.S =================================================================== --- linux-rt.q.orig/arch/mips/kernel/scall64-64.S +++ linux-rt.q/arch/mips/kernel/scall64-64.S @@ -72,7 +72,7 @@ NESTED(handle_sys64, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result n64_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux-rt.q/arch/mips/kernel/scall64-n32.S =================================================================== --- linux-rt.q.orig/arch/mips/kernel/scall64-n32.S +++ linux-rt.q/arch/mips/kernel/scall64-n32.S @@ -69,7 +69,7 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd v0, PT_R0(sp) # set flag for syscall restarting 1: sd v0, PT_R2(sp) # result - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux-rt.q/arch/mips/kernel/scall64-o32.S =================================================================== --- linux-rt.q.orig/arch/mips/kernel/scall64-o32.S +++ linux-rt.q/arch/mips/kernel/scall64-o32.S @@ -98,7 +98,7 @@ NESTED(handle_sys, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make need_resched and + raw_local_irq_disable # make need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) Index: linux-rt.q/arch/mips/kernel/semaphore.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/semaphore.c +++ linux-rt.q/arch/mips/kernel/semaphore.c @@ -36,7 +36,7 @@ * sem->count and sem->waking atomic. Scalability isn't an issue because * this lock is used on UP only so it's just an empty variable. */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -67,7 +67,7 @@ static inline int __sem_update_count(str : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) : "r" (incr), "m" (sem->count)); } else { - static DEFINE_SPINLOCK(semaphore_lock); + static DEFINE_RAW_SPINLOCK(semaphore_lock); unsigned long flags; spin_lock_irqsave(&semaphore_lock, flags); @@ -80,7 +80,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -94,7 +94,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -104,7 +104,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -133,9 +133,9 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -165,4 +165,10 @@ int __sched __down_interruptible(struct return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux-rt.q/arch/mips/kernel/signal.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/signal.c +++ linux-rt.q/arch/mips/kernel/signal.c @@ -629,6 +629,10 @@ static void do_signal(struct pt_regs *re siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: linux-rt.q/arch/mips/kernel/signal32.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/signal32.c +++ linux-rt.q/arch/mips/kernel/signal32.c @@ -656,6 +656,10 @@ static int setup_rt_frame_32(struct k_si if (err) goto give_sigsegv; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * Arguments to signal handler: * Index: linux-rt.q/arch/mips/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/smp.c +++ linux-rt.q/arch/mips/kernel/smp.c @@ -88,7 +88,22 @@ asmlinkage __cpuinit void start_secondar cpu_idle(); } -DEFINE_SPINLOCK(smp_call_lock); +DEFINE_RAW_SPINLOCK(smp_call_lock); + +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them. + */ +void smp_send_reschedule_allbutself(void) +{ + int cpu = smp_processor_id(); + int i; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_online(i) && i != cpu) + core_send_ipi(i, SMP_RESCHEDULE_YOURSELF); +} struct call_data_struct *call_data; @@ -275,6 +290,8 @@ int setup_profiling_timer(unsigned int m return 0; } +static DEFINE_RAW_SPINLOCK(tlbstate_lock); + static void flush_tlb_all_ipi(void *info) { local_flush_tlb_all(); @@ -332,6 +349,7 @@ static inline void smp_on_each_tlb(void void flush_tlb_mm(struct mm_struct *mm) { preempt_disable(); + spin_lock(&tlbstate_lock); if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { smp_on_other_tlbs(flush_tlb_mm_ipi, (void *)mm); @@ -341,6 +359,7 @@ void flush_tlb_mm(struct mm_struct *mm) if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_mm(mm); preempt_enable(); @@ -364,6 +383,8 @@ void flush_tlb_range(struct vm_area_stru struct mm_struct *mm = vma->vm_mm; preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { struct flush_tlb_data fd; @@ -377,6 +398,7 @@ void flush_tlb_range(struct vm_area_stru if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_range(vma, start, end); preempt_enable(); } @@ -407,6 +429,8 @@ static void flush_tlb_page_ipi(void *inf void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) { preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) { struct flush_tlb_data fd; @@ -419,6 +443,7 @@ void flush_tlb_page(struct vm_area_struc if (smp_processor_id() != i) cpu_context(i, vma->vm_mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_page(vma, page); preempt_enable(); } Index: linux-rt.q/arch/mips/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/time.c +++ linux-rt.q/arch/mips/kernel/time.c @@ -10,6 +10,11 @@ * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. + * + * This implementation of High Res Timers uses two timers. One is the system + * timer. The second is used for the high res timers. The high res timers + * require the CPU to have count/compare registers. The mips_set_next_event() + * function schedules the next high res timer interrupt. */ #include #include @@ -23,6 +28,7 @@ #include #include #include +#include #include #include @@ -47,7 +53,27 @@ /* * forward reference */ -DEFINE_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); + +/* any missed timer interrupts */ +int missed_timer_count; + +#ifdef CONFIG_HIGH_RES_TIMERS +static void mips_set_next_event(unsigned long evt); +static void mips_set_mode(int mode, void *priv); + +static struct clock_event lapic_clockevent = { + .name = "mips clockevent interface", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | + CLOCK_HAS_IRQHANDLER +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_next_event = mips_set_next_event, +}; +#endif /* * By default we provide the null RTC ops @@ -56,6 +82,129 @@ static unsigned long null_rtc_get_time(v { return mktime(2000, 1, 1, 0, 0, 0); } +#ifdef CONFIG_SMP +/* + * We have to synchronize the master CPU with all the slave CPUs + */ +static atomic_t cpus_started; +static atomic_t cpus_ready; +static atomic_t cpus_count; +/* + * Master processor inits + */ +static void sync_cpus_init(int v) +{ + atomic_set(&cpus_count, 0); + mb(); + atomic_set(&cpus_started, v); + mb(); + atomic_set(&cpus_ready, v); + mb(); +} + +/* + * Called by the master processor + */ +static void sync_cpus_master(int v) +{ + atomic_set(&cpus_count, 0); + mb(); + atomic_set(&cpus_started, v); + mb(); + /* Wait here till all other CPUs are now ready */ + while (atomic_read(&cpus_count) != (num_online_cpus() -1) ) + mb(); + atomic_set(&cpus_ready, v); + mb(); +} +/* + * Called by the slave processors + */ +static void sync_cpus_slave(int v) +{ + /* Check if the master has been through this */ + while (atomic_read(&cpus_started) != v) + mb(); + atomic_inc(&cpus_count); + mb(); + while (atomic_read(&cpus_ready) != v) + mb(); +} +/* + * Called by the slave CPUs when done syncing the count register + * with the master processor + */ +static void sync_cpus_slave_exit(int v) +{ + while (atomic_read(&cpus_started) != v) + mb(); + atomic_inc(&cpus_count); + mb(); +} + +#define LOOPS 100 +static u32 c0_count[NR_CPUS]; /* Count register per CPU */ +static u32 c[NR_CPUS][LOOPS + 1]; /* Count register per CPU per loop for syncing */ + +/* + * Slave processors execute this via IPI + */ +static void sync_c0_count_slave(void *info) +{ + int cpus = 1, loop, prev_count = 0, cpu = smp_processor_id(); + unsigned long flags; + u32 diff_count; /* CPU count registers are 32-bit */ + local_irq_save(flags); + + for(loop = 0; loop <= LOOPS; loop++) { + /* Sync with the Master processor */ + sync_cpus_slave(cpus++); + c[cpu][loop] = c0_count[cpu] = read_c0_count(); + mb(); + sync_cpus_slave(cpus++); + diff_count = c0_count[0] - c0_count[cpu]; + diff_count += prev_count; + diff_count += read_c0_count(); + write_c0_count(diff_count); + prev_count = (prev_count >> 1) + + ((int)(c0_count[0] - c0_count[cpu]) >> 1); + } + + /* Slave processor is done syncing count register with Master */ + sync_cpus_slave_exit(cpus++); + printk("SMP: Slave processor %d done syncing count \n", cpu); + local_irq_restore(flags); +} + +/* + * Master kicks off the syncing process + */ +void sync_c0_count_master(void) +{ + int cpus = 0, loop, cpu = smp_processor_id(); + unsigned long flags; + + printk("SMP: Starting to sync the c0 count register ... \n"); + sync_cpus_init(cpus++); + + /* Kick off the slave processors to also start the syncing process */ + smp_call_function(sync_c0_count_slave, NULL, 0, 0); + local_irq_save(flags); + + for (loop = 0; loop <= LOOPS; loop++) { + /* Wait for all the CPUs here */ + sync_cpus_master(cpus++); + c[cpu][loop] = c0_count[cpu] = read_c0_count(); + mb(); + /* Do syncing once more */ + sync_cpus_master(cpus++); + } + sync_cpus_master(cpus++); + local_irq_restore(flags); + + printk("SMP: Syncing process completed accross CPUs ... \n"); +} +#endif /* CONFIG_SMP */ static int null_rtc_set_time(unsigned long sec) { @@ -66,19 +215,30 @@ unsigned long (*rtc_mips_get_time)(void) int (*rtc_mips_set_time)(unsigned long) = null_rtc_set_time; int (*rtc_mips_set_mmss)(unsigned long); - /* how many counter cycles in a jiffy */ static unsigned long cycles_per_jiffy __read_mostly; +static unsigned long hrt_cycles_per_jiffy __read_mostly; + + /* expirelo is the count value for next CPU timer interrupt */ static unsigned int expirelo; - /* * Null timer ack for systems not needing one (e.g. i8254). */ static void null_timer_ack(void) { /* nothing */ } +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * Set the next event + */ +static void mips_set_next_event(unsigned long evt) +{ + write_c0_compare(read_c0_count() + evt); +} +#endif + /* * Null high precision timer functions for systems lacking one. */ @@ -95,13 +255,13 @@ static void c0_timer_ack(void) unsigned int count; /* Ack this timer interrupt and set the next one. */ - expirelo += cycles_per_jiffy; + expirelo += hrt_cycles_per_jiffy; write_c0_compare(expirelo); - /* Check to see if we have missed any timer interrupts. */ - while (((count = read_c0_count()) - expirelo) < 0x7fffffff) { - /* missed_timer_count++; */ - expirelo = count + cycles_per_jiffy; + count = read_c0_count(); + if ((count - expirelo) < 0x7fffffff) { + /* missed_timer_count++; */ + expirelo = count + hrt_cycles_per_jiffy; write_c0_compare(expirelo); } } @@ -160,7 +320,7 @@ irqreturn_t timer_interrupt(int irq, voi /* * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. rtc_mips_set_time() has to be + * CMOS clock accordingly every ~11 minutes. rtc_set_time() has to be * called as close as possible to 500 ms before the new second starts. */ if (ntp_synced() && @@ -228,6 +388,15 @@ static inline int handle_perf_irq (int r !r2; } +#ifdef CONFIG_HIGH_RES_TIMERS +void event_timer_handler(struct pt_regs *regs) +{ + c0_timer_ack(); + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs,NULL); +} +#endif + asmlinkage void ll_timer_interrupt(int irq) { int r2 = cpu_has_mips_r2; @@ -235,6 +404,16 @@ asmlinkage void ll_timer_interrupt(int i irq_enter(); kstat_this_cpu.irqs[irq]++; + +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Run the event handler + */ + if (!r2 || (read_c0_cause() & (1 << 26))) + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs,NULL); +#endif + if (handle_perf_irq(r2)) goto out; @@ -267,7 +446,7 @@ asmlinkage void ll_local_timer_interrupt * b) (optional) calibrate and set the mips_hpt_frequency * (only needed if you intended to use cpu counter as timer interrupt * source) - * 2) setup xtime based on rtc_mips_get_time(). + * 2) setup xtime based on rtc_get_time(). * 3) calculate a couple of cached variables for later usage * 4) plat_timer_setup() - * a) (optional) over-write any choices made above by time_init(). @@ -358,6 +537,9 @@ static void __init init_mips_clocksource void __init time_init(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + u64 temp; +#endif if (board_time_init) board_time_init(); @@ -401,6 +583,12 @@ void __init time_init(void) if (!mips_hpt_frequency) mips_hpt_frequency = calibrate_hpt(); +#ifdef CONFIG_HIGH_RES_TIMERS + hrt_cycles_per_jiffy = ( (CONFIG_CPU_SPEED * 1000000) + HZ / 2) / HZ; +#else + hrt_cycles_per_jiffy = cycles_per_jiffy; +#endif + /* Report the high precision timer rate for a reference. */ printk("Using %u.%03u MHz high precision timer.\n", ((mips_hpt_frequency + 500) / 1000) / 1000, Index: linux-rt.q/arch/mips/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/traps.c +++ linux-rt.q/arch/mips/kernel/traps.c @@ -309,7 +309,7 @@ void show_registers(struct pt_regs *regs printk("\n"); } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); NORET_TYPE void ATTRIB_NORET die(const char * str, struct pt_regs * regs) { Index: linux-rt.q/arch/mips/mm/init.c =================================================================== --- linux-rt.q.orig/arch/mips/mm/init.c +++ linux-rt.q/arch/mips/mm/init.c @@ -59,7 +59,7 @@ #endif /* CONFIG_MIPS_MT_SMTC */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * We have up to 8 empty zeroed pages so we can map one of the right colour Index: linux-rt.q/arch/mips/sibyte/cfe/smp.c =================================================================== --- linux-rt.q.orig/arch/mips/sibyte/cfe/smp.c +++ linux-rt.q/arch/mips/sibyte/cfe/smp.c @@ -107,4 +107,8 @@ void prom_smp_finish(void) */ void prom_cpus_done(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + extern void sync_c0_count_master(void); + sync_c0_count_master(); +#endif } Index: linux-rt.q/arch/mips/sibyte/sb1250/irq.c =================================================================== --- linux-rt.q.orig/arch/mips/sibyte/sb1250/irq.c +++ linux-rt.q/arch/mips/sibyte/sb1250/irq.c @@ -81,7 +81,7 @@ static struct irq_chip sb1250_irq_type = /* Store the CPU id (not the logical number) */ int sb1250_irq_owner[SB1250_NR_IRQS]; -DEFINE_SPINLOCK(sb1250_imr_lock); +DEFINE_RAW_SPINLOCK(sb1250_imr_lock); void sb1250_mask_irq(int cpu, int irq) { @@ -352,6 +352,10 @@ void __init arch_init_irq(void) #ifdef CONFIG_KGDB imask |= STATUSF_IP6; #endif + +#ifdef CONFIG_HIGH_RES_TIMERS + imask |= STATUSF_IP7; +#endif /* Enable necessary IPs, disable the rest */ change_c0_status(ST0_IM, imask); @@ -429,6 +433,10 @@ asmlinkage void plat_irq_dispatch(void) else #endif +#ifdef CONFIG_HIGH_RES_TIMERS + if (pending & CAUSEF_IP7) + event_timer_handler(regs); +#endif if (pending & CAUSEF_IP4) sb1250_timer_interrupt(); Index: linux-rt.q/arch/mips/sibyte/sb1250/smp.c =================================================================== --- linux-rt.q.orig/arch/mips/sibyte/sb1250/smp.c +++ linux-rt.q/arch/mips/sibyte/sb1250/smp.c @@ -59,7 +59,7 @@ void sb1250_smp_finish(void) { extern void sb1250_time_init(void); sb1250_time_init(); - local_irq_enable(); + raw_local_irq_enable(); } /* Index: linux-rt.q/arch/mips/sibyte/swarm/setup.c =================================================================== --- linux-rt.q.orig/arch/mips/sibyte/swarm/setup.c +++ linux-rt.q/arch/mips/sibyte/swarm/setup.c @@ -131,6 +131,12 @@ void __init plat_mem_setup(void) rtc_mips_set_time = m41t81_set_time; } +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * set the mips_hpt_frequency here + */ + mips_hpt_frequency = CONFIG_CPU_SPEED * 1000000; +#endif printk("This kernel optimized for " #ifdef CONFIG_SIMULATION "simulation" Index: linux-rt.q/include/asm-mips/asmmacro.h =================================================================== --- linux-rt.q.orig/include/asm-mips/asmmacro.h +++ linux-rt.q/include/asm-mips/asmmacro.h @@ -21,7 +21,7 @@ #endif #ifdef CONFIG_MIPS_MT_SMTC - .macro local_irq_enable reg=t0 + .macro raw_local_irq_enable reg=t0 mfc0 \reg, CP0_TCSTATUS ori \reg, \reg, TCSTATUS_IXMT xori \reg, \reg, TCSTATUS_IXMT @@ -29,21 +29,21 @@ _ehb .endm - .macro local_irq_disable reg=t0 + .macro raw_local_irq_disable reg=t0 mfc0 \reg, CP0_TCSTATUS ori \reg, \reg, TCSTATUS_IXMT mtc0 \reg, CP0_TCSTATUS _ehb .endm #else - .macro local_irq_enable reg=t0 + .macro raw_local_irq_enable reg=t0 mfc0 \reg, CP0_STATUS ori \reg, \reg, 1 mtc0 \reg, CP0_STATUS irq_enable_hazard .endm - .macro local_irq_disable reg=t0 + .macro raw_local_irq_disable reg=t0 mfc0 \reg, CP0_STATUS ori \reg, \reg, 1 xori \reg, \reg, 1 Index: linux-rt.q/include/asm-mips/atomic.h =================================================================== --- linux-rt.q.orig/include/asm-mips/atomic.h +++ linux-rt.q/include/asm-mips/atomic.h @@ -573,7 +573,6 @@ static __inline__ long atomic64_add_retu raw_local_irq_restore(flags); } #endif -#endif smp_mb(); Index: linux-rt.q/include/asm-mips/bitops.h =================================================================== --- linux-rt.q.orig/include/asm-mips/bitops.h +++ linux-rt.q/include/asm-mips/bitops.h @@ -500,9 +500,6 @@ static inline unsigned long __ffs(unsign } /* - * fls - find last bit set. - * @word: The word to search - * * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ @@ -520,6 +517,8 @@ static inline int fls64(__u64 word) return 64 - word; } +#define __bi_local_irq_save(x) raw_local_irq_save(x) +#define __bi_local_irq_restore(x) raw_local_irq_restore(x) #else #include #endif Index: linux-rt.q/include/asm-mips/hw_irq.h =================================================================== --- linux-rt.q.orig/include/asm-mips/hw_irq.h +++ linux-rt.q/include/asm-mips/hw_irq.h @@ -10,6 +10,7 @@ #include #include +#include extern void disable_8259A_irq(unsigned int irq); extern void enable_8259A_irq(unsigned int irq); Index: linux-rt.q/include/asm-mips/i8259.h =================================================================== --- linux-rt.q.orig/include/asm-mips/i8259.h +++ linux-rt.q/include/asm-mips/i8259.h @@ -35,7 +35,7 @@ #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; extern void init_8259A(int auto_eoi); extern void enable_8259A_irq(unsigned int irq); Index: linux-rt.q/include/asm-mips/io.h =================================================================== --- linux-rt.q.orig/include/asm-mips/io.h +++ linux-rt.q/include/asm-mips/io.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include Index: linux-rt.q/include/asm-mips/linkage.h =================================================================== --- linux-rt.q.orig/include/asm-mips/linkage.h +++ linux-rt.q/include/asm-mips/linkage.h @@ -3,6 +3,11 @@ #ifdef __ASSEMBLY__ #include + +/* FASTCALL stuff */ +#define FASTCALL(x) x +#define fastcall + #endif #endif Index: linux-rt.q/include/asm-mips/m48t35.h =================================================================== --- linux-rt.q.orig/include/asm-mips/m48t35.h +++ linux-rt.q/include/asm-mips/m48t35.h @@ -6,7 +6,7 @@ #include -extern spinlock_t rtc_lock; +extern raw_spinlock_t rtc_lock; struct m48t35_rtc { volatile u8 pad[0x7ff8]; /* starts at 0x7ff8 */ Index: linux-rt.q/include/asm-mips/mipsregs.h =================================================================== --- linux-rt.q.orig/include/asm-mips/mipsregs.h +++ linux-rt.q/include/asm-mips/mipsregs.h @@ -705,7 +705,7 @@ do { \ unsigned long long val; \ unsigned long flags; \ \ - local_irq_save(flags); \ + local_irq_save(flags); \ if (sel == 0) \ __asm__ __volatile__( \ ".set\tmips64\n\t" \ @@ -733,7 +733,7 @@ do { \ do { \ unsigned long flags; \ \ - local_irq_save(flags); \ + local_irq_save(flags); \ if (sel == 0) \ __asm__ __volatile__( \ ".set\tmips64\n\t" \ Index: linux-rt.q/include/asm-mips/rwsem.h =================================================================== --- /dev/null +++ linux-rt.q/include/asm-mips/rwsem.h @@ -0,0 +1,176 @@ +/* + * include/asm-mips/rwsem.h: R/W semaphores for MIPS using the stuff + * in lib/rwsem.c. Adapted largely from include/asm-ppc/rwsem.h + * by john.cooper@timesys.com + */ + +#ifndef _MIPS_RWSEM_H +#define _MIPS_RWSEM_H + +#ifndef _LINUX_RWSEM_H +#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" +#endif + +#ifdef __KERNEL__ +#include +#include +#include +#include + +/* + * the semaphore definition + */ +struct compat_rw_semaphore { + /* XXX this should be able to be an atomic_t -- paulus */ + signed long count; +#define RWSEM_UNLOCKED_VALUE 0x00000000 +#define RWSEM_ACTIVE_BIAS 0x00000001 +#define RWSEM_ACTIVE_MASK 0x0000ffff +#define RWSEM_WAITING_BIAS (-0x00010000) +#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + raw_spinlock_t wait_lock; + struct list_head wait_list; +#if RWSEM_DEBUG + int debug; +#endif +}; + +/* + * initialisation + */ +#if RWSEM_DEBUG +#define __RWSEM_DEBUG_INIT , 0 +#else +#define __RWSEM_DEBUG_INIT /* */ +#endif + +#define __COMPAT_RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ + LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEBUG_INIT } + +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name) + +extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem); + +static inline void compat_init_rwsem(struct compat_rw_semaphore *sem) +{ + sem->count = RWSEM_UNLOCKED_VALUE; + spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +#if RWSEM_DEBUG + sem->debug = 0; +#endif +} + +/* + * lock for reading + */ +static inline void __down_read(struct compat_rw_semaphore *sem) +{ + if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) + smp_wmb(); + else + rwsem_down_read_failed(sem); +} + +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) +{ + int tmp; + + while ((tmp = sem->count) >= 0) { + if (tmp == cmpxchg(&sem->count, tmp, + tmp + RWSEM_ACTIVE_READ_BIAS)) { + smp_wmb(); + return 1; + } + } + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct compat_rw_semaphore *sem) +{ + int tmp; + + tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, + (atomic_t *)(&sem->count)); + if (tmp == RWSEM_ACTIVE_WRITE_BIAS) + smp_wmb(); + else + rwsem_down_write_failed(sem); +} + +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) +{ + int tmp; + + tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); + smp_wmb(); + return tmp == RWSEM_UNLOCKED_VALUE; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct compat_rw_semaphore *sem) +{ + int tmp; + + smp_wmb(); + tmp = atomic_dec_return((atomic_t *)(&sem->count)); + if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) + rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct compat_rw_semaphore *sem) +{ + smp_wmb(); + if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, + (atomic_t *)(&sem->count)) < 0) + rwsem_wake(sem); +} + +/* + * implement atomic add functionality + */ +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) +{ + atomic_add(delta, (atomic_t *)(&sem->count)); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct compat_rw_semaphore *sem) +{ + int tmp; + + smp_wmb(); + tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); + if (tmp < 0) + rwsem_downgrade_wake(sem); +} + +/* + * implement exchange and add functionality + */ +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) +{ + smp_mb(); + return atomic_add_return(delta, (atomic_t *)(&sem->count)); +} + +#endif /* __KERNEL__ */ +#endif /* _MIPS_RWSEM_H */ Index: linux-rt.q/include/asm-mips/semaphore.h =================================================================== --- linux-rt.q.orig/include/asm-mips/semaphore.h +++ linux-rt.q/include/asm-mips/semaphore.h @@ -47,39 +47,42 @@ struct compat_semaphore { wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name, count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name, 1) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1) -#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name, 0) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -static inline void sema_init (struct semaphore *sem, int val) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) +#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0) + +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern void __compat_up(struct compat_semaphore * sem); -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -112,6 +115,8 @@ static inline void compat_up(struct comp __compat_up(sem); } +extern int compat_sem_is_locked(struct compat_semaphore *sem); + #define compat_sema_count(sem) atomic_read(&(sem)->count) #include Index: linux-rt.q/include/asm-mips/spinlock.h =================================================================== --- linux-rt.q.orig/include/asm-mips/spinlock.h +++ linux-rt.q/include/asm-mips/spinlock.h @@ -28,7 +28,7 @@ * We make no fairness assumptions. They have a cost. */ -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { unsigned int tmp; @@ -70,7 +70,7 @@ static inline void __raw_spin_lock(raw_s smp_mb(); } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { smp_mb(); @@ -83,7 +83,7 @@ static inline void __raw_spin_unlock(raw : "memory"); } -static inline unsigned int __raw_spin_trylock(raw_spinlock_t *lock) +static inline unsigned int __raw_spin_trylock(__raw_spinlock_t *lock) { unsigned int temp, res; @@ -144,7 +144,7 @@ static inline unsigned int __raw_spin_tr */ #define __raw_write_can_lock(rw) (!(rw)->lock) -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void __raw_read_lock(__raw_rwlock_t *rw) { unsigned int tmp; @@ -189,7 +189,7 @@ static inline void __raw_read_lock(raw_r /* Note the use of sub, not subu which will make the kernel die with an overflow exception if we ever try to unlock an rwlock that is already unlocked or is being held by a writer. */ -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void __raw_read_unlock(__raw_rwlock_t *rw) { unsigned int tmp; @@ -223,7 +223,7 @@ static inline void __raw_read_unlock(raw } } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void __raw_write_lock(__raw_rwlock_t *rw) { unsigned int tmp; @@ -265,7 +265,7 @@ static inline void __raw_write_lock(raw_ smp_mb(); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void __raw_write_unlock(__raw_rwlock_t *rw) { smp_mb(); @@ -277,7 +277,7 @@ static inline void __raw_write_unlock(ra : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *rw) +static inline int __raw_read_trylock(__raw_rwlock_t *rw) { unsigned int tmp; int ret; @@ -321,7 +321,7 @@ static inline int __raw_read_trylock(raw return ret; } -static inline int __raw_write_trylock(raw_rwlock_t *rw) +static inline int __raw_write_trylock(__raw_rwlock_t *rw) { unsigned int tmp; int ret; Index: linux-rt.q/include/asm-mips/spinlock_types.h =================================================================== --- linux-rt.q.orig/include/asm-mips/spinlock_types.h +++ linux-rt.q/include/asm-mips/spinlock_types.h @@ -7,13 +7,13 @@ typedef struct { volatile unsigned int lock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } typedef struct { volatile unsigned int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { 0 } Index: linux-rt.q/include/asm-mips/thread_info.h =================================================================== --- linux-rt.q.orig/include/asm-mips/thread_info.h +++ linux-rt.q/include/asm-mips/thread_info.h @@ -114,6 +114,7 @@ register struct thread_info *__current_t #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ #define TIF_SECCOMP 5 /* secure computing */ +#define TIF_NEED_RESCHED_DELAYED 6 /* reschedule on return to userspace */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -127,6 +128,7 @@ register struct thread_info *__current_t #define _TIF_NEED_RESCHED (1< #include -extern spinlock_t rtc_lock; +extern raw_spinlock_t rtc_lock; /* * RTC ops. By default, they point to no-RTC functions. Index: linux-rt.q/include/asm-mips/timeofday.h =================================================================== --- /dev/null +++ linux-rt.q/include/asm-mips/timeofday.h @@ -0,0 +1,5 @@ +#ifndef _ASM_MIPS_TIMEOFDAY_H +#define _ASM_MIPS_TIMEOFDAY_H +#include +#endif + Index: linux-rt.q/include/asm-mips/uaccess.h =================================================================== --- linux-rt.q.orig/include/asm-mips/uaccess.h +++ linux-rt.q/include/asm-mips/uaccess.h @@ -427,7 +427,6 @@ extern size_t __copy_user(void *__to, co const void *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -483,7 +482,6 @@ extern size_t __copy_user_inatomic(void const void *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -562,7 +560,6 @@ extern size_t __copy_user_inatomic(void const void __user *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -593,7 +590,6 @@ extern size_t __copy_user_inatomic(void const void __user *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -611,7 +607,6 @@ extern size_t __copy_user_inatomic(void const void __user *__cu_from; \ long __cu_len; \ \ - might_sleep(); \ __cu_to = (to); \ __cu_from = (from); \ __cu_len = (n); \ @@ -638,7 +633,6 @@ __clear_user(void __user *addr, __kernel { __kernel_size_t res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, $0\n\t" @@ -687,7 +681,6 @@ __strncpy_from_user(char *__to, const ch { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, %2\n\t" @@ -724,7 +717,6 @@ strncpy_from_user(char *__to, const char { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, %2\n\t" @@ -743,7 +735,6 @@ static inline long __strlen_user(const c { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" __MODULE_JAL(__strlen_user_nocheck_asm) @@ -773,7 +764,6 @@ static inline long strlen_user(const cha { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" __MODULE_JAL(__strlen_user_asm) @@ -790,7 +780,6 @@ static inline long __strnlen_user(const { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, %2\n\t" @@ -821,7 +810,6 @@ static inline long strnlen_user(const ch { long res; - might_sleep(); __asm__ __volatile__( "move\t$4, %1\n\t" "move\t$5, %2\n\t" patches/neptune-no-at-keyboard.patch0000664000077200007720000000330010646635211017032 0ustar mingomingoneptune needs this to boot ... --- drivers/input/keyboard/atkbd.c | 14 ++++++++++++++ drivers/input/mouse/psmouse-base.c | 15 +++++++++++++++ 2 files changed, 29 insertions(+) Index: linux-rt.q/drivers/input/keyboard/atkbd.c =================================================================== --- linux-rt.q.orig/drivers/input/keyboard/atkbd.c +++ linux-rt.q/drivers/input/keyboard/atkbd.c @@ -1396,9 +1396,23 @@ static ssize_t atkbd_show_err_count(stru return sprintf(buf, "%lu\n", atkbd->err_count); } +static int __read_mostly noatkbd; + +static int __init noatkbd_setup(char *str) +{ + noatkbd = 1; + printk(KERN_INFO "debug: not setting up AT keyboard.\n"); + + return 1; +} + +__setup("noatkbd", noatkbd_setup); static int __init atkbd_init(void) { + if (noatkbd) + return 0; + return serio_register_driver(&atkbd_drv); } Index: linux-rt.q/drivers/input/mouse/psmouse-base.c =================================================================== --- linux-rt.q.orig/drivers/input/mouse/psmouse-base.c +++ linux-rt.q/drivers/input/mouse/psmouse-base.c @@ -1565,10 +1565,25 @@ static int psmouse_get_maxproto(char *bu return sprintf(buffer, "%s\n", psmouse_protocol_by_type(type)->name); } +static int __read_mostly nopsmouse; + +static int __init nopsmouse_setup(char *str) +{ + nopsmouse = 1; + printk(KERN_INFO "debug: not setting up psmouse.\n"); + + return 1; +} + +__setup("nopsmouse", nopsmouse_setup); + static int __init psmouse_init(void) { int err; + if (nopsmouse) + return 0; + kpsmoused_wq = create_singlethread_workqueue("kpsmoused"); if (!kpsmoused_wq) { printk(KERN_ERR "psmouse: failed to create kpsmoused workqueue\n"); patches/ioapic-fix-too-fast-clocks.patch0000664000077200007720000000273010646635211017606 0ustar mingomingoFrom: Akira Tsukamoto This one line patch adds upper bound testing inside timer_irq_works() when evaluating whether irq timer works or not on boot up. It fix the machines having problem with clock running too fast. What this patch do is, if timer interrupts running too fast through IO-APIC IRQ then false back to i8259A IRQ. I really appreciate for the feedback from ATI Xpress 200 chipset user, It should eliminate the needs of adding no_timer_check on kernel options. I have NEC laptop using ATI Xpress 200 chipset with Pentium M 1.8GHz and its clock keep going forward when kernel compiled with local APIC support. Many machines based on RS200 chipset seem to have the same problem, including Acer Ferrari 400X AMD notebook or Compaq R4000. Also I would like to have comments on upper bound limit, 16 ticks, which I chose in this patch. My laptop always reports around 20, which is double from normal. arch/i386/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/io_apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/io_apic.c +++ linux-rt.q/arch/i386/kernel/io_apic.c @@ -1919,7 +1919,7 @@ int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (jiffies - t1 > 4 && jiffies - t1 < 16) return 1; return 0; patches/preempt-realtime-irqs.patch0000664000077200007720000001117510646635215017007 0ustar mingomingo--- include/linux/irq.h | 10 ++++------ kernel/irq/handle.c | 13 +++++++++++-- kernel/irq/manage.c | 14 ++++++++++++-- kernel/irq/spurious.c | 3 +-- 4 files changed, 28 insertions(+), 12 deletions(-) Index: linux-rt.q/include/linux/irq.h =================================================================== --- linux-rt.q.orig/include/linux/irq.h +++ linux-rt.q/include/linux/irq.h @@ -145,7 +145,6 @@ struct irq_chip { * @irqs_unhandled: stats field for spurious unhandled interrupts * @thread: Thread pointer for threaded preemptible irq handling * @wait_for_handler: Waitqueue to wait for a running preemptible handler - * @cycles: Timestamp for stats and debugging * @lock: locking for SMP * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing @@ -167,10 +166,10 @@ struct irq_desc { unsigned int wake_depth; /* nested wake enables */ unsigned int irq_count; /* For detecting broken IRQs */ unsigned int irqs_unhandled; - struct task_struct *thread; - wait_queue_head_t wait_for_handler; - cycles_t timestamp; - spinlock_t lock; + struct task_struct *thread; + wait_queue_head_t wait_for_handler; + cycles_t timestamp; + raw_spinlock_t lock; #ifdef CONFIG_SMP cpumask_t affinity; unsigned int cpu; @@ -396,7 +395,6 @@ extern int set_irq_msi(unsigned int irq, /* Early initialization of irqs */ extern void early_init_hardirqs(void); -extern cycles_t irq_timestamp(unsigned int irq); #if defined(CONFIG_PREEMPT_HARDIRQS) extern void init_hardirqs(void); Index: linux-rt.q/kernel/irq/handle.c =================================================================== --- linux-rt.q.orig/kernel/irq/handle.c +++ linux-rt.q/kernel/irq/handle.c @@ -54,12 +54,13 @@ struct irq_desc irq_desc[NR_IRQS] __cach .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(irq_desc), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif } }; +EXPORT_SYMBOL_GPL(irq_desc); /* * What should we do if we get a hw irq event on an illegal vector? @@ -151,6 +152,7 @@ irqreturn_t handle_IRQ_event(unsigned in ret = action->handler(irq, action->dev_id); if (preempt_count() != preempt_count) { + stop_trace(); print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler); printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); dump_stack(); @@ -225,7 +227,7 @@ int redirect_hardirq(struct irq_desc *de * This is the original x86 implementation which is used for every * interrupt type. */ -fastcall unsigned int __do_IRQ(unsigned int irq) +fastcall notrace unsigned int __do_IRQ(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; struct irqaction *action; @@ -246,6 +248,13 @@ fastcall unsigned int __do_IRQ(unsigned desc->chip->end(irq); return 1; } + /* + * If the task is currently running in user mode, don't + * detect soft lockups. If CONFIG_DETECT_SOFTLOCKUP is not + * configured, this should be optimized out. + */ + if (user_mode(get_irq_regs())) + touch_softlockup_watchdog(); spin_lock(&desc->lock); if (desc->chip->ack) Index: linux-rt.q/kernel/irq/manage.c =================================================================== --- linux-rt.q.orig/kernel/irq/manage.c +++ linux-rt.q/kernel/irq/manage.c @@ -600,6 +600,11 @@ int hardirq_preemption = 1; EXPORT_SYMBOL(hardirq_preemption); +/* + * Real-Time Preemption depends on hardirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + static int __init hardirq_preempt_setup (char *str) { if (!strncmp(str, "off", 3)) @@ -614,6 +619,7 @@ static int __init hardirq_preempt_setup __setup("hardirq-preempt=", hardirq_preempt_setup); +#endif /* * threaded simple handler @@ -773,12 +779,16 @@ static int do_irqd(void * __desc) sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); while (!kthread_should_stop()) { - local_irq_disable(); + local_irq_disable_nort(); set_current_state(TASK_INTERRUPTIBLE); +#ifndef CONFIG_PREEMPT_RT irq_enter(); +#endif do_hardirq(desc); +#ifndef CONFIG_PREEMPT_RT irq_exit(); - local_irq_enable(); +#endif + local_irq_enable_nort(); cond_resched(); #ifdef CONFIG_SMP /* Index: linux-rt.q/kernel/irq/spurious.c =================================================================== --- linux-rt.q.orig/kernel/irq/spurious.c +++ linux-rt.q/kernel/irq/spurious.c @@ -59,9 +59,8 @@ static int misrouted_irq(int irq) } action = action->next; } - local_irq_disable(); /* Now clean up the flags */ - spin_lock(&desc->lock); + spin_lock_irq(&desc->lock); action = desc->action; /* patches/x86_64-tsc-sync-irqflags-fix.patch0000664000077200007720000000140510646635211017637 0ustar mingomingo--- arch/x86_64/kernel/tsc_sync.c | 4 ++++ 1 file changed, 4 insertions(+) Index: linux-rt.q/arch/x86_64/kernel/tsc_sync.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/tsc_sync.c +++ linux-rt.q/arch/x86_64/kernel/tsc_sync.c @@ -97,6 +97,7 @@ static __cpuinit void check_tsc_warp(voi */ void __cpuinit check_tsc_sync_source(int cpu) { + unsigned long flags; int cpus = 2; /* @@ -117,8 +118,11 @@ void __cpuinit check_tsc_sync_source(int /* * Wait for the target to arrive: */ + local_save_flags(flags); + local_irq_enable(); while (atomic_read(&start_count) != cpus-1) cpu_relax(); + local_irq_restore(flags); /* * Trigger the target to continue into the measurement too: */ patches/slob-scale-break-out-caches.patch0000664000077200007720000003026010646635211017706 0ustar mingomingo--- mm/slob.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 235 insertions(+), 56 deletions(-) Index: linux-rt.q/mm/slob.c =================================================================== --- linux-rt.q.orig/mm/slob.c +++ linux-rt.q/mm/slob.c @@ -27,6 +27,20 @@ * are allocated by calling __get_free_pages. As SLAB objects know * their size, no separate size bookkeeping is necessary and there is * essentially no allocation space overhead. + * + * Modified by: Steven Rostedt 12/20/05 + * + * Now we take advantage of the kmem_cache usage. I've removed + * the global slobfree, and created one for every cache. + * + * For kmalloc/kfree I've reintroduced the usage of cache_sizes, + * but only for sizes 32 through PAGE_SIZE >> 1 by order of 2. + * + * Having the SLOB alloc per size of the cache should speed things up + * greatly, not only by making the search paths smaller, but also by + * keeping all the caches of similar units. This way the fragmentation + * should not be as big of a problem. + * */ #include @@ -37,6 +51,8 @@ #include #include +#undef DEBUG_CACHE + struct slob_block { int units; struct slob_block *next; @@ -63,21 +79,66 @@ struct slob_rcu { int size; }; -static slob_t arena = { .next = &arena, .units = 1 }; -static slob_t *slobfree = &arena; -static DEFINE_SPINLOCK(slob_lock); +struct kmem_cache { + unsigned int size, align; + const char *name; + slob_t *slobfree; + slob_t arena; + spinlock_t lock; + void (*ctor)(void *, struct kmem_cache *, unsigned long); + void (*dtor)(void *, struct kmem_cache *, unsigned long); + atomic_t items; + unsigned int free; + struct list_head list; +}; + +#define NR_SLOB_CACHES ((PAGE_SHIFT) - 5) /* 32 to PAGE_SIZE-1 by order of 2 */ +#define MAX_SLOB_CACHE_SIZE (PAGE_SIZE >> 1) -static void slob_free(void *b, int size); -static void slob_timer_cbk(void); +static struct kmem_cache *cache_sizes[NR_SLOB_CACHES]; +static struct kmem_cache *bb_cache; +static struct semaphore cache_chain_sem; +static struct list_head cache_chain; -#define __get_slob_block(b) ((unsigned long)(b) & ~(PAGE_SIZE-1)) +#ifdef DEBUG_CACHE +static void test_cache(kmem_cache_t *c) +{ + slob_t *cur = c->slobfree; + unsigned int x = -1 >> 2; + + do { + BUG_ON(!cur->next); + cur = cur->next; + } while (cur != c->slobfree && --x); + BUG_ON(!x); +} +#else +#define test_cache(x) do {} while(0) +#endif +/* + * Here we take advantage of the lru field of the pages that + * map to the pages we use in the SLOB. This is done similar + * to what is done with SLAB. + * + * The lru.next field is used to get the bigblock descriptor + * for large blocks larger than PAGE_SIZE >> 1. + * + * Set and retrieved by set_slob_block and get_slob_block + * respectively. + * + * The lru.prev field is used to find the cache descriptor + * for small blocks smaller than or equal to PAGE_SIZE >> 1. + * + * Set and retrieved by set_slob_ptr and get_slob_ptr + * respectively. + * + * The use of lru.next tells us in kmalloc that the page is large. + */ static inline struct page *get_slob_page(const void *mem) { - void *virt = (void*)__get_slob_block(mem); - - return virt_to_page(virt); + return virt_to_page(mem); } static inline void zero_slob_block(const void *b) @@ -101,16 +162,39 @@ static inline void set_slob_block(const page->lru.next = data; } -static void *slob_alloc(size_t size, gfp_t gfp, int align) +static inline void *get_slob_ptr(const void *b) +{ + struct page *page; + page = get_slob_page(b); + return page->lru.prev; +} + +static inline void set_slob_ptr(const void *b, void *data) +{ + struct page *page; + page = get_slob_page(b); + page->lru.prev = data; +} + +static void slob_free(kmem_cache_t *cachep, void *b, int size); + +static void *slob_alloc(kmem_cache_t *cachep, gfp_t gfp, int align) { + size_t size; slob_t *prev, *cur, *aligned = 0; - int delta = 0, units = SLOB_UNITS(size); + int delta = 0, units; unsigned long flags; - spin_lock_irqsave(&slob_lock, flags); - prev = slobfree; + size = cachep->size; + units = SLOB_UNITS(size); + BUG_ON(!units); + + spin_lock_irqsave(&cachep->lock, flags); + prev = cachep->slobfree; for (cur = prev->next; ; prev = cur, cur = cur->next) { if (align) { + while (align < SLOB_UNIT) + align <<= 1; aligned = (slob_t *)ALIGN((unsigned long)cur, align); delta = aligned - cur; } @@ -133,12 +217,16 @@ static void *slob_alloc(size_t size, gfp cur->units = units; } - slobfree = prev; - spin_unlock_irqrestore(&slob_lock, flags); + cachep->slobfree = prev; + test_cache(cachep); + if (prev < prev->next) + BUG_ON(cur + cur->units > prev->next); + spin_unlock_irqrestore(&cachep->lock, flags); return cur; } - if (cur == slobfree) { - spin_unlock_irqrestore(&slob_lock, flags); + if (cur == cachep->slobfree) { + test_cache(cachep); + spin_unlock_irqrestore(&cachep->lock, flags); if (size == PAGE_SIZE) /* trying to shrink arena? */ return 0; @@ -148,14 +236,15 @@ static void *slob_alloc(size_t size, gfp return 0; zero_slob_block(cur); - slob_free(cur, PAGE_SIZE); - spin_lock_irqsave(&slob_lock, flags); - cur = slobfree; + set_slob_ptr(cur, cachep); + slob_free(cachep, cur, PAGE_SIZE); + spin_lock_irqsave(&cachep->lock, flags); + cur = cachep->slobfree; } } } -static void slob_free(void *block, int size) +static void slob_free(kmem_cache_t *cachep, void *block, int size) { slob_t *cur, *b = (slob_t *)block; unsigned long flags; @@ -167,39 +256,51 @@ static void slob_free(void *block, int s b->units = SLOB_UNITS(size); /* Find reinsertion point */ - spin_lock_irqsave(&slob_lock, flags); - for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next) + spin_lock_irqsave(&cachep->lock, flags); + for (cur = cachep->slobfree; !(b > cur && b < cur->next); cur = cur->next) if (cur >= cur->next && (b > cur || b < cur->next)) break; if (b + b->units == cur->next) { b->units += cur->next->units; b->next = cur->next->next; + BUG_ON(cur->next == &cachep->arena); } else b->next = cur->next; if (cur + cur->units == b) { cur->units += b->units; cur->next = b->next; + BUG_ON(b == &cachep->arena); } else cur->next = b; - slobfree = cur; + cachep->slobfree = cur; - spin_unlock_irqrestore(&slob_lock, flags); + test_cache(cachep); + spin_unlock_irqrestore(&cachep->lock, flags); } void *__kmalloc(size_t size, gfp_t gfp) { - slob_t *m; bigblock_t *bb; - if (size < PAGE_SIZE - SLOB_UNIT) { - m = slob_alloc(size + SLOB_UNIT, gfp, 0); - return m ? (void *)(m + 1) : 0; + /* + * If the size is less than PAGE_SIZE >> 1 then + * we use the generic caches. Otherwise, we + * just allocate the necessary pages. + */ + if (size <= MAX_SLOB_CACHE_SIZE) { + int i; + int order; + for (i=0, order=32; i < NR_SLOB_CACHES; i++, order <<= 1) + if (size <= order) + break; + BUG_ON(i == NR_SLOB_CACHES); + return kmem_cache_alloc(cache_sizes[i], gfp); } - bb = slob_alloc(sizeof(bigblock_t), gfp, 0); + bb = slob_alloc(bb_cache, gfp, 0); if (!bb) return 0; @@ -211,7 +312,7 @@ void *__kmalloc(size_t size, gfp_t gfp) return bb->pages; } - slob_free(bb, sizeof(bigblock_t)); + slob_free(bb_cache, bb, sizeof(bigblock_t)); return 0; } EXPORT_SYMBOL(__kmalloc); @@ -251,20 +352,25 @@ EXPORT_SYMBOL(krealloc); void kfree(const void *block) { + kmem_cache_t *c; bigblock_t *bb; if (!block) return; + /* + * look into the page of the allocated block to + * see if this is a big allocation or not. + */ bb = get_slob_block(block); if (bb) { free_pages((unsigned long)block, bb->order); - slob_free(bb, sizeof(bigblock_t)); + slob_free(bb_cache, bb, sizeof(bigblock_t)); return; } - slob_free((slob_t *)block - 1, 0); - return; + c = get_slob_ptr(block); + kmem_cache_free(c, (void *)block); } EXPORT_SYMBOL(kfree); @@ -272,6 +378,7 @@ EXPORT_SYMBOL(kfree); size_t ksize(const void *block) { bigblock_t *bb; + kmem_cache_t *c; if (!block) return 0; @@ -280,14 +387,16 @@ size_t ksize(const void *block) if (bb) return PAGE_SIZE << bb->order; - return ((slob_t *)block - 1)->units * SLOB_UNIT; + c = get_slob_ptr(block); + return c->size; } -struct kmem_cache { - unsigned int size, align; - unsigned long flags; - const char *name; - void (*ctor)(void *, struct kmem_cache *, unsigned long); +static slob_t cache_arena = { .next = &cache_arena, .units = 0 }; +struct kmem_cache cache_cache = { + .name = "cache", + .slobfree = &cache_cache.arena, + .arena = { .next = &cache_cache.arena, .units = 0 }, + .lock = SPIN_LOCK_UNLOCKED }; struct kmem_cache *kmem_cache_create(const char *name, size_t size, @@ -296,8 +405,22 @@ struct kmem_cache *kmem_cache_create(con void (*dtor)(void*, struct kmem_cache *, unsigned long)) { struct kmem_cache *c; + void *p; + + c = slob_alloc(&cache_cache, flags, 0); - c = slob_alloc(sizeof(struct kmem_cache), flags, 0); + memset(c, 0, sizeof(*c)); + + c->size = PAGE_SIZE; + c->arena.next = &c->arena; + c->arena.units = 0; + c->slobfree = &c->arena; + atomic_set(&c->items, 0); + spin_lock_init(&c->lock); + + p = slob_alloc(c, 0, PAGE_SIZE-1); + if (p) + free_page((unsigned long)p); if (c) { c->name = name; @@ -315,13 +438,27 @@ struct kmem_cache *kmem_cache_create(con } else if (flags & SLAB_PANIC) panic("Cannot create slab cache %s\n", name); + down(&cache_chain_sem); + list_add_tail(&c->list, &cache_chain); + up(&cache_chain_sem); + return c; } EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *c) { - slob_free(c, sizeof(struct kmem_cache)); + down(&cache_chain_sem); + list_del(&c->list); + up(&cache_chain_sem); + + BUG_ON(atomic_read(&c->items)); + + /* + * WARNING!!! Memory leak! + */ + printk("FIX ME: need to free memory\n"); + slob_free(&cache_cache, c, sizeof(struct kmem_cache)); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -329,11 +466,16 @@ void *kmem_cache_alloc(struct kmem_cache { void *b; - if (c->size < PAGE_SIZE) - b = slob_alloc(c->size, flags, c->align); + atomic_inc(&c->items); + + if (c->size <= MAX_SLOB_CACHE_SIZE) + b = slob_alloc(c, flags, c->align); else b = (void *)__get_free_pages(flags, get_order(c->size)); + if (!b) + return b; + if (c->ctor) c->ctor(b, c, 0); @@ -353,8 +495,10 @@ EXPORT_SYMBOL(kmem_cache_zalloc); static void __kmem_cache_free(void *b, int size) { - if (size < PAGE_SIZE) - slob_free(b, size); + atomic_dec(&c->items); + + if (c->size <= MAX_SLOB_CACHE_SIZE) + slob_free(c, b, c->size); else free_pages((unsigned long)b, get_order(size)); } @@ -393,9 +537,6 @@ const char *kmem_cache_name(struct kmem_ } EXPORT_SYMBOL(kmem_cache_name); -static struct timer_list slob_timer = TIMER_INITIALIZER( - (void (*)(unsigned long))slob_timer_cbk, 0, 0); - int kmem_cache_shrink(struct kmem_cache *d) { return 0; @@ -407,17 +548,55 @@ int kmem_ptr_validate(struct kmem_cache return 0; } -void __init kmem_cache_init(void) +static char cache_names[NR_SLOB_CACHES][15]; + +void kmem_cache_init(void) +{ + static int done; + void *p; + + if (!done) { + int i; + int size = 32; + done = 1; + + init_MUTEX(&cache_chain_sem); + INIT_LIST_HEAD(&cache_chain); + + cache_cache.size = PAGE_SIZE; + p = slob_alloc(&cache_cache, 0, PAGE_SIZE-1); + if (p) + free_page((unsigned long)p); + cache_cache.size = sizeof(struct kmem_cache); + + bb_cache = kmem_cache_create("bb_cache",sizeof(bigblock_t), 0, + GFP_KERNEL, NULL, NULL); + for (i=0; i < NR_SLOB_CACHES; i++, size <<= 1) + cache_sizes[i] = kmem_cache_create(cache_names[i], size, 0, + GFP_KERNEL, NULL, NULL); + } +} + +static void test_slob(slob_t *s) { - slob_timer_cbk(); + slob_t *p; + long x = 0; + + for (p=s->next; p != s && x < 10000; p = p->next, x++) + printk("."); } -static void slob_timer_cbk(void) +void print_slobs(void) { - void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); + struct list_head *curr; - if (p) - free_page((unsigned long)p); + list_for_each(curr, &cache_chain) { + kmem_cache_t *c = list_entry(curr, struct kmem_cache, list); - mod_timer(&slob_timer, jiffies + HZ); + printk("%s items:%d", + c->name?:"", + atomic_read(&c->items)); + test_slob(&c->arena); + printk("\n"); + } } patches/x86_64-i8259-remove-useless-forward-declaration.patch0000664000077200007720000000136710646635210023171 0ustar mingomingoSubject: x86_64: remove useless forward declaration i8245_timer_resume is forward declared for no reason. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/i8259.c | 1 - 1 file changed, 1 deletion(-) Index: linux-rt.q/arch/x86_64/kernel/i8259.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/i8259.c +++ linux-rt.q/arch/x86_64/kernel/i8259.c @@ -460,7 +460,6 @@ void invalidate_interrupt6(void); void invalidate_interrupt7(void); void thermal_interrupt(void); void threshold_interrupt(void); -void i8254_timer_resume(void); static void setup_timer_hardware(void) { patches/ns2cyc-result-fix.patch0000664000077200007720000000532410646635212016054 0ustar mingomingoFrom sshtylyov@ru.mvista.com Wed May 16 18:11:13 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from imap.sh.mvista.com (unknown [63.81.120.155]) by mail.tglx.de (Postfix) with ESMTP id B7F0D65C065 for ; Wed, 16 May 2007 18:11:13 +0200 (CEST) Received: from wasted.dev.rtsoft.ru (unknown [10.150.0.9]) by imap.sh.mvista.com (Postfix) with ESMTP id 11FC13EC9 for ; Wed, 16 May 2007 08:38:17 -0700 (PDT) From: Sergei Shtylyov Organization: MontaVista Software Inc. To: tglx@linutronix.de Subject: [PATCH 2.6.21-rt1] ns2cyc() result fix Date: Wed, 16 May 2007 18:39:50 +0300 User-Agent: KMail/1.5 MIME-Version: 1.0 Content-Disposition: inline Content-Type: text/plain; charset="iso-8859-1" Message-Id: <200705161939.50242.sshtylyov@ru.mvista.com> X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Fix the dubious use of cycles_t where cycle_t was appropriate. On the machines with 32-bit cycles_t (like ARM/PPC) it caused these warnings: In file included from arch/powerpc/kernel/time.c:1045: include/linux/clocksource.h: In function `ns2cyc': include/linux/clocksource.h:213: warning: comparison of distinct pointer types lacks a cast include/linux/clocksource.h:213: warning: right shift count >= width of type include/linux/clocksource.h:213: warning: passing argument 1 of `__div64_32' from incompatible pointer type This function and therefore usecs_to_cycles() was unlikely to return a correct result on such machines because of the shift result truncation. Signed-off-by: Sergei Shtylyov --- I'm also uncertain about 'preempt_max_latency' and 'preempt_thresh' variables being declared as 'unsigned long' -- however, looks like those are unlikely to overflow... yet it's unclear why there's casts to 'cycle_t' (which is always 64-bit) when initializing/comparing them... --- include/linux/clocksource.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/include/linux/clocksource.h =================================================================== --- linux-rt.q.orig/include/linux/clocksource.h +++ linux-rt.q/include/linux/clocksource.h @@ -183,9 +183,9 @@ static inline s64 cyc2ns(struct clocksou * @cs: Pointer to clocksource * @nsecs: Nanoseconds */ -static inline cycles_t ns2cyc(struct clocksource *cs, u64 nsecs) +static inline cycle_t ns2cyc(struct clocksource *cs, u64 nsecs) { - cycles_t ret = nsecs << cs->shift; + cycle_t ret = nsecs << cs->shift; do_div(ret, cs->mult + 1); patches/futex-performance-hack.patch0000664000077200007720000000321410646635216017111 0ustar mingomingo--- kernel/futex.c | 6 ++++-- kernel/sysctl.c | 9 +++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) Index: linux-rt.q/kernel/futex.c =================================================================== --- linux-rt.q.orig/kernel/futex.c +++ linux-rt.q/kernel/futex.c @@ -120,12 +120,14 @@ static struct futex_hash_bucket futex_qu /* Futex-fs vfsmount entry: */ static struct vfsmount *futex_mnt; +int futex_performance_hack; + /* * Take mm->mmap_sem, when futex is shared */ static inline void futex_lock_mm(struct rw_semaphore *fshared) { - if (fshared) + if (fshared && !futex_performance_hack) down_read(fshared); } @@ -134,7 +136,7 @@ static inline void futex_lock_mm(struct */ static inline void futex_unlock_mm(struct rw_semaphore *fshared) { - if (fshared) + if (fshared && !futex_performance_hack) up_read(fshared); } Index: linux-rt.q/kernel/sysctl.c =================================================================== --- linux-rt.q.orig/kernel/sysctl.c +++ linux-rt.q/kernel/sysctl.c @@ -66,6 +66,7 @@ extern int C_A_D; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; +extern int futex_performance_hack; extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; @@ -294,6 +295,14 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "futex_performance_hack", + .data = &futex_performance_hack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = KERN_PANIC, .procname = "prof_pid", .data = &prof_pid, patches/spinlock-trylock-cleanup-sungem.patch0000664000077200007720000000115210646635211020775 0ustar mingomingo--- drivers/net/sungem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) Index: linux-rt.q/drivers/net/sungem.c =================================================================== --- linux-rt.q.orig/drivers/net/sungem.c +++ linux-rt.q/drivers/net/sungem.c @@ -1034,10 +1034,8 @@ static int gem_start_xmit(struct sk_buff (csum_stuff_off << 21)); } - local_irq_save(flags); - if (!spin_trylock(&gp->tx_lock)) { + if (!spin_trylock_irqsave(&gp->tx_lock, flags)) { /* Tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } /* We raced with gem_do_stop() */ patches/preempt-realtime-acpi.patch0000664000077200007720000001277010646635215016747 0ustar mingomingo--- drivers/acpi/ec.c | 12 ++++++++++++ drivers/acpi/hardware/hwregs.c | 16 ++++++++-------- drivers/acpi/processor_idle.c | 2 +- drivers/acpi/utilities/utmutex.c | 2 +- include/acpi/acglobal.h | 7 ++++++- include/acpi/acpiosxf.h | 2 +- 6 files changed, 29 insertions(+), 12 deletions(-) Index: linux-rt.q/drivers/acpi/ec.c =================================================================== --- linux-rt.q.orig/drivers/acpi/ec.c +++ linux-rt.q/drivers/acpi/ec.c @@ -420,7 +420,19 @@ static u32 acpi_ec_gpe_handler(void *dat atomic_inc(&ec->event_count); if (acpi_ec_mode == EC_INTR) { +#if 0 wake_up(&ec->wait); +#else + // hack ... + if (waitqueue_active(&ec->wait)) { + struct task_struct *task; + + task = list_entry(ec->wait.task_list.next, + wait_queue_t, task_list)->private; + if (task) + wake_up_process(task); + } +#endif } value = acpi_ec_read_status(ec); Index: linux-rt.q/drivers/acpi/hardware/hwregs.c =================================================================== --- linux-rt.q.orig/drivers/acpi/hardware/hwregs.c +++ linux-rt.q/drivers/acpi/hardware/hwregs.c @@ -73,7 +73,7 @@ acpi_status acpi_hw_clear_acpi_status(vo ACPI_BITMASK_ALL_FIXED_STATUS, (u16) acpi_gbl_FADT.xpm1a_event_block.address)); - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); status = acpi_hw_register_write(ACPI_MTX_DO_NOT_LOCK, ACPI_REGISTER_PM1_STATUS, @@ -98,7 +98,7 @@ acpi_status acpi_hw_clear_acpi_status(vo status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block); unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } @@ -331,7 +331,7 @@ acpi_status acpi_set_register(u32 regist return_ACPI_STATUS(AE_BAD_PARAMETER); } - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); /* Always do a register read first so we can insert the new bits */ @@ -441,7 +441,7 @@ acpi_status acpi_set_register(u32 regist unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); /* Normalize the value that was read */ @@ -481,7 +481,7 @@ acpi_hw_register_read(u8 use_lock, u32 r ACPI_FUNCTION_TRACE(hw_register_read); if (ACPI_MTX_LOCK == use_lock) { - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); } switch (register_id) { @@ -560,7 +560,7 @@ acpi_hw_register_read(u8 use_lock, u32 r unlock_and_exit: if (ACPI_MTX_LOCK == use_lock) { - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); } if (ACPI_SUCCESS(status)) { @@ -606,7 +606,7 @@ acpi_status acpi_hw_register_write(u8 us ACPI_FUNCTION_TRACE(hw_register_write); if (ACPI_MTX_LOCK == use_lock) { - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); } switch (register_id) { @@ -730,7 +730,7 @@ acpi_status acpi_hw_register_write(u8 us unlock_and_exit: if (ACPI_MTX_LOCK == use_lock) { - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); } return_ACPI_STATUS(status); Index: linux-rt.q/drivers/acpi/processor_idle.c =================================================================== --- linux-rt.q.orig/drivers/acpi/processor_idle.c +++ linux-rt.q/drivers/acpi/processor_idle.c @@ -947,7 +947,7 @@ static int acpi_idle_enter_c2(struct cpu } static int c3_cpu_count; -static DEFINE_SPINLOCK(c3_lock); +static DEFINE_RAW_SPINLOCK(c3_lock); /** * acpi_idle_enter_c3 - enters an ACPI C3 state-type Index: linux-rt.q/drivers/acpi/utilities/utmutex.c =================================================================== --- linux-rt.q.orig/drivers/acpi/utilities/utmutex.c +++ linux-rt.q/drivers/acpi/utilities/utmutex.c @@ -116,7 +116,7 @@ void acpi_ut_mutex_terminate(void) /* Delete the spinlocks */ acpi_os_delete_lock(acpi_gbl_gpe_lock); - acpi_os_delete_lock(acpi_gbl_hardware_lock); +// acpi_os_delete_lock(acpi_gbl_hardware_lock); return_VOID; } Index: linux-rt.q/include/acpi/acglobal.h =================================================================== --- linux-rt.q.orig/include/acpi/acglobal.h +++ linux-rt.q/include/acpi/acglobal.h @@ -184,7 +184,12 @@ ACPI_EXTERN acpi_semaphore acpi_gbl_glob * interrupt level */ ACPI_EXTERN spinlock_t _acpi_gbl_gpe_lock; /* For GPE data structs and registers */ -ACPI_EXTERN spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ + +/* + * Need to be raw because it might be used in acpi_processor_idle(): + */ +ACPI_EXTERN raw_spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ + #define acpi_gbl_gpe_lock &_acpi_gbl_gpe_lock #define acpi_gbl_hardware_lock &_acpi_gbl_hardware_lock Index: linux-rt.q/include/acpi/acpiosxf.h =================================================================== --- linux-rt.q.orig/include/acpi/acpiosxf.h +++ linux-rt.q/include/acpi/acpiosxf.h @@ -61,7 +61,7 @@ typedef enum { OSL_EC_BURST_HANDLER } acpi_execute_type; -#define ACPI_NO_UNIT_LIMIT ((u32) -1) +#define ACPI_NO_UNIT_LIMIT (INT_MAX/2) #define ACPI_MUTEX_SEM 1 /* Functions for acpi_os_signal */ patches/panic-dont-stop-box.patch0000664000077200007720000000076610646635216016371 0ustar mingomingo--- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/kernel/panic.c =================================================================== --- linux-rt.q.orig/kernel/panic.c +++ linux-rt.q/kernel/panic.c @@ -96,7 +96,7 @@ NORET_TYPE void panic(const char * fmt, * unfortunately means it may not be hardened to work in a panic * situation. */ - smp_send_stop(); +// smp_send_stop(); #endif atomic_notifier_call_chain(&panic_notifier_list, 0, buf); patches/latency-tracing.patch0000664000077200007720000034604010646635212015642 0ustar mingomingo Makefile | 10 arch/i386/lib/delay.c | 6 arch/x86_64/kernel/tsc.c | 4 drivers/clocksource/acpi_pm.c | 8 fs/proc/proc_misc.c | 17 include/linux/clocksource.h | 23 include/linux/kernel.h | 2 include/linux/latency_hist.h | 32 include/linux/preempt.h | 20 include/linux/sched.h | 109 + init/main.c | 2 kernel/Makefile | 5 kernel/fork.c | 2 kernel/latency_hist.c | 266 ++++ kernel/latency_trace.c | 2744 ++++++++++++++++++++++++++++++++++++++++++ kernel/lockdep.c | 39 kernel/panic.c | 2 kernel/printk.c | 2 kernel/sched.c | 92 - kernel/softlockup.c | 2 kernel/sysctl.c | 128 + kernel/time/timekeeping.c | 27 lib/Kconfig.debug | 186 ++ lib/debug_locks.c | 7 scripts/Makefile | 1 scripts/trace-it.c | 79 + 26 files changed, 3751 insertions(+), 64 deletions(-) Index: linux-rt.q/Makefile =================================================================== --- linux-rt.q.orig/Makefile +++ linux-rt.q/Makefile @@ -490,10 +490,14 @@ endif include $(srctree)/arch/$(ARCH)/Makefile -ifdef CONFIG_FRAME_POINTER -CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) +ifdef CONFIG_MCOUNT +CFLAGS += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) else -CFLAGS += -fomit-frame-pointer + ifdef CONFIG_FRAME_POINTER + CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) + else + CFLAGS += -fomit-frame-pointer + endif endif ifdef CONFIG_DEBUG_INFO Index: linux-rt.q/arch/i386/lib/delay.c =================================================================== --- linux-rt.q.orig/arch/i386/lib/delay.c +++ linux-rt.q/arch/i386/lib/delay.c @@ -23,7 +23,7 @@ #endif /* simple loop based delay: */ -static void delay_loop(unsigned long loops) +static notrace void delay_loop(unsigned long loops) { int d0; @@ -38,7 +38,7 @@ static void delay_loop(unsigned long loo } /* TSC based delay: */ -static void delay_tsc(unsigned long loops) +static notrace void delay_tsc(unsigned long loops) { unsigned long bclock, now; @@ -69,7 +69,7 @@ int read_current_timer(unsigned long *ti return -1; } -void __delay(unsigned long loops) +void notrace __delay(unsigned long loops) { delay_fn(loops); } Index: linux-rt.q/arch/x86_64/kernel/tsc.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/tsc.c +++ linux-rt.q/arch/x86_64/kernel/tsc.c @@ -247,13 +247,13 @@ __setup("notsc", notsc_setup); /* clock source code: */ -static cycle_t read_tsc(void) +static notrace cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles_sync(); return ret; } -static cycle_t __vsyscall_fn vread_tsc(void) +static notrace cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)get_cycles_sync(); return ret; Index: linux-rt.q/drivers/clocksource/acpi_pm.c =================================================================== --- linux-rt.q.orig/drivers/clocksource/acpi_pm.c +++ linux-rt.q/drivers/clocksource/acpi_pm.c @@ -30,13 +30,13 @@ */ u32 pmtmr_ioport __read_mostly; -static inline u32 read_pmtmr(void) +static notrace inline u32 read_pmtmr(void) { /* mask the output to 24 bits */ return inl(pmtmr_ioport) & ACPI_PM_MASK; } -u32 acpi_pm_read_verified(void) +u32 notrace acpi_pm_read_verified(void) { u32 v1 = 0, v2 = 0, v3 = 0; @@ -56,12 +56,12 @@ u32 acpi_pm_read_verified(void) return v2; } -static cycle_t acpi_pm_read_slow(void) +static notrace cycle_t acpi_pm_read_slow(void) { return (cycle_t)acpi_pm_read_verified(); } -static cycle_t acpi_pm_read(void) +static notrace cycle_t acpi_pm_read(void) { return (cycle_t)read_pmtmr(); } Index: linux-rt.q/fs/proc/proc_misc.c =================================================================== --- linux-rt.q.orig/fs/proc/proc_misc.c +++ linux-rt.q/fs/proc/proc_misc.c @@ -623,6 +623,20 @@ static int execdomains_read_proc(char *p return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_EVENT_TRACE +extern struct seq_operations latency_trace_op; +static int latency_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &latency_trace_op); +} +static struct file_operations proc_latency_trace_operations = { + .open = latency_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* * writing 'C' to /proc/sysrq-trigger is like sysrq-C @@ -716,6 +730,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_SCHEDSTATS create_seq_entry("schedstat", 0, &proc_schedstat_operations); #endif +#ifdef CONFIG_EVENT_TRACE + create_seq_entry("latency_trace", 0, &proc_latency_trace_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { Index: linux-rt.q/include/linux/clocksource.h =================================================================== --- linux-rt.q.orig/include/linux/clocksource.h +++ linux-rt.q/include/linux/clocksource.h @@ -21,6 +21,9 @@ typedef u64 cycle_t; struct clocksource; +extern unsigned long preempt_max_latency; +extern unsigned long preempt_thresh; + /** * struct clocksource - hardware abstraction for a free running counter * Provides mostly state-free accessors to the underlying hardware. @@ -172,8 +175,20 @@ static inline cycle_t clocksource_read(s */ static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles) { - u64 ret = (u64)cycles; - ret = (ret * cs->mult) >> cs->shift; + return ((u64)cycles * cs->mult) >> cs->shift; +} + +/** + * ns2cyc - converts nanoseconds to clocksource cycles + * @cs: Pointer to clocksource + * @nsecs: Nanoseconds + */ +static inline cycles_t ns2cyc(struct clocksource *cs, u64 nsecs) +{ + cycles_t ret = nsecs << cs->shift; + + do_div(ret, cs->mult + 1); + return ret; } @@ -221,4 +236,8 @@ static inline void update_vsyscall(struc } #endif +extern cycle_t get_monotonic_cycles(void); +extern unsigned long cycles_to_usecs(cycle_t); +extern cycle_t usecs_to_cycles(unsigned long); + #endif /* _LINUX_CLOCKSOURCE_H */ Index: linux-rt.q/include/linux/kernel.h =================================================================== --- linux-rt.q.orig/include/linux/kernel.h +++ linux-rt.q/include/linux/kernel.h @@ -156,6 +156,8 @@ asmlinkage int vprintk(const char *fmt, __attribute__ ((format (printf, 1, 0))); asmlinkage int printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))); +extern void early_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); Index: linux-rt.q/include/linux/latency_hist.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/latency_hist.h @@ -0,0 +1,32 @@ +/* + * kernel/latency_hist.h + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + */ +#ifndef _LINUX_LATENCY_HIST_H_ +#define _LINUX_LATENCY_HIST_H_ + +enum { + INTERRUPT_LATENCY = 0, + PREEMPT_LATENCY, + WAKEUP_LATENCY +}; + +#define MAX_ENTRY_NUM 10240 +#define LATENCY_TYPE_NUM 3 + +#ifdef CONFIG_LATENCY_HIST +extern void latency_hist(int latency_type, int cpu, unsigned long latency); +# define latency_hist_flag 1 +#else +# define latency_hist(a,b,c) do { (void)(cpu); } while (0) +# define latency_hist_flag 0 +#endif /* CONFIG_LATENCY_HIST */ + +#endif /* ifndef _LINUX_LATENCY_HIST_H_ */ Index: linux-rt.q/include/linux/preempt.h =================================================================== --- linux-rt.q.orig/include/linux/preempt.h +++ linux-rt.q/include/linux/preempt.h @@ -9,12 +9,26 @@ #include #include -#ifdef CONFIG_DEBUG_PREEMPT - extern void fastcall add_preempt_count(int val); - extern void fastcall sub_preempt_count(int val); +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + extern void notrace add_preempt_count(unsigned int val); + extern void notrace sub_preempt_count(unsigned int val); + extern void notrace mask_preempt_count(unsigned int mask); + extern void notrace unmask_preempt_count(unsigned int mask); #else # define add_preempt_count(val) do { preempt_count() += (val); } while (0) # define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define mask_preempt_count(mask) \ + do { preempt_count() |= (mask); } while (0) +# define unmask_preempt_count(mask) \ + do { preempt_count() &= ~(mask); } while (0) +#endif + +#ifdef CONFIG_CRITICAL_TIMING + extern void touch_critical_timing(void); + extern void stop_critical_timing(void); +#else +# define touch_critical_timing() do { } while (0) +# define stop_critical_timing() do { } while (0) #endif #define inc_preempt_count() add_preempt_count(1) Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -239,6 +239,7 @@ static inline void show_state(void) } extern void show_regs(struct pt_regs *); +extern void irq_show_regs_callback(int cpu, struct pt_regs *regs); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current @@ -275,6 +276,107 @@ static inline void touch_all_softlockup_ } #endif +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE) + extern void print_traces(struct task_struct *task); +#else +# define print_traces(task) do { } while (0) +#endif + +#ifdef CONFIG_FRAME_POINTER +# ifndef CONFIG_ARM +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) +# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +# else + extern unsigned long arm_return_addr(int level); +# define CALLER_ADDR0 arm_return_addr(0) +# define CALLER_ADDR1 arm_return_addr(1) +# define CALLER_ADDR2 arm_return_addr(2) +# define CALLER_ADDR3 arm_return_addr(3) +# define CALLER_ADDR4 arm_return_addr(4) +# define CALLER_ADDR5 arm_return_addr(5) +#endif +#else +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 0UL +# define CALLER_ADDR2 0UL +# define CALLER_ADDR3 0UL +# define CALLER_ADDR4 0UL +# define CALLER_ADDR5 0UL +#endif + +#ifdef CONFIG_MCOUNT + extern void notrace mcount(void); +#else +# define mcount() do { } while (0) +#endif + +#ifdef CONFIG_EVENT_TRACE + extern int mcount_enabled, trace_enabled, trace_user_triggered, + trace_user_trigger_irq, trace_freerunning, trace_verbose, + trace_print_on_crash, trace_all_cpus, print_functions, + syscall_tracing, stackframe_tracing, trace_use_raw_cycles, + trace_all_runnable; + extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3); + extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2); + extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2); + extern void notrace trace_special_sym(void); + extern void stop_trace(void); +# define start_trace() do { trace_enabled = 1; } while (0) + extern void print_last_trace(void); + extern void nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags); + extern long user_trace_start(void); + extern long user_trace_stop(void); + extern void trace_cmdline(void); + extern void init_tracer(void); +#else +# define mcount_enabled 0 +# define trace_enabled 0 +# define syscall_tracing 0 +# define stackframe_tracing 0 +# define trace_user_triggered 0 +# define trace_freerunning 0 +# define trace_all_cpus 0 +# define trace_verbose 0 +# define trace_special(v1,v2,v3) do { } while (0) +# define trace_special_pid(pid,v1,v2) do { } while (0) +# define trace_special_u64(v1,v2) do { } while (0) +# define trace_special_sym() do { } while (0) +# define stop_trace() do { } while (0) +# define start_trace() do { } while (0) +# define print_last_trace() do { } while (0) +# define nmi_trace(eip, parent_eip, flags) do { } while (0) +# define user_trace_start() do { } while (0) +# define user_trace_stop() do { } while (0) +# define trace_cmdline() do { } while (0) +# define init_tracer() do { } while (0) +#endif + +extern int timeofday_API_hacks(void *tv, void *tz); + +#ifdef CONFIG_WAKEUP_TIMING + extern int wakeup_timing; + extern void __trace_start_sched_wakeup(struct task_struct *p); + extern void trace_stop_sched_switched(struct task_struct *p); + extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu); +#else +# define wakeup_timing 0 +# define __trace_start_sched_wakeup(p) do { } while (0) +# define trace_stop_sched_switched(p) do { } while (0) +# define trace_change_sched_cpu(p, cpu) do { } while (0) +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1); + extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1); +#else +# define time_hardirqs_on(a0, a1) do { } while (0) +# define time_hardirqs_off(a0, a1) do { } while (0) +#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -1082,6 +1184,13 @@ struct task_struct { unsigned int lockdep_recursion; #endif +#define MAX_PREEMPT_TRACE 16 + +#ifdef CONFIG_PREEMPT_TRACE + unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE]; + unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE]; +#endif + /* journalling filesystem info */ void *journal_info; Index: linux-rt.q/init/main.c =================================================================== --- linux-rt.q.orig/init/main.c +++ linux-rt.q/init/main.c @@ -577,6 +577,8 @@ asmlinkage void __init start_kernel(void if (panic_later) panic(panic_later, panic_param); + init_tracer(); + lockdep_info(); /* Index: linux-rt.q/kernel/Makefile =================================================================== --- linux-rt.q.orig/kernel/Makefile +++ linux-rt.q/kernel/Makefile @@ -38,6 +38,11 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_DEBUG_PREEMPT) += latency_trace.o +obj-$(CONFIG_WAKEUP_TIMING) += latency_trace.o +obj-$(CONFIG_EVENT_TRACE) += latency_trace.o +obj-$(CONFIG_CRITICAL_TIMING) += latency_trace.o +obj-$(CONFIG_LATENCY_HIST) += latency_hist.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o Index: linux-rt.q/kernel/fork.c =================================================================== --- linux-rt.q.orig/kernel/fork.c +++ linux-rt.q/kernel/fork.c @@ -992,7 +992,7 @@ static struct task_struct *copy_process( rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif Index: linux-rt.q/kernel/latency_hist.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/latency_hist.c @@ -0,0 +1,266 @@ +/* + * kernel/latency_hist.c + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + */ +#include +#include +#include +#include +#include +#include + +typedef struct hist_data_struct { + atomic_t hist_mode; /* 0 log, 1 don't log */ + unsigned long min_lat; + unsigned long avg_lat; + unsigned long max_lat; + unsigned long long beyond_hist_bound_samples; + unsigned long long accumulate_lat; + unsigned long long total_samples; + unsigned long long hist_array[MAX_ENTRY_NUM]; +} hist_data_t; + +static struct proc_dir_entry * latency_hist_root = NULL; +static char * latency_hist_proc_dir_root = "latency_hist"; + +static char * percpu_proc_name = "CPU"; + +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist); +static char * interrupt_off_hist_proc_dir = "interrupt_off_latency"; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(hist_data_t, preempt_off_hist); +static char * preempt_off_hist_proc_dir = "preempt_off_latency"; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist); +static char * wakeup_latency_hist_proc_dir = "wakeup_latency"; +#endif + +static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS]; + +static inline u64 u64_div(u64 x, u64 y) +{ + do_div(x, y); + return x; +} + +void latency_hist(int latency_type, int cpu, unsigned long latency) +{ + hist_data_t * my_hist; + + if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY) + || (latency_type > WAKEUP_LATENCY) || (latency < 0)) + return; + + switch(latency_type) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu); + break; +#endif + default: + return; + } + + if (atomic_read(&my_hist->hist_mode) == 0) + return; + + if (latency >= MAX_ENTRY_NUM) + my_hist->beyond_hist_bound_samples++; + else + my_hist->hist_array[latency]++; + + if (latency < my_hist->min_lat) + my_hist->min_lat = latency; + else if (latency > my_hist->max_lat) + my_hist->max_lat = latency; + + my_hist->total_samples++; + my_hist->accumulate_lat += latency; + my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat, + my_hist->total_samples); + return; +} + +static void *l_start(struct seq_file *m, loff_t * pos) +{ + loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); + loff_t index = *pos; + hist_data_t *my_hist = (hist_data_t *) m->private; + + if (!index_ptr) + return NULL; + + if (index == 0) { + atomic_dec(&my_hist->hist_mode); + seq_printf(m, "#Minimum latency: %lu microseconds.\n" + "#Average latency: %lu microseconds.\n" + "#Maximum latency: %lu microseconds.\n" + "#Total samples: %llu\n" + "#There are %llu samples greater or equal than %d microseconds\n" + "#usecs\t%16s\n" + , my_hist->min_lat + , my_hist->avg_lat + , my_hist->max_lat + , my_hist->total_samples + , my_hist->beyond_hist_bound_samples + , MAX_ENTRY_NUM, "samples"); + } + if (index >= MAX_ENTRY_NUM) + return NULL; + + *index_ptr = index; + return index_ptr; +} + +static void *l_next(struct seq_file *m, void *p, loff_t * pos) +{ + loff_t *index_ptr = p; + hist_data_t *my_hist = (hist_data_t *) m->private; + + if (++*pos >= MAX_ENTRY_NUM) { + atomic_inc(&my_hist->hist_mode); + return NULL; + } + *index_ptr = *pos; + return index_ptr; +} + +static void l_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static int l_show(struct seq_file *m, void *p) +{ + int index = *(loff_t *) p; + hist_data_t *my_hist = (hist_data_t *) m->private; + + seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]); + return 0; +} + +static struct seq_operations latency_hist_seq_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static int latency_hist_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *entry_ptr = NULL; + int ret, i, j, break_flags = 0; + struct seq_file *seq; + + entry_ptr = PDE(file->f_dentry->d_inode); + for (i = 0; i < LATENCY_TYPE_NUM; i++) { + for (j = 0; j < NR_CPUS; j++) { + if (entry[i][j] == NULL) + continue; + if (entry_ptr->low_ino == entry[i][j]->low_ino) { + break_flags = 1; + break; + } + } + if (break_flags == 1) + break; + } + ret = seq_open(file, &latency_hist_seq_op); + if (break_flags == 1) { + seq = (struct seq_file *)file->private_data; + seq->private = entry[i][j]->data; + } + return ret; +} + +static struct file_operations latency_hist_seq_fops = { + .open = latency_hist_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int latency_hist_init(void) +{ + struct proc_dir_entry *tmp_parent_proc_dir; + int i = 0, len = 0; + hist_data_t *my_hist; + char procname[64]; + + latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL); + + +#ifdef CONFIG_INTERRUPT_OFF_HIST + tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[INTERRUPT_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i); + entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[PREEMPT_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i); + entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[WAKEUP_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i); + entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + return 0; + +} + +__initcall(latency_hist_init); + Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/latency_trace.c @@ -0,0 +1,2744 @@ +/* + * kernel/latency_trace.c + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_64 +# include +#endif +#include + +#ifndef DEFINE_RAW_SPINLOCK +# define DEFINE_RAW_SPINLOCK DEFINE_SPINLOCK +#endif + +#ifndef RAW_SPIN_LOCK_UNLOCKED +# define RAW_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + +int trace_use_raw_cycles = 0; + +#ifdef CONFIG_EVENT_TRACE +/* + * Convert raw cycles to usecs. + * Note: this is not the 'clocksource cycles' value, it's the raw + * cycle counter cycles. We use GTOD to timestamp latency start/end + * points, but the trace entries inbetween are timestamped with + * get_cycles(). + */ +static unsigned long notrace cycles_to_us(cycle_t delta) +{ + if (!trace_use_raw_cycles) + return cycles_to_usecs(delta); +#ifdef CONFIG_X86 + do_div(delta, cpu_khz/1000+1); +#elif defined(CONFIG_PPC) + delta = mulhwu(tb_to_us, delta); +#elif defined(CONFIG_ARM) + delta = mach_cycles_to_usecs(delta); +#else + #error Implement cycles_to_usecs. +#endif + + return (unsigned long) delta; +} +#endif + +static notrace inline cycle_t now(void) +{ + if (trace_use_raw_cycles) + return get_cycles(); + return get_monotonic_cycles(); +} + +#ifndef irqs_off +# define irqs_off() 0 +#endif + +#ifndef DEBUG_WARN_ON +static inline int DEBUG_WARN_ON(int cond) +{ + WARN_ON(cond); + return 0; +} +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING +# ifdef CONFIG_CRITICAL_PREEMPT_TIMING +# define irqs_off_preempt_count() preempt_count() +# else +# define irqs_off_preempt_count() 0 +# endif +#endif + +#ifdef CONFIG_WAKEUP_TIMING +struct sch_struct { + __raw_spinlock_t trace_lock; + struct task_struct *task; + int cpu; + struct cpu_trace *tr; +} ____cacheline_aligned_in_smp; + +static __cacheline_aligned_in_smp struct sch_struct sch = + { trace_lock: __RAW_SPIN_LOCK_UNLOCKED }; + +int wakeup_timing = 1; +#endif + +/* + * Track maximum latencies and save the trace: + */ + +/* + * trace_stop_sched_switched must not be called with runqueue locks held! + */ +static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex); + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +enum trace_type +{ + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_SPECIAL, + TRACE_SPECIAL_PID, + TRACE_SPECIAL_U64, + TRACE_SPECIAL_SYM, + TRACE_CMDLINE, + TRACE_SYSCALL, + TRACE_SYSRET, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type +{ + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_NEED_RESCHED_DELAYED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_IRQS_HARD_OFF = 0x20, +}; + +/* + * Maximum preemption latency measured. Initialize to maximum, + * we clear it after bootup. + */ +#ifdef CONFIG_LATENCY_HIST +unsigned long preempt_max_latency = (cycle_t)0UL; +#else +unsigned long preempt_max_latency = (cycle_t)ULONG_MAX; +#endif + +unsigned long preempt_thresh; + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (latency_hist_flag && !trace_user_triggered) + return 1; + + if (preempt_thresh) { + if (delta < preempt_thresh) + return 0; + } else { + if (delta <= preempt_max_latency) + return 0; + } + return 1; +} + +#ifdef CONFIG_EVENT_TRACE + +/* + * Number of per-CPU trace entries: + */ +#define MAX_TRACE (65536UL*16UL) + +#define CMDLINE_BYTES 16 + +/* + * 32 bytes on 32-bit platforms: + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; // assumes PREEMPT_MASK is 8 bits or less + int pid; + cycle_t timestamp; + union { + struct { + unsigned long eip; + unsigned long parent_eip; + } fn; + struct { + unsigned long eip; + unsigned long v1, v2, v3; + } special; + struct { + unsigned char str[CMDLINE_BYTES]; + } cmdline; + struct { + unsigned long nr; // highest bit: compat call + unsigned long p1, p2, p3; + } syscall; + struct { + unsigned long ret; + } sysret; + struct { + unsigned long __pad3[4]; + } pad; + } u; +} __attribute__((packed)); + +#endif + +struct cpu_trace { + atomic_t disabled; + unsigned long trace_idx; + cycle_t preempt_timestamp; + unsigned long critical_start, critical_end; + unsigned long critical_sequence; + atomic_t underrun; + atomic_t overrun; + int early_warning; + int latency_type; + int cpu; + +#ifdef CONFIG_EVENT_TRACE + struct trace_entry *trace; + char comm[CMDLINE_BYTES]; + pid_t pid; + unsigned long uid; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long saved_latency; +#endif +#ifdef CONFIG_DEBUG_STACKOVERFLOW + unsigned long stack_check; +#endif +} ____cacheline_aligned_in_smp; + +static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp = +{ [0 ... NR_CPUS-1] = { +#ifdef CONFIG_DEBUG_STACKOVERFLOW + .stack_check = 1 +#endif + } }; + +#ifdef CONFIG_EVENT_TRACE + +int trace_enabled = 0; +int syscall_tracing = 1; +int stackframe_tracing = 0; +int mcount_enabled = 0; +int trace_freerunning = 0; +int trace_print_on_crash = 0; +int trace_verbose = 0; +int trace_all_cpus = 0; +int print_functions = 0; +int trace_all_runnable = 0; + +/* + * user-triggered via gettimeofday(0,1)/gettimeofday(0,0) + */ +int trace_user_triggered = 0; +int trace_user_trigger_irq = -1; + +struct saved_trace_struct { + int cpu; + cycle_t first_timestamp, last_timestamp; + struct cpu_trace traces[NR_CPUS]; +} ____cacheline_aligned_in_smp; + +/* + * The current worst-case trace: + */ +static struct saved_trace_struct max_tr; + +/* + * /proc/latency_trace atomicity: + */ +static DECLARE_MUTEX(out_mutex); + +static struct saved_trace_struct out_tr; + +static void notrace printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + printk("%s+%#lx/%#lx", sym_name, offset, size); + else + printk("<%08lx>", eip); +} + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + +#ifndef STACK_WARN +# define STACK_WARN (THREAD_SIZE/8) +#endif + +#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN) +#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info)) + +#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER) +# define PRINT_EXACT_STACKFRAME +#endif + +#ifdef PRINT_EXACT_STACKFRAME +static unsigned long *worst_stack_bp; +#endif +static DEFINE_RAW_SPINLOCK(worst_stack_lock); +unsigned long worst_stack_left = THREAD_SIZE; +static unsigned long worst_stack_printed = THREAD_SIZE; +static char worst_stack_comm[TASK_COMM_LEN+1]; +static int worst_stack_pid; +static unsigned long worst_stack_sp; +static char worst_stack[THREAD_SIZE]; + +static notrace void fill_worst_stack(unsigned long stack_left) +{ + unsigned long flags; + + /* + * On x64, we must not read the PDA during early bootup: + */ +#ifdef CONFIG_X86_64 + if (system_state == SYSTEM_BOOTING) + return; +#endif + spin_lock_irqsave(&worst_stack_lock, flags); + if (likely(stack_left < worst_stack_left)) { + worst_stack_left = stack_left; + memcpy(worst_stack, current_thread_info(), THREAD_SIZE); + worst_stack_sp = (unsigned long)&stack_left; + memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN); + worst_stack_pid = current->pid; +#ifdef PRINT_EXACT_STACKFRAME +# ifdef __i386__ + asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp)); +# elif defined(__x86_64__) + asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp)); +# else +# error Poke the author of above asm code lines ! +# endif +#endif + } + spin_unlock_irqrestore(&worst_stack_lock, flags); +} + +#ifdef PRINT_EXACT_STACKFRAME + +/* + * This takes a BP offset to point the BP back into the saved stack, + * the original stack might be long gone (but the stackframe within + * the saved copy still contains references to it). + */ +#define CONVERT_TO_SAVED_STACK(bp) \ + ((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1))) + +static void show_stackframe(void) +{ + unsigned long addr, frame_size, *bp, *prev_bp, sum = 0; + + bp = CONVERT_TO_SAVED_STACK(worst_stack_bp); + + while (bp[0]) { + addr = bp[1]; + if (!kernel_text_address(addr)) + break; + + prev_bp = bp; + bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]); + + frame_size = (bp - prev_bp) * sizeof(long); + + if (frame_size < THREAD_SIZE) { + printk("{ %4ld} ", frame_size); + sum += frame_size; + } else + printk("{=%4ld} ", sum); + + printk("[<%08lx>] ", addr); + printk_name(addr); + printk("\n"); + } +} + +#else + +static inline int valid_stack_ptr(void *p) +{ + return p > (void *)worst_stack && + p < (void *)worst_stack + THREAD_SIZE - 3; +} + +static void show_stackframe(void) +{ + unsigned long prev_frame, addr; + unsigned long *stack; + + prev_frame = (unsigned long)(worst_stack + + (worst_stack_sp & (THREAD_SIZE-1))); + stack = (unsigned long *)prev_frame; + + while (valid_stack_ptr(stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) { + printk("(%4ld) ", (unsigned long)stack - prev_frame); + printk("[<%08lx>] ", addr); + print_symbol("%s\n", addr); + prev_frame = (unsigned long)stack; + } + if ((char *)stack >= worst_stack + THREAD_SIZE) + break; + } +} + +#endif + +static notrace void __print_worst_stack(void) +{ + unsigned long fill_ratio; + printk("----------------------------->\n"); + printk("| new stack fill maximum: %s/%d, %ld bytes (out of %ld bytes).\n", + worst_stack_comm, worst_stack_pid, + MAX_STACK-worst_stack_left, (long)MAX_STACK); + fill_ratio = (MAX_STACK-worst_stack_left)*100/(long)MAX_STACK; + printk("| Stack fill ratio: %02ld%%", fill_ratio); + if (fill_ratio >= 90) + printk(" - BUG: that's quite high, please report this!\n"); + else + printk(" - that's still OK, no need to report this.\n"); + printk("------------|\n"); + + show_stackframe(); + printk("<---------------------------\n\n"); +} + +static notrace void print_worst_stack(void) +{ + unsigned long flags; + + if (irqs_disabled() || preempt_count()) + return; + + spin_lock_irqsave(&worst_stack_lock, flags); + if (worst_stack_printed == worst_stack_left) { + spin_unlock_irqrestore(&worst_stack_lock, flags); + return; + } + worst_stack_printed = worst_stack_left; + spin_unlock_irqrestore(&worst_stack_lock, flags); + + __print_worst_stack(); +} + +static notrace void debug_stackoverflow(struct cpu_trace *tr) +{ + long stack_left; + + if (unlikely(tr->stack_check <= 0)) + return; + atomic_inc(&tr->disabled); + + /* Debugging check for stack overflow: is there less than 1KB free? */ +#ifdef __i386__ + __asm__ __volatile__("and %%esp,%0" : + "=r" (stack_left) : "0" (THREAD_SIZE - 1)); +#elif defined(__x86_64__) + __asm__ __volatile__("and %%rsp,%0" : + "=r" (stack_left) : "0" (THREAD_SIZE - 1)); +#else +# error Poke the author of above asm code lines ! +#endif + if (unlikely(stack_left < MIN_STACK_NEEDED)) { + tr->stack_check = 0; + printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n", + stack_left - sizeof(struct thread_info), + (long)&stack_left, + (long)current_thread_info(), + (long)current_thread_info() + THREAD_SIZE); + fill_worst_stack(stack_left); + __print_worst_stack(); + goto out; + } + if (unlikely(stack_left < worst_stack_left)) { + tr->stack_check--; + fill_worst_stack(stack_left); + print_worst_stack(); + tr->stack_check++; + } else + if (worst_stack_printed != worst_stack_left) { + tr->stack_check--; + print_worst_stack(); + tr->stack_check++; + } +out: + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_EARLY_PRINTK +static void notrace early_printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + early_printk("%s <%08lx>", sym_name, eip); + else + early_printk("<%08lx>", eip); +} + +static __raw_spinlock_t early_print_lock = __RAW_SPIN_LOCK_UNLOCKED; + +static void notrace early_print_entry(struct trace_entry *entry) +{ + int hardirq, softirq; + + __raw_spin_lock(&early_print_lock); + early_printk("%-5d ", entry->pid); + + early_printk("%d%c%c", + entry->cpu, + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + early_printk("H"); + else { + if (hardirq) + early_printk("h"); + else { + if (softirq) + early_printk("s"); + else + early_printk("."); + } + } + + early_printk(":%d: ", entry->preempt_count); + + if (entry->type == TRACE_FN) { + early_printk_name(entry->u.fn.eip); + early_printk(" <= ("); + early_printk_name(entry->u.fn.parent_eip); + early_printk(")\n"); + } else { + /* special entries: */ + early_printk_name(entry->u.special.eip); + early_printk(": <%08lx> <%08lx> <%08lx>\n", + entry->u.special.v1, + entry->u.special.v2, + entry->u.special.v3); + } + __raw_spin_unlock(&early_print_lock); +} +#else +# define early_print_entry(x) do { } while(0) +#endif + +static void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long idx, idx_next; + cycle_t timestamp; + u32 pc; + +#ifdef CONFIG_DEBUG_PREEMPT +// WARN_ON(!atomic_read(&tr->disabled)); +#endif + if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus && + !trace_print_on_crash && !print_functions) + goto out; + /* + * Allocate the next index. Make sure an NMI (or interrupt) + * has not taken it away. Potentially redo the timestamp as + * well to make sure the trace timestamps are in chronologic + * order. + */ +again: + idx = tr->trace_idx; + idx_next = idx + 1; + timestamp = now(); + + if (unlikely((trace_freerunning || print_functions || atomic_read(&tr->underrun)) && + (idx_next >= MAX_TRACE) && !atomic_read(&tr->overrun))) { + atomic_inc(&tr->underrun); + idx_next = 0; + } + if (unlikely(idx >= MAX_TRACE)) { + atomic_inc(&tr->overrun); + goto out; + } +#ifdef __HAVE_ARCH_CMPXCHG + if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx)) { + if (idx_next == 0) + atomic_dec(&tr->underrun); + goto again; + } +#else +# ifdef CONFIG_SMP +# error CMPXCHG missing +# else + /* No worry, we are protected by the atomic_incr(&tr->disabled) + * in __trace further down + */ + tr->trace_idx = idx_next; +# endif +#endif + if (unlikely(idx_next != 0 && atomic_read(&tr->underrun))) + atomic_inc(&tr->underrun); + + pc = preempt_count(); + + if (unlikely(!tr->trace)) + goto out; + entry = tr->trace + idx; + entry->type = type; +#ifdef CONFIG_SMP + entry->cpu = cpu; +#endif + entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) | + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)| + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | + (need_resched_delayed() ? TRACE_FLAG_NEED_RESCHED_DELAYED : 0); + entry->preempt_count = pc & 0xff; + entry->pid = current->pid; + entry->timestamp = timestamp; + + switch (type) { + case TRACE_FN: + entry->u.fn.eip = eip; + entry->u.fn.parent_eip = parent_eip; + if (unlikely(print_functions && !in_interrupt())) + early_print_entry(entry); + break; + case TRACE_SPECIAL: + case TRACE_SPECIAL_PID: + case TRACE_SPECIAL_U64: + case TRACE_SPECIAL_SYM: + entry->u.special.eip = eip; + entry->u.special.v1 = v1; + entry->u.special.v2 = v2; + entry->u.special.v3 = v3; + if (unlikely(print_functions && !in_interrupt())) + early_print_entry(entry); + break; + case TRACE_SYSCALL: + entry->u.syscall.nr = eip; + entry->u.syscall.p1 = v1; + entry->u.syscall.p2 = v2; + entry->u.syscall.p3 = v3; + break; + case TRACE_SYSRET: + entry->u.sysret.ret = eip; + break; + case TRACE_CMDLINE: + memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES); + break; + default: + break; + } +out: + ; +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (unlikely(trace_enabled <= 0)) + return; + +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86) + debug_stackoverflow(cpu_traces + raw_smp_processor_id()); +#endif + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + /* + * Trace on the CPU where the current highest-prio task + * is waiting to become runnable: + */ +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing && !trace_all_cpus && !trace_print_on_crash && + !print_functions) { + if (!sch.tr || cpu != sch.cpu) + goto out; + tr = sch.tr; + } else + tr = cpu_traces + cpu; +#else + tr = cpu_traces + cpu; +#endif + atomic_inc(&tr->disabled); + if (likely(atomic_read(&tr->disabled) == 1)) { +//#define DEBUG_STACK_POISON +#ifdef DEBUG_STACK_POISON + char stack; + + memset(&stack - 128, 0x34, 128); +#endif + ____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags); + } + atomic_dec(&tr->disabled); +#ifdef CONFIG_WAKEUP_TIMING +out: +#endif + raw_local_irq_restore(flags); +} + +/* + * Special, ad-hoc tracepoints: + */ +void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3) +{ + ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3); +} + +EXPORT_SYMBOL(trace_special); + +void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2); +} + +EXPORT_SYMBOL(trace_special_pid); + +void notrace trace_special_u64(unsigned long long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0, + (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF), + v2); +} + +EXPORT_SYMBOL(trace_special_u64); + +void notrace trace_special_sym(void) +{ +#define STACK_ENTRIES 8 + unsigned long entries[STACK_ENTRIES]; + struct stack_trace trace; + + if (!trace_enabled) + return; + + if (!stackframe_tracing) + return ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, CALLER_ADDR1, 0, 0); + + trace.entries = entries; + trace.skip = 3; + trace.max_entries = STACK_ENTRIES; + trace.nr_entries = 0; + + save_stack_trace(&trace); + /* + * clear out the rest: + */ + while (trace.nr_entries < trace.max_entries) + entries[trace.nr_entries++] = 0; + + ___trace(TRACE_SPECIAL_SYM, entries[0], 0, + entries[1], entries[2], entries[3]); + ___trace(TRACE_SPECIAL_SYM, entries[4], 0, + entries[5], entries[6], entries[7]); +} + +EXPORT_SYMBOL(trace_special_sym); + +/* + * Non-inlined function: + */ +void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ + ___trace(TRACE_FN, eip, parent_eip, 0, 0, 0); +} + +#ifdef CONFIG_MCOUNT + +extern void mcount(void); + +EXPORT_SYMBOL(mcount); + +void notrace __mcount(void) +{ + ___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0); +} + +#endif + +void notrace +sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3) +{ + if (syscall_tracing) + ___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3); +} + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + +void notrace +sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2, + unsigned long p3) +{ + if (syscall_tracing) + ___trace(TRACE_SYSCALL, nr | 0x80000000, 0, p1, p2, p3); +} + +#endif + +void notrace sys_ret(unsigned long ret) +{ + if (syscall_tracing) + ___trace(TRACE_SYSRET, ret, 0, 0, 0, 0); +} + +static void notrace print_name(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + /* + * Special trace values: + */ + if (((long)eip < 10000L) && ((long)eip > -10000L)) { + seq_printf(m, "(%5ld)", eip); + return; + } + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_puts(m, sym_name); + else + seq_printf(m, "<%08lx>", eip); +} + +static void notrace print_name_offset(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s+%#lx/%#lx <%08lx>", + sym_name, offset, size, eip); + else + seq_printf(m, "<%08lx>", eip); +} + +static unsigned long out_sequence = -1; + +static int pid_to_cmdline_array[PID_MAX_DEFAULT+1]; + +static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ + unsigned long flags; + + local_save_flags(flags); + ____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags); +} + +void notrace trace_cmdline(void) +{ + ___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0); +} + +static void construct_pid_to_cmdline(struct cpu_trace *tr) +{ + unsigned int i, j, entries, pid; + + if (tr->critical_sequence == out_sequence) + return; + out_sequence = tr->critical_sequence; + + memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1)); + + if (!tr->trace) + return; + + entries = min(tr->trace_idx, MAX_TRACE); + + for (i = 0; i < entries; i++) { + struct trace_entry *entry = tr->trace + i; + + if (entry->type != TRACE_CMDLINE) + continue; + pid = entry->pid; + if (pid < PID_MAX_DEFAULT) { + pid_to_cmdline_array[pid] = i; + /* + * Replace space with underline - makes it easier + * to process for tools: + */ + for (j = 0; j < CMDLINE_BYTES; j++) + if (entry->u.cmdline.str[j] == ' ') + entry->u.cmdline.str[j] = '_'; + } + } +} + +char *pid_to_cmdline(unsigned long pid) +{ + struct cpu_trace *tr = out_tr.traces + 0; + char *cmdline = "<...>"; + int idx; + + pid = min(pid, (unsigned long)PID_MAX_DEFAULT); + if (!pid) + return ""; + + if (pid_to_cmdline_array[pid] != -1) { + idx = pid_to_cmdline_array[pid]; + if (tr->trace[idx].type == TRACE_CMDLINE) + cmdline = tr->trace[idx].u.cmdline.str; + } + return cmdline; +} + +static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr, int reorder) +{ + if (!save->trace || !tr->trace) + return; + /* free-running needs reordering */ + if (reorder && atomic_read(&tr->underrun)) { + int i, idx, idx0 = tr->trace_idx; + + for (i = 0; i < MAX_TRACE; i++) { + idx = (idx0 + i) % MAX_TRACE; + save->trace[i] = tr->trace[idx]; + } + save->trace_idx = MAX_TRACE; + } else { + save->trace_idx = tr->trace_idx; + + memcpy(save->trace, tr->trace, + min(save->trace_idx, MAX_TRACE) * + sizeof(struct trace_entry)); + } + save->underrun = tr->underrun; + save->overrun = tr->overrun; +} + + +struct block_idx { + int idx[NR_CPUS]; +}; + +/* + * return the trace entry (position) of the smallest-timestamp + * one (that is still in the valid idx range): + */ +static int min_idx(struct block_idx *bidx) +{ + cycle_t min_stamp = (cycle_t) -1; + struct trace_entry *entry; + int cpu, min_cpu = -1, idx; + + for_each_online_cpu(cpu) { + idx = bidx->idx[cpu]; + if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE)) + continue; + if (idx >= MAX_TRACE*NR_CPUS) { + printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, + NR_CPUS); + WARN_ON(1); + break; + } + entry = max_tr.traces[cpu].trace + bidx->idx[cpu]; + if (entry->timestamp < min_stamp) { + min_cpu = cpu; + min_stamp = entry->timestamp; + } + } + + return min_cpu; +} + +/* + * This code is called to construct an output trace from + * the maximum trace. Having separate traces serves both + * atomicity (a new max might be saved while we are busy + * accessing /proc/latency_trace) and it is also used to + * delay the (expensive) sorting of the output trace by + * timestamps, in the trace_all_cpus case. + */ +static void update_out_trace(void) +{ + struct trace_entry *out_entry, *entry, *tmp; + cycle_t stamp, first_stamp, last_stamp; + struct block_idx bidx = { { 0, }, }; + struct cpu_trace *tmp_max, *tmp_out; + int cpu, sum, entries, underrun_sum, overrun_sum; + + /* + * For out_tr we only have the first array's trace entries + * allocated - and they have are larger on SMP to make room + * for all trace entries from all CPUs. + */ + tmp_out = out_tr.traces + 0; + tmp_max = max_tr.traces + max_tr.cpu; + /* + * Easier to copy this way. Note: the trace buffer is private + * to the output buffer, so preserve it: + */ + copy_trace(tmp_out, tmp_max, 0); + tmp = tmp_out->trace; + *tmp_out = *tmp_max; + tmp_out->trace = tmp; + + out_tr.cpu = max_tr.cpu; + + if (!tmp_out->trace) + return; + + out_entry = tmp_out->trace + 0; + + if (!trace_all_cpus) { + entries = min(tmp_out->trace_idx, MAX_TRACE); + if (!entries) + return; + out_tr.first_timestamp = tmp_out->trace[0].timestamp; + out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp; + return; + } + /* + * Find the range of timestamps that are fully traced in + * all CPU traces. (since CPU traces can cover a variable + * range of time, we have to find the best range.) + */ + first_stamp = 0; + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + stamp = tmp_max->trace[0].timestamp; + if (stamp > first_stamp) + first_stamp = stamp; + } + /* + * Save the timestamp range: + */ + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + /* + * No saved trace yet? + */ + if (!entries) { + out_tr.traces[0].trace_idx = 0; + return; + } + + last_stamp = tmp_max->trace[entries-1].timestamp; + + if (last_stamp < first_stamp) { + WARN_ON(1); + + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n", + cpu, + tmp_max->trace[0].timestamp, + tmp_max->trace[1].timestamp, + entries, + tmp_max->trace[entries-2].timestamp, + tmp_max->trace[entries-1].timestamp); + } + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + + printk("CPU%d entries: %d\n", max_tr.cpu, entries); + printk("first stamp: %016Lx\n", first_stamp); + printk(" last stamp: %016Lx\n", first_stamp); + } + +#if 0 + printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp); + printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp); + printk(" +1 stamp: %Ld [%016Lx]\n", + tmp_max->trace[entries].timestamp, + tmp_max->trace[entries].timestamp); + printk(" +2 stamp: %Ld [%016Lx]\n", + tmp_max->trace[entries+1].timestamp, + tmp_max->trace[entries+1].timestamp); + printk(" delta: %Ld\n", last_stamp-first_stamp); + printk(" entries: %d\n", entries); +#endif + + out_tr.first_timestamp = first_stamp; + out_tr.last_timestamp = last_stamp; + + /* + * Fetch trace entries one by one, in increasing timestamp + * order. Start at first_stamp, stop at last_stamp: + */ + sum = 0; + for (;;) { + cpu = min_idx(&bidx); + if (cpu == -1) + break; + entry = max_tr.traces[cpu].trace + bidx.idx[cpu]; + if (entry->timestamp > last_stamp) + break; + + bidx.idx[cpu]++; + if (entry->timestamp < first_stamp) + continue; + *out_entry = *entry; + out_entry++; + sum++; + if (sum >= MAX_TRACE*NR_CPUS) { + printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, + NR_CPUS); + WARN_ON(1); + break; + } + } + + sum = 0; + underrun_sum = 0; + overrun_sum = 0; + for_each_online_cpu(cpu) { + sum += max_tr.traces[cpu].trace_idx; + underrun_sum += atomic_read(&max_tr.traces[cpu].underrun); + overrun_sum += atomic_read(&max_tr.traces[cpu].overrun); + } + tmp_out->trace_idx = sum; + atomic_set(&tmp_out->underrun, underrun_sum); + atomic_set(&tmp_out->overrun, overrun_sum); +} + +static void notrace print_help_header(struct seq_file *m) +{ + seq_puts(m, " _------=> CPU# \n"); + seq_puts(m, " / _-----=> irqs-off \n"); + seq_puts(m, " | / _----=> need-resched \n"); + seq_puts(m, " || / _---=> hardirq/softirq \n"); + seq_puts(m, " ||| / _--=> preempt-depth \n"); + seq_puts(m, " |||| / \n"); + seq_puts(m, " ||||| delay \n"); + seq_puts(m, " cmd pid ||||| time | caller \n"); + seq_puts(m, " \\ / ||||| \\ | / \n"); +} + +static void * notrace l_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + unsigned long entries; + struct cpu_trace *tr = out_tr.traces + 0; + + down(&out_mutex); + /* + * if the file is being read newly, update the output trace: + */ + if (!n) { + // TODO: use the sequence counter here to optimize + down(&max_mutex); + update_out_trace(); + up(&max_mutex); +#if 0 + if (!tr->trace_idx) { + up(&out_mutex); + return NULL; + } +#endif + construct_pid_to_cmdline(tr); + } + entries = min(tr->trace_idx, MAX_TRACE); + + if (!n) { + seq_printf(m, "preemption latency trace v1.1.5 on %s\n", + UTS_RELEASE); + seq_puts(m, "--------------------------------------------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d", + cycles_to_usecs(tr->saved_latency), + entries, + (entries + atomic_read(&tr->underrun) + + atomic_read(&tr->overrun)), + out_tr.cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "rt", +#endif + 0, 0, +#ifdef CONFIG_PREEMPT_SOFTIRQS + softirq_preemption +#else + 0 +#endif + , +#ifdef CONFIG_PREEMPT_HARDIRQS + hardirq_preemption +#else + 0 +#endif + ); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n", + tr->comm, tr->pid, tr->uid, tr->nice, + tr->policy, tr->rt_priority); + seq_puts(m, " -----------------\n"); + if (trace_user_triggered) { + seq_puts(m, " => started at: "); + print_name_offset(m, tr->critical_start); + seq_puts(m, "\n => ended at: "); + print_name_offset(m, tr->critical_end); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + if (!trace_verbose) + print_help_header(m); + } + if (n >= entries || !tr->trace) + return NULL; + + return tr->trace + n; +} + +static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned long entries = min(tr->trace_idx, MAX_TRACE); + + WARN_ON(!tr->trace); + + if (++*pos >= entries) { + if (*pos == entries) + seq_puts(m, "\n\nvim:ft=help\n"); + return NULL; + } + return tr->trace + *pos; +} + +static void notrace l_stop(struct seq_file *m, void *p) +{ + up(&out_mutex); +} + +static void print_timestamp(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static void +print_timestamp_short(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_putc(m, '!'); + else if (rel_usecs > 1) + seq_putc(m, '+'); + else + seq_putc(m, ' '); +} + +static void +print_generic(struct seq_file *m, struct trace_entry *entry) +{ + int hardirq, softirq; + + seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid); + seq_printf(m, "%d", entry->cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + + +static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + if (trace_verbose) { + seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ", + pid_to_cmdline(entry->pid), + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, trace_idx, + entry->timestamp, abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + print_name_offset(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name_offset(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } else { + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + print_name(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } + return 0; +} + +static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry, int mode64) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + + if (!mode64) { + /* + * For convenience, print small numbers in decimal: + */ + if (abs((int)entry->u.special.v1) < 10000) + seq_printf(m, " (%5ld ", entry->u.special.v1); + else + seq_printf(m, " (%lx ", entry->u.special.v1); + if (abs((int)entry->u.special.v2) < 10000) + seq_printf(m, "%5ld ", entry->u.special.v2); + else + seq_printf(m, "%lx ", entry->u.special.v2); + if (abs((int)entry->u.special.v3) < 10000) + seq_printf(m, "%5ld)\n", entry->u.special.v3); + else + seq_printf(m, "%lx)\n", entry->u.special.v3); + } else { + seq_printf(m, " (%13Ld %ld)\n", + ((u64)entry->u.special.v1 << 32) + + (u64)entry->u.special.v2, entry->u.special.v3); + } + return 0; +} + +static int notrace +l_show_special_pid(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int pid; + + pid = entry->u.special.v1; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + seq_printf(m, " <%.8s-%d> (%ld %ld)\n", + pid_to_cmdline(pid), pid, + entry->u.special.v2, entry->u.special.v3); + + return 0; +} + +static int notrace +l_show_special_sym(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry, int mode64) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v1); + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v2); + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v3); + seq_puts(m, "()\n"); + + return 0; +} + + +static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + if (!trace_verbose) + return 0; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + seq_printf(m, + "[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n", + entry->u.cmdline.str, + abs_usecs/1000, abs_usecs % 1000, + rel_usecs/1000, rel_usecs % 1000); + + return 0; +} + +extern unsigned long sys_call_table[NR_syscalls]; + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) +extern unsigned long ia32_sys_call_table[], ia32_syscall_end[]; +#define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table) +#endif + +static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned long nr; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_puts(m, "> "); + nr = entry->u.syscall.nr; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + if (nr & 0x80000000) { + nr &= ~0x80000000; + if (nr < IA32_NR_syscalls) + print_name(m, ia32_sys_call_table[nr]); + else + seq_printf(m, "", nr); + } else +#endif + if (nr < NR_syscalls) + print_name(m, sys_call_table[nr]); + else + seq_printf(m, "", nr); + +#ifdef CONFIG_64BIT + seq_printf(m, " (%016lx %016lx %016lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); +#else + seq_printf(m, " (%08lx %08lx %08lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); +#endif + + return 0; +} + +static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_printf(m, "< (%ld)\n", entry->u.sysret.ret); + + return 0; +} + + +static int notrace l_show(struct seq_file *m, void *p) +{ + struct cpu_trace *tr = out_tr.traces; + struct trace_entry *entry, *entry0, *next_entry; + unsigned long trace_idx; + + cond_resched(); + entry = p; + if (entry->timestamp < out_tr.first_timestamp) + return 0; + if (entry->timestamp > out_tr.last_timestamp) + return 0; + + entry0 = tr->trace; + trace_idx = entry - entry0; + + if (trace_idx + 1 < tr->trace_idx) + next_entry = entry + 1; + else + next_entry = entry; + + if (trace_verbose) + seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx); + + switch (entry->type) { + case TRACE_FN: + l_show_fn(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL: + l_show_special(m, trace_idx, entry, entry0, next_entry, 0); + break; + case TRACE_SPECIAL_PID: + l_show_special_pid(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL_U64: + l_show_special(m, trace_idx, entry, entry0, next_entry, 1); + break; + case TRACE_SPECIAL_SYM: + l_show_special_sym(m, trace_idx, entry, entry0, + next_entry, 1); + break; + case TRACE_CMDLINE: + l_show_cmdline(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSCALL: + l_show_syscall(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSRET: + l_show_sysret(m, trace_idx, entry, entry0, next_entry); + break; + default: + seq_printf(m, "unknown trace type %d\n", entry->type); + } + return 0; +} + +struct seq_operations latency_trace_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /proc/latency_trace) + */ +static void update_max_tr(struct cpu_trace *tr) +{ + struct cpu_trace *save; + int cpu, all_cpus = 0; + +#ifdef CONFIG_PREEMPT + WARN_ON(!preempt_count() && !irqs_disabled()); +#endif + + max_tr.cpu = tr->cpu; + save = max_tr.traces + tr->cpu; + + if ((wakeup_timing || trace_user_triggered || trace_print_on_crash || + print_functions) && trace_all_cpus) { + all_cpus = 1; + for_each_online_cpu(cpu) + atomic_inc(&cpu_traces[cpu].disabled); + } + + save->saved_latency = preempt_max_latency; + save->preempt_timestamp = tr->preempt_timestamp; + save->critical_start = tr->critical_start; + save->critical_end = tr->critical_end; + save->critical_sequence = tr->critical_sequence; + + memcpy(save->comm, current->comm, CMDLINE_BYTES); + save->pid = current->pid; + save->uid = current->uid; + save->nice = current->static_prio - 20 - MAX_RT_PRIO; + save->policy = current->policy; + save->rt_priority = current->rt_priority; + + if (all_cpus) { + for_each_online_cpu(cpu) { + copy_trace(max_tr.traces + cpu, cpu_traces + cpu, 1); + atomic_dec(&cpu_traces[cpu].disabled); + } + } else + copy_trace(save, tr, 1); +} + +#else /* !EVENT_TRACE */ + +static inline void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3, + unsigned long flags) +{ +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ +} + +static inline void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ +} + +static inline void update_max_tr(struct cpu_trace *tr) +{ +} + +static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ +} + +#endif + +static int setup_preempt_thresh(char *s) +{ + int thresh; + + get_option(&s, &thresh); + if (thresh > 0) { + preempt_thresh = usecs_to_cycles(thresh); + printk("Preemption threshold = %u us\n", thresh); + } + return 1; +} +__setup("preempt_thresh=", setup_preempt_thresh); + +static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr) +{ + if (trace_all_cpus) + for_each_online_cpu(cpu) { + tr = cpu_traces + cpu; + tr->trace_idx = 0; + atomic_set(&tr->underrun, 0); + atomic_set(&tr->overrun, 0); + } + else{ + tr->trace_idx = 0; + atomic_set(&tr->underrun, 0); + atomic_set(&tr->overrun, 0); + } +} + +#ifdef CONFIG_CRITICAL_TIMING + +static void notrace +check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, T2, delta; + unsigned long flags; + + if (trace_user_triggered) + return; + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = tr->preempt_timestamp; + T1 = get_monotonic_cycles(); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T2 = get_monotonic_cycles(); + + delta = T2-T0; + + latency = cycles_to_usecs(delta); + latency_hist(tr->latency_type, cpu, latency); + + if (latency_hist_flag) { + if (preempt_max_latency >= delta) + goto out; + } + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + +#ifndef CONFIG_CRITICAL_LATENCY_HIST + if (!preempt_thresh && preempt_max_latency > delta) { + printk("bug: updating %016Lx > %016Lx?\n", + preempt_max_latency, delta); + printk(" [%016Lx %016Lx %016Lx]\n", T0, T1, T2); + } +#endif + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + + tr->critical_end = parent_eip; + + update_max_tr(tr); + +#ifndef CONFIG_CRITICAL_LATENCY_HIST + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, cycles_to_usecs(preempt_thresh), t0); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, t0); + + print_symbol("<%s>\n", tr->critical_start); + printk(" => ended at timestamp %lu: ", t1); + print_symbol("<%s>\n", tr->critical_end); + dump_stack(); + t1 = cycles_to_usecs(get_monotonic_cycles()); + printk(" => dump-end timestamp %lu\n\n", t1); +#endif + + max_sequence++; + + up(&max_mutex); + +out: + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = get_monotonic_cycles(); + tr->early_warning = 0; + reset_trace_idx(cpu, tr); + _trace_cmdline(cpu, tr); + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); +} + +void notrace touch_critical_timing(void) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + if (preempt_count() > 0 && tr->critical_start) { + atomic_inc(&tr->disabled); + check_critical_timing(cpu, tr, CALLER_ADDR0); + tr->critical_start = CALLER_ADDR0; + tr->critical_sequence = max_sequence; + atomic_dec(&tr->disabled); + } +} +EXPORT_SYMBOL(touch_critical_timing); + +void notrace stop_critical_timing(void) +{ + struct cpu_trace *tr = cpu_traces + raw_smp_processor_id(); + + tr->critical_start = 0; +} +EXPORT_SYMBOL(stop_critical_timing); + +static inline void notrace +__start_critical_timing(unsigned long eip, unsigned long parent_eip, + int latency_type) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + unsigned long flags; + + if (tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = get_monotonic_cycles(); + tr->critical_start = eip; + reset_trace_idx(cpu, tr); + tr->latency_type = latency_type; + _trace_cmdline(cpu, tr); + + local_save_flags(flags); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags); + + atomic_dec(&tr->disabled); +} + +static inline void notrace +__stop_critical_timing(unsigned long eip, unsigned long parent_eip) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + unsigned long flags; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + local_save_flags(flags); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags); + check_critical_timing(cpu, tr, eip); + tr->critical_start = 0; + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + +#ifdef CONFIG_LOCKDEP + +void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(a0, a1); +} + +void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(a0, a1, INTERRUPT_LATENCY); +} + +#else /* !CONFIG_LOCKDEP */ + +/* + * Dummy: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void notrace trace_hardirqs_on(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); +} + +EXPORT_SYMBOL(trace_hardirqs_on); + +void notrace trace_hardirqs_off(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */, + INTERRUPT_LATENCY); +} + +EXPORT_SYMBOL(trace_hardirqs_off); + +#endif /* !CONFIG_LOCKDEP */ + +#endif /* CONFIG_CRITICAL_IRQSOFF_TIMING */ + +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + +static inline unsigned long get_parent_eip(void) +{ + unsigned long parent_eip = CALLER_ADDR1; + + if (in_lock_functions(parent_eip)) { + parent_eip = CALLER_ADDR2; + if (in_lock_functions(parent_eip)) + parent_eip = CALLER_ADDR3; + } + + return parent_eip; +} + +void notrace add_preempt_count(unsigned int val) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_WARN_ON(((int)preempt_count() < 0))) + return; + /* + * Spinlock count overflowing soon? + */ + if (DEBUG_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10)) + return; +#endif + + preempt_count() += val; +#ifdef CONFIG_PREEMPT_TRACE + if (val <= 10) { + unsigned int idx = preempt_count() & PREEMPT_MASK; + if (idx < MAX_PREEMPT_TRACE) { + current->preempt_trace_eip[idx] = eip; + current->preempt_trace_parent_eip[idx] = parent_eip; + } + } +#endif +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __start_critical_timing(eip, parent_eip, + PREEMPT_LATENCY); + } +#endif + (void)eip, (void)parent_eip; +} +EXPORT_SYMBOL(add_preempt_count); + +void notrace sub_preempt_count(unsigned int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_WARN_ON(unlikely(val > preempt_count()))) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __stop_critical_timing(CALLER_ADDR0, + CALLER_ADDR1); + } +#endif + preempt_count() -= val; +} + +EXPORT_SYMBOL(sub_preempt_count); + +void notrace mask_preempt_count(unsigned int mask) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + + preempt_count() |= mask; + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __start_critical_timing(eip, parent_eip, + PREEMPT_LATENCY); + } +#endif + (void) eip, (void) parent_eip; +} +EXPORT_SYMBOL(mask_preempt_count); + +void notrace unmask_preempt_count(unsigned int mask) +{ +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __stop_critical_timing(CALLER_ADDR0, + CALLER_ADDR1); + } +#endif + preempt_count() &= ~mask; +} +EXPORT_SYMBOL(unmask_preempt_count); + + +#endif + +/* + * Wakeup latency timing/tracing. We get upcalls from the scheduler + * when a task is being woken up and we time/trace it until it gets + * to a CPU - or an even-higher-prio task supercedes it. (in that + * case we throw away the currently traced task - we dont try to + * handle nesting, that simplifies things significantly) + */ +#ifdef CONFIG_WAKEUP_TIMING + +static void notrace +check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip, + unsigned long *flags) +{ + int cpu = raw_smp_processor_id(); + unsigned long latency, t0, t1; + cycle_t T0, T1, delta; + + if (trace_user_triggered) + return; + + atomic_inc(&tr->disabled); + if (atomic_read(&tr->disabled) != 1) + goto out; + + T0 = tr->preempt_timestamp; + T1 = get_monotonic_cycles(); + /* + * Any wraparound or time warp and we are out: + */ + if (T0 > T1) + goto out; + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + + ____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, + 0, 0, 0, *flags); + + latency = cycles_to_usecs(delta); + latency_hist(tr->latency_type, cpu, latency); + + if (latency_hist_flag) { + if (preempt_max_latency >= delta) + goto out; + } + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + +#ifndef CONFIG_WAKEUP_LATENCY_HIST + if (!preempt_thresh && preempt_max_latency > delta) { + printk("bug2: updating %016lx > %016Lx?\n", + preempt_max_latency, delta); + printk(" [%016Lx %016Lx]\n", T0, T1); + } +#endif + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + tr->critical_end = parent_eip; + + update_max_tr(tr); + + atomic_dec(&tr->disabled); + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(*flags); + +#ifndef CONFIG_WAKEUP_LATENCY_HIST + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "wakeup.\n", current->comm, current->pid, + raw_smp_processor_id(), latency); +#endif + + max_sequence++; + + up(&max_mutex); + + return; + +out: + atomic_dec(&tr->disabled); + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(*flags); +} + +/* + * Start wakeup latency tracing - called with the runqueue held + * and interrupts disabled: + */ +void __trace_start_sched_wakeup(struct task_struct *p) +{ + struct cpu_trace *tr; + int cpu; + + if (trace_user_triggered || !wakeup_timing) { + trace_special_pid(p->pid, p->prio, -1); + return; + } + + __raw_spin_lock(&sch.trace_lock); + if (sch.task && (sch.task->prio <= p->prio)) + goto out_unlock; + + /* + * New highest-prio task just woke up - start tracing: + */ + sch.task = p; + cpu = task_cpu(p); + sch.cpu = cpu; + /* + * We keep using this CPU's trace buffer even if the task + * gets migrated to another CPU. Tracing only happens on + * the CPU that 'owns' the highest-prio task so it's + * fundamentally single-threaded. + */ + sch.tr = tr = cpu_traces + cpu; + reset_trace_idx(cpu, tr); + +// if (!atomic_read(&tr->disabled)) { + atomic_inc(&tr->disabled); + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = get_monotonic_cycles(); + tr->latency_type = WAKEUP_LATENCY; + tr->critical_start = CALLER_ADDR0; + _trace_cmdline(raw_smp_processor_id(), tr); + atomic_dec(&tr->disabled); +// } + + mcount(); + trace_special_pid(p->pid, p->prio, cpu); + trace_special_sym(); +out_unlock: + __raw_spin_unlock(&sch.trace_lock); +} + +void trace_stop_sched_switched(struct task_struct *p) +{ + struct cpu_trace *tr; + unsigned long flags; + + if (trace_user_triggered || !wakeup_timing) + return; + + local_irq_save(flags); + __raw_spin_lock(&sch.trace_lock); + if (p == sch.task) { + trace_special_pid(p->pid, p->prio, task_cpu(p)); + + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + WARN_ON(!tr); + /* auto-unlocks the spinlock: */ + check_wakeup_timing(tr, CALLER_ADDR0, &flags); + } else { + if (sch.task) + trace_special_pid(sch.task->pid, sch.task->prio, + p->prio); + if (sch.task && (sch.task->prio >= p->prio)) + sch.task = NULL; + __raw_spin_unlock(&sch.trace_lock); + } + local_irq_restore(flags); +} + +void trace_change_sched_cpu(struct task_struct *p, int new_cpu) +{ + unsigned long flags; + + if (!wakeup_timing) + return; + + trace_special_pid(p->pid, task_cpu(p), new_cpu); + trace_special_sym(); + local_irq_save(flags); + __raw_spin_lock(&sch.trace_lock); + if (p == sch.task && task_cpu(p) != new_cpu) { + sch.cpu = new_cpu; + trace_special(task_cpu(p), new_cpu, 0); + } + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(flags); +} + +#endif + +#ifdef CONFIG_EVENT_TRACE + +long user_trace_start(void) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (!trace_user_triggered || trace_print_on_crash || print_functions) + return -EINVAL; + + /* + * If the user has not yet reset the max latency after + * bootup then we assume that this was the intention + * (we wont get any tracing done otherwise): + */ + if (preempt_max_latency == (cycle_t)ULONG_MAX) + preempt_max_latency = 0; + + /* + * user_trace_start() might be called from hardirq + * context, if trace_user_triggered_irq is set, so + * be careful about locking: + */ + if (preempt_count() || irqs_disabled()) { + if (down_trylock(&max_mutex)) + return -EAGAIN; + } else + down(&max_mutex); + + local_irq_save(flags); + cpu = smp_processor_id(); + tr = cpu_traces + cpu; + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + __raw_spin_lock(&sch.trace_lock); + sch.task = current; + sch.cpu = cpu; + sch.tr = tr; + __raw_spin_unlock(&sch.trace_lock); + } +#endif + reset_trace_idx(cpu, tr); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = get_monotonic_cycles(); + tr->critical_start = CALLER_ADDR0; + _trace_cmdline(cpu, tr); + mcount(); + + WARN_ON(!irqs_disabled()); + local_irq_restore(flags); + + up(&max_mutex); + + return 0; +} + +EXPORT_SYMBOL_GPL(user_trace_start); + +long user_trace_stop(void) +{ + unsigned long latency = 0, flags; + struct cpu_trace *tr; + cycle_t delta; + + if (!trace_user_triggered || trace_print_on_crash || print_functions) + return -EINVAL; + + local_irq_save(flags); + mcount(); + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + struct task_struct *t; + + __raw_spin_lock(&sch.trace_lock); + t = sch.task; + if (current != t) { + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(flags); + printk("wrong stop: curr: %s/%d[%d] => %p\n", + current->comm, current->pid, + task_thread_info(current)->cpu, t); + if (t) + printk("wrong stop: curr: %s/%d[%d]\n", + t->comm, t->pid, + task_thread_info(t)->cpu); + return -EINVAL; + } + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + __raw_spin_unlock(&sch.trace_lock); + } else +#endif + tr = cpu_traces + smp_processor_id(); + + atomic_inc(&tr->disabled); + if (tr->preempt_timestamp) { + cycle_t T0, T1; + unsigned long long tmp0; + + T0 = tr->preempt_timestamp; + T1 = get_monotonic_cycles(); + tmp0 = preempt_max_latency; + if (T1 < T0) + T0 = T1; + delta = T1 - T0; + if (!report_latency(delta)) + goto out; + if (tr->critical_sequence != max_sequence || + down_trylock(&max_mutex)) + goto out; + + WARN_ON(!preempt_thresh && preempt_max_latency > delta); + + preempt_max_latency = delta; + update_max_tr(tr); + + latency = cycles_to_usecs(delta); + + max_sequence++; + up(&max_mutex); +out: + tr->preempt_timestamp = 0; + } + atomic_dec(&tr->disabled); + local_irq_restore(flags); + + if (latency) { + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us user-latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us user-latency.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency); + } + + return 0; +} + +EXPORT_SYMBOL(user_trace_stop); + +static int trace_print_cpu = -1; + +void notrace stop_trace(void) +{ + if (trace_print_on_crash && trace_print_cpu == -1) { + trace_enabled = -1; + trace_print_cpu = raw_smp_processor_id(); + } +} + +EXPORT_SYMBOL(stop_trace); + +static void print_entry(struct trace_entry *entry, struct trace_entry *entry0) +{ + unsigned long abs_usecs; + int hardirq, softirq; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + + printk("%-5d ", entry->pid); + + printk("%d%c%c", + entry->cpu, + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + printk("H"); + else { + if (hardirq) + printk("h"); + else { + if (softirq) + printk("s"); + else + printk("."); + } + } + + if (entry->preempt_count) + printk(":%x ", entry->preempt_count); + else + printk(":. "); + + printk("%ld.%03ldms: ", abs_usecs/1000, abs_usecs % 1000); + + switch (entry->type) { + case TRACE_FN: + printk_name(entry->u.fn.eip); + printk(" <= ("); + printk_name(entry->u.fn.parent_eip); + printk(")\n"); + break; + case TRACE_SPECIAL: + printk(" special: %lx %lx %lx\n", + entry->u.special.v1, entry->u.special.v2, + entry->u.special.v3); + break; + case TRACE_SPECIAL_U64: + printk(" spec64: %lx%08lx %lx\n", + entry->u.special.v1, entry->u.special.v2, + entry->u.special.v3); + break; + } +} + +/* + * Print the current trace at crash time. + * + * We print it backwards, so that the newest (most interesting) entries + * are printed first. + */ +void print_last_trace(void) +{ + unsigned int idx0, idx, i, cpu; + struct cpu_trace *tr; + struct trace_entry *entry0, *entry; + + preempt_disable(); + cpu = smp_processor_id(); + if (trace_enabled != -1 || trace_print_cpu != cpu || + !trace_print_on_crash) { + if (trace_print_on_crash) + printk("skipping trace printing on CPU#%d != %d\n", + cpu, trace_print_cpu); + preempt_enable(); + return; + } + + trace_print_on_crash = 0; + + tr = cpu_traces + cpu; + if (!tr->trace) + goto out; + + printk("Last %ld trace entries:\n", MAX_TRACE); + idx0 = tr->trace_idx; + printk("curr idx: %d\n", idx0); + if (idx0 >= MAX_TRACE) + idx0 = 0; + idx = idx0; + entry0 = tr->trace + idx0; + + for (i = 0; i < MAX_TRACE; i++) { + if (idx == 0) + idx = MAX_TRACE-1; + else + idx--; + entry = tr->trace + idx; + switch (entry->type) { + case TRACE_FN: + case TRACE_SPECIAL: + case TRACE_SPECIAL_U64: + print_entry(entry, entry0); + break; + } + } + printk("printed %ld entries\n", MAX_TRACE); +out: + preempt_enable(); +} + +#ifdef CONFIG_SMP +/* + * On SMP, try to 'peek' on other CPU's traces and record them + * in this CPU's trace. This way we get a rough idea about what's + * going on there, without the overhead of global tracing. + * + * (no need to make this PER_CPU, we bounce it around anyway.) + */ +unsigned long nmi_eips[NR_CPUS]; +unsigned long nmi_flags[NR_CPUS]; + +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + int cpu, this_cpu = smp_processor_id(); + + __trace(eip, parent_eip); + + nmi_eips[this_cpu] = parent_eip; + nmi_flags[this_cpu] = flags; + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_online(cpu) && cpu != this_cpu) { + __trace(eip, nmi_eips[cpu]); + __trace(eip, nmi_flags[cpu]); + } +} +#else +/* + * On UP, NMI tracing is quite simple: + */ +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + __trace(eip, parent_eip); +} +#endif + +#endif + +#ifdef CONFIG_PREEMPT_TRACE + +static void print_preempt_trace(struct task_struct *task) +{ + unsigned int count = task_thread_info(task)->preempt_count; + unsigned int i, lim = count & PREEMPT_MASK; + if (lim >= MAX_PREEMPT_TRACE) + lim = MAX_PREEMPT_TRACE-1; + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + printk("| %d-level deep critical section nesting:\n", lim); + printk("----------------------------------------\n"); + for (i = 1; i <= lim; i++) { + printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]); + print_symbol("%s\n", task->preempt_trace_eip[i]); + printk(".....[<%08lx>] .. ( <= ", + task->preempt_trace_parent_eip[i]); + print_symbol("%s)\n", task->preempt_trace_parent_eip[i]); + } + printk("\n"); +} + +#endif + +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE) +void print_traces(struct task_struct *task) +{ + if (!task) + task = current; + +#ifdef CONFIG_PREEMPT_TRACE + print_preempt_trace(task); +#endif +#ifdef CONFIG_EVENT_TRACE + print_last_trace(); +#endif +} +#endif + +#ifdef CONFIG_EVENT_TRACE +/* + * Allocate all the per-CPU trace buffers and the + * save-maximum/save-output staging buffers: + */ +void __init init_tracer(void) +{ + unsigned long size, total_size = 0; + struct trace_entry *array; + struct cpu_trace *tr; + int cpu; + + printk("num_possible_cpus(): %d\n", num_possible_cpus()); + + size = sizeof(struct trace_entry)*MAX_TRACE; + + for_each_possible_cpu(cpu) { + tr = cpu_traces + cpu; + array = alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "CPU#%d: failed to allocate %ld bytes trace buffer!\n", + cpu, size); + } else { + printk(KERN_INFO + "CPU#%d: allocated %ld bytes trace buffer.\n", + cpu, size); + total_size += size; + } + tr->cpu = cpu; + tr->trace = array; + + array = alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "CPU#%d: failed to allocate %ld bytes max-trace buffer!\n", + cpu, size); + } else { + printk(KERN_INFO + "CPU#%d: allocated %ld bytes max-trace buffer.\n", + cpu, size); + total_size += size; + } + max_tr.traces[cpu].trace = array; + } + + /* + * The output trace buffer is a special one that only has + * trace entries for the first cpu-trace structure: + */ + size = sizeof(struct trace_entry)*MAX_TRACE*num_possible_cpus(); + array = alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "failed to allocate %ld bytes out-trace buffer!\n", + size); + } else { + printk(KERN_INFO "allocated %ld bytes out-trace buffer.\n", + size); + total_size += size; + } + out_tr.traces[0].trace = array; + printk(KERN_INFO + "tracer: a total of %ld bytes allocated.\n", + total_size); +} +#endif Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -166,14 +166,14 @@ static struct list_head chainhash_table[ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) -void lockdep_off(void) +void notrace lockdep_off(void) { current->lockdep_recursion++; } EXPORT_SYMBOL(lockdep_off); -void lockdep_on(void) +void notrace lockdep_on(void) { current->lockdep_recursion--; } @@ -706,7 +706,7 @@ find_usage_forwards(struct lock_class *s * Return 1 otherwise and keep unchanged. * Return 0 on error. */ -static noinline int +static noinline notrace int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; @@ -1386,7 +1386,7 @@ cache_hit: * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ -static void check_chain_key(struct task_struct *curr) +static void notrace check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; @@ -1573,8 +1573,8 @@ valid_state(struct task_struct *curr, st /* * Mark a lock with a usage bit, and validate the state transition: */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) +static int notrace mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -1781,6 +1781,7 @@ static int mark_lock(struct task_struct * We must printk outside of the graph_lock: */ if (ret == 2) { + user_trace_stop(); printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); @@ -1794,7 +1795,7 @@ static int mark_lock(struct task_struct /* * Mark all held locks with a usage bit: */ -static int +static int notrace mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; @@ -1841,7 +1842,7 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void notrace trace_hardirqs_on(void) { struct task_struct *curr = current; unsigned long ip; @@ -1882,6 +1883,9 @@ void trace_hardirqs_on(void) curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + time_hardirqs_on(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); +#endif } EXPORT_SYMBOL(trace_hardirqs_on); @@ -1889,7 +1893,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void notrace trace_hardirqs_off(void) { struct task_struct *curr = current; @@ -1907,6 +1911,9 @@ void trace_hardirqs_off(void) curr->hardirq_disable_ip = _RET_IP_; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_off_events); +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + time_hardirqs_off(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); +#endif } else debug_atomic_inc(&redundant_hardirqs_off); } @@ -2404,7 +2411,7 @@ __lock_release(struct lockdep_map *lock, /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static notrace void check_flags(unsigned long flags) { #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) @@ -2436,8 +2443,9 @@ static void check_flags(unsigned long fl * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ -void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) +void notrace +lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2445,9 +2453,9 @@ void lock_acquire(struct lockdep_map *lo return; raw_local_irq_save(flags); + current->lockdep_recursion = 1; check_flags(flags); - current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), ip); current->lockdep_recursion = 0; @@ -2456,7 +2464,8 @@ void lock_acquire(struct lockdep_map *lo EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +void notrace +lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; @@ -2464,8 +2473,8 @@ void lock_release(struct lockdep_map *lo return; raw_local_irq_save(flags); - check_flags(flags); current->lockdep_recursion = 1; + check_flags(flags); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); Index: linux-rt.q/kernel/panic.c =================================================================== --- linux-rt.q.orig/kernel/panic.c +++ linux-rt.q/kernel/panic.c @@ -66,6 +66,8 @@ NORET_TYPE void panic(const char * fmt, unsigned long caller = (unsigned long) __builtin_return_address(0); #endif + stop_trace(); + /* * It's possible to come here directly from a panic-assertion and not * have preempt disabled. Some functions called from here want Index: linux-rt.q/kernel/printk.c =================================================================== --- linux-rt.q.orig/kernel/printk.c +++ linux-rt.q/kernel/printk.c @@ -324,12 +324,14 @@ static void __call_console_drivers(unsig { struct console *con; + touch_critical_timing(); for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) con->write(con, &LOG_BUF(start), end - start); } + touch_critical_timing(); } static int __read_mostly ignore_loglevel; Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -81,6 +81,10 @@ unsigned long long __attribute__((weak)) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +#define __PRIO(prio) \ + ((prio) <= 99 ? 199 - (prio) : (prio) - 120) + +#define PRIO(p) __PRIO((p)->prio) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, @@ -547,6 +551,12 @@ void sched_clock_unstable_event(void) #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT +static inline void trace_start_sched_wakeup(struct task_struct *p, struct rq *rq) +{ + if (p != rq->curr) + __trace_start_sched_wakeup(p); +} + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -564,6 +574,8 @@ static void resched_task(struct task_str { int cpu; + trace_start_sched_wakeup(p, task_rq(p)); + assert_spin_locked(&task_rq(p)->lock); if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) @@ -595,6 +607,8 @@ static void resched_cpu(int cpu) #else static inline void resched_task(struct task_struct *p) { + trace_start_sched_wakeup(p, task_rq(p)); + assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } @@ -953,6 +967,7 @@ unsigned long weighted_cpuload(const int static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { + trace_change_sched_cpu(p, cpu); task_thread_info(p)->cpu = cpu; set_task_cfs_rq(p); } @@ -1519,14 +1534,19 @@ out: int fastcall wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + mcount(); + return ret; } EXPORT_SYMBOL(wake_up_process); int fastcall wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + int ret = try_to_wake_up(p, state, 0); + + mcount(); + return ret; } /* @@ -1701,6 +1721,7 @@ static inline void finish_task_switch(st prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); + trace_stop_sched_switched(current); if (likely(mm)) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { @@ -1772,10 +1793,15 @@ context_switch(struct rq *rq, struct tas spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif + trace_cmdline(); + /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); + + trace_special_pid(prev->pid, PRIO(prev), PRIO(current)); + /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack @@ -3216,41 +3242,39 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_EVENT_TRACE) && defined(CONFIG_DEBUG_RT_MUTEXES) -void fastcall add_preempt_count(int val) +static void trace_array(struct prio_array *array) { - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); + int i; + struct task_struct *p; + struct list_head *head, *tmp; + + for (i = 0; i < MAX_RT_PRIO; i++) { + head = array->queue + i; + if (list_empty(head)) { + WARN_ON(test_bit(i, array->bitmap)); + continue; + } + WARN_ON(!test_bit(i, array->bitmap)); + list_for_each(tmp, head) { + p = list_entry(tmp, struct task_struct, run_list); + trace_special_pid(p->pid, p->prio, PRIO(p)); + } + } } -EXPORT_SYMBOL(add_preempt_count); -void fastcall sub_preempt_count(int val) +static inline void trace_all_runnable_tasks(struct rq *rq) { - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; + if (trace_enabled) + trace_array(&rq->active); +} - preempt_count() -= val; +#else + +static inline void trace_all_runnable_tasks(struct rq *rq) +{ } -EXPORT_SYMBOL(sub_preempt_count); #endif @@ -3361,6 +3385,8 @@ need_resched_nonpreemptible: prev->sched_class->put_prev_task(rq, prev, now); next = pick_next_task(rq, prev, now); + trace_all_runnable_tasks(rq); + sched_info_switch(prev, next); if (likely(prev != next)) { @@ -3369,8 +3395,10 @@ need_resched_nonpreemptible: ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ - } else + } else { spin_unlock_irq(&rq->lock); + trace_stop_sched_switched(next); + } if (unlikely(reacquire_kernel_lock(current) < 0)) { cpu = smp_processor_id(); @@ -3824,6 +3852,7 @@ void rt_mutex_setprio(struct task_struct check_preempt_curr(rq, p); } } + task_rq_unlock(rq, &flags); } @@ -6336,6 +6365,7 @@ void __might_sleep(char *file, int line) if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; + stop_trace(); printk(KERN_ERR "BUG: sleeping function called from invalid" " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", Index: linux-rt.q/kernel/softlockup.c =================================================================== --- linux-rt.q.orig/kernel/softlockup.c +++ linux-rt.q/kernel/softlockup.c @@ -100,6 +100,8 @@ void softlockup_tick(void) if (now > (touch_timestamp + 10)) { per_cpu(print_timestamp, this_cpu) = touch_timestamp; + stop_trace(); + spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", this_cpu); Index: linux-rt.q/kernel/sysctl.c =================================================================== --- linux-rt.q.orig/kernel/sysctl.c +++ linux-rt.q/kernel/sysctl.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include @@ -291,6 +293,132 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_WAKEUP_TIMING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "wakeup_timing", + .data = &wakeup_timing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_WAKEUP_TIMING) || defined(CONFIG_EVENT_TRACE) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "preempt_max_latency", + .data = &preempt_max_latency, + .maxlen = sizeof(preempt_max_latency), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "preempt_thresh", + .data = &preempt_thresh, + .maxlen = sizeof(preempt_thresh), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_EVENT_TRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_enabled", + .data = &trace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "syscall_tracing", + .data = &syscall_tracing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "stackframe_tracing", + .data = &stackframe_tracing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "mcount_enabled", + .data = &mcount_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_user_triggered", + .data = &trace_user_triggered, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_user_trigger_irq", + .data = &trace_user_trigger_irq, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_freerunning", + .data = &trace_freerunning, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_print_on_crash", + .data = &trace_print_on_crash, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_verbose", + .data = &trace_verbose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_all_cpus", + .data = &trace_all_cpus, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_use_raw_cycles", + .data = &trace_use_raw_cycles, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_all_runnable", + .data = &trace_all_runnable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", Index: linux-rt.q/kernel/time/timekeeping.c =================================================================== --- linux-rt.q.orig/kernel/time/timekeeping.c +++ linux-rt.q/kernel/time/timekeeping.c @@ -71,6 +71,33 @@ static inline s64 __get_nsec_offset(void return ns_offset; } +cycle_t notrace get_monotonic_cycles(void) +{ + cycle_t cycle_now, cycle_delta; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + return clock->cycle_last + cycle_delta; +} + +unsigned long notrace cycles_to_usecs(cycle_t cycles) +{ + u64 ret = cyc2ns(clock, cycles); + + do_div(ret, 1000); + + return ret; +} + +cycle_t notrace usecs_to_cycles(unsigned long usecs) +{ + return ns2cyc(clock, (u64)usecs * 1000); +} + /** * __get_realtime_clock_ts - Returns the time of day in a timespec * @ts: pointer to the timespec to be set Index: linux-rt.q/lib/Kconfig.debug =================================================================== --- linux-rt.q.orig/lib/Kconfig.debug +++ linux-rt.q/lib/Kconfig.debug @@ -308,6 +308,192 @@ config STACKTRACE depends on DEBUG_KERNEL depends on STACKTRACE_SUPPORT +config PREEMPT_TRACE + bool + default y + depends on DEBUG_PREEMPT + +config EVENT_TRACE + bool "Kernel event tracing" + default n + depends on GENERIC_TIME + select FRAME_POINTER + select STACKTRACE + help + This option enables a kernel tracing mechanism that will track + certain kernel events such as system call entry and return, + IRQ entry, context-switching, etc. + + Run the scripts/trace-it utility on a kernel with this option + enabled for sample output. + +config FUNCTION_TRACE + bool "Kernel function call tracing" + default n + depends on !REORDER + select EVENT_TRACE + help + This option enables a kernel tracing mechanism that will track + precise function-call granularity kernel execution. Sample + output: + + pcscd-1772 0D..2 6867us : deactivate_task (-2 1) + pcscd-1772 0D..2 6867us : dequeue_task (deactivate_task) + -0 0D..2 6870us : __switch_to (__schedule) + -0 0D..2 6871us : __schedule (-2 20) + -0 0D..2 6871us : __lock_acquire (lock_acquire) + -0 0D..2 6872us : __spin_unlock_irq (__schedule) + + Run the scripts/trace-it sample utility on a kernel with this + option enabled to capture 1 second worth of events. + + (Note that kernel size and overhead increases noticeably + with this option enabled.) + +config WAKEUP_TIMING + bool "Wakeup latency timing" + depends on GENERIC_TIME + help + This option measures the time spent from a highprio thread being + woken up to it getting scheduled on a CPU, with microsecond + accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + +config LATENCY_TRACE + bool "Latency tracing" + default n + depends on LATENCY_TIMING && !REORDER && GENERIC_TIME + select FRAME_POINTER + select FUNCTION_TRACE + help + When this option is enabled then the last maximum latency timing + event's full trace can be found in /proc/latency_trace, in a + human-readable (or rather as some would say, in a + kernel-developer-readable) form. + + (Note that kernel size and overhead increases noticeably + with this option enabled.) + +config CRITICAL_PREEMPT_TIMING + bool "Non-preemptible critical section latency timing" + default n + depends on PREEMPT + depends on GENERIC_TIME + help + This option measures the time spent in preempt-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config CRITICAL_IRQSOFF_TIMING + bool "Interrupts-off critical section latency timing" + default n + depends on GENERIC_TIME + select TRACE_IRQFLAGS + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config WAKEUP_LATENCY_HIST + bool "wakeup latency histogram" + default n + depends on WAKEUP_TIMING + help + This option logs all the wakeup latency timing to a big histogram + bucket, in the meanwhile, it also dummies up printk produced by + wakeup latency timing. + + The wakeup latency timing histogram can be viewed via: + + cat /proc/latency_hist/wakeup_latency/CPU* + + (Note: * presents CPU ID.) + +config PREEMPT_OFF_HIST + bool "non-preemptible critical section latency histogram" + default n + depends on CRITICAL_PREEMPT_TIMING + help + This option logs all the non-preemptible critical section latency + timing to a big histogram bucket, in the meanwhile, it also + dummies up printk produced by non-preemptible critical section + latency timing. + + The non-preemptible critical section latency timing histogram can + be viewed via: + + cat /proc/latency_hist/preempt_off_latency/CPU* + + (Note: * presents CPU ID.) + +config INTERRUPT_OFF_HIST + bool "interrupts-off critical section latency histogram" + default n + depends on CRITICAL_IRQSOFF_TIMING + help + This option logs all the interrupts-off critical section latency + timing to a big histogram bucket, in the meanwhile, it also + dummies up printk produced by interrupts-off critical section + latency timing. + + The interrupts-off critical section latency timing histogram can + be viewed via: + + cat /proc/latency_hist/interrupt_off_latency/CPU* + + (Note: * presents CPU ID.) + +config CRITICAL_TIMING + bool + default y + depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING + +config DEBUG_TRACE_IRQFLAGS + bool + default y + depends on CRITICAL_IRQSOFF_TIMING + +config LATENCY_TIMING + bool + default y + depends on WAKEUP_TIMING || CRITICAL_TIMING + select SYSCTL + +config CRITICAL_LATENCY_HIST + bool + default y + depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST + +config LATENCY_HIST + bool + default y + depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST + +config MCOUNT + bool + depends on FUNCTION_TRACE + default y + config DEBUG_KOBJECT bool "kobject debugging" depends on DEBUG_KERNEL Index: linux-rt.q/lib/debug_locks.c =================================================================== --- linux-rt.q.orig/lib/debug_locks.c +++ linux-rt.q/lib/debug_locks.c @@ -36,7 +36,14 @@ int debug_locks_silent; int debug_locks_off(void) { if (xchg(&debug_locks, 0)) { +#ifdef CONFIG_DEBUG_RT_MUTEXES + if (spin_is_locked(¤t->pi_lock)) + spin_unlock(¤t->pi_lock); +#endif if (!debug_locks_silent) { + stop_trace(); + user_trace_stop(); + printk("stopped custom tracer.\n"); console_verbose(); return 1; } Index: linux-rt.q/scripts/Makefile =================================================================== --- linux-rt.q.orig/scripts/Makefile +++ linux-rt.q/scripts/Makefile @@ -7,6 +7,7 @@ # conmakehash: Create chartable # conmakehash: Create arrays for initializing the kernel console tables +hostprogs-$(CONFIG_EVENT_TRACE) += trace-it hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash Index: linux-rt.q/scripts/trace-it.c =================================================================== --- /dev/null +++ linux-rt.q/scripts/trace-it.c @@ -0,0 +1,79 @@ + +/* + * Copyright (C) 2005, Ingo Molnar + * + * user-triggered tracing. + * + * The -rt kernel has a built-in kernel tracer, which will trace + * all kernel function calls (and a couple of special events as well), + * by using a build-time gcc feature that instruments all kernel + * functions. + * + * The tracer is highly automated for a number of latency tracing purposes, + * but it can also be switched into 'user-triggered' mode, which is a + * half-automatic tracing mode where userspace apps start and stop the + * tracer. This file shows a dumb example how to turn user-triggered + * tracing on, and how to start/stop tracing. Note that if you do + * multiple start/stop sequences, the kernel will do a maximum search + * over their latencies, and will keep the trace of the largest latency + * in /proc/latency_trace. The maximums are also reported to the kernel + * log. (but can also be read from /proc/sys/kernel/preempt_max_latency) + * + * For the tracer to be activated, turn on CONFIG_EVENT_TRACING + * in the .config, rebuild the kernel and boot into it. The trace will + * get _alot_ more verbose if you also turn on CONFIG_FUNCTION_TRACING, + * every kernel function call will be put into the trace. Note that + * CONFIG_FUNCTION_TRACING has significant runtime overhead, so you dont + * want to use it for performance testing :) + */ + +#include +#include +#include +#include +#include +#include +#include + +int main (int argc, char **argv) +{ + int ret; + + if (getuid() != 0) { + fprintf(stderr, "needs to run as root.\n"); + exit(1); + } + ret = system("cat /proc/sys/kernel/mcount_enabled >/dev/null 2>/dev/null"); + if (ret) { + fprintf(stderr, "CONFIG_LATENCY_TRACING not enabled?\n"); + exit(1); + } + system("echo 1 > /proc/sys/kernel/trace_user_triggered"); + system("[ -e /proc/sys/kernel/wakeup_timing ] && echo 0 > /proc/sys/kernel/wakeup_timing"); + system("echo 1 > /proc/sys/kernel/trace_enabled"); + system("echo 1 > /proc/sys/kernel/mcount_enabled"); + system("echo 0 > /proc/sys/kernel/trace_freerunning"); + system("echo 0 > /proc/sys/kernel/trace_print_on_crash"); + system("echo 0 > /proc/sys/kernel/trace_verbose"); + system("echo 0 > /proc/sys/kernel/preempt_thresh 2>/dev/null"); + system("echo 0 > /proc/sys/kernel/preempt_max_latency 2>/dev/null"); + + // start tracing + if (prctl(0, 1)) { + fprintf(stderr, "trace-it: couldnt start tracing!\n"); + return 1; + } + usleep(10000000); + if (prctl(0, 0)) { + fprintf(stderr, "trace-it: couldnt stop tracing!\n"); + return 1; + } + + system("echo 0 > /proc/sys/kernel/trace_user_triggered"); + system("echo 0 > /proc/sys/kernel/trace_enabled"); + system("cat /proc/latency_trace"); + + return 0; +} + + patches/clockevents-allow-build-without-runtime-use.patch0000664000077200007720000000544610646635210023263 0ustar mingomingoSubject: clockevents: Allow build w/o run-tine usage for migration purposes Migration aid to allow preparatory patches which introduce not yet used parts of clock events code. Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 8 ++++++-- kernel/time/Kconfig | 5 +++++ kernel/time/Makefile | 2 +- kernel/time/clockevents.c | 3 ++- 4 files changed, 14 insertions(+), 4 deletions(-) Index: linux-rt.q/include/linux/clockchips.h =================================================================== --- linux-rt.q.orig/include/linux/clockchips.h +++ linux-rt.q/include/linux/clockchips.h @@ -8,7 +8,7 @@ #ifndef _LINUX_CLOCKCHIPS_H #define _LINUX_CLOCKCHIPS_H -#ifdef CONFIG_GENERIC_CLOCKEVENTS +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #include #include @@ -127,9 +127,13 @@ extern void clockevents_unregister_notif extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, ktime_t now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS extern void clockevents_notify(unsigned long reason, void *arg); - #else +# define clockevents_notify(reason, arg) do { } while (0) +#endif + +#else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */ #define clockevents_notify(reason, arg) do { } while (0) Index: linux-rt.q/kernel/time/Kconfig =================================================================== --- linux-rt.q.orig/kernel/time/Kconfig +++ linux-rt.q/kernel/time/Kconfig @@ -23,3 +23,8 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. +config GENERIC_CLOCKEVENTS_BUILD + bool + default y + depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR + Index: linux-rt.q/kernel/time/Makefile =================================================================== --- linux-rt.q.orig/kernel/time/Makefile +++ linux-rt.q/kernel/time/Makefile @@ -1,6 +1,6 @@ obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o Index: linux-rt.q/kernel/time/clockevents.c =================================================================== --- linux-rt.q.orig/kernel/time/clockevents.c +++ linux-rt.q/kernel/time/clockevents.c @@ -204,6 +204,7 @@ void clockevents_exchange_device(struct local_irq_restore(flags); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events */ @@ -232,4 +233,4 @@ void clockevents_notify(unsigned long re spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); - +#endif patches/lockstat-hooks.patch0000664000077200007720000001456310646635217015532 0ustar mingomingoSubject: lockstat: hook into spinlock_t, rwlock_t, rwsem and mutex Call the new lockstat tracking functions from the various lock primitives. Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron --- kernel/mutex.c | 9 +++++++++ kernel/rwsem.c | 8 ++++---- kernel/spinlock.c | 28 ++++++++++++++-------------- 3 files changed, 27 insertions(+), 18 deletions(-) Index: linux-rt.q/kernel/mutex.c =================================================================== --- linux-rt.q.orig/kernel/mutex.c +++ linux-rt.q/kernel/mutex.c @@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; + old_val = atomic_xchg(&lock->count, -1); + if (old_val == 1) + goto done; + + lock_contended(&lock->dep_map, _RET_IP_); + for (;;) { /* * Lets try to take the lock again - this is needed even if @@ -174,6 +180,9 @@ __mutex_lock_common(struct mutex *lock, spin_lock_mutex(&lock->wait_lock, flags); } + lock_acquired(&lock->dep_map); +done: + /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task_thread_info(task)); debug_mutex_set_owner(lock, task_thread_info(task)); Index: linux-rt.q/kernel/rwsem.c =================================================================== --- linux-rt.q.orig/kernel/rwsem.c +++ linux-rt.q/kernel/rwsem.c @@ -20,7 +20,7 @@ void compat_down_read(struct compat_rw_s might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); - __down_read(sem); + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } EXPORT_SYMBOL(compat_down_read); @@ -47,7 +47,7 @@ void compat_down_write(struct compat_rw_ might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - __down_write(sem); + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } EXPORT_SYMBOL(compat_down_write); @@ -111,7 +111,7 @@ void compat_down_read_nested(struct comp might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - __down_read(sem); + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } EXPORT_SYMBOL(compat_down_read_nested); @@ -130,7 +130,7 @@ void compat_down_write_nested(struct com might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - __down_write_nested(sem, subclass); + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } EXPORT_SYMBOL(compat_down_write_nested); Index: linux-rt.q/kernel/spinlock.c =================================================================== --- linux-rt.q.orig/kernel/spinlock.c +++ linux-rt.q/kernel/spinlock.c @@ -123,7 +123,7 @@ void __lockfunc __read_lock(raw_rwlock_t { preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(__read_lock); @@ -140,7 +140,7 @@ unsigned long __lockfunc __spin_lock_irq * that interrupts are not re-enabled during lock-acquire: */ #ifdef CONFIG_LOCKDEP - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); #else _raw_spin_lock_flags(lock, &flags); #endif @@ -153,7 +153,7 @@ void __lockfunc __spin_lock_irq(raw_spin local_irq_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(__spin_lock_irq); @@ -162,7 +162,7 @@ void __lockfunc __spin_lock_bh(raw_spinl local_bh_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(__spin_lock_bh); @@ -173,7 +173,7 @@ unsigned long __lockfunc __read_lock_irq local_irq_save(flags); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); return flags; } EXPORT_SYMBOL(__read_lock_irqsave); @@ -183,7 +183,7 @@ void __lockfunc __read_lock_irq(raw_rwlo local_irq_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(__read_lock_irq); @@ -192,7 +192,7 @@ void __lockfunc __read_lock_bh(raw_rwloc local_bh_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(__read_lock_bh); @@ -203,7 +203,7 @@ unsigned long __lockfunc __write_lock_ir local_irq_save(flags); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); return flags; } EXPORT_SYMBOL(__write_lock_irqsave); @@ -213,7 +213,7 @@ void __lockfunc __write_lock_irq(raw_rwl local_irq_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(__write_lock_irq); @@ -222,7 +222,7 @@ void __lockfunc __write_lock_bh(raw_rwlo local_bh_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(__write_lock_bh); @@ -230,7 +230,7 @@ void __lockfunc __spin_lock(raw_spinlock { preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(__spin_lock); @@ -239,7 +239,7 @@ void __lockfunc __write_lock(raw_rwlock_ { preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(__write_lock); @@ -342,7 +342,7 @@ void __lockfunc __spin_lock_nested(raw_s { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(__spin_lock_nested); @@ -360,7 +360,7 @@ __spin_lock_irqsave_nested(raw_spinlock_ * that interrupts are not re-enabled during lock-acquire: */ #ifdef CONFIG_LOCKDEP - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); #else _raw_spin_lock_flags(lock, &flags); #endif patches/radix-tree-use-indirect-bit.patch0000664000077200007720000002320110646635216017762 0ustar mingomingoFrom: Nick Piggin Subject: [patch 1/4] radix-tree: use indirect bit Rather than sign direct radix-tree pointers with a special bit, sign the indirect one that hangs off the root. This means that, given a lookup_slot operation, the invalid result will be differentiated from the valid (previously, valid results could have the bit either set or clear). This does not affect slot lookups which occur under lock -- they can never return an invalid result. Is needed in future for lockless pagecache. Signed-off-by: Nick Piggin --- include/linux/radix-tree.h | 40 ++++++++++++++------------ lib/radix-tree.c | 69 ++++++++++++++++++++++++++++----------------- 2 files changed, 65 insertions(+), 44 deletions(-) Index: linux-rt.q/include/linux/radix-tree.h =================================================================== --- linux-rt.q.orig/include/linux/radix-tree.h +++ linux-rt.q/include/linux/radix-tree.h @@ -26,28 +26,31 @@ #include /* - * A direct pointer (root->rnode pointing directly to a data item, - * rather than another radix_tree_node) is signalled by the low bit - * set in the root->rnode pointer. - * - * In this case root->height is also NULL, but the direct pointer tests are - * needed for RCU lookups when root->height is unreliable. + * An indirect pointer (root->rnode pointing to a radix_tree_node, rather + * than a data item) is signalled by the low bit set in the root->rnode + * pointer. + * + * In this case root->height is > 0, but the indirect pointer tests are + * needed for RCU lookups (because root->height is unreliable). The only + * time callers need worry about this is when doing a lookup_slot under + * RCU. */ -#define RADIX_TREE_DIRECT_PTR 1 +#define RADIX_TREE_INDIRECT_PTR 1 +#define RADIX_TREE_RETRY ((void *)-1UL) -static inline void *radix_tree_ptr_to_direct(void *ptr) +static inline void *radix_tree_ptr_to_indirect(void *ptr) { - return (void *)((unsigned long)ptr | RADIX_TREE_DIRECT_PTR); + return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); } -static inline void *radix_tree_direct_to_ptr(void *ptr) +static inline void *radix_tree_indirect_to_ptr(void *ptr) { - return (void *)((unsigned long)ptr & ~RADIX_TREE_DIRECT_PTR); + return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); } -static inline int radix_tree_is_direct_ptr(void *ptr) +static inline int radix_tree_is_indirect_ptr(void *ptr) { - return (int)((unsigned long)ptr & RADIX_TREE_DIRECT_PTR); + return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); } /*** radix-tree API starts here ***/ @@ -130,7 +133,10 @@ do { \ */ static inline void *radix_tree_deref_slot(void **pslot) { - return radix_tree_direct_to_ptr(*pslot); + void *ret = *pslot; + if (unlikely(radix_tree_is_indirect_ptr(ret))) + ret = RADIX_TREE_RETRY; + return ret; } /** * radix_tree_replace_slot - replace item in a slot @@ -142,10 +148,8 @@ static inline void *radix_tree_deref_slo */ static inline void radix_tree_replace_slot(void **pslot, void *item) { - BUG_ON(radix_tree_is_direct_ptr(item)); - rcu_assign_pointer(*pslot, - (void *)((unsigned long)item | - ((unsigned long)*pslot & RADIX_TREE_DIRECT_PTR))); + BUG_ON(radix_tree_is_indirect_ptr(item)); + rcu_assign_pointer(*pslot, item); } int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); Index: linux-rt.q/lib/radix-tree.c =================================================================== --- linux-rt.q.orig/lib/radix-tree.c +++ linux-rt.q/lib/radix-tree.c @@ -105,7 +105,7 @@ radix_tree_node_alloc(struct radix_tree_ } put_cpu_var(radix_tree_preloads); } - BUG_ON(radix_tree_is_direct_ptr(ret)); + BUG_ON(radix_tree_is_indirect_ptr(ret)); return ret; } @@ -243,7 +243,7 @@ static int radix_tree_extend(struct radi return -ENOMEM; /* Increase the height. */ - node->slots[0] = radix_tree_direct_to_ptr(root->rnode); + node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); /* Propagate the aggregated tag info into the new root */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { @@ -254,6 +254,7 @@ static int radix_tree_extend(struct radi newheight = root->height+1; node->height = newheight; node->count = 1; + node = radix_tree_ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); root->height = newheight; } while (height > root->height); @@ -277,7 +278,7 @@ int radix_tree_insert(struct radix_tree_ int offset; int error; - BUG_ON(radix_tree_is_direct_ptr(item)); + BUG_ON(radix_tree_is_indirect_ptr(item)); /* Make sure the tree is high enough. */ if (index > radix_tree_maxindex(root->height)) { @@ -286,7 +287,8 @@ int radix_tree_insert(struct radix_tree_ return error; } - slot = root->rnode; + slot = radix_tree_indirect_to_ptr(root->rnode); + height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; @@ -301,7 +303,8 @@ int radix_tree_insert(struct radix_tree_ rcu_assign_pointer(node->slots[offset], slot); node->count++; } else - rcu_assign_pointer(root->rnode, slot); + rcu_assign_pointer(root->rnode, + radix_tree_ptr_to_indirect(slot)); } /* Go a level down */ @@ -321,7 +324,7 @@ int radix_tree_insert(struct radix_tree_ BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); } else { - rcu_assign_pointer(root->rnode, radix_tree_ptr_to_direct(item)); + rcu_assign_pointer(root->rnode, item); BUG_ON(root_tag_get(root, 0)); BUG_ON(root_tag_get(root, 1)); } @@ -353,11 +356,12 @@ void **radix_tree_lookup_slot(struct rad if (node == NULL) return NULL; - if (radix_tree_is_direct_ptr(node)) { + if (!radix_tree_is_indirect_ptr(node)) { if (index > 0) return NULL; return (void **)&root->rnode; } + node = radix_tree_indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -401,11 +405,12 @@ void *radix_tree_lookup(struct radix_tre if (node == NULL) return NULL; - if (radix_tree_is_direct_ptr(node)) { + if (!radix_tree_is_indirect_ptr(node)) { if (index > 0) return NULL; - return radix_tree_direct_to_ptr(node); + return node; } + node = radix_tree_indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -450,7 +455,7 @@ void *radix_tree_tag_set(struct radix_tr height = root->height; BUG_ON(index > radix_tree_maxindex(height)); - slot = root->rnode; + slot = radix_tree_indirect_to_ptr(root->rnode); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; while (height > 0) { @@ -500,7 +505,7 @@ void *radix_tree_tag_clear(struct radix_ shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; - slot = root->rnode; + slot = radix_tree_indirect_to_ptr(root->rnode); while (height > 0) { int offset; @@ -565,8 +570,9 @@ int radix_tree_tag_get(struct radix_tree if (node == NULL) return 0; - if (radix_tree_is_direct_ptr(node)) + if (!radix_tree_is_indirect_ptr(node)) return (index == 0); + node = radix_tree_indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -683,13 +689,13 @@ radix_tree_gang_lookup(struct radix_tree if (!node) return 0; - if (radix_tree_is_direct_ptr(node)) { + if (!radix_tree_is_indirect_ptr(node)) { if (first_index > 0) return 0; - node = radix_tree_direct_to_ptr(node); - results[0] = rcu_dereference(node); + results[0] = node; return 1; } + node = radix_tree_indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -811,13 +817,13 @@ radix_tree_gang_lookup_tag(struct radix_ if (!node) return 0; - if (radix_tree_is_direct_ptr(node)) { + if (!radix_tree_is_indirect_ptr(node)) { if (first_index > 0) return 0; - node = radix_tree_direct_to_ptr(node); - results[0] = rcu_dereference(node); + results[0] = node; return 1; } + node = radix_tree_indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -847,12 +853,22 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag static inline void radix_tree_shrink(struct radix_tree_root *root) { /* try to shrink tree height */ - while (root->height > 0 && - root->rnode->count == 1 && - root->rnode->slots[0]) { + while (root->height > 0) { struct radix_tree_node *to_free = root->rnode; void *newptr; + BUG_ON(!radix_tree_is_indirect_ptr(to_free)); + to_free = radix_tree_indirect_to_ptr(to_free); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, we cannot shrink. + */ + if (to_free->count != 1) + break; + if (!to_free->slots[0]) + break; + /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another. If @@ -861,8 +877,8 @@ static inline void radix_tree_shrink(str * one (root->rnode). */ newptr = to_free->slots[0]; - if (root->height == 1) - newptr = radix_tree_ptr_to_direct(newptr); + if (root->height > 1) + newptr = radix_tree_ptr_to_indirect(newptr); root->rnode = newptr; root->height--; /* must only free zeroed nodes into the slab */ @@ -897,12 +913,12 @@ void *radix_tree_delete(struct radix_tre goto out; slot = root->rnode; - if (height == 0 && root->rnode) { - slot = radix_tree_direct_to_ptr(slot); + if (height == 0 /* XXX: bugfix? */) { root_tag_clear_all(root); root->rnode = NULL; goto out; } + slot = radix_tree_indirect_to_ptr(slot); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; @@ -944,7 +960,8 @@ void *radix_tree_delete(struct radix_tre radix_tree_node_free(to_free); if (pathp->node->count) { - if (pathp->node == root->rnode) + if (pathp->node == + radix_tree_indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } patches/move-native-irq.patch0000664000077200007720000000167510646635211015602 0ustar mingomingo--- kernel/irq/migration.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) Index: linux-rt.q/kernel/irq/migration.c =================================================================== --- linux-rt.q.orig/kernel/irq/migration.c +++ linux-rt.q/kernel/irq/migration.c @@ -61,6 +61,7 @@ void move_masked_irq(int irq) void move_native_irq(int irq) { struct irq_desc *desc = irq_desc + irq; + int mask = 1; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -68,8 +69,17 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + /* + * If the irq is already in progress, it should be masked. + * If we unmask it, we might cause an interrupt storm on RT. + */ + if (unlikely(desc->status & IRQ_INPROGRESS)) + mask = 0; + + if (mask) + desc->chip->mask(irq); move_masked_irq(irq); - desc->chip->unmask(irq); + if (mask) + desc->chip->unmask(irq); } patches/clockevents-fix-typo-in-acpi_pmc.patch0000664000077200007720000000140010646635210021012 0ustar mingomingoFrom: Alessio Igor Bogani Signed-off-by: Alessio Igor Bogani Cc: john stultz Signed-off-by: Andrew Morton --- drivers/clocksource/acpi_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/drivers/clocksource/acpi_pm.c =================================================================== --- linux-rt.q.orig/drivers/clocksource/acpi_pm.c +++ linux-rt.q/drivers/clocksource/acpi_pm.c @@ -71,7 +71,7 @@ static struct clocksource clocksource_ac .rating = 200, .read = acpi_pm_read, .mask = (cycle_t)ACPI_PM_MASK, - .mult = 0, /*to be caluclated*/ + .mult = 0, /*to be calculated*/ .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS, patches/arm-fix-atomic-cmpxchg.patch0000664000077200007720000000116310646635213017015 0ustar mingomingo--- include/asm-arm/atomic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/include/asm-arm/atomic.h =================================================================== --- linux-rt.q.orig/include/asm-arm/atomic.h +++ linux-rt.q/include/asm-arm/atomic.h @@ -189,10 +189,10 @@ static inline unsigned long __cmpxchg(vo volatile unsigned long *p = ptr; if (size == 4) { - local_irq_save(flags); + raw_local_irq_save(flags); if ((prev = *p) == old) *p = new; - local_irq_restore(flags); + raw_local_irq_restore(flags); return(prev); } else return wrong_size_cmpxchg(ptr); patches/cputimer-thread-rt-fix.patch0000664000077200007720000000312310646635214017054 0ustar mingomingo--- kernel/posix-cpu-timers.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) Index: linux-rt.q/kernel/posix-cpu-timers.c =================================================================== --- linux-rt.q.orig/kernel/posix-cpu-timers.c +++ linux-rt.q/kernel/posix-cpu-timers.c @@ -1292,18 +1292,6 @@ void __run_posix_cpu_timers(struct task_ LIST_HEAD(firing); struct k_itimer *timer, *next; - -#define UNEXPIRED(clock) \ - (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ - cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - - if (UNEXPIRED(prof) && UNEXPIRED(virt) && - (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) - return; - -#undef UNEXPIRED - /* * Double-check with locks held. */ @@ -1428,6 +1416,19 @@ void run_posix_cpu_timers(struct task_st BUG_ON(!irqs_disabled()); if(!per_cpu(posix_timer_task, cpu)) return; + + +#define UNEXPIRED(clock) \ + (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ + cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) + + if (UNEXPIRED(prof) && UNEXPIRED(virt) && + (tsk->it_sched_expires == 0 || + tsk->sum_exec_runtime < tsk->it_sched_expires)) + return; + +#undef UNEXPIRED + /* get per-cpu references */ tasklist = per_cpu(posix_timer_tasklist, cpu); @@ -1446,7 +1447,7 @@ void run_posix_cpu_timers(struct task_st per_cpu(posix_timer_tasklist, cpu) = tsk; } /* XXX signal the thread somehow */ - wake_up_process(per_cpu(posix_timer_task,cpu)); + wake_up_process(per_cpu(posix_timer_task, cpu)); } patches/preempt-realtime-ppc-more-resched-fixups.patch0000664000077200007720000000536610646635214022506 0ustar mingomingo--- arch/powerpc/kernel/entry_64.S | 16 +++++++++++----- arch/powerpc/kernel/idle.c | 4 ++-- include/asm-powerpc/thread_info.h | 3 ++- 3 files changed, 15 insertions(+), 8 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/entry_64.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/entry_64.S +++ linux-rt.q/arch/powerpc/kernel/entry_64.S @@ -449,7 +449,8 @@ _GLOBAL(ret_from_except_lite) #ifdef CONFIG_PREEMPT clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ - li r0,_TIF_NEED_RESCHED /* bits to check */ + li r0,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) + /* bits to check */ ld r3,_MSR(r1) ld r4,TI_FLAGS(r9) /* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */ @@ -558,16 +559,21 @@ do_work: cmpdi r0,0 crandc eq,cr1*4+eq,eq bne restore + /* here we are preempting the current task */ 1: - /* preempt_schedule_irq() expects interrupts disabled. */ - bl .preempt_schedule_irq + li r0,1 + stb r0,PACASOFTIRQEN(r13) + stb r0,PACAHARDIRQEN(r13) + ori r10,r10,MSR_EE + mtmsrd r10,1 /* reenable interrupts */ + bl .preempt_schedule mfmsr r10 clrrdi r9,r1,THREAD_SHIFT rldicl r10,r10,48,1 /* disable interrupts again */ rotldi r10,r10,16 mtmsrd r10,1 ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne 1b b restore @@ -582,7 +588,7 @@ user_work: ori r10,r10,MSR_EE mtmsrd r10,1 - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq 1f bl .schedule b .ret_from_except_lite Index: linux-rt.q/arch/powerpc/kernel/idle.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/idle.c +++ linux-rt.q/arch/powerpc/kernel/idle.c @@ -61,8 +61,8 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { tick_nohz_stop_sched_tick(); - - while (!need_resched() && !cpu_should_die()) { + while (!need_resched() && !need_resched_delayed() && + !cpu_should_die()) { ppc64_runlatch_off(); if (ppc_md.power_save) { Index: linux-rt.q/include/asm-powerpc/thread_info.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/thread_info.h +++ linux-rt.q/include/asm-powerpc/thread_info.h @@ -146,7 +146,8 @@ static inline struct thread_info *curren #define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP) #define _TIF_USER_WORK_MASK (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ - _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK) + _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK | \ + _TIF_NEED_RESCHED_DELAYED) #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) /* Bits in local_flags */ patches/x86_64-prep-idle-loop-for-dynticks.patch0000664000077200007720000000231110646635211020741 0ustar mingomingoSubject: x86_64: prepare idle loop for dynamic ticks From: Chris Wright Add tick_nohz_{stop,restart}_sched_tick to idle loop in prepartion for turning on dynticks. These are just noops until NO_HZ is enabled. Signed-off-by: Chris Wright Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/process.c | 4 ++++ 1 file changed, 4 insertions(+) Index: linux-rt.q/arch/x86_64/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/process.c +++ linux-rt.q/arch/x86_64/kernel/process.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -207,6 +208,8 @@ void cpu_idle (void) if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + tick_nohz_stop_sched_tick(); + rmb(); idle = pm_idle; if (!idle) @@ -227,6 +230,7 @@ void cpu_idle (void) __exit_idle(); } + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); patches/preempt-realtime-net-softirq-fixups.patch0000664000077200007720000000271610646635215021621 0ustar mingomingoSubject: NOHZ: local_softirq_pending with tickless From: Mikulas Patocka quota += dev->weight; else dev->quota = dev->weight; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__netif_rx_schedule); @@ -2060,7 +2060,7 @@ out: softnet_break: __get_cpu_var(netdev_rx_stat).time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } patches/irda-fix.patch0000664000077200007720000000174110646635216014261 0ustar mingomingoThis was found around the 2.6.10 timeframe when testing with the -rt patch and I believe is still is an issue. irttp_dup() does a memcpy() of the tsap_cb structure causing the spinlock protecting various fields in the structure to be duped. This works OK in the non-RT case but in the RT case we end up with two mutexes pointing to the same wait_list and leading to an OOPS. Fix is to simply initialize the spinlock after the memcpy(). Signed-off-by: Deepak Saxena --- net/irda/irttp.c | 1 + 1 file changed, 1 insertion(+) Index: linux-rt.q/net/irda/irttp.c =================================================================== --- linux-rt.q.orig/net/irda/irttp.c +++ linux-rt.q/net/irda/irttp.c @@ -1441,6 +1441,7 @@ struct tsap_cb *irttp_dup(struct tsap_cb } /* Dup */ memcpy(new, orig, sizeof(struct tsap_cb)); + spin_lock_init(&new->lock); /* We don't need the old instance any more */ spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags); patches/trace-with-caller-addr.patch0000664000077200007720000000670110646635212016772 0ustar mingomingo--- arch/x86_64/lib/thunk.S | 18 ++++++++++++++++-- kernel/latency_trace.c | 22 ++++++++++++++++++++++ kernel/lockdep.c | 16 ++++++++++++---- 3 files changed, 50 insertions(+), 6 deletions(-) Index: linux-rt.q/arch/x86_64/lib/thunk.S =================================================================== --- linux-rt.q.orig/arch/x86_64/lib/thunk.S +++ linux-rt.q/arch/x86_64/lib/thunk.S @@ -47,8 +47,22 @@ thunk __up_wakeup,__up #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + /* put return address in rdi (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + /* SAVE_ARGS pushs 9 elements */ + /* the next element would be the rip */ + movq 9*8(%rsp), %rdi + call \func + jmp restore + CFI_ENDPROC + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller #endif /* SAVE_ARGS below is used only for the .cfi directives it contains. */ Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -1986,6 +1986,28 @@ void notrace trace_hardirqs_off(void) EXPORT_SYMBOL(trace_hardirqs_off); +/* used by x86_64 thunk.S */ +void notrace trace_hardirqs_on_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(caller_addr, 0 /* CALLER_ADDR1 */); +} + +void notrace trace_hardirqs_off_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(caller_addr, 0 /* CALLER_ADDR1 */, + INTERRUPT_LATENCY); +} + #endif /* !CONFIG_LOCKDEP */ #endif /* CONFIG_CRITICAL_IRQSOFF_TIMING */ Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -1842,7 +1842,7 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void notrace trace_hardirqs_on(void) +void notrace trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; @@ -1884,16 +1884,20 @@ void notrace trace_hardirqs_on(void) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); #ifdef CONFIG_CRITICAL_IRQSOFF_TIMING - time_hardirqs_on(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); + time_hardirqs_on(a0, 0 /* CALLER_ADDR1 */); #endif } +void notrace trace_hardirqs_on(void) { + trace_hardirqs_on_caller(CALLER_ADDR0); +} + EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void notrace trace_hardirqs_off(void) +void notrace trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; @@ -1912,12 +1916,16 @@ void notrace trace_hardirqs_off(void) curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_off_events); #ifdef CONFIG_CRITICAL_IRQSOFF_TIMING - time_hardirqs_off(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); + time_hardirqs_off(a0, 0 /* CALLER_ADDR1 */); #endif } else debug_atomic_inc(&redundant_hardirqs_off); } +void notrace trace_hardirqs_off(void) { + trace_hardirqs_off_caller(CALLER_ADDR0); +} + EXPORT_SYMBOL(trace_hardirqs_off); /* patches/serial-slow-machines.patch0000664000077200007720000000316310646635214016602 0ustar mingomingo--- drivers/char/tty_io.c | 4 ++++ drivers/serial/8250.c | 11 ++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) Index: linux-rt.q/drivers/char/tty_io.c =================================================================== --- linux-rt.q.orig/drivers/char/tty_io.c +++ linux-rt.q/drivers/char/tty_io.c @@ -3624,10 +3624,14 @@ void tty_flip_buffer_push(struct tty_str tty->buf.tail->commit = tty->buf.tail->used; spin_unlock_irqrestore(&tty->buf.lock, flags); +#ifndef CONFIG_PREEMPT_RT if (tty->low_latency) flush_to_ldisc(&tty->buf.work.work); else schedule_delayed_work(&tty->buf.work, 1); +#else + flush_to_ldisc(&tty->buf.work.work); +#endif } EXPORT_SYMBOL(tty_flip_buffer_push); Index: linux-rt.q/drivers/serial/8250.c =================================================================== --- linux-rt.q.orig/drivers/serial/8250.c +++ linux-rt.q/drivers/serial/8250.c @@ -1451,7 +1451,10 @@ static irqreturn_t serial8250_interrupt( { struct irq_info *i = dev_id; struct list_head *l, *end = NULL; - int pass_counter = 0, handled = 0; +#ifndef CONFIG_PREEMPT_RT + int pass_counter = 0; +#endif + int handled = 0; DEBUG_INTR("serial8250_interrupt(%d)...", irq); @@ -1489,12 +1492,18 @@ static irqreturn_t serial8250_interrupt( l = l->next; + /* + * On preempt-rt we can be preempted and run in our + * own thread. + */ +#ifndef CONFIG_PREEMPT_RT if (l == i->head && pass_counter++ > PASS_LIMIT) { /* If we hit this, we're dead. */ printk(KERN_ERR "serial8250: too much work for " "irq%d\n", irq); break; } +#endif } while (l != end); spin_unlock(&i->lock); patches/clockevents-remove-prototypes-of-removed-functions.patch0000664000077200007720000000176210646635210024663 0ustar mingomingoFrom: Thomas Gleixner Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton --- include/linux/clockchips.h | 4 ---- 1 file changed, 4 deletions(-) Index: linux-rt.q/include/linux/clockchips.h =================================================================== --- linux-rt.q.orig/include/linux/clockchips.h +++ linux-rt.q/include/linux/clockchips.h @@ -119,10 +119,6 @@ extern void clockevents_register_device( extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); -extern -struct clock_event_device *clockevents_request_device(unsigned int features, - cpumask_t cpumask); -extern void clockevents_release_device(struct clock_event_device *dev); extern void clockevents_set_mode(struct clock_event_device *dev, enum clock_event_mode mode); extern int clockevents_register_notifier(struct notifier_block *nb); patches/preempt-rt-cs5530-lock-ide-fix.patch0000664000077200007720000000160510646635214020043 0ustar mingomingo drivers/ide/pci/cs5530.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) Index: linux-rt.q/drivers/ide/pci/cs5530.c =================================================================== --- linux-rt.q.orig/drivers/ide/pci/cs5530.c +++ linux-rt.q/drivers/ide/pci/cs5530.c @@ -227,8 +227,8 @@ static unsigned int __devinit init_chips goto out; } - spin_lock_irqsave(&ide_lock, flags); - /* all CPUs (there should only be one CPU with this chipset) */ + /* Local CPU. ide_lock is acquired in do_ide_setup_pci_device. */ + local_irq_save(flags); /* * Enable BusMaster and MemoryWriteAndInvalidate for the cs5530: @@ -280,7 +280,7 @@ static unsigned int __devinit init_chips pci_write_config_byte(master_0, 0x42, 0x00); pci_write_config_byte(master_0, 0x43, 0xc1); - spin_unlock_irqrestore(&ide_lock, flags); + local_irq_restore(flags); out: pci_dev_put(master_0); patches/kmap-atomic-prepare.patch0000664000077200007720000001105010646635216016406 0ustar mingomingo With the separation of pagefault_{disable,enable}() from the preempt_count a previously overlooked dependancy became painfully clear. kmap_atomic() is per cpu and relies not only on disabling the pagefault handler, but really needs preemption disabled too. make this explicit now - so that we can change pagefault_disable(). Signed-off-by: Peter Zijlstra --- arch/i386/mm/highmem.c | 4 +++- arch/mips/mm/highmem.c | 5 ++++- arch/sparc/mm/highmem.c | 4 +++- include/asm-frv/highmem.h | 2 ++ include/asm-ppc/highmem.h | 4 +++- 5 files changed, 15 insertions(+), 4 deletions(-) Index: linux-rt.q/arch/i386/mm/highmem.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/highmem.c +++ linux-rt.q/arch/i386/mm/highmem.c @@ -51,7 +51,7 @@ void *__kmap_atomic_prot(struct page *pa enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); @@ -94,6 +94,7 @@ void __kunmap_atomic(void *kvaddr, enum arch_flush_lazy_mmu_mode(); pagefault_enable(); + preempt_enable(); } /* This is the same as kmap_atomic() but can map memory that doesn't @@ -104,6 +105,7 @@ void *__kmap_atomic_pfn(unsigned long pf enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); Index: linux-rt.q/arch/mips/mm/highmem.c =================================================================== --- linux-rt.q.orig/arch/mips/mm/highmem.c +++ linux-rt.q/arch/mips/mm/highmem.c @@ -38,7 +38,7 @@ void *__kmap_atomic(struct page *page, e enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr, enum if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -78,6 +79,7 @@ void __kunmap_atomic(void *kvaddr, enum #endif pagefault_enable(); + preempt_enable(); } /* @@ -89,6 +91,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); Index: linux-rt.q/arch/sparc/mm/highmem.c =================================================================== --- linux-rt.q.orig/arch/sparc/mm/highmem.c +++ linux-rt.q/arch/sparc/mm/highmem.c @@ -34,7 +34,7 @@ void *kmap_atomic(struct page *page, enu unsigned long idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -71,6 +71,7 @@ void kunmap_atomic(void *kvaddr, enum km if (vaddr < FIXADDR_START) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -97,6 +98,7 @@ void kunmap_atomic(void *kvaddr, enum km #endif pagefault_enable(); + preempt_enable(); } /* We may be fed a pagetable here by ptep_to_xxx and others. */ Index: linux-rt.q/include/asm-frv/highmem.h =================================================================== --- linux-rt.q.orig/include/asm-frv/highmem.h +++ linux-rt.q/include/asm-frv/highmem.h @@ -115,6 +115,7 @@ static inline void *kmap_atomic(struct p { unsigned long paddr; + preempt_disable(); pagefault_disable(); paddr = page_to_phys(page); @@ -171,6 +172,7 @@ static inline void kunmap_atomic(void *k BUG(); } pagefault_enable(); + preempt_enable(); } #endif /* !__ASSEMBLY__ */ Index: linux-rt.q/include/asm-ppc/highmem.h =================================================================== --- linux-rt.q.orig/include/asm-ppc/highmem.h +++ linux-rt.q/include/asm-ppc/highmem.h @@ -78,7 +78,7 @@ static inline void *kmap_atomic(struct p unsigned int idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -102,6 +102,7 @@ static inline void kunmap_atomic(void *k if (vaddr < KMAP_FIX_BEGIN) { // FIXME pagefault_enable(); + preempt_enable(); return; } @@ -115,6 +116,7 @@ static inline void kunmap_atomic(void *k flush_tlb_page(NULL, vaddr); #endif pagefault_enable(); + preempt_enable(); } static inline struct page *kmap_atomic_to_page(void *ptr) patches/arm-latency-tracer-support.patch0000664000077200007720000000511710646635212017757 0ustar mingomingoadd latency tracer support for EP93xx boards Add latency tracer support for the EP93xx platform. This is done by: - adding the correct Kconfig options - add (an empty) save_stack_trace implementation. -> Someone needs to implement save_stack_trace for arm :) Maybe we can use the implementation from rmk? - implementing mach_read_cycles (read out EP93XX_TIMER4_VALUE_LOW) - implementing mach_cycles_to_usecs (just the same way as for the PXA platform) - implementing mach_usecs_to_cycles (just the same way as for the PXA platform) Signed-off-by: Jan Altenberg --- arch/arm/Kconfig | 4 ++++ arch/arm/lib/Makefile | 1 + arch/arm/lib/stacktrace.c | 7 +++++++ include/asm-arm/arch-ep93xx/timex.h | 6 ++++++ 4 files changed, 18 insertions(+) Index: linux-rt.q/arch/arm/Kconfig =================================================================== --- linux-rt.q.orig/arch/arm/Kconfig +++ linux-rt.q/arch/arm/Kconfig @@ -33,6 +33,10 @@ config GENERIC_CLOCKEVENTS bool default n +config STACKTRACE_SUPPORT + bool + default y + config MMU bool default y Index: linux-rt.q/arch/arm/lib/Makefile =================================================================== --- linux-rt.q.orig/arch/arm/lib/Makefile +++ linux-rt.q/arch/arm/lib/Makefile @@ -41,6 +41,7 @@ lib-$(CONFIG_ARCH_RPC) += ecard.o io-ac lib-$(CONFIG_ARCH_CLPS7500) += io-acorn.o lib-$(CONFIG_ARCH_L7200) += io-acorn.o lib-$(CONFIG_ARCH_SHARK) += io-shark.o +lib-$(CONFIG_STACKTRACE) += stacktrace.o $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S Index: linux-rt.q/arch/arm/lib/stacktrace.c =================================================================== --- /dev/null +++ linux-rt.q/arch/arm/lib/stacktrace.c @@ -0,0 +1,7 @@ +#include +#include + +void save_stack_trace(struct stack_trace *trace) +{ +} + Index: linux-rt.q/include/asm-arm/arch-ep93xx/timex.h =================================================================== --- linux-rt.q.orig/include/asm-arm/arch-ep93xx/timex.h +++ linux-rt.q/include/asm-arm/arch-ep93xx/timex.h @@ -1,5 +1,11 @@ /* * linux/include/asm-arm/arch-ep93xx/timex.h */ +#include +#include #define CLOCK_TICK_RATE 983040 + +#define mach_read_cycles() __raw_readl(EP93XX_TIMER4_VALUE_LOW) +#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32) +#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32) patches/vortex-fix.patch0000664000077200007720000000512110646635214014663 0ustar mingomingo Argh, cut and paste wasn't enough... Use this patch instead. It needs an irq disable. But, believe it or not, on SMP this is actually better. If the irq is shared (as it is in Mark's case), we don't stop the irq of other devices from being handled on another CPU (unfortunately for Mark, he pinned all interrupts to one CPU). Andrew, should this be changed in mainline too? -- Steve Signed-off-by: Steven Rostedt drivers/net/3c59x.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) Index: linux-rt.q/drivers/net/3c59x.c =================================================================== --- linux-rt.q.orig/drivers/net/3c59x.c +++ linux-rt.q/drivers/net/3c59x.c @@ -792,9 +792,9 @@ static void poll_vortex(struct net_devic { struct vortex_private *vp = netdev_priv(dev); unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -1728,6 +1728,7 @@ vortex_timer(unsigned long data) int next_tick = 60*HZ; int ok = 0; int media_status, old_window; + unsigned long flags; if (vortex_debug > 2) { printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n", @@ -1735,7 +1736,7 @@ vortex_timer(unsigned long data) printk(KERN_DEBUG "dev->watchdog_timeo=%d\n", dev->watchdog_timeo); } - disable_irq_lockdep(dev->irq); + spin_lock_irqsave(&vp->lock, flags); old_window = ioread16(ioaddr + EL3_CMD) >> 13; EL3WINDOW(4); media_status = ioread16(ioaddr + Wn4_Media); @@ -1758,9 +1759,7 @@ vortex_timer(unsigned long data) case XCVR_MII: case XCVR_NWAY: { ok = 1; - spin_lock_bh(&vp->lock); vortex_check_media(dev, 0); - spin_unlock_bh(&vp->lock); } break; default: /* Other media types handled by Tx timeouts. */ @@ -1816,7 +1815,7 @@ leave_media_alone: dev->name, media_tbl[dev->if_port].name); EL3WINDOW(old_window); - enable_irq_lockdep(dev->irq); + spin_unlock_irqrestore(&vp->lock, flags); mod_timer(&vp->timer, RUN_AT(next_tick)); if (vp->deferred) iowrite16(FakeIntr, ioaddr + EL3_CMD); @@ -1849,13 +1848,17 @@ static void vortex_tx_timeout(struct net /* * Block interrupts because vortex_interrupt does a bare spin_lock() */ +#ifndef CONFIG_PREEMPT_RT unsigned long flags; local_irq_save(flags); +#endif if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev); else vortex_interrupt(dev->irq, dev); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } } patches/preempt-realtime-i386.patch0000664000077200007720000007616310646635215016532 0ustar mingomingo--- arch/i386/Kconfig.debug | 2 + arch/i386/kernel/apic.c | 2 - arch/i386/kernel/cpu/mtrr/generic.c | 2 - arch/i386/kernel/entry.S | 14 ++++++---- arch/i386/kernel/head.S | 1 arch/i386/kernel/i8253.c | 2 - arch/i386/kernel/i8259.c | 2 - arch/i386/kernel/io_apic.c | 4 +-- arch/i386/kernel/irq.c | 4 ++- arch/i386/kernel/microcode.c | 2 - arch/i386/kernel/nmi.c | 5 +++ arch/i386/kernel/process.c | 18 ++++++++++--- arch/i386/kernel/signal.c | 14 ++++++++++ arch/i386/kernel/smp.c | 23 ++++++++++++----- arch/i386/kernel/time.c | 2 - arch/i386/kernel/traps.c | 29 +++++++++++++++++---- arch/i386/kernel/vm86.c | 1 arch/i386/mm/fault.c | 7 +++-- arch/i386/mm/highmem.c | 37 +++++++++++++++++++++------- arch/i386/mm/pgtable.c | 2 - arch/i386/oprofile/Kconfig | 3 ++ arch/i386/pci/common.c | 2 - arch/i386/pci/direct.c | 29 ++++++++++++++------- arch/i386/pci/pci.h | 2 - include/asm-i386/acpi.h | 4 +-- include/asm-i386/dma.h | 2 - include/asm-i386/highmem.h | 27 ++++++++++++++++++++ include/asm-i386/i8253.h | 2 - include/asm-i386/i8259.h | 2 - include/asm-i386/mach-default/irq_vectors.h | 2 - include/asm-i386/mc146818rtc.h | 2 - include/asm-i386/pgtable.h | 2 - include/asm-i386/tlbflush.h | 26 +++++++++++++++++++ include/asm-i386/xor.h | 21 +++++++++++++-- 34 files changed, 235 insertions(+), 64 deletions(-) Index: linux-rt.q/arch/i386/Kconfig.debug =================================================================== --- linux-rt.q.orig/arch/i386/Kconfig.debug +++ linux-rt.q/arch/i386/Kconfig.debug @@ -49,6 +49,7 @@ config DEBUG_PAGEALLOC config DEBUG_RODATA bool "Write protect kernel read-only data structures" depends on DEBUG_KERNEL + default y help Mark the kernel read-only data as write-protected in the pagetables, in order to catch accidental (and incorrect) writes to such const @@ -59,6 +60,7 @@ config DEBUG_RODATA config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" depends on DEBUG_KERNEL + default y help If you say Y here the kernel will use a 4Kb stacksize for the kernel stack attached to each process/thread. This facilitates Index: linux-rt.q/arch/i386/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/apic.c +++ linux-rt.q/arch/i386/kernel/apic.c @@ -579,7 +579,7 @@ static void local_apic_timer_interrupt(v * interrupt as well. Thus we cannot inline the local irq ... ] */ -void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +void fastcall notrace smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); Index: linux-rt.q/arch/i386/kernel/cpu/mtrr/generic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/cpu/mtrr/generic.c +++ linux-rt.q/arch/i386/kernel/cpu/mtrr/generic.c @@ -330,7 +330,7 @@ static unsigned long set_mtrr_state(void static unsigned long cr4 = 0; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts - they Index: linux-rt.q/arch/i386/kernel/entry.S =================================================================== --- linux-rt.q.orig/arch/i386/kernel/entry.S +++ linux-rt.q/arch/i386/kernel/entry.S @@ -264,14 +264,18 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + DISABLE_INTERRUPTS(CLBR_ANY) + call preempt_schedule_irq jmp need_resched END(resume_kernel) @@ -483,11 +487,11 @@ work_pending: testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jz work_notifysig work_resched: - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) + call __schedule + # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? Index: linux-rt.q/arch/i386/kernel/head.S =================================================================== --- linux-rt.q.orig/arch/i386/kernel/head.S +++ linux-rt.q/arch/i386/kernel/head.S @@ -486,6 +486,7 @@ ignore_int: call printk #endif addl $(5*4),%esp + call dump_stack popl %ds popl %es popl %edx Index: linux-rt.q/arch/i386/kernel/i8253.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/i8253.c +++ linux-rt.q/arch/i386/kernel/i8253.c @@ -14,7 +14,7 @@ #include #include -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); /* Index: linux-rt.q/arch/i386/kernel/i8259.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/i8259.c +++ linux-rt.q/arch/i386/kernel/i8259.c @@ -34,7 +34,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { Index: linux-rt.q/arch/i386/kernel/io_apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/io_apic.c +++ linux-rt.q/arch/i386/kernel/io_apic.c @@ -56,8 +56,8 @@ atomic_t irq_mis_count; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; Index: linux-rt.q/arch/i386/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/irq.c +++ linux-rt.q/arch/i386/kernel/irq.c @@ -79,6 +79,8 @@ fastcall notrace unsigned int do_IRQ(str u32 *isp; #endif + irq_show_regs_callback(smp_processor_id(), regs); + if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __FUNCTION__, irq); @@ -100,7 +102,7 @@ fastcall notrace unsigned int do_IRQ(str __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } Index: linux-rt.q/arch/i386/kernel/microcode.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/microcode.c +++ linux-rt.q/arch/i386/kernel/microcode.c @@ -116,7 +116,7 @@ MODULE_LICENSE("GPL"); #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) /* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); +static DEFINE_RAW_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); Index: linux-rt.q/arch/i386/kernel/nmi.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/nmi.c +++ linux-rt.q/arch/i386/kernel/nmi.c @@ -62,7 +62,12 @@ static int endflag __initdata = 0; */ static __init void nmi_cpu_busy(void *data) { + /* + * avoid a warning, on PREEMPT_RT this wont run in hardirq context: + */ +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the Index: linux-rt.q/arch/i386/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/process.c +++ linux-rt.q/arch/i386/kernel/process.c @@ -200,12 +200,14 @@ void cpu_idle(void) __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } + local_irq_disable(); trace_preempt_exit_idle(); tick_nohz_restart_sched_tick(); __preempt_enable_no_resched(); - schedule(); + __schedule(); preempt_disable(); trace_preempt_enter_idle(); + local_irq_enable(); } } @@ -371,15 +373,23 @@ void exit_thread(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + void *io_bitmap_ptr = t->io_bitmap_ptr; + int cpu; + struct tss_struct *tss; - kfree(t->io_bitmap_ptr); + /* + * On PREEMPT_RT we must not call kfree() with + * preemption disabled, so we first zap the pointer: + */ t->io_bitmap_ptr = NULL; + kfree(io_bitmap_ptr); + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; Index: linux-rt.q/arch/i386/kernel/signal.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/signal.c +++ linux-rt.q/arch/i386/kernel/signal.c @@ -533,6 +533,13 @@ handle_signal(unsigned long sig, siginfo } } +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so * that register information in the sigcontext is correct. @@ -573,6 +580,13 @@ static void fastcall do_signal(struct pt struct k_sigaction ka; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux-rt.q/arch/i386/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/smp.c +++ linux-rt.q/arch/i386/kernel/smp.c @@ -246,7 +246,7 @@ void send_IPI_mask_sequence(cpumask_t ma static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +static DEFINE_RAW_SPINLOCK(tlbstate_lock); /* * We cannot call mmdrop() because we are in interrupt context, @@ -474,10 +474,20 @@ static void native_smp_send_reschedule(i } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -632,13 +642,14 @@ static void native_smp_send_stop(void) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. Trigger a reschedule pass so that + * RT-overload balancing can pass tasks around. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs) { + trace_special(regs->eip, 0, 0); ack_APIC_irq(); + set_tsk_need_resched(current); } fastcall void smp_call_function_interrupt(struct pt_regs *regs) Index: linux-rt.q/arch/i386/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/time.c +++ linux-rt.q/arch/i386/kernel/time.c @@ -124,7 +124,7 @@ static int set_rtc_mmss(unsigned long no int timer_ack; -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); Index: linux-rt.q/arch/i386/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/traps.c +++ linux-rt.q/arch/i386/kernel/traps.c @@ -275,6 +275,12 @@ void dump_stack(void) EXPORT_SYMBOL(dump_stack); +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; @@ -303,8 +309,12 @@ void show_registers(struct pt_regs *regs regs->eax, regs->ebx, regs->ecx, regs->edx); printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); + + printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x " + " preempt:%08x\n", + regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, + ss, preempt_count()); + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, current_thread_info(), current, task_thread_info(current)); @@ -364,11 +374,11 @@ int is_valid_bugaddr(unsigned long eip) void die(const char * str, struct pt_regs * regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -474,6 +484,11 @@ static void __kprobes do_trap(int trapnr if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { /* * We want error_code and trap_no set for userspace faults and @@ -713,10 +728,11 @@ void __kprobes die_nmi(struct pt_regs *r crash_kexec(regs); } + nmi_exit(); do_exit(SIGSEGV); } -static __kprobes void default_do_nmi(struct pt_regs * regs) +static notrace __kprobes void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -754,11 +770,12 @@ static __kprobes void default_do_nmi(str reassert_nmi(); } -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) +fastcall notrace __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; nmi_enter(); + nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags); cpu = smp_processor_id(); Index: linux-rt.q/arch/i386/kernel/vm86.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/vm86.c +++ linux-rt.q/arch/i386/kernel/vm86.c @@ -137,6 +137,7 @@ struct pt_regs * fastcall save_v86_state local_irq_enable(); if (!current->thread.vm86_info) { + local_irq_disable(); printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } Index: linux-rt.q/arch/i386/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/fault.c +++ linux-rt.q/arch/i386/mm/fault.c @@ -295,8 +295,8 @@ static inline int vmalloc_fault(unsigned * bit 3 == 1 means use of reserved bit detected * bit 4 == 1 means fault was an instruction fetch */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) +fastcall notrace void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -306,6 +306,7 @@ fastcall void __kprobes do_page_fault(st /* get the address */ address = read_cr2(); + trace_special(regs->eip, error_code, address); tsk = current; @@ -489,6 +490,8 @@ bad_area_nosemaphore: if (nr == 6) { stop_trace(); + user_trace_stop(); + zap_rt_locks(); do_invalid_op(regs, 0); return; } Index: linux-rt.q/arch/i386/mm/highmem.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/highmem.c +++ linux-rt.q/arch/i386/mm/highmem.c @@ -18,6 +18,26 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -26,7 +46,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -47,12 +67,12 @@ void *kmap_atomic_prot(struct page *page return (void*) vaddr; } -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { return kmap_atomic_prot(page, type, kmap_prot); } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -79,7 +99,7 @@ void kunmap_atomic(void *kvaddr, enum km /* This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -94,7 +114,7 @@ void *kmap_atomic_pfn(unsigned long pfn, return (void*) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -109,6 +129,7 @@ struct page *kmap_atomic_to_page(void *p EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_to_page); Index: linux-rt.q/arch/i386/mm/pgtable.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/pgtable.c +++ linux-rt.q/arch/i386/mm/pgtable.c @@ -208,7 +208,7 @@ void pmd_ctor(void *pmd, struct kmem_cac * vmalloc faults work because attached pagetables are never freed. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); +DEFINE_RAW_SPINLOCK(pgd_lock); struct page *pgd_list; static inline void pgd_list_add(pgd_t *pgd) Index: linux-rt.q/arch/i386/oprofile/Kconfig =================================================================== --- linux-rt.q.orig/arch/i386/oprofile/Kconfig +++ linux-rt.q/arch/i386/oprofile/Kconfig @@ -15,3 +15,6 @@ config OPROFILE If unsure, say N. +config PROFILE_NMI + bool + default y Index: linux-rt.q/arch/i386/pci/common.c =================================================================== --- linux-rt.q.orig/arch/i386/pci/common.c +++ linux-rt.q/arch/i386/pci/common.c @@ -52,7 +52,7 @@ int pcibios_scanned; * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ -DEFINE_SPINLOCK(pci_config_lock); +DEFINE_RAW_SPINLOCK(pci_config_lock); /* * Several buggy motherboards address only 16 devices and mirror Index: linux-rt.q/arch/i386/pci/direct.c =================================================================== --- linux-rt.q.orig/arch/i386/pci/direct.c +++ linux-rt.q/arch/i386/pci/direct.c @@ -220,16 +220,23 @@ static int __init pci_check_type1(void) unsigned int tmp; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); - if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { - works = 1; + + if (inl(0xCF8) == 0x80000000) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf1)) + works = 1; + + spin_lock_irqsave(&pci_config_lock, flags); } outl(tmp, 0xCF8); - local_irq_restore(flags); + + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } @@ -239,17 +246,19 @@ static int __init pci_check_type2(void) unsigned long flags; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); - if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && - pci_sanity_check(&pci_direct_conf2)) { - works = 1; - } - local_irq_restore(flags); + if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf2)) + works = 1; + } else + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } Index: linux-rt.q/arch/i386/pci/pci.h =================================================================== --- linux-rt.q.orig/arch/i386/pci/pci.h +++ linux-rt.q/arch/i386/pci/pci.h @@ -78,7 +78,7 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; extern int pcibios_scanned; -extern spinlock_t pci_config_lock; +extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); Index: linux-rt.q/include/asm-i386/acpi.h =================================================================== --- linux-rt.q.orig/include/asm-i386/acpi.h +++ linux-rt.q/include/asm-i386/acpi.h @@ -52,8 +52,8 @@ #define ACPI_ASM_MACROS #define BREAKPOINT3 -#define ACPI_DISABLE_IRQS() local_irq_disable() -#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_DISABLE_IRQS() local_irq_disable_nort() +#define ACPI_ENABLE_IRQS() local_irq_enable_nort() #define ACPI_FLUSH_CPU_CACHE() wbinvd() int __acpi_acquire_global_lock(unsigned int *lock); Index: linux-rt.q/include/asm-i386/dma.h =================================================================== --- linux-rt.q.orig/include/asm-i386/dma.h +++ linux-rt.q/include/asm-i386/dma.h @@ -134,7 +134,7 @@ #define DMA_AUTOINIT 0x10 -extern spinlock_t dma_spin_lock; +extern spinlock_t dma_spin_lock; static __inline__ unsigned long claim_dma_lock(void) { Index: linux-rt.q/include/asm-i386/highmem.h =================================================================== --- linux-rt.q.orig/include/asm-i386/highmem.h +++ linux-rt.q/include/asm-i386/highmem.h @@ -67,6 +67,16 @@ extern void * FASTCALL(kmap_high(struct extern void FASTCALL(kunmap_high(struct page *page)); void *kmap(struct page *page); +extern void kunmap_virt(void *ptr); +extern struct page *kmap_to_page(void *ptr); +void kunmap(struct page *page); + +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); +void *__kmap_atomic(struct page *page, enum km_type type); +void __kunmap_atomic(void *kvaddr, enum km_type type); +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type); +struct page *__kmap_atomic_to_page(void *ptr); + void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); void *kmap_atomic(struct page *page, enum km_type type); @@ -80,6 +90,23 @@ struct page *kmap_atomic_to_page(void *p #define flush_cache_kmaps() do { } while (0) +/* + * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap(): + */ +#ifdef CONFIG_PREEMPT_RT +# define kmap_atomic_prot(page, type, prot) kmap(page) +# define kmap_atomic(page, type) kmap(page) +# define kmap_atomic_pfn(pfn, type) kmap(pfn_to_page(pfn)) +# define kunmap_atomic(kvaddr, type) kunmap_virt(kvaddr) +# define kmap_atomic_to_page(kvaddr) kmap_to_page(kvaddr) +#else +# define kmap_atomic_prot(page, type, prot) __kmap_atomic_prot(page, type, prot) +# define kmap_atomic(page, type) __kmap_atomic(page, type) +# define kmap_atomic_pfn(pfn, type) __kmap_atomic_pfn(pfn, type) +# define kunmap_atomic(kvaddr, type) __kunmap_atomic(kvaddr, type) +# define kmap_atomic_to_page(kvaddr) __kmap_atomic_to_page(kvaddr) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ Index: linux-rt.q/include/asm-i386/i8253.h =================================================================== --- linux-rt.q.orig/include/asm-i386/i8253.h +++ linux-rt.q/include/asm-i386/i8253.h @@ -6,7 +6,7 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 -extern spinlock_t i8253_lock; +extern raw_spinlock_t i8253_lock; extern struct clock_event_device *global_clock_event; Index: linux-rt.q/include/asm-i386/i8259.h =================================================================== --- linux-rt.q.orig/include/asm-i386/i8259.h +++ linux-rt.q/include/asm-i386/i8259.h @@ -7,7 +7,7 @@ extern unsigned int cached_irq_mask; #define cached_master_mask (__byte(0, cached_irq_mask)) #define cached_slave_mask (__byte(1, cached_irq_mask)) -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; extern void init_8259A(int auto_eoi); extern void enable_8259A_irq(unsigned int irq); Index: linux-rt.q/include/asm-i386/mach-default/irq_vectors.h =================================================================== --- linux-rt.q.orig/include/asm-i386/mach-default/irq_vectors.h +++ linux-rt.q/include/asm-i386/mach-default/irq_vectors.h @@ -63,7 +63,7 @@ * levels. (0x80 is the syscall vector) */ #define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xef +#define FIRST_SYSTEM_VECTOR 0xee #define TIMER_IRQ 0 Index: linux-rt.q/include/asm-i386/mc146818rtc.h =================================================================== --- linux-rt.q.orig/include/asm-i386/mc146818rtc.h +++ linux-rt.q/include/asm-i386/mc146818rtc.h @@ -69,7 +69,7 @@ static inline unsigned char current_lock lock_cmos(reg) #define lock_cmos_suffix(reg) \ unlock_cmos(); \ - local_irq_restore(cmos_flags); \ + local_irq_restore(cmos_flags); \ } while (0) #else #define lock_cmos_prefix(reg) do {} while (0) Index: linux-rt.q/include/asm-i386/pgtable.h =================================================================== --- linux-rt.q.orig/include/asm-i386/pgtable.h +++ linux-rt.q/include/asm-i386/pgtable.h @@ -36,7 +36,7 @@ struct vm_area_struct; extern unsigned long empty_zero_page[1024]; extern pgd_t swapper_pg_dir[1024]; extern struct kmem_cache *pmd_cache; -extern spinlock_t pgd_lock; +extern raw_spinlock_t pgd_lock; extern struct page *pgd_list; void check_pgt_cache(void); Index: linux-rt.q/include/asm-i386/tlbflush.h =================================================================== --- linux-rt.q.orig/include/asm-i386/tlbflush.h +++ linux-rt.q/include/asm-i386/tlbflush.h @@ -4,6 +4,21 @@ #include #include +/* + * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the + * following complex race scenario: + * + * if the current task is lazy-TLB and does a TLB flush and + * gets preempted after the movl %%r3, %0 but before the + * movl %0, %%cr3 then its ->active_mm might change and it will + * install the wrong cr3 when it switches back. This is not a + * problem for the lazy-TLB task itself, but if the next task it + * switches to has an ->mm that is also the lazy-TLB task's + * new ->active_mm, then the scheduler will assume that cr3 is + * the new one, while we overwrote it with the old one. The result + * is the wrong cr3 in the new (non-lazy-TLB) task, which typically + * causes an infinite pagefault upon the next userspace access. + */ #ifdef CONFIG_PARAVIRT #include #else @@ -16,11 +31,13 @@ do { \ unsigned int tmpreg; \ \ + preempt_disable(); \ __asm__ __volatile__( \ "movl %%cr3, %0; \n" \ "movl %0, %%cr3; # flush TLB \n" \ : "=r" (tmpreg) \ :: "memory"); \ + preempt_enable(); \ } while (0) /* @@ -31,6 +48,7 @@ do { \ unsigned int tmpreg, cr4, cr4_orig; \ \ + preempt_disable(); \ __asm__ __volatile__( \ "movl %%cr4, %2; # turn off PGE \n" \ "movl %2, %1; \n" \ @@ -42,6 +60,7 @@ : "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig) \ : "i" (~X86_CR4_PGE) \ : "memory"); \ + preempt_enable(); \ } while (0) #define __native_flush_tlb_single(addr) \ @@ -98,6 +117,13 @@ static inline void flush_tlb_mm(struct mm_struct *mm) { + /* + * This is safe on PREEMPT_RT because if we preempt + * right after the check but before the __flush_tlb(), + * and if ->active_mm changes, then we might miss a + * TLB flush, but that TLB flush happened already when + * ->active_mm was changed: + */ if (mm == current->active_mm) __flush_tlb(); } Index: linux-rt.q/include/asm-i386/xor.h =================================================================== --- linux-rt.q.orig/include/asm-i386/xor.h +++ linux-rt.q/include/asm-i386/xor.h @@ -862,7 +862,21 @@ static struct xor_block_template xor_blo #include #undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ +/* + * MMX/SSE ops disable preemption for long periods of time, + * so on PREEMPT_RT use the register-based ops only: + */ +#ifdef CONFIG_PREEMPT_RT +# define XOR_TRY_TEMPLATES \ + do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_8regs_p); \ + xor_speed(&xor_block_32regs); \ + xor_speed(&xor_block_32regs_p); \ + } while (0) +# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST) +#else +# define XOR_TRY_TEMPLATES \ do { \ xor_speed(&xor_block_8regs); \ xor_speed(&xor_block_8regs_p); \ @@ -875,9 +889,10 @@ static struct xor_block_template xor_blo xor_speed(&xor_block_p5_mmx); \ } \ } while (0) - /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ +# define XOR_SELECT_TEMPLATE(FASTEST) \ (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) +#endif + patches/latency-trace-fix.patch0000664000077200007720000000513110646635212016066 0ustar mingomingoFrom linux-rt-users-owner@vger.kernel.org Fri Jul 13 20:13:10 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by mail.tglx.de (Postfix) with ESMTP id 9AD1E65C3E9; Fri, 13 Jul 2007 20:13:10 +0200 (CEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1760492AbXGMSNJ (ORCPT + 1 other); Fri, 13 Jul 2007 14:13:09 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S932549AbXGMSNJ (ORCPT ); Fri, 13 Jul 2007 14:13:09 -0400 Received: from deeprooted.net ([216.254.16.51]:38939 "EHLO paris.hilman.org" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1759850AbXGMSNG (ORCPT ); Fri, 13 Jul 2007 14:13:06 -0400 Received: by paris.hilman.org (Postfix, from userid 1000) id 98015E4C5C2; Fri, 13 Jul 2007 10:52:28 -0700 (PDT) Message-Id: <20070713175228.311226264@mvista.com> References: <20070713175214.336577416@mvista.com> User-Agent: quilt/0.45-1 Date: Fri, 13 Jul 2007 10:52:17 -0700 From: Kevin Hilman To: tglx@linutronix.de, mingo@elte.hu Cc: linux-rt-users@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [PATCH -rt 3/6] Compile fix for PREEMPT_TIMING on and TRACE_IRQFLAGS off Content-Disposition: inline; filename=latency-trace-fix.patch Sender: linux-rt-users-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-rt-users@vger.kernel.org X-Filter-To: .Kernel.rt-users X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Mime-Version: 1.0 Fix compile of latency_trace.c in the case where CRITICAL_PREEMPT_TIMING=y and TRACE_IRQFLAGS=n (because DEBUG_KERNEL is disabled) Signed-off-by: Kevin Hilman --- kernel/latency_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -2156,7 +2156,7 @@ void notrace unmask_preempt_count(unsign } EXPORT_SYMBOL(unmask_preempt_count); -#ifdef CONFIG_CRITICAL_PREEMPT_TIMING +#if defined(CONFIG_CRITICAL_PREEMPT_TIMING) && defined(CONFIG_TRACE_IRQFLAGS) /* Some archs do their cpu_idle with preemption on. Don't measure it */ void notrace trace_preempt_enter_idle(void) patches/s_files-pipe-fix.patch0000664000077200007720000000203110646635216015712 0ustar mingomingoSubject: s_files: free_write_pipe() fix From: Ingo Molnar file_kill() has to look at the file's inode (for the barrier logic), hence make sure we free the inode before the file. Signed-off-by: Ingo Molnar --- fs/pipe.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) Index: linux-rt.q/fs/pipe.c =================================================================== --- linux-rt.q.orig/fs/pipe.c +++ linux-rt.q/fs/pipe.c @@ -952,12 +952,17 @@ struct file *create_write_pipe(void) return ERR_PTR(err); } -void free_write_pipe(struct file *f) +void free_write_pipe(struct file *file) { - free_pipe_info(f->f_dentry->d_inode); - dput(f->f_path.dentry); - mntput(f->f_path.mnt); - put_filp(f); + struct dentry *dentry = file->f_path.dentry; + struct vfsmount *mnt = file->f_path.mnt; + + free_pipe_info(file->f_dentry->d_inode); + file->f_path.dentry = NULL; + file->f_path.mnt = NULL; + put_filp(file); + dput(dentry); + mntput(mnt); } struct file *create_read_pipe(struct file *wrf) patches/preempt-realtime-profiling.patch0000664000077200007720000000161710646635215020022 0ustar mingomingo--- kernel/profile.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) Index: linux-rt.q/kernel/profile.c =================================================================== --- linux-rt.q.orig/kernel/profile.c +++ linux-rt.q/kernel/profile.c @@ -46,6 +46,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +int prof_pid = -1; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -411,7 +412,8 @@ void __profile_tick(int type, struct pt_ { if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) && + (prof_pid == -1 || prof_pid == current->pid)) profile_hit(type, (void *)profile_pc(regs)); } patches/kstat-add-rt-stats.patch0000664000077200007720000001165310646635214016212 0ustar mingomingoFrom: tglx Subject: add rt stats to /proc/stat add RT stats to /proc/stat Signed-off-by: Ingo Molnar fs/proc/proc_misc.c | 29 +++++++++++++++++++++-------- include/linux/kernel_stat.h | 2 ++ kernel/sched.c | 6 +++++- 3 files changed, 28 insertions(+), 9 deletions(-) Index: linux-rt.q/fs/proc/proc_misc.c =================================================================== --- linux-rt.q.orig/fs/proc/proc_misc.c +++ linux-rt.q/fs/proc/proc_misc.c @@ -441,10 +441,11 @@ static int show_stat(struct seq_file *p, { int i; unsigned long jif; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t user_rt, user, nice, system_rt, system, idle, + iowait, irq, softirq, steal; u64 sum = 0; - user = nice = system = idle = iowait = + user_rt = user = nice = system_rt = system = idle = iowait = irq = softirq = steal = cputime64_zero; jif = - wall_to_monotonic.tv_sec; if (wall_to_monotonic.tv_nsec) @@ -461,11 +462,16 @@ static int show_stat(struct seq_file *p, irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + user_rt = cputime64_add(user_rt, kstat_cpu(i).cpustat.user_rt); + system_rt = cputime64_add(system_rt, kstat_cpu(i).cpustat.system_rt); for (j = 0 ; j < NR_IRQS ; j++) sum += kstat_cpu(i).irqs[j]; } - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", + user = cputime64_add(user_rt, user); + system = cputime64_add(system_rt, system); + + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -473,19 +479,24 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(user_rt), + (unsigned long long)cputime64_to_clock_t(system_rt)); + for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; + user_rt = kstat_cpu(i).cpustat.user_rt; + system_rt = kstat_cpu(i).cpustat.system_rt; + user = cputime64_add(user_rt, kstat_cpu(i).cpustat.user); nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; + system = cputime64_add(system_rt, kstat_cpu(i).cpustat.system); idle = kstat_cpu(i).cpustat.idle; iowait = kstat_cpu(i).cpustat.iowait; irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; - seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -494,7 +505,9 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(user_rt), + (unsigned long long)cputime64_to_clock_t(system_rt)); } seq_printf(p, "intr %llu", (unsigned long long)sum); Index: linux-rt.q/include/linux/kernel_stat.h =================================================================== --- linux-rt.q.orig/include/linux/kernel_stat.h +++ linux-rt.q/include/linux/kernel_stat.h @@ -23,6 +23,8 @@ struct cpu_usage_stat { cputime64_t idle; cputime64_t iowait; cputime64_t steal; + cputime64_t user_rt; + cputime64_t system_rt; }; struct kernel_stat { Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -3194,7 +3194,9 @@ void account_user_time(struct task_struc /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (rt_task(p)) + cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp); + else if (TASK_NICE(p) > 0) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); @@ -3221,6 +3223,8 @@ void account_system_time(struct task_str cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count() || (p->flags & PF_SOFTIRQ)) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (rt_task(p)) + cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp); else if (p != rq->idle) cpustat->system = cputime64_add(cpustat->system, tmp); else if (atomic_read(&rq->nr_iowait) > 0) patches/gtod-optimize.patch0000664000077200007720000000111310646635216015342 0ustar mingomingo--- kernel/timer.c | 7 +++++++ 1 file changed, 7 insertions(+) Index: linux-rt.q/kernel/timer.c =================================================================== --- linux-rt.q.orig/kernel/timer.c +++ linux-rt.q/kernel/timer.c @@ -976,6 +976,13 @@ static inline void update_times(void) static unsigned long last_tick; unsigned long ticks, flags; + /* + * Dont take the xtime_lock from every CPU in + * every tick - only when needed: + */ + if (jiffies == last_tick) + return; + write_seqlock_irqsave(&xtime_lock, flags); ticks = jiffies - last_tick; if (ticks) { patches/msi-suspend-resume-workaround.patch0000664000077200007720000000155710646635211020514 0ustar mingomingo--- drivers/base/power/resume.c | 1 + drivers/pci/msi.c | 4 ++++ 2 files changed, 5 insertions(+) Index: linux-rt.q/drivers/base/power/resume.c =================================================================== --- linux-rt.q.orig/drivers/base/power/resume.c +++ linux-rt.q/drivers/base/power/resume.c @@ -9,6 +9,7 @@ */ #include +#include #include #include "../base.h" #include "power.h" Index: linux-rt.q/drivers/pci/msi.c =================================================================== --- linux-rt.q.orig/drivers/pci/msi.c +++ linux-rt.q/drivers/pci/msi.c @@ -235,6 +235,10 @@ static void __pci_restore_msi_state(stru return; entry = get_irq_msi(dev->irq); + if (!entry) { + WARN_ON(1); + return; + } pos = entry->msi_attrib.pos; pci_intx(dev, 0); /* disable intx */ patches/fix-acpi-build-weirdness.patch0000664000077200007720000000112310646635211017341 0ustar mingomingo arch/i386/pci/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/i386/pci/Makefile =================================================================== --- linux-rt.q.orig/arch/i386/pci/Makefile +++ linux-rt.q/arch/i386/pci/Makefile @@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o +obj-$(CONFIG_ACPI) += acpi.o + pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o patches/redo-regparm-option.patch0000664000077200007720000000620410646635212016443 0ustar mingomingo undo: commit a1a70c25bed75ed36ed48bbe18b9029428d2452d Author: Adrian Bunk Date: Thu Dec 7 02:14:12 2006 +0100 [PATCH] i386: always enable regparm needed for latency tracing. --- Documentation/stable_api_nonsense.txt | 3 +++ arch/i386/Kconfig | 7 +++++++ arch/i386/Makefile | 4 +++- include/asm-i386/module.h | 8 +++++++- 4 files changed, 20 insertions(+), 2 deletions(-) Index: linux-rt.q/Documentation/stable_api_nonsense.txt =================================================================== --- linux-rt.q.orig/Documentation/stable_api_nonsense.txt +++ linux-rt.q/Documentation/stable_api_nonsense.txt @@ -62,6 +62,9 @@ consider the following facts about the L - different structures can contain different fields - Some functions may not be implemented at all, (i.e. some locks compile away to nothing for non-SMP builds.) + - Parameter passing of variables from function to function can be + done in different ways (the CONFIG_REGPARM option controls + this.) - Memory within the kernel can be aligned in different ways, depending on the build options. - Linux runs on a wide range of different processor architectures. Index: linux-rt.q/arch/i386/Kconfig =================================================================== --- linux-rt.q.orig/arch/i386/Kconfig +++ linux-rt.q/arch/i386/Kconfig @@ -772,6 +772,13 @@ config BOOT_IOREMAP depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) default y +# +# function tracing might turn this off: +# +config REGPARM + bool + default y + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS Index: linux-rt.q/arch/i386/Makefile =================================================================== --- linux-rt.q.orig/arch/i386/Makefile +++ linux-rt.q/arch/i386/Makefile @@ -31,7 +31,7 @@ LDFLAGS_vmlinux := --emit-relocs endif CHECKFLAGS += -D__i386__ -CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return +CFLAGS += -pipe -msoft-float # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) @@ -39,6 +39,8 @@ CFLAGS += $(call cc-option,-mpreferred-s # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/i386/Makefile.cpu +cflags-$(CONFIG_REGPARM) += -mregparm=3 -freg-struct-return + # temporary until string.h is fixed cflags-y += -ffreestanding Index: linux-rt.q/include/asm-i386/module.h =================================================================== --- linux-rt.q.orig/include/asm-i386/module.h +++ linux-rt.q/include/asm-i386/module.h @@ -64,12 +64,18 @@ struct mod_arch_specific #error unknown processor family #endif +#ifdef CONFIG_REGPARM +#define MODULE_REGPARM "REGPARM " +#else +#define MODULE_REGPARM "" +#endif + #ifdef CONFIG_4KSTACKS #define MODULE_STACKSIZE "4KSTACKS " #else #define MODULE_STACKSIZE "" #endif -#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE +#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE #endif /* _ASM_I386_MODULE_H */ patches/x86_64-fix-irq-regs-leftovers.patch0000664000077200007720000000144210646635210020026 0ustar mingomingoSubject: x86_64: fiuxp pt_reqs leftovers The hpet_rtc_interrupt handler still uses pt_regs. Fix it. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/x86_64/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/hpet.c +++ linux-rt.q/arch/x86_64/kernel/hpet.c @@ -439,7 +439,7 @@ int hpet_rtc_dropped_irq(void) return 1; } -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) { struct rtc_time curr_time; unsigned long rtc_int_flag = 0; patches/x86_64-apic-change-setup-calling-convention.patch0000664000077200007720000000317310646635211022572 0ustar mingomingoSubject: x86_64: apic change setup_APIC_timer calling convention setup_APIC_timer takes the file global calibration result as an argument. Remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -784,7 +784,7 @@ static void __setup_APIC_LVTT(unsigned i apic_write(APIC_TMICT, clocks); } -static void setup_APIC_timer(unsigned int clocks) +static void setup_APIC_timer(void) { unsigned long flags; int irqen; @@ -793,7 +793,7 @@ static void setup_APIC_timer(unsigned in irqen = ! cpu_isset(smp_processor_id(), timer_interrupt_broadcast_ipi_mask); - __setup_APIC_LVTT(clocks, 0, irqen); + __setup_APIC_LVTT(calibration_result, 0, irqen); /* Turn off PIT interrupt if we use APIC timer as main timer. Only works with the PM timer right now TBD fix it for HPET too. */ @@ -880,7 +880,7 @@ void __init setup_boot_APIC_clock (void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } @@ -888,7 +888,7 @@ void __init setup_boot_APIC_clock (void) void __cpuinit setup_secondary_APIC_clock(void) { local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } patches/rt-mutex-trylock-export.patch0000664000077200007720000000727210646635214017352 0ustar mingomingoFrom linux-kernel-owner@vger.kernel.org Wed May 23 01:44:17 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=none autolearn=unavailable version=3.1.7-deb Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by mail.tglx.de (Postfix) with ESMTP id 32C4A65C3E9 for ; Wed, 23 May 2007 01:44:17 +0200 (CEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1759353AbXEVXoG (ORCPT ); Tue, 22 May 2007 19:44:06 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1757791AbXEVXn4 (ORCPT ); Tue, 22 May 2007 19:43:56 -0400 Received: from rwcrmhc11.comcast.net ([204.127.192.81]:35206 "EHLO rwcrmhc11.comcast.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757669AbXEVXn4 (ORCPT ); Tue, 22 May 2007 19:43:56 -0400 Received: from sx.thebigcorporation.com ([69.181.45.228]) by comcast.net (rwcrmhc11) with ESMTP id <20070522233624m1100rg2vge>; Tue, 22 May 2007 23:36:29 +0000 Received: from sx.thebigcorporation.com (localhost.localdomain [127.0.0.1]) by sx.thebigcorporation.com (8.13.8/8.13.8) with ESMTP id l4MNaKHv029409; Tue, 22 May 2007 16:36:20 -0700 Received: (from sven@localhost) by sx.thebigcorporation.com (8.13.8/8.13.8/Submit) id l4MNaJIn029408; Tue, 22 May 2007 16:36:19 -0700 X-Authentication-Warning: sx.thebigcorporation.com: sven set sender to sven@thebigcorporation.com using -f Subject: [PATCH] 2.6.21-rt6 From: Sven-Thorsten Dietrich To: LKML Cc: Ingo Molnar In-Reply-To: <1179874795.25500.40.camel@sx.thebigcorporation.com> References: <1179874795.25500.40.camel@sx.thebigcorporation.com> Content-Type: text/plain Organization: The Big Corporation Date: Tue, 22 May 2007 16:36:19 -0700 Message-Id: <1179876979.25500.54.camel@sx.thebigcorporation.com> Mime-Version: 1.0 X-Mailer: Evolution 2.8.3 (2.8.3-2.fc6) Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org X-Filter-To: .Kernel.LKML X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit On Tue, 2007-05-22 at 15:59 -0700, Sven-Thorsten Dietrich wrote: > Add > header and export for rt_write_trylock_irqsave. Disregard the last patch, flags parameter was missing in the header. --- include/linux/spinlock.h | 2 ++ kernel/rt.c | 1 + 2 files changed, 3 insertions(+) Index: linux-rt.q/include/linux/spinlock.h =================================================================== --- linux-rt.q.orig/include/linux/spinlock.h +++ linux-rt.q/include/linux/spinlock.h @@ -294,6 +294,8 @@ do { \ extern void __lockfunc rt_write_lock(rwlock_t *rwlock); extern void __lockfunc rt_read_lock(rwlock_t *rwlock); extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, + unsigned long *flags); extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); Index: linux-rt.q/kernel/rt.c =================================================================== --- linux-rt.q.orig/kernel/rt.c +++ linux-rt.q/kernel/rt.c @@ -172,6 +172,7 @@ int __lockfunc rt_write_trylock_irqsave( *flags = 0; return rt_write_trylock(rwlock); } +EXPORT_SYMBOL(rt_write_trylock_irqsave); int __lockfunc rt_read_trylock(rwlock_t *rwlock) { patches/tasklet-fix-preemption-race.patch0000664000077200007720000000776310646635214020111 0ustar mingomingoFrom johnstul@us.ibm.com Wed Jun 6 04:17:34 2007 Return-Path: Received: from e3.ny.us.ibm.com (e3.ny.us.ibm.com [32.97.182.143]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (No client certificate requested) by mail.tglx.de (Postfix) with ESMTP id 1CCC065C065 for ; Wed, 6 Jun 2007 04:17:34 +0200 (CEST) Received: from d01relay04.pok.ibm.com (d01relay04.pok.ibm.com [9.56.227.236]) by e3.ny.us.ibm.com (8.13.8/8.13.8) with ESMTP id l561EvIT011411 for ; Tue, 5 Jun 2007 21:14:57 -0400 Received: from d01av04.pok.ibm.com (d01av04.pok.ibm.com [9.56.224.64]) by d01relay04.pok.ibm.com (8.13.8/8.13.8/NCO v8.3) with ESMTP id l562HUG6545736 for ; Tue, 5 Jun 2007 22:17:30 -0400 Received: from d01av04.pok.ibm.com (loopback [127.0.0.1]) by d01av04.pok.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id l562HUu0027167 for ; Tue, 5 Jun 2007 22:17:30 -0400 Received: from [9.47.21.16] (cog.beaverton.ibm.com [9.47.21.16]) by d01av04.pok.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id l562HTkh027139; Tue, 5 Jun 2007 22:17:29 -0400 Subject: [PATCH -rt] Fix TASKLET_STATE_SCHED WARN_ON() From: john stultz To: Ingo Molnar Cc: Thomas Gleixner , Steven Rostedt , "Paul E. McKenney" , lkml Content-Type: text/plain Date: Tue, 05 Jun 2007 19:17:23 -0700 Message-Id: <1181096244.6018.20.camel@localhost> Mime-Version: 1.0 X-Mailer: Evolution 2.10.1 X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Hey Ingo, So we've been seeing the following trace fairly frequently on our SMP boxes when running kernbench: BUG: at kernel/softirq.c:639 __tasklet_action() Call Trace: [] dump_trace+0xaa/0x32a [] show_trace+0x41/0x5c [] dump_stack+0x15/0x17 [] __tasklet_action+0xdf/0x12e [] tasklet_action+0x27/0x29 [] ksoftirqd+0x16c/0x271 [] kthread+0xf5/0x128 [] child_rip+0xa/0x12 Paul also pointed this out awhile back: http://lkml.org/lkml/2007/2/25/1 Anyway, I think I finally found the issue. Its a bit hard to explain, but the idea is while __tasklet_action is running the tasklet function on CPU1, if a call to tasklet_schedule() on CPU2 is made, and if right after we mark the TASKLET_STATE_SCHED bit we are preempted, __tasklet_action on CPU1 might be able to re-run the function, clear the bit and unlock the tasklet before CPU2 enters __tasklet_common_schedule. Once __tasklet_common_schedule locks the tasklet, we will add the tasklet to the list with the TASKLET_STATE_SCHED *unset*. I've verified this race occurs w/ a WARN_ON in __tasklet_common_schedule(). This fix avoids this race by making sure *after* we've locked the tasklet that the STATE_SCHED bit is set before adding it to the list. Does it look ok to you? thanks -john Signed-off-by: John Stultz --- kernel/softirq.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -459,10 +459,17 @@ static void inline __tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) { if (tasklet_trylock(t)) { - WARN_ON(t->next != NULL); - t->next = head->list; - head->list = t; - raise_softirq_irqoff(nr); + /* We may have been preempted before tasklet_trylock + * and __tasklet_action may have already run. + * So double check the sched bit while the takslet + * is locked before adding it to the list. + */ + if (test_bit(TASKLET_STATE_SCHED, &t->state)) { + WARN_ON(t->next != NULL); + t->next = head->list; + head->list = t; + raise_softirq_irqoff(nr); + } tasklet_unlock(t); } } patches/x86_64-use-generic-cmos-update.patch0000664000077200007720000000531110646635210020126 0ustar mingomingoSubject: x86_64: use generic cmos update Use the generic cmos update function in kernel/time/ntp.c Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar Cc: john stultz --- arch/x86_64/Kconfig | 4 ++++ arch/x86_64/kernel/time.c | 25 +++++++++---------------- 2 files changed, 13 insertions(+), 16 deletions(-) Index: linux-rt.q/arch/x86_64/Kconfig =================================================================== --- linux-rt.q.orig/arch/x86_64/Kconfig +++ linux-rt.q/arch/x86_64/Kconfig @@ -32,6 +32,10 @@ config GENERIC_TIME_VSYSCALL bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config ZONE_DMA32 bool default y Index: linux-rt.q/arch/x86_64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/time.c +++ linux-rt.q/arch/x86_64/kernel/time.c @@ -81,8 +81,9 @@ EXPORT_SYMBOL(profile_pc); * sheet for details. */ -static void set_rtc_mmss(unsigned long nowtime) +static int set_rtc_mmss(unsigned long nowtime) { + int retval = 0; int real_seconds, real_minutes, cmos_minutes; unsigned char control, freq_select; @@ -122,6 +123,7 @@ static void set_rtc_mmss(unsigned long n if (abs(real_minutes - cmos_minutes) >= 30) { printk(KERN_WARNING "time.c: can't update CMOS clock " "from %d to %d\n", cmos_minutes, real_minutes); + retval = -1; } else { BIN_TO_BCD(real_seconds); BIN_TO_BCD(real_minutes); @@ -141,12 +143,17 @@ static void set_rtc_mmss(unsigned long n CMOS_WRITE(freq_select, RTC_FREQ_SELECT); spin_unlock(&rtc_lock); + + return retval; } +int update_persistent_clock(struct timespec now) +{ + return set_rtc_mmss(now.tv_sec); +} void main_timer_handler(void) { - static unsigned long rtc_update = 0; /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -174,20 +181,6 @@ void main_timer_handler(void) if (!using_apic_timer) smp_local_timer_interrupt(); -/* - * If we have an externally synchronized Linux clock, then update CMOS clock - * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy - * closest to exactly 500 ms before the next second. If the update fails, we - * don't care, as it'll be updated on the next turn, and the problem (time way - * off) isn't likely to go away much sooner anyway. - */ - - if (ntp_synced() && xtime.tv_sec > rtc_update && - abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { - set_rtc_mmss(xtime.tv_sec); - rtc_update = xtime.tv_sec + 660; - } - write_sequnlock(&xtime_lock); } patches/dynticks-rcu-rt-fixlet.patch0000664000077200007720000000140110646635213017077 0ustar mingomingo--- kernel/rcupreempt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) Index: linux-rt.q/kernel/rcupreempt.c =================================================================== --- linux-rt.q.orig/kernel/rcupreempt.c +++ linux-rt.q/kernel/rcupreempt.c @@ -338,6 +338,17 @@ void __synchronize_sched(void) sched_setaffinity(0, oldmask); } +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + */ +int rcu_needs_cpu(int cpu) +{ + return !!rcu_data.waitlist || rcu_pending(cpu); +} + int rcu_pending(int cpu) { return (rcu_data.donelist != NULL || patches/latency-tracing-x86_64.patch0000664000077200007720000001716510646635212016601 0ustar mingomingo arch/x86_64/ia32/ia32entry.S | 11 ++++++++- arch/x86_64/kernel/entry.S | 45 +++++++++++++++++++++++++++++++++++++ arch/x86_64/kernel/head64.c | 3 +- arch/x86_64/kernel/irq.c | 6 +++++ arch/x86_64/kernel/setup64.c | 4 +-- arch/x86_64/kernel/smpboot.c | 2 - arch/x86_64/kernel/traps.c | 1 arch/x86_64/kernel/vsyscall.c | 2 - include/asm-x86_64/calling.h | 50 ++++++++++++++++++++++++++++++++++++++++++ include/asm-x86_64/unistd.h | 2 + 10 files changed, 120 insertions(+), 6 deletions(-) Index: linux-rt.q/arch/x86_64/ia32/ia32entry.S =================================================================== --- linux-rt.q.orig/arch/x86_64/ia32/ia32entry.S +++ linux-rt.q/arch/x86_64/ia32/ia32entry.S @@ -120,7 +120,9 @@ sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -229,7 +231,9 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -323,8 +327,10 @@ ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call @@ -394,7 +400,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 -ia32_sys_call_table: +ENTRY(ia32_sys_call_table) .quad sys_restart_syscall .quad sys_exit .quad stub32_fork @@ -719,4 +725,7 @@ ia32_sys_call_table: .quad compat_sys_signalfd .quad compat_sys_timerfd .quad sys_eventfd +#ifdef CONFIG_EVENT_TRACE + .globl ia32_syscall_end +#endif ia32_syscall_end: Index: linux-rt.q/arch/x86_64/kernel/entry.S =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/entry.S +++ linux-rt.q/arch/x86_64/kernel/entry.S @@ -53,6 +53,47 @@ .code64 +#ifdef CONFIG_EVENT_TRACE + +ENTRY(mcount) + cmpl $0, mcount_enabled + jz out + + push %rbp + mov %rsp,%rbp + + push %r11 + push %r10 + push %r9 + push %r8 + push %rdi + push %rsi + push %rdx + push %rcx + push %rax + + mov 0x0(%rbp),%rax + mov 0x8(%rbp),%rdi + mov 0x8(%rax),%rsi + + call __trace + + pop %rax + pop %rcx + pop %rdx + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + + pop %rbp +out: + ret + +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -234,7 +275,9 @@ ENTRY(system_call) cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx + TRACE_SYS_CALL call *sys_call_table(,%rax,8) # XXX: rip relative + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) @@ -316,7 +359,9 @@ tracesys: cmova %rcx,%rax ja 1f movq %r10,%rcx /* fixup for C */ + TRACE_SYS_CALL call *sys_call_table(,%rax,8) + TRACE_SYS_RET 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ Index: linux-rt.q/arch/x86_64/kernel/head64.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/head64.c +++ linux-rt.q/arch/x86_64/kernel/head64.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -58,7 +59,7 @@ static void __init copy_bootdata(char *r memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } -void __init x86_64_start_kernel(char * real_mode_data) +void __init notrace x86_64_start_kernel(char * real_mode_data) { int i; Index: linux-rt.q/arch/x86_64/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/irq.c +++ linux-rt.q/arch/x86_64/kernel/irq.c @@ -117,6 +117,12 @@ asmlinkage unsigned int do_IRQ(struct pt irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; +#ifdef CONFIG_EVENT_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->rip, irq, 0); + #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); #endif Index: linux-rt.q/arch/x86_64/kernel/setup64.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/setup64.c +++ linux-rt.q/arch/x86_64/kernel/setup64.c @@ -114,7 +114,7 @@ void __init setup_per_cpu_areas(void) } } -void pda_init(int cpu) +void notrace pda_init(int cpu) { struct x8664_pda *pda = cpu_pda(cpu); @@ -191,7 +191,7 @@ unsigned long kernel_eflags; * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __cpuinit cpu_init (void) +void __cpuinit notrace cpu_init (void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); Index: linux-rt.q/arch/x86_64/kernel/smpboot.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/smpboot.c +++ linux-rt.q/arch/x86_64/kernel/smpboot.c @@ -316,7 +316,7 @@ static inline void set_cpu_sibling_map(i /* * Setup code on secondary processor (after comming out of the trampoline) */ -void __cpuinit start_secondary(void) +void __cpuinit notrace start_secondary(void) { /* * Dont put anything before smp_callin(), SMP Index: linux-rt.q/arch/x86_64/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/traps.c +++ linux-rt.q/arch/x86_64/kernel/traps.c @@ -346,6 +346,7 @@ show_trace(struct task_struct *tsk, stru printk("\nCall Trace:\n"); dump_trace(tsk, regs, stack, &print_trace_ops, NULL); printk("\n"); + print_traces(tsk); } static void Index: linux-rt.q/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/vsyscall.c +++ linux-rt.q/arch/x86_64/kernel/vsyscall.c @@ -43,7 +43,7 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","rcx","memory" #define __pa_vsymbol(x) \ ({unsigned long v; \ Index: linux-rt.q/include/asm-x86_64/calling.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/calling.h +++ linux-rt.q/include/asm-x86_64/calling.h @@ -160,3 +160,53 @@ .macro icebp .byte 0xf1 .endm + +/* + * latency-tracing helpers: + */ + + .macro TRACE_SYS_CALL + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_call + + RESTORE_ARGS +#endif + .endm + + + .macro TRACE_SYS_IA32_CALL + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_ia32_call + + RESTORE_ARGS +#endif + .endm + + .macro TRACE_SYS_RET + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rax, %rdi + + call sys_ret + + RESTORE_ARGS +#endif + .endm Index: linux-rt.q/include/asm-x86_64/unistd.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/unistd.h +++ linux-rt.q/include/asm-x86_64/unistd.h @@ -11,6 +11,8 @@ * Note: holes are not allowed. */ +#define NR_syscalls (__NR_syscall_max+1) + /* at least 8 syscall per cacheline */ #define __NR_read 0 __SYSCALL(__NR_read, sys_read) patches/preempt-realtime-input.patch0000664000077200007720000000245610646635215017172 0ustar mingomingo--- drivers/input/gameport/gameport.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) Index: linux-rt.q/drivers/input/gameport/gameport.c =================================================================== --- linux-rt.q.orig/drivers/input/gameport/gameport.c +++ linux-rt.q/drivers/input/gameport/gameport.c @@ -21,6 +21,7 @@ #include #include #include +#include #include /* HZ */ #include #include @@ -102,12 +103,12 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); GET_TIME(t1); for (t = 0; t < 50; t++) gameport_read(gameport); GET_TIME(t2); GET_TIME(t3); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; } @@ -126,11 +127,11 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); rdtscl(t1); for (t = 0; t < 50; t++) gameport_read(gameport); rdtscl(t2); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; } patches/preempt-irqs-i386-ioapic-mask-quirk.patch0000664000077200007720000001356110646635213021221 0ustar mingomingoFrom mschmidt@redhat.com Thu Jun 21 13:32:02 2007 Return-Path: Received: from mx1.redhat.com (mx1.redhat.com [66.187.233.31]) by mail.tglx.de (Postfix) with ESMTP id CA11565C065 for ; Thu, 21 Jun 2007 13:32:02 +0200 (CEST) Received: from int-mx1.corp.redhat.com (int-mx1.corp.redhat.com [172.16.52.254]) by mx1.redhat.com (8.13.1/8.13.1) with ESMTP id l5LBVoq3016914; Thu, 21 Jun 2007 07:31:50 -0400 Received: from pobox.stuttgart.redhat.com (pobox.stuttgart.redhat.com [172.16.2.10]) by int-mx1.corp.redhat.com (8.13.1/8.13.1) with ESMTP id l5LBVmp0010104; Thu, 21 Jun 2007 07:31:49 -0400 Received: from [10.34.32.84] (brian.englab.brq.redhat.com [10.34.32.84]) by pobox.stuttgart.redhat.com (8.12.11.20060308/8.12.11) with ESMTP id l5LBVl5k000423; Thu, 21 Jun 2007 13:31:47 +0200 Message-ID: <467A61A3.7060804@redhat.com> Date: Thu, 21 Jun 2007 13:31:47 +0200 From: Michal Schmidt User-Agent: Thunderbird 1.5.0.12 (X11/20070529) MIME-Version: 1.0 To: Steven Rostedt CC: Ingo Molnar , Thomas Gleixner , linux-rt-users@vger.kernel.org, linux-kernel@vger.kernel.org Subject: Re: [PATCH -rt] irq nobody cared workaround for i386 References: <4676CF81.2000205@redhat.com> <4677D7AF.7040700@redhat.com> <467932B4.6030800@redhat.com> <467936FE.8050704@redhat.com> In-Reply-To: <467936FE.8050704@redhat.com> X-Enigmail-Version: 0.94.2.0 Content-Type: text/plain; charset=ISO-8859-1 X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Steven Rostedt wrote: > Michal Schmidt wrote: > >> I came to the conclusion that the IO-APICs which need the fix for the >> nobody cared bug don't have the issue ack_ioapic_quirk_irq is designed >> to work-around. It should be safe simply to use the normal >> ack_ioapic_irq as the .eoi method in pcix_ioapic_chip. >> So this is the port of Steven's fix for the nobody cared bug to i386. It >> works fine on IBM LS21 I have access to. >> >> > You want to make that "apic > 0". Note the spacing. If it breaks > 80 characters, then simply put it to a new line. > > [...] > ACK > > -- Steve > OK, I fixed the spacing in both occurences. Signed-off-by: Michal Schmidt --- arch/i386/kernel/io_apic.c | 66 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 9 deletions(-) Index: linux-rt.q/arch/i386/kernel/io_apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/io_apic.c +++ linux-rt.q/arch/i386/kernel/io_apic.c @@ -261,6 +261,18 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } +/* trigger = 0 (edge mode) */ +static void __pcix_mask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, 0x00008000); +} + +/* mask = 0, trigger = 1 (level mode) */ +static void __pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); +} + static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -279,6 +291,24 @@ static void unmask_IO_APIC_irq (unsigned spin_unlock_irqrestore(&ioapic_lock, flags); } +static void pcix_mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -1254,22 +1284,27 @@ static int assign_irq_vector(int irq) return vector; } + static struct irq_chip ioapic_chip; +static struct irq_chip pcix_ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +static void ioapic_register_intr(int irq, int vector, unsigned long trigger, + int pcix) { + struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip; + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - else { - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); - } + set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, + pcix ? "pcix-fasteoi" : "fasteoi"); + else + set_irq_chip_and_handler_name(irq, chip, handle_edge_irq, + pcix ? "pcix-edge" : "edge"); + set_intr_gate(vector, interrupt[irq]); } @@ -1333,7 +1368,8 @@ static void __init setup_IO_APIC_irqs(vo if (IO_APIC_IRQ(irq)) { vector = assign_irq_vector(irq); entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); + ioapic_register_intr(irq, vector, IOAPIC_AUTO, + apic > 0); if (!apic && (irq < 16)) disable_8259A_irq(irq); @@ -2025,6 +2061,18 @@ static struct irq_chip ioapic_chip __rea .retrigger = ioapic_retrigger_irq, }; +static struct irq_chip pcix_ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = pcix_mask_IO_APIC_irq, + .unmask = pcix_unmask_IO_APIC_irq, + .ack = ack_ioapic_irq, + .eoi = ack_ioapic_irq, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; static inline void init_IO_APIC_traps(void) { @@ -2825,7 +2873,7 @@ int io_apic_set_pci_routing (int ioapic, mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, edge_level, active_high_low); - ioapic_register_intr(irq, entry.vector, edge_level); + ioapic_register_intr(irq, entry.vector, edge_level, ioapic > 0); if (!ioapic && (irq < 16)) disable_8259A_irq(irq); patches/new-softirq-code-fixlets.patch0000664000077200007720000000471210646635216017421 0ustar mingomingo--- kernel/softirq.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -102,6 +102,7 @@ static void wakeup_softirqd(int softirq) if (unlikely(!tsk)) return; +#if 1 #if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS) /* * Optimization: if we are in a hardirq thread context, and @@ -116,6 +117,7 @@ static void wakeup_softirqd(int softirq) (tsk->normal_prio == current->normal_prio)) return; #endif +#endif /* * Wake up the softirq task: */ @@ -771,19 +773,18 @@ static int ksoftirqd(void * __data) struct softirq_action *h; int cpu = data->cpu; - current->flags |= PF_NOFREEZE | PF_SOFTIRQ; - #ifdef CONFIG_PREEMPT_SOFTIRQS init_waitqueue_head(&data->wait); #endif sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending() & softirq_mask) { + if (!(local_softirq_pending() & softirq_mask)) { sleep_more: __preempt_enable_no_resched(); schedule(); @@ -792,7 +793,11 @@ sleep_more: __set_current_state(TASK_RUNNING); - while (local_softirq_pending() & soft_irqmask) { +#ifdef CONFIG_PREEMPT_SOFTIRQS + data->running = 1; +#endif + + while (local_softirq_pending() & softirq_mask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ @@ -930,6 +935,9 @@ static int __cpuinit cpu_callback(struct for (i = 0; i < MAX_SOFTIRQ; i++) { per_cpu(ksoftirqd, hotcpu)[i].nr = i; per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + } + for (i = 0; i < MAX_SOFTIRQ; i++) { p = kthread_create(ksoftirqd, &per_cpu(ksoftirqd, hotcpu)[i], "softirq-%s/%d", softirq_names[i], @@ -952,12 +960,14 @@ static int __cpuinit cpu_callback(struct #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: +#if 0 for (i = 0; i < MAX_SOFTIRQ; i++) { if (!per_cpu(ksoftirqd, hotcpu)[i].tsk) continue; kthread_bind(per_cpu(ksoftirqd, hotcpu)[i].tsk, any_online_cpu(cpu_online_map)); } +#endif case CPU_DEAD: case CPU_DEAD_FROZEN: for (i = 0; i < MAX_SOFTIRQ; i++) { patches/x86_64-preparatory-apic-set-lvtt.patch0000664000077200007720000000452710646635211020554 0ustar mingomingoSubject: x86_64: prepare apic code for clock events Change __setup_APIC_LVTT so it takes the arguments which are necessary for the later clock events switch. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -760,14 +760,14 @@ void __init init_apic_mappings(void) #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { unsigned int lvtt_value, tmp_value; - int cpu = smp_processor_id(); - - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!irqen) lvtt_value |= APIC_LVT_MASKED; apic_write(APIC_LVTT, lvtt_value); @@ -780,12 +780,14 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write(APIC_TMICT, clocks/APIC_DIVISOR); } static void setup_APIC_timer(unsigned int clocks) { unsigned long flags; + int irqen; local_irq_save(flags); @@ -808,7 +810,10 @@ static void setup_APIC_timer(unsigned in c2 |= inb_p(0x40) << 8; } while (c2 - c1 < 300); } - __setup_APIC_LVTT(clocks); + + irqen = ! cpu_isset(smp_processor_id(), + timer_interrupt_broadcast_ipi_mask); + __setup_APIC_LVTT(clocks, 0, irqen); /* Turn off PIT interrupt if we use APIC timer as main timer. Only works with the PM timer right now TBD fix it for HPET too. */ @@ -846,8 +851,10 @@ static int __init calibrate_APIC_clock(v * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the * counter running for calibration. + * + * No interrupt enable ! */ - __setup_APIC_LVTT(4000000000); + __setup_APIC_LVTT(4000000000, 0, 0); apic_start = apic_read(APIC_TMCCT); #ifdef CONFIG_X86_PM_TIMER patches/ppc-remove-broken-vsyscall.patch0000664000077200007720000006007710646635213017753 0ustar mingomingoFrom sshtylyov@ru.mvista.com Wed May 16 20:55:24 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from imap.sh.mvista.com (unknown [63.81.120.155]) by mail.tglx.de (Postfix) with ESMTP id A9FD665C065 for ; Wed, 16 May 2007 20:55:24 +0200 (CEST) Received: from wasted.dev.rtsoft.ru (unknown [10.150.0.9]) by imap.sh.mvista.com (Postfix) with ESMTP id A97873EC9; Wed, 16 May 2007 11:55:18 -0700 (PDT) From: Sergei Shtylyov (by way of Sergei Shtylyov ) Organization: MontaVista Software Inc. Subject: [PATCH 2.6.21-rt2] PowerPC: remove broken vsyscall code Date: Wed, 16 May 2007 21:56:51 +0300 User-Agent: KMail/1.5 MIME-Version: 1.0 Content-Disposition: inline To: tglx@linutronix.de, mingo@elte.hu Cc: linux-kernel@vger.kernel.org, johnstul@us.ibm.com Content-Type: text/plain; charset="iso-8859-1" Message-Id: <200705162256.51722.sshtylyov@ru.mvista.com> X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Remove PowerPC vsyscalls that were broken by the generic TOD patch. Signed-off-by: Sergei Shtylyov --- Since there's still no working PowerPC TOD vsyscalls fix, and they continue to be broken in the RT patch, I've respun this patch again... arch/powerpc/kernel/asm-offsets.c | 15 - arch/powerpc/kernel/smp.c | 2 arch/powerpc/kernel/vdso32/Makefile | 2 arch/powerpc/kernel/vdso32/datapage.S | 18 - arch/powerpc/kernel/vdso32/gettimeofday.S | 324 ------------------------------ arch/powerpc/kernel/vdso32/vdso32.lds.S | 4 arch/powerpc/kernel/vdso64/Makefile | 2 arch/powerpc/kernel/vdso64/datapage.S | 18 - arch/powerpc/kernel/vdso64/gettimeofday.S | 255 ----------------------- arch/powerpc/kernel/vdso64/vdso64.lds.S | 4 include/asm-powerpc/time.h | 20 - include/asm-powerpc/vdso_datapage.h | 14 - 12 files changed, 2 insertions(+), 676 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/asm-offsets.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/asm-offsets.c +++ linux-rt.q/arch/powerpc/kernel/asm-offsets.c @@ -273,16 +273,7 @@ int main(void) #endif /* ! CONFIG_PPC64 */ /* datapage offsets for use by vdso */ - DEFINE(CFG_TB_ORIG_STAMP, offsetof(struct vdso_data, tb_orig_stamp)); - DEFINE(CFG_TB_TICKS_PER_SEC, offsetof(struct vdso_data, tb_ticks_per_sec)); - DEFINE(CFG_TB_TO_XS, offsetof(struct vdso_data, tb_to_xs)); - DEFINE(CFG_STAMP_XSEC, offsetof(struct vdso_data, stamp_xsec)); - DEFINE(CFG_TB_UPDATE_COUNT, offsetof(struct vdso_data, tb_update_count)); - DEFINE(CFG_TZ_MINUTEWEST, offsetof(struct vdso_data, tz_minuteswest)); - DEFINE(CFG_TZ_DSTTIME, offsetof(struct vdso_data, tz_dsttime)); DEFINE(CFG_SYSCALL_MAP32, offsetof(struct vdso_data, syscall_map_32)); - DEFINE(WTOM_CLOCK_SEC, offsetof(struct vdso_data, wtom_clock_sec)); - DEFINE(WTOM_CLOCK_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); #ifdef CONFIG_PPC64 DEFINE(CFG_SYSCALL_MAP64, offsetof(struct vdso_data, syscall_map_64)); DEFINE(TVAL64_TV_SEC, offsetof(struct timeval, tv_sec)); @@ -303,12 +294,6 @@ int main(void) DEFINE(TZONE_TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); DEFINE(TZONE_TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); - /* Other bits used by the vdso */ - DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); - DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); - DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); - DEFINE(CLOCK_REALTIME_RES, TICK_NSEC); - #ifdef CONFIG_BUG DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); #endif Index: linux-rt.q/arch/powerpc/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/smp.c +++ linux-rt.q/arch/powerpc/kernel/smp.c @@ -331,8 +331,6 @@ void smp_call_function_interrupt(void) } } -extern struct gettimeofday_struct do_gtod; - struct thread_info *current_set[NR_CPUS]; DECLARE_PER_CPU(unsigned int, pvr); Index: linux-rt.q/arch/powerpc/kernel/vdso32/Makefile =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/vdso32/Makefile +++ linux-rt.q/arch/powerpc/kernel/vdso32/Makefile @@ -1,7 +1,7 @@ # List of files in the vdso, has to be asm only for now -obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o +obj-vdso32 = sigtramp.o datapage.o cacheflush.o note.o # Build rules Index: linux-rt.q/arch/powerpc/kernel/vdso32/datapage.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/vdso32/datapage.S +++ linux-rt.q/arch/powerpc/kernel/vdso32/datapage.S @@ -65,21 +65,3 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_ma blr .cfi_endproc V_FUNCTION_END(__kernel_get_syscall_map) - -/* - * void unsigned long long __kernel_get_tbfreq(void); - * - * returns the timebase frequency in HZ - */ -V_FUNCTION_BEGIN(__kernel_get_tbfreq) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - bl __get_datapage@local - lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) - lwz r3,CFG_TB_TICKS_PER_SEC(r3) - mtlr r12 - crclr cr0*4+so - blr - .cfi_endproc -V_FUNCTION_END(__kernel_get_tbfreq) Index: linux-rt.q/arch/powerpc/kernel/vdso32/gettimeofday.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/vdso32/gettimeofday.S +++ /dev/null @@ -1,324 +0,0 @@ -/* - * Userland implementation of gettimeofday() for 32 bits processes in a - * ppc64 kernel for use in the vDSO - * - * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org, - * IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include - - .text -/* - * Exact prototype of gettimeofday - * - * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); - * - */ -V_FUNCTION_BEGIN(__kernel_gettimeofday) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - - mr r10,r3 /* r10 saves tv */ - mr r11,r4 /* r11 saves tz */ - bl __get_datapage@local /* get data page */ - mr r9, r3 /* datapage ptr in r9 */ - cmplwi r10,0 /* check if tv is NULL */ - beq 3f - bl __do_get_xsec@local /* get xsec from tb & kernel */ - bne- 2f /* out of line -> do syscall */ - - /* seconds are xsec >> 20 */ - rlwinm r5,r4,12,20,31 - rlwimi r5,r3,12,0,19 - stw r5,TVAL32_TV_SEC(r10) - - /* get remaining xsec and convert to usec. we scale - * up remaining xsec by 12 bits and get the top 32 bits - * of the multiplication - */ - rlwinm r5,r4,12,0,19 - lis r6,1000000@h - ori r6,r6,1000000@l - mulhwu r5,r5,r6 - stw r5,TVAL32_TV_USEC(r10) - -3: cmplwi r11,0 /* check if tz is NULL */ - beq 1f - lwz r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */ - lwz r5,CFG_TZ_DSTTIME(r9) - stw r4,TZONE_TZ_MINWEST(r11) - stw r5,TZONE_TZ_DSTTIME(r11) - -1: mtlr r12 - crclr cr0*4+so - li r3,0 - blr - -2: - mtlr r12 - mr r3,r10 - mr r4,r11 - li r0,__NR_gettimeofday - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_gettimeofday) - -/* - * Exact prototype of clock_gettime() - * - * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); - * - */ -V_FUNCTION_BEGIN(__kernel_clock_gettime) - .cfi_startproc - /* Check for supported clock IDs */ - cmpli cr0,r3,CLOCK_REALTIME - cmpli cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f - - mflr r12 /* r12 saves lr */ - .cfi_register lr,r12 - mr r10,r3 /* r10 saves id */ - mr r11,r4 /* r11 saves tp */ - bl __get_datapage@local /* get data page */ - mr r9,r3 /* datapage ptr in r9 */ - beq cr1,50f /* if monotonic -> jump there */ - - /* - * CLOCK_REALTIME - */ - - bl __do_get_xsec@local /* get xsec from tb & kernel */ - bne- 98f /* out of line -> do syscall */ - - /* seconds are xsec >> 20 */ - rlwinm r5,r4,12,20,31 - rlwimi r5,r3,12,0,19 - stw r5,TSPC32_TV_SEC(r11) - - /* get remaining xsec and convert to nsec. we scale - * up remaining xsec by 12 bits and get the top 32 bits - * of the multiplication, then we multiply by 1000 - */ - rlwinm r5,r4,12,0,19 - lis r6,1000000@h - ori r6,r6,1000000@l - mulhwu r5,r5,r6 - mulli r5,r5,1000 - stw r5,TSPC32_TV_NSEC(r11) - mtlr r12 - crclr cr0*4+so - li r3,0 - blr - - /* - * CLOCK_MONOTONIC - */ - -50: bl __do_get_xsec@local /* get xsec from tb & kernel */ - bne- 98f /* out of line -> do syscall */ - - /* seconds are xsec >> 20 */ - rlwinm r6,r4,12,20,31 - rlwimi r6,r3,12,0,19 - - /* get remaining xsec and convert to nsec. we scale - * up remaining xsec by 12 bits and get the top 32 bits - * of the multiplication, then we multiply by 1000 - */ - rlwinm r7,r4,12,0,19 - lis r5,1000000@h - ori r5,r5,1000000@l - mulhwu r7,r7,r5 - mulli r7,r7,1000 - - /* now we must fixup using wall to monotonic. We need to snapshot - * that value and do the counter trick again. Fortunately, we still - * have the counter value in r8 that was returned by __do_get_xsec. - * At this point, r6,r7 contain our sec/nsec values, r3,r4 and r5 - * can be used - */ - - lwz r3,WTOM_CLOCK_SEC(r9) - lwz r4,WTOM_CLOCK_NSEC(r9) - - /* We now have our result in r3,r4. We create a fake dependency - * on that result and re-check the counter - */ - or r5,r4,r3 - xor r0,r5,r5 - add r9,r9,r0 -#ifdef CONFIG_PPC64 - lwz r0,(CFG_TB_UPDATE_COUNT+4)(r9) -#else - lwz r0,(CFG_TB_UPDATE_COUNT)(r9) -#endif - cmpl cr0,r8,r0 /* check if updated */ - bne- 50b - - /* Calculate and store result. Note that this mimmics the C code, - * which may cause funny results if nsec goes negative... is that - * possible at all ? - */ - add r3,r3,r6 - add r4,r4,r7 - lis r5,NSEC_PER_SEC@h - ori r5,r5,NSEC_PER_SEC@l - cmpl cr0,r4,r5 - cmpli cr1,r4,0 - blt 1f - subf r4,r5,r4 - addi r3,r3,1 -1: bge cr1,1f - addi r3,r3,-1 - add r4,r4,r5 -1: stw r3,TSPC32_TV_SEC(r11) - stw r4,TSPC32_TV_NSEC(r11) - - mtlr r12 - crclr cr0*4+so - li r3,0 - blr - - /* - * syscall fallback - */ -98: - mtlr r12 - mr r3,r10 - mr r4,r11 -99: - li r0,__NR_clock_gettime - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_clock_gettime) - - -/* - * Exact prototype of clock_getres() - * - * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); - * - */ -V_FUNCTION_BEGIN(__kernel_clock_getres) - .cfi_startproc - /* Check for supported clock IDs */ - cmpwi cr0,r3,CLOCK_REALTIME - cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f - - li r3,0 - cmpli cr0,r4,0 - crclr cr0*4+so - beqlr - lis r5,CLOCK_REALTIME_RES@h - ori r5,r5,CLOCK_REALTIME_RES@l - stw r3,TSPC32_TV_SEC(r4) - stw r5,TSPC32_TV_NSEC(r4) - blr - - /* - * syscall fallback - */ -99: - li r0,__NR_clock_getres - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_clock_getres) - - -/* - * This is the core of gettimeofday() & friends, it returns the xsec - * value in r3 & r4 and expects the datapage ptr (non clobbered) - * in r9. clobbers r0,r4,r5,r6,r7,r8. - * When returning, r8 contains the counter value that can be reused - * by the monotonic clock implementation - */ -__do_get_xsec: - .cfi_startproc - /* Check for update count & load values. We use the low - * order 32 bits of the update count - */ -#ifdef CONFIG_PPC64 -1: lwz r8,(CFG_TB_UPDATE_COUNT+4)(r9) -#else -1: lwz r8,(CFG_TB_UPDATE_COUNT)(r9) -#endif - andi. r0,r8,1 /* pending update ? loop */ - bne- 1b - xor r0,r8,r8 /* create dependency */ - add r9,r9,r0 - - /* Load orig stamp (offset to TB) */ - lwz r5,CFG_TB_ORIG_STAMP(r9) - lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) - - /* Get a stable TB value */ -2: mftbu r3 - mftbl r4 - mftbu r0 - cmpl cr0,r3,r0 - bne- 2b - - /* Substract tb orig stamp. If the high part is non-zero, we jump to - * the slow path which call the syscall. - * If it's ok, then we have our 32 bits tb_ticks value in r7 - */ - subfc r7,r6,r4 - subfe. r0,r5,r3 - bne- 3f - - /* Load scale factor & do multiplication */ - lwz r5,CFG_TB_TO_XS(r9) /* load values */ - lwz r6,(CFG_TB_TO_XS+4)(r9) - mulhwu r4,r7,r5 - mulhwu r6,r7,r6 - mullw r0,r7,r5 - addc r6,r6,r0 - - /* At this point, we have the scaled xsec value in r4 + XER:CA - * we load & add the stamp since epoch - */ - lwz r5,CFG_STAMP_XSEC(r9) - lwz r6,(CFG_STAMP_XSEC+4)(r9) - adde r4,r4,r6 - addze r3,r5 - - /* We now have our result in r3,r4. We create a fake dependency - * on that result and re-check the counter - */ - or r6,r4,r3 - xor r0,r6,r6 - add r9,r9,r0 -#ifdef CONFIG_PPC64 - lwz r0,(CFG_TB_UPDATE_COUNT+4)(r9) -#else - lwz r0,(CFG_TB_UPDATE_COUNT)(r9) -#endif - cmpl cr0,r8,r0 /* check if updated */ - bne- 1b - - /* Warning ! The caller expects CR:EQ to be set to indicate a - * successful calculation (so it won't fallback to the syscall - * method). We have overriden that CR bit in the counter check, - * but fortunately, the loop exit condition _is_ CR:EQ set, so - * we can exit safely here. If you change this code, be careful - * of that side effect. - */ -3: blr - .cfi_endproc Index: linux-rt.q/arch/powerpc/kernel/vdso32/vdso32.lds.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ linux-rt.q/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -117,10 +117,6 @@ VERSION global: __kernel_datapage_offset; /* Has to be there for the kernel to find */ __kernel_get_syscall_map; - __kernel_gettimeofday; - __kernel_clock_gettime; - __kernel_clock_getres; - __kernel_get_tbfreq; __kernel_sync_dicache; __kernel_sync_dicache_p5; __kernel_sigtramp32; Index: linux-rt.q/arch/powerpc/kernel/vdso64/Makefile =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/vdso64/Makefile +++ linux-rt.q/arch/powerpc/kernel/vdso64/Makefile @@ -1,6 +1,6 @@ # List of files in the vdso, has to be asm only for now -obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o +obj-vdso64 = sigtramp.o datapage.o cacheflush.o note.o # Build rules Index: linux-rt.q/arch/powerpc/kernel/vdso64/datapage.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/vdso64/datapage.S +++ linux-rt.q/arch/powerpc/kernel/vdso64/datapage.S @@ -65,21 +65,3 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_ma blr .cfi_endproc V_FUNCTION_END(__kernel_get_syscall_map) - - -/* - * void unsigned long __kernel_get_tbfreq(void); - * - * returns the timebase frequency in HZ - */ -V_FUNCTION_BEGIN(__kernel_get_tbfreq) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - bl V_LOCAL_FUNC(__get_datapage) - ld r3,CFG_TB_TICKS_PER_SEC(r3) - mtlr r12 - crclr cr0*4+so - blr - .cfi_endproc -V_FUNCTION_END(__kernel_get_tbfreq) Index: linux-rt.q/arch/powerpc/kernel/vdso64/gettimeofday.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/vdso64/gettimeofday.S +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Userland implementation of gettimeofday() for 64 bits processes in a - * ppc64 kernel for use in the vDSO - * - * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), - * IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include - - .text -/* - * Exact prototype of gettimeofday - * - * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); - * - */ -V_FUNCTION_BEGIN(__kernel_gettimeofday) - .cfi_startproc - mflr r12 - .cfi_register lr,r12 - - mr r11,r3 /* r11 holds tv */ - mr r10,r4 /* r10 holds tz */ - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ - cmpldi r11,0 /* check if tv is NULL */ - beq 2f - bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ - ori r7,r7,16960 - rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ - rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ - std r5,TVAL64_TV_SEC(r11) /* store sec in tv */ - subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / - * XSEC_PER_SEC - */ - rldicl r0,r0,44,20 - std r0,TVAL64_TV_USEC(r11) /* store usec in tv */ -2: cmpldi r10,0 /* check if tz is NULL */ - beq 1f - lwz r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */ - lwz r5,CFG_TZ_DSTTIME(r3) - stw r4,TZONE_TZ_MINWEST(r10) - stw r5,TZONE_TZ_DSTTIME(r10) -1: mtlr r12 - crclr cr0*4+so - li r3,0 /* always success */ - blr - .cfi_endproc -V_FUNCTION_END(__kernel_gettimeofday) - - -/* - * Exact prototype of clock_gettime() - * - * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); - * - */ -V_FUNCTION_BEGIN(__kernel_clock_gettime) - .cfi_startproc - /* Check for supported clock IDs */ - cmpwi cr0,r3,CLOCK_REALTIME - cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f - - mflr r12 /* r12 saves lr */ - .cfi_register lr,r12 - mr r10,r3 /* r10 saves id */ - mr r11,r4 /* r11 saves tp */ - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ - beq cr1,50f /* if monotonic -> jump there */ - - /* - * CLOCK_REALTIME - */ - - bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - - lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ - ori r7,r7,16960 - rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ - rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ - std r5,TSPC64_TV_SEC(r11) /* store sec in tv */ - subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / - * XSEC_PER_SEC - */ - rldicl r0,r0,44,20 - mulli r0,r0,1000 /* nsec = usec * 1000 */ - std r0,TSPC64_TV_NSEC(r11) /* store nsec in tp */ - - mtlr r12 - crclr cr0*4+so - li r3,0 - blr - - /* - * CLOCK_MONOTONIC - */ - -50: bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - - lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ - ori r7,r7,16960 - rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ - rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ - subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / - * XSEC_PER_SEC - */ - rldicl r6,r0,44,20 - mulli r6,r6,1000 /* nsec = usec * 1000 */ - - /* now we must fixup using wall to monotonic. We need to snapshot - * that value and do the counter trick again. Fortunately, we still - * have the counter value in r8 that was returned by __do_get_xsec. - * At this point, r5,r6 contain our sec/nsec values. - * can be used - */ - - lwa r4,WTOM_CLOCK_SEC(r3) - lwa r7,WTOM_CLOCK_NSEC(r3) - - /* We now have our result in r4,r7. We create a fake dependency - * on that result and re-check the counter - */ - or r9,r4,r7 - xor r0,r9,r9 - add r3,r3,r0 - ld r0,CFG_TB_UPDATE_COUNT(r3) - cmpld cr0,r0,r8 /* check if updated */ - bne- 50b - - /* Calculate and store result. Note that this mimmics the C code, - * which may cause funny results if nsec goes negative... is that - * possible at all ? - */ - add r4,r4,r5 - add r7,r7,r6 - lis r9,NSEC_PER_SEC@h - ori r9,r9,NSEC_PER_SEC@l - cmpl cr0,r7,r9 - cmpli cr1,r7,0 - blt 1f - subf r7,r9,r7 - addi r4,r4,1 -1: bge cr1,1f - addi r4,r4,-1 - add r7,r7,r9 -1: std r4,TSPC64_TV_SEC(r11) - std r7,TSPC64_TV_NSEC(r11) - - mtlr r12 - crclr cr0*4+so - li r3,0 - blr - - /* - * syscall fallback - */ -98: - mtlr r12 - mr r3,r10 - mr r4,r11 -99: - li r0,__NR_clock_gettime - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_clock_gettime) - - -/* - * Exact prototype of clock_getres() - * - * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); - * - */ -V_FUNCTION_BEGIN(__kernel_clock_getres) - .cfi_startproc - /* Check for supported clock IDs */ - cmpwi cr0,r3,CLOCK_REALTIME - cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f - - li r3,0 - cmpli cr0,r4,0 - crclr cr0*4+so - beqlr - lis r5,CLOCK_REALTIME_RES@h - ori r5,r5,CLOCK_REALTIME_RES@l - std r3,TSPC64_TV_SEC(r4) - std r5,TSPC64_TV_NSEC(r4) - blr - - /* - * syscall fallback - */ -99: - li r0,__NR_clock_getres - sc - blr - .cfi_endproc -V_FUNCTION_END(__kernel_clock_getres) - - -/* - * This is the core of gettimeofday(), it returns the xsec - * value in r4 and expects the datapage ptr (non clobbered) - * in r3. clobbers r0,r4,r5,r6,r7,r8 - * When returning, r8 contains the counter value that can be reused - */ -V_FUNCTION_BEGIN(__do_get_xsec) - .cfi_startproc - /* check for update count & load values */ -1: ld r8,CFG_TB_UPDATE_COUNT(r3) - andi. r0,r8,1 /* pending update ? loop */ - bne- 1b - xor r0,r8,r8 /* create dependency */ - add r3,r3,r0 - - /* Get TB & offset it. We use the MFTB macro which will generate - * workaround code for Cell. - */ - MFTB(r7) - ld r9,CFG_TB_ORIG_STAMP(r3) - subf r7,r9,r7 - - /* Scale result */ - ld r5,CFG_TB_TO_XS(r3) - mulhdu r7,r7,r5 - - /* Add stamp since epoch */ - ld r6,CFG_STAMP_XSEC(r3) - add r4,r6,r7 - - xor r0,r4,r4 - add r3,r3,r0 - ld r0,CFG_TB_UPDATE_COUNT(r3) - cmpld cr0,r0,r8 /* check if updated */ - bne- 1b - blr - .cfi_endproc -V_FUNCTION_END(__do_get_xsec) Index: linux-rt.q/arch/powerpc/kernel/vdso64/vdso64.lds.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/vdso64/vdso64.lds.S +++ linux-rt.q/arch/powerpc/kernel/vdso64/vdso64.lds.S @@ -115,10 +115,6 @@ VERSION global: __kernel_datapage_offset; /* Has to be there for the kernel to find */ __kernel_get_syscall_map; - __kernel_gettimeofday; - __kernel_clock_gettime; - __kernel_clock_getres; - __kernel_get_tbfreq; __kernel_sync_dicache; __kernel_sync_dicache_p5; __kernel_sigtramp_rt64; Index: linux-rt.q/include/asm-powerpc/time.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/time.h +++ linux-rt.q/include/asm-powerpc/time.h @@ -47,26 +47,6 @@ extern unsigned long ppc_proc_freq; extern unsigned long ppc_tb_freq; #define DEFAULT_TB_FREQ 125000000UL -/* - * By putting all of this stuff into a single struct we - * reduce the number of cache lines touched by do_gettimeofday. - * Both by collecting all of the data in one cache line and - * by touching only one TOC entry on ppc64. - */ -struct gettimeofday_vars { - u64 tb_to_xs; - u64 stamp_xsec; - u64 tb_orig_stamp; -}; - -struct gettimeofday_struct { - unsigned long tb_ticks_per_sec; - struct gettimeofday_vars vars[2]; - struct gettimeofday_vars * volatile varp; - unsigned var_idx; - unsigned tb_to_us; -}; - struct div_result { u64 result_high; u64 result_low; Index: linux-rt.q/include/asm-powerpc/vdso_datapage.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/vdso_datapage.h +++ linux-rt.q/include/asm-powerpc/vdso_datapage.h @@ -74,11 +74,6 @@ struct vdso_data { __u32 icache_size; /* L1 i-cache size 0x68 */ __u32 icache_line_size; /* L1 i-cache line size 0x6C */ - /* those additional ones don't have to be located anywhere - * special as they were not part of the original systemcfg - */ - __s32 wtom_clock_sec; /* Wall to monotonic clock */ - __s32 wtom_clock_nsec; __u32 syscall_map_64[SYSCALL_MAP_SIZE]; /* map of syscalls */ __u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */ }; @@ -89,15 +84,6 @@ struct vdso_data { * And here is the simpler 32 bits version */ struct vdso_data { - __u64 tb_orig_stamp; /* Timebase at boot 0x30 */ - __u64 tb_ticks_per_sec; /* Timebase tics / sec 0x38 */ - __u64 tb_to_xs; /* Inverse of TB to 2^20 0x40 */ - __u64 stamp_xsec; /* 0x48 */ - __u32 tb_update_count; /* Timebase atomicity ctr 0x50 */ - __u32 tz_minuteswest; /* Minutes west of Greenwich 0x58 */ - __u32 tz_dsttime; /* Type of dst correction 0x5C */ - __s32 wtom_clock_sec; /* Wall to monotonic clock */ - __s32 wtom_clock_nsec; __u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */ }; patches/lockstat-output.patch0000664000077200007720000002753710646635217015754 0ustar mingomingoSubject: lockstat: human readability tweaks Present all this fancy new lock statistics information: *warning, _wide_ output ahead* (output edited for purpose of brevity) # cat /proc/lock_stat lock_stat version 0.1 ----------------------------------------------------------------------------------------------------------------------------------------------------------------- class name contentions waittime-min waittime-max waittime-total acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------- &inode->i_mutex: 14458 6.57 398832.75 2469412.23 6768876 0.34 11398383.65 339410830.89 --------------- &inode->i_mutex 4486 [] pipe_wait+0x86/0x8d &inode->i_mutex 0 [] pipe_write_fasync+0x29/0x5d &inode->i_mutex 0 [] pipe_read+0x74/0x3a5 &inode->i_mutex 0 [] do_lookup+0x81/0x1ae ................................................................................................................................................................. &inode->i_data.tree_lock-W: 491 0.27 62.47 493.89 2477833 0.39 468.89 1146584.25 &inode->i_data.tree_lock-R: 65 0.44 4.27 48.78 26288792 0.36 184.62 10197458.24 -------------------------- &inode->i_data.tree_lock 46 [] __do_page_cache_readahead+0x69/0x24f &inode->i_data.tree_lock 31 [] add_to_page_cache+0x31/0xba &inode->i_data.tree_lock 0 [] __do_page_cache_readahead+0xc2/0x24f &inode->i_data.tree_lock 0 [] find_get_page+0x1a/0x58 ................................................................................................................................................................. proc_inum_idr.lock: 0 0.00 0.00 0.00 36 0.00 65.60 148.26 proc_subdir_lock: 0 0.00 0.00 0.00 3049859 0.00 106.81 1563212.42 shrinker_rwsem-W: 0 0.00 0.00 0.00 5 0.00 1.73 3.68 shrinker_rwsem-R: 0 0.00 0.00 0.00 633 2.57 246.57 10909.76 'contentions' and 'acquisitions' are the number of such events measured (since the last reset). The waittime- and holdtime- (min, max, total) numbers are presented in microseconds. If there are any contention points, the lock class is presented in the block format (as i_mutex and tree_lock above), otherwise a single line of output is presented. The output is sorted on absolute number of contentions (read + write), this should get the worst offenders presented first, so that: # grep : /proc/lock_stat | head will quickly show who's bad. The stats can be reset using: # echo 0 > /proc/lock_stat Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron --- kernel/lockdep.c | 44 ++++++++ kernel/lockdep_proc.c | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 310 insertions(+) Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -168,6 +168,50 @@ static void lock_time_inc(struct lock_ti lt->nr++; } +static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) +{ + dst->min += src->min; + dst->max += src->max; + dst->total += src->total; + dst->nr += src->nr; +} + +struct lock_class_stats lock_stats(struct lock_class *class) +{ + struct lock_class_stats stats; + int cpu, i; + + memset(&stats, 0, sizeof(struct lock_class_stats)); + for_each_possible_cpu(cpu) { + struct lock_class_stats *pcs = + &per_cpu(lock_stats, cpu)[class - lock_classes]; + + for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) + stats.contention_point[i] += pcs->contention_point[i]; + + lock_time_add(&pcs->read_waittime, &stats.read_waittime); + lock_time_add(&pcs->write_waittime, &stats.write_waittime); + + lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + } + + return stats; +} + +void clear_lock_stats(struct lock_class *class) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct lock_class_stats *cpu_stats = + &per_cpu(lock_stats, cpu)[class - lock_classes]; + + memset(cpu_stats, 0, sizeof(struct lock_class_stats)); + } + memset(class->contention_point, 0, sizeof(class->contention_point)); +} + static struct lock_class_stats *get_lock_stats(struct lock_class *class) { return &get_cpu_var(lock_stats)[class - lock_classes]; Index: linux-rt.q/kernel/lockdep_proc.c =================================================================== --- linux-rt.q.orig/kernel/lockdep_proc.c +++ linux-rt.q/kernel/lockdep_proc.c @@ -15,6 +15,10 @@ #include #include #include +#include +#include +#include +#include #include "lockdep_internals.h" @@ -342,6 +346,262 @@ static const struct file_operations proc .release = seq_release, }; +#ifdef CONFIG_LOCK_STAT + +struct lock_stat_data { + struct lock_class *class; + struct lock_class_stats stats; +}; + +struct lock_stat_seq { + struct lock_stat_data *iter; + struct lock_stat_data *iter_end; + struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; +}; + +/* + * sort on absolute number of contentions + */ +int lock_stat_cmp(const void *l, const void *r) +{ + const struct lock_stat_data *dl = l, *dr = r; + unsigned long nl, nr; + + nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; + nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; + + return nr - nl; +} + +static void seq_line(struct seq_file *m, char c, int offset, int length) +{ + int i; + + for (i = 0; i < offset; i++) + seq_puts(m, " "); + for (i = 0; i < length; i++) + seq_printf(m, "%c", c); + seq_puts(m, "\n"); +} + +static void snprint_time(char *buf, size_t bufsiz, s64 nr) +{ + unsigned long rem; + + rem = do_div(nr, 1000); /* XXX: do_div_signed */ + snprintf(buf, bufsiz, "%lld.%02d", nr, ((int)rem+5)/10); +} + +static void seq_time(struct seq_file *m, s64 time) +{ + char num[15]; + + snprint_time(num, sizeof(num), time); + seq_printf(m, " %14s", num); +} + +static void seq_lock_time(struct seq_file *m, struct lock_time *lt) +{ + seq_printf(m, "%14lu", lt->nr); + seq_time(m, lt->min); + seq_time(m, lt->max); + seq_time(m, lt->total); +} + +static void seq_stats(struct seq_file *m, struct lock_stat_data *data) +{ + char name[39]; + struct lock_class *class; + struct lock_class_stats *stats; + int i, namelen; + + class = data->class; + stats = &data->stats; + + snprintf(name, 38, "%s", class->name); + namelen = strlen(name); + + if (stats->write_holdtime.nr) { + if (stats->read_holdtime.nr) + seq_printf(m, "%38s-W:", name); + else + seq_printf(m, "%40s:", name); + + seq_lock_time(m, &stats->write_waittime); + seq_puts(m, " "); + seq_lock_time(m, &stats->write_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_holdtime.nr) { + seq_printf(m, "%38s-R:", name); + seq_lock_time(m, &stats->read_waittime); + seq_puts(m, " "); + seq_lock_time(m, &stats->read_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_waittime.nr + stats->write_waittime.nr == 0) + return; + + if (stats->read_holdtime.nr) + namelen += 2; + + for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + char sym[KSYM_SYMBOL_LEN]; + char ip[32]; + + if (class->contention_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + sprint_symbol(sym, class->contention_point[i]); + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contention_point[i]); + seq_printf(m, "%40s %14lu %29s %s\n", name, + stats->contention_point[i], + ip, sym); + } + if (i) { + seq_puts(m, "\n"); + seq_line(m, '.', 0, 40 + 1 + 8 * (14 + 1)); + seq_puts(m, "\n"); + } +} + +static void seq_header(struct seq_file *m) +{ + seq_printf(m, "lock_stat version 0.1\n"); + seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s\n", + "class name", + "contentions", + "waittime-min", + "waittime-max", + "waittime-total", + "acquisitions", + "holdtime-min", + "holdtime-max", + "holdtime-total"); + seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1)); + seq_printf(m, "\n"); +} + +static void *ls_start(struct seq_file *m, loff_t *pos) +{ + struct lock_stat_seq *data = m->private; + + if (data->iter == data->stats) + seq_header(m); + + return data->iter; +} + +static void *ls_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct lock_stat_seq *data = m->private; + + (*pos)++; + + data->iter = v; + data->iter++; + if (data->iter == data->iter_end) + data->iter = NULL; + + return data->iter; +} + +static void ls_stop(struct seq_file *m, void *v) +{ +} + +static int ls_show(struct seq_file *m, void *v) +{ + struct lock_stat_seq *data = m->private; + + seq_stats(m, data->iter); + return 0; +} + +static struct seq_operations lockstat_ops = { + .start = ls_start, + .next = ls_next, + .stop = ls_stop, + .show = ls_show, +}; + +static int lock_stat_open(struct inode *inode, struct file *file) +{ + int res; + struct lock_class *class; + struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); + + if (!data) + return -ENOMEM; + + res = seq_open(file, &lockstat_ops); + if (!res) { + struct lock_stat_data *iter = data->stats; + struct seq_file *m = file->private_data; + + data->iter = iter; + list_for_each_entry(class, &all_lock_classes, lock_entry) { + iter->class = class; + iter->stats = lock_stats(class); + iter++; + } + data->iter_end = iter; + + sort(data->stats, data->iter_end - data->iter, + sizeof(struct lock_stat_data), + lock_stat_cmp, NULL); + + m->private = data; + } else + vfree(data); + + return res; +} + +ssize_t lock_stat_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct lock_class *class; + char c; + + if (count) { + if (get_user(c, buf)) + return -EFAULT; + + if (c != '0') + return count; + + list_for_each_entry(class, &all_lock_classes, lock_entry) + clear_lock_stats(class); + } + return count; +} + +static int lock_stat_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + vfree(seq->private); + seq->private = NULL; + return seq_release(inode, file); +} + +static const struct file_operations proc_lock_stat_operations = { + .open = lock_stat_open, + .write = lock_stat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = lock_stat_release, +}; +#endif /* CONFIG_LOCK_STAT */ + static int __init lockdep_proc_init(void) { struct proc_dir_entry *entry; @@ -354,6 +614,12 @@ static int __init lockdep_proc_init(void if (entry) entry->proc_fops = &proc_lockdep_stats_operations; +#ifdef CONFIG_LOCK_STAT + entry = create_proc_entry("lock_stat", S_IRUSR, NULL); + if (entry) + entry->proc_fops = &proc_lock_stat_operations; +#endif + return 0; } patches/lockdep-more-entries.patch0000664000077200007720000000132310646635213016577 0ustar mingomingo--- kernel/lockdep_internals.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/kernel/lockdep_internals.h =================================================================== --- linux-rt.q.orig/kernel/lockdep_internals.h +++ linux-rt.q/kernel/lockdep_internals.h @@ -15,12 +15,12 @@ * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_KEYS_BITS 11 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) /* patches/x86_64-apic-shuffle-calibration-around.patch0000664000077200007720000000415510646635211021630 0ustar mingomingoSubject: x86_64: Move apic calibration code around Let the calibration code fill in calibration_result directly and move the variable on top of the file. Fixup a printk w/o log level while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -56,6 +56,8 @@ static struct resource lapic_resource = .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; +static unsigned int calibration_result; + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -822,7 +824,7 @@ static void setup_APIC_timer(unsigned in #define TICK_COUNT 100000000 -static int __init calibrate_APIC_clock(void) +static void __init calibrate_APIC_clock(void) { unsigned apic, apic_start; unsigned long tsc, tsc_start; @@ -856,17 +858,14 @@ static int __init calibrate_APIC_clock(v result = (apic_start - apic) * 1000L * tsc_khz / (tsc - tsc_start); } - printk("result %d\n", result); - + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", result / 1000 / 1000, result / 1000 % 1000); - return result * APIC_DIVISOR / HZ; + calibration_result = result * APIC_DIVISOR / HZ; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock (void) { if (disable_apic_timer) { @@ -879,7 +878,7 @@ void __init setup_boot_APIC_clock (void) local_irq_disable(); - calibration_result = calibrate_APIC_clock(); + calibrate_APIC_clock(); /* * Now set up the timer for real. */ @@ -986,8 +985,6 @@ void setup_APIC_extended_lvt(unsigned ch apic_write(reg, v); } -#undef APIC_DIVISOR - /* * Local timer interrupt handler. It does both profiling and * process statistics/rescheduling. patches/reset-latency-histogram.patch0000664000077200007720000000554410646635212017331 0ustar mingomingoSubject: Latency tracer: Reset histogram when preempt_max_latency was reset From: Carsten Emde When the histogram mode is active, it is not possible to reset the histogram for a second one. Reset it, when preempt_max_latency was reset. Signed-off-by: Carsten Emde --- include/linux/latency_hist.h | 1 + kernel/latency_hist.c | 28 ++++++++++++++++++++++++++++ kernel/latency_trace.c | 15 +++++++++++++++ 3 files changed, 44 insertions(+) Index: linux-rt.q/include/linux/latency_hist.h =================================================================== --- linux-rt.q.orig/include/linux/latency_hist.h +++ linux-rt.q/include/linux/latency_hist.h @@ -23,6 +23,7 @@ enum { #ifdef CONFIG_LATENCY_HIST extern void latency_hist(int latency_type, int cpu, unsigned long latency); +extern void latency_hist_reset(void); # define latency_hist_flag 1 #else # define latency_hist(a,b,c) do { (void)(cpu); } while (0) Index: linux-rt.q/kernel/latency_hist.c =================================================================== --- linux-rt.q.orig/kernel/latency_hist.c +++ linux-rt.q/kernel/latency_hist.c @@ -264,3 +264,31 @@ static __init int latency_hist_init(void __initcall(latency_hist_init); + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static void hist_reset(hist_data_t *hist) +{ + atomic_dec(&hist->hist_mode); + + memset(hist->hist_array, 0, sizeof(hist->hist_array)); + hist->beyond_hist_bound_samples = 0UL; + hist->min_lat = 0xFFFFFFFFUL; + hist->max_lat = 0UL; + hist->total_samples = 0UL; + hist->accumulate_lat = 0UL; + hist->avg_lat = 0UL; + + atomic_inc(&hist->hist_mode); +} + +void latency_hist_reset(void) +{ + int cpu; + hist_data_t *hist; + + for_each_online_cpu(cpu) { + hist = &per_cpu(wakeup_latency_hist, cpu); + hist_reset(hist); + } +} +#endif Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -2182,6 +2182,8 @@ void notrace trace_preempt_exit_idle(voi */ #ifdef CONFIG_WAKEUP_TIMING +unsigned long last_preempt_max_latency; + static void notrace check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip, unsigned long *flags) @@ -2209,6 +2211,19 @@ check_wakeup_timing(struct cpu_trace *tr if (!report_latency(delta)) goto out; +#ifdef CONFIG_WAKEUP_LATENCY_HIST + /* + * Was preempt_max_latency reset? + * If so, we reinitialize the latency histograms to keep them in sync. + * + * FIXME: Remove the poll and write our own procfs handler, so + * we can trigger on the write to preempt_max_latency + */ + if (last_preempt_max_latency > 0 && preempt_max_latency == 0) + latency_hist_reset(); + last_preempt_max_latency = preempt_max_latency; +#endif + ____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, *flags); patches/gcc-warnings-shut-up.patch0000664000077200007720000001353510646635212016543 0ustar mingomingo arch/i386/kernel/efi.c | 2 +- fs/block_dev.c | 2 +- fs/isofs/namei.c | 2 +- fs/jffs2/erase.c | 2 +- fs/nfsd/nfsctl.c | 2 +- ipc/msg.c | 2 +- ipc/sem.c | 2 +- kernel/audit.c | 2 +- kernel/auditfilter.c | 2 +- net/core/flow.c | 2 +- net/sunrpc/svc.c | 2 +- sound/core/control_compat.c | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) Index: linux-rt.q/arch/i386/kernel/efi.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/efi.c +++ linux-rt.q/arch/i386/kernel/efi.c @@ -278,7 +278,7 @@ void efi_memmap_walk(efi_freemem_callbac struct range { unsigned long start; unsigned long end; - } prev, curr; + } prev = { } /* shut up gcc */ , curr = { } /* shut up gcc */ ; efi_memory_desc_t *md; unsigned long start, end; void *p; Index: linux-rt.q/fs/block_dev.c =================================================================== --- linux-rt.q.orig/fs/block_dev.c +++ linux-rt.q/fs/block_dev.c @@ -949,7 +949,7 @@ static int bd_claim_by_kobject(struct bl struct kobject *kobj) { int res; - struct bd_holder *bo, *found; + struct bd_holder *bo, *found = NULL /* shut up GCC */; if (!kobj) return -EINVAL; Index: linux-rt.q/fs/isofs/namei.c =================================================================== --- linux-rt.q.orig/fs/isofs/namei.c +++ linux-rt.q/fs/isofs/namei.c @@ -158,7 +158,7 @@ isofs_find_entry(struct inode *dir, stru struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { int found; - unsigned long block, offset; + unsigned long block = 0, offset = 0 /* avoid stupid gcc warning */; struct inode *inode; struct page *page; Index: linux-rt.q/fs/jffs2/erase.c =================================================================== --- linux-rt.q.orig/fs/jffs2/erase.c +++ linux-rt.q/fs/jffs2/erase.c @@ -362,7 +362,7 @@ static void jffs2_mark_erased_block(stru { size_t retlen; int ret; - uint32_t bad_offset; + uint32_t bad_offset = 0 /* shut up gcc */; switch (jffs2_block_check_erase(c, jeb, &bad_offset)) { case -EAGAIN: goto refile; Index: linux-rt.q/fs/nfsd/nfsctl.c =================================================================== --- linux-rt.q.orig/fs/nfsd/nfsctl.c +++ linux-rt.q/fs/nfsd/nfsctl.c @@ -299,7 +299,7 @@ static ssize_t write_filehandle(struct f * qword quoting is used, so filehandle will be \x.... */ char *dname, *path; - int maxsize; + int maxsize = 0; char *mesg = buf; int len; struct auth_domain *dom; Index: linux-rt.q/ipc/msg.c =================================================================== --- linux-rt.q.orig/ipc/msg.c +++ linux-rt.q/ipc/msg.c @@ -387,7 +387,7 @@ copy_msqid_from_user(struct msq_setbuf * asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) { struct kern_ipc_perm *ipcp; - struct msq_setbuf setbuf; + struct msq_setbuf setbuf = { /* shut up gcc warning */ }; struct msg_queue *msq; int err, version; struct ipc_namespace *ns; Index: linux-rt.q/ipc/sem.c =================================================================== --- linux-rt.q.orig/ipc/sem.c +++ linux-rt.q/ipc/sem.c @@ -858,7 +858,7 @@ static int semctl_down(struct ipc_namesp { struct sem_array *sma; int err; - struct sem_setbuf setbuf; + struct sem_setbuf setbuf = { /* shut up gcc warning */ }; struct kern_ipc_perm *ipcp; if(cmd == IPC_SET) { Index: linux-rt.q/kernel/audit.c =================================================================== --- linux-rt.q.orig/kernel/audit.c +++ linux-rt.q/kernel/audit.c @@ -969,7 +969,7 @@ struct audit_buffer *audit_log_start(str { struct audit_buffer *ab = NULL; struct timespec t; - unsigned int serial; + unsigned int serial = 0 /* shut up gcc */; int reserve; unsigned long timeout_start = jiffies; Index: linux-rt.q/kernel/auditfilter.c =================================================================== --- linux-rt.q.orig/kernel/auditfilter.c +++ linux-rt.q/kernel/auditfilter.c @@ -1210,7 +1210,7 @@ static inline int audit_add_rule(struct struct audit_entry *e; struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch = entry->rule.watch; - struct nameidata *ndp, *ndw; + struct nameidata *ndp = NULL, *ndw = NULL /* shut up gcc */; int h, err, putnd_needed = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; Index: linux-rt.q/net/core/flow.c =================================================================== --- linux-rt.q.orig/net/core/flow.c +++ linux-rt.q/net/core/flow.c @@ -172,7 +172,7 @@ static int flow_key_compare(struct flowi void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { - struct flow_cache_entry *fle, **head; + struct flow_cache_entry *fle, **head = NULL /* shut up GCC */; unsigned int hash; int cpu; Index: linux-rt.q/net/sunrpc/svc.c =================================================================== --- linux-rt.q.orig/net/sunrpc/svc.c +++ linux-rt.q/net/sunrpc/svc.c @@ -547,7 +547,7 @@ __svc_create_thread(svc_thread_fn func, struct svc_rqst *rqstp; int error = -ENOMEM; int have_oldmask = 0; - cpumask_t oldmask; + cpumask_t oldmask = CPU_MASK_NONE /* shut up GCC */; rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); if (!rqstp) Index: linux-rt.q/sound/core/control_compat.c =================================================================== --- linux-rt.q.orig/sound/core/control_compat.c +++ linux-rt.q/sound/core/control_compat.c @@ -219,7 +219,7 @@ static int copy_ctl_value_from_user(stru struct snd_ctl_elem_value32 __user *data32, int *typep, int *countp) { - int i, type, count, size; + int i, type, count = 0 /* shut up gcc warning */, size; unsigned int indirect; if (copy_from_user(&data->id, &data32->id, sizeof(data->id))) patches/latency-tracing-remove-trace-array.patch0000664000077200007720000000251710646635212021343 0ustar mingomingo--- kernel/sched.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -3242,42 +3242,6 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_EVENT_TRACE) && defined(CONFIG_DEBUG_RT_MUTEXES) - -static void trace_array(struct prio_array *array) -{ - int i; - struct task_struct *p; - struct list_head *head, *tmp; - - for (i = 0; i < MAX_RT_PRIO; i++) { - head = array->queue + i; - if (list_empty(head)) { - WARN_ON(test_bit(i, array->bitmap)); - continue; - } - WARN_ON(!test_bit(i, array->bitmap)); - list_for_each(tmp, head) { - p = list_entry(tmp, struct task_struct, run_list); - trace_special_pid(p->pid, p->prio, PRIO(p)); - } - } -} - -static inline void trace_all_runnable_tasks(struct rq *rq) -{ - if (trace_enabled) - trace_array(&rq->active); -} - -#else - -static inline void trace_all_runnable_tasks(struct rq *rq) -{ -} - -#endif - /* * Print scheduling while atomic bug: */ @@ -3385,8 +3349,6 @@ need_resched_nonpreemptible: prev->sched_class->put_prev_task(rq, prev, now); next = pick_next_task(rq, prev, now); - trace_all_runnable_tasks(rq); - sched_info_switch(prev, next); if (likely(prev != next)) { patches/x86_64-apic-calibration-remove-divisor.patch0000664000077200007720000000327510646635211021662 0ustar mingomingoSubject: x86_64: Remove APIC_DIVISOR APIC_DIVISOR is rather useless. It makes the calibration result more accurate in the first place, but we discard this later when we write the value to the APIC timer by dividing the calibration value by APIC_DIVISOR. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -760,8 +760,6 @@ void __init init_apic_mappings(void) * P5 APIC double write bug. */ -#define APIC_DIVISOR 16 - static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { unsigned int lvtt_value, tmp_value; @@ -783,7 +781,7 @@ static void __setup_APIC_LVTT(unsigned i | APIC_TDR_DIV_16); if (!oneshot) - apic_write(APIC_TMICT, clocks/APIC_DIVISOR); + apic_write(APIC_TMICT, clocks); } static void setup_APIC_timer(unsigned int clocks) @@ -836,7 +834,7 @@ static void __init calibrate_APIC_clock( * * No interrupt enable ! */ - __setup_APIC_LVTT(4000000000, 0, 0); + __setup_APIC_LVTT(250000000, 0, 0); apic_start = apic_read(APIC_TMCCT); #ifdef CONFIG_X86_PM_TIMER @@ -863,7 +861,7 @@ static void __init calibrate_APIC_clock( printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", result / 1000 / 1000, result / 1000 % 1000); - calibration_result = result * APIC_DIVISOR / HZ; + calibration_result = result / HZ; } void __init setup_boot_APIC_clock (void) patches/rt-mutex-compat-semaphores.patch0000664000077200007720000003320210646635214017763 0ustar mingomingo drivers/acpi/osl.c | 12 ++++++------ drivers/media/dvb/dvb-core/dvb_frontend.c | 2 +- drivers/media/dvb/dvb-core/dvb_frontend.h | 2 +- drivers/net/3c527.c | 2 +- drivers/net/hamradio/6pack.c | 2 +- drivers/net/hamradio/mkiss.c | 2 +- drivers/net/plip.c | 5 ++++- drivers/net/ppp_async.c | 2 +- drivers/net/ppp_synctty.c | 2 +- drivers/pci/hotplug/cpci_hotplug_core.c | 4 ++-- drivers/pci/hotplug/cpqphp_ctrl.c | 4 ++-- drivers/pci/hotplug/ibmphp_hpc.c | 2 +- drivers/scsi/aacraid/aacraid.h | 4 ++-- drivers/scsi/qla2xxx/qla_def.h | 2 +- drivers/usb/storage/usb.h | 2 +- fs/jffs2/jffs2_fs_i.h | 2 +- fs/xfs/linux-2.6/sema.h | 9 +++++++-- fs/xfs/linux-2.6/xfs_buf.h | 4 ++-- include/linux/parport.h | 2 +- 19 files changed, 37 insertions(+), 29 deletions(-) Index: linux-rt.q/drivers/acpi/osl.c =================================================================== --- linux-rt.q.orig/drivers/acpi/osl.c +++ linux-rt.q/drivers/acpi/osl.c @@ -745,13 +745,13 @@ void acpi_os_delete_lock(acpi_spinlock h acpi_status acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle) { - struct semaphore *sem = NULL; + struct compat_semaphore *sem = NULL; - sem = acpi_os_allocate(sizeof(struct semaphore)); + sem = acpi_os_allocate(sizeof(struct compat_semaphore)); if (!sem) return AE_NO_MEMORY; - memset(sem, 0, sizeof(struct semaphore)); + memset(sem, 0, sizeof(struct compat_semaphore)); sema_init(sem, initial_units); @@ -774,7 +774,7 @@ EXPORT_SYMBOL(acpi_os_create_semaphore); acpi_status acpi_os_delete_semaphore(acpi_handle handle) { - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; if (!sem) @@ -802,7 +802,7 @@ EXPORT_SYMBOL(acpi_os_delete_semaphore); acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout) { acpi_status status = AE_OK; - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; int ret = 0; @@ -889,7 +889,7 @@ EXPORT_SYMBOL(acpi_os_wait_semaphore); */ acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units) { - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; if (!sem || (units < 1)) Index: linux-rt.q/drivers/media/dvb/dvb-core/dvb_frontend.c =================================================================== --- linux-rt.q.orig/drivers/media/dvb/dvb-core/dvb_frontend.c +++ linux-rt.q/drivers/media/dvb/dvb-core/dvb_frontend.c @@ -98,7 +98,7 @@ struct dvb_frontend_private { struct dvb_device *dvbdev; struct dvb_frontend_parameters parameters; struct dvb_fe_events events; - struct semaphore sem; + struct compat_semaphore sem; struct list_head list_head; wait_queue_head_t wait_queue; struct task_struct *thread; Index: linux-rt.q/drivers/media/dvb/dvb-core/dvb_frontend.h =================================================================== --- linux-rt.q.orig/drivers/media/dvb/dvb-core/dvb_frontend.h +++ linux-rt.q/drivers/media/dvb/dvb-core/dvb_frontend.h @@ -142,7 +142,7 @@ struct dvb_fe_events { int eventr; int overflow; wait_queue_head_t wait_queue; - struct semaphore sem; + struct compat_semaphore sem; }; struct dvb_frontend { Index: linux-rt.q/drivers/net/3c527.c =================================================================== --- linux-rt.q.orig/drivers/net/3c527.c +++ linux-rt.q/drivers/net/3c527.c @@ -182,7 +182,7 @@ struct mc32_local u16 rx_ring_tail; /* index to rx de-queue end */ - struct semaphore cmd_mutex; /* Serialises issuing of execute commands */ + struct compat_semaphore cmd_mutex; /* Serialises issuing of execute commands */ struct completion execution_cmd; /* Card has completed an execute command */ struct completion xceiver_cmd; /* Card has completed a tx or rx command */ }; Index: linux-rt.q/drivers/net/hamradio/6pack.c =================================================================== --- linux-rt.q.orig/drivers/net/hamradio/6pack.c +++ linux-rt.q/drivers/net/hamradio/6pack.c @@ -123,7 +123,7 @@ struct sixpack { struct timer_list tx_t; struct timer_list resync_t; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; spinlock_t lock; }; Index: linux-rt.q/drivers/net/hamradio/mkiss.c =================================================================== --- linux-rt.q.orig/drivers/net/hamradio/mkiss.c +++ linux-rt.q/drivers/net/hamradio/mkiss.c @@ -84,7 +84,7 @@ struct mkiss { #define CRC_MODE_SMACK_TEST 4 atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; }; /*---------------------------------------------------------------------------*/ Index: linux-rt.q/drivers/net/plip.c =================================================================== --- linux-rt.q.orig/drivers/net/plip.c +++ linux-rt.q/drivers/net/plip.c @@ -228,7 +228,10 @@ struct net_local { struct hh_cache *hh); spinlock_t lock; atomic_t kill_timer; - struct semaphore killed_timer_sem; + /* + * PREEMPT_RT: this isnt a mutex, it should be struct completion. + */ + struct compat_semaphore killed_timer_sem; }; static inline void enable_parport_interrupts (struct net_device *dev) Index: linux-rt.q/drivers/net/ppp_async.c =================================================================== --- linux-rt.q.orig/drivers/net/ppp_async.c +++ linux-rt.q/drivers/net/ppp_async.c @@ -67,7 +67,7 @@ struct asyncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; Index: linux-rt.q/drivers/net/ppp_synctty.c =================================================================== --- linux-rt.q.orig/drivers/net/ppp_synctty.c +++ linux-rt.q/drivers/net/ppp_synctty.c @@ -70,7 +70,7 @@ struct syncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ }; Index: linux-rt.q/drivers/pci/hotplug/cpci_hotplug_core.c =================================================================== --- linux-rt.q.orig/drivers/pci/hotplug/cpci_hotplug_core.c +++ linux-rt.q/drivers/pci/hotplug/cpci_hotplug_core.c @@ -59,8 +59,8 @@ static int slots; static atomic_t extracting; int cpci_debug; static struct cpci_hp_controller *controller; -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore thread_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore thread_exit; /* guard ensure thread has exited before calling it quits */ static int thread_finished = 1; static int enable_slot(struct hotplug_slot *slot); Index: linux-rt.q/drivers/pci/hotplug/cpqphp_ctrl.c =================================================================== --- linux-rt.q.orig/drivers/pci/hotplug/cpqphp_ctrl.c +++ linux-rt.q/drivers/pci/hotplug/cpqphp_ctrl.c @@ -45,8 +45,8 @@ static int configure_new_function(struct u8 behind_bridge, struct resource_lists *resources); static void interrupt_event_handler(struct controller *ctrl); -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore event_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore event_exit; /* guard ensure thread has exited before calling it quits */ static int event_finished; static unsigned long pushbutton_pending; /* = 0 */ Index: linux-rt.q/drivers/pci/hotplug/ibmphp_hpc.c =================================================================== --- linux-rt.q.orig/drivers/pci/hotplug/ibmphp_hpc.c +++ linux-rt.q/drivers/pci/hotplug/ibmphp_hpc.c @@ -106,7 +106,7 @@ static int tid_poll; static struct mutex sem_hpcaccess; // lock access to HPC static struct semaphore semOperations; // lock all operations and // access to data structures -static struct semaphore sem_exit; // make sure polling thread goes away +static struct compat_semaphore sem_exit; // make sure polling thread goes away //---------------------------------------------------------------------------- // local function prototypes //---------------------------------------------------------------------------- Index: linux-rt.q/drivers/scsi/aacraid/aacraid.h =================================================================== --- linux-rt.q.orig/drivers/scsi/aacraid/aacraid.h +++ linux-rt.q/drivers/scsi/aacraid/aacraid.h @@ -715,7 +715,7 @@ struct aac_fib_context { u32 unique; // unique value representing this context ulong jiffies; // used for cleanup - dmb changed to ulong struct list_head next; // used to link context's into a linked list - struct semaphore wait_sem; // this is used to wait for the next fib to arrive. + struct compat_semaphore wait_sem; // this is used to wait for the next fib to arrive. int wait; // Set to true when thread is in WaitForSingleObject unsigned long count; // total number of FIBs on FibList struct list_head fib_list; // this holds fibs and their attachd hw_fibs @@ -785,7 +785,7 @@ struct fib { * This is the event the sendfib routine will wait on if the * caller did not pass one and this is synch io. */ - struct semaphore event_wait; + struct compat_semaphore event_wait; spinlock_t event_lock; u32 done; /* gets set to 1 when fib is complete */ Index: linux-rt.q/drivers/scsi/qla2xxx/qla_def.h =================================================================== --- linux-rt.q.orig/drivers/scsi/qla2xxx/qla_def.h +++ linux-rt.q/drivers/scsi/qla2xxx/qla_def.h @@ -2344,7 +2344,7 @@ typedef struct scsi_qla_host { #define MBX_UPDATE_FLASH_ACTIVE 3 struct semaphore mbx_cmd_sem; /* Serialialize mbx access */ - struct semaphore mbx_intr_sem; /* Used for completion notification */ + struct compat_semaphore mbx_intr_sem; /* Used for completion notification */ uint32_t mbx_flags; #define MBX_IN_PROGRESS BIT_0 Index: linux-rt.q/drivers/usb/storage/usb.h =================================================================== --- linux-rt.q.orig/drivers/usb/storage/usb.h +++ linux-rt.q/drivers/usb/storage/usb.h @@ -146,7 +146,7 @@ struct us_data { dma_addr_t iobuf_dma; /* mutual exclusion and synchronization structures */ - struct semaphore sema; /* to sleep thread on */ + struct compat_semaphore sema; /* to sleep thread on */ struct completion notify; /* thread begin/end */ wait_queue_head_t delay_wait; /* wait during scan, reset */ Index: linux-rt.q/fs/jffs2/jffs2_fs_i.h =================================================================== --- linux-rt.q.orig/fs/jffs2/jffs2_fs_i.h +++ linux-rt.q/fs/jffs2/jffs2_fs_i.h @@ -24,7 +24,7 @@ struct jffs2_inode_info { before letting GC proceed. Or we'd have to put ugliness into the GC code so it didn't attempt to obtain the i_mutex for the inode(s) which are already locked */ - struct semaphore sem; + struct compat_semaphore sem; /* The highest (datanode) version number used for this ino */ uint32_t highest_version; Index: linux-rt.q/fs/xfs/linux-2.6/sema.h =================================================================== --- linux-rt.q.orig/fs/xfs/linux-2.6/sema.h +++ linux-rt.q/fs/xfs/linux-2.6/sema.h @@ -27,7 +27,7 @@ * sema_t structure just maps to struct semaphore in Linux kernel. */ -typedef struct semaphore sema_t; +typedef struct compat_semaphore sema_t; #define initnsema(sp, val, name) sema_init(sp, val) #define psema(sp, b) down(sp) @@ -36,7 +36,12 @@ typedef struct semaphore sema_t; static inline int issemalocked(sema_t *sp) { - return down_trylock(sp) || (up(sp), 0); + int rv; + + if ((rv = down_trylock(sp))) + return (rv); + up(sp); + return (0); } /* Index: linux-rt.q/fs/xfs/linux-2.6/xfs_buf.h =================================================================== --- linux-rt.q.orig/fs/xfs/linux-2.6/xfs_buf.h +++ linux-rt.q/fs/xfs/linux-2.6/xfs_buf.h @@ -118,7 +118,7 @@ typedef int (*xfs_buf_bdstrat_t)(struct #define XB_PAGES 2 typedef struct xfs_buf { - struct semaphore b_sema; /* semaphore for lockables */ + struct compat_semaphore b_sema; /* semaphore for lockables */ unsigned long b_queuetime; /* time buffer was queued */ atomic_t b_pin_count; /* pin count */ wait_queue_head_t b_waiters; /* unpin waiters */ @@ -138,7 +138,7 @@ typedef struct xfs_buf { xfs_buf_iodone_t b_iodone; /* I/O completion function */ xfs_buf_relse_t b_relse; /* releasing function */ xfs_buf_bdstrat_t b_strat; /* pre-write function */ - struct semaphore b_iodonesema; /* Semaphore for I/O waiters */ + struct compat_semaphore b_iodonesema; /* Semaphore for I/O waiters */ void *b_fspriv; void *b_fspriv2; void *b_fspriv3; Index: linux-rt.q/include/linux/parport.h =================================================================== --- linux-rt.q.orig/include/linux/parport.h +++ linux-rt.q/include/linux/parport.h @@ -265,7 +265,7 @@ enum ieee1284_phase { struct ieee1284_info { int mode; volatile enum ieee1284_phase phase; - struct semaphore irq; + struct compat_semaphore irq; }; /* A parallel port */ patches/fix-migrating-softirq.patch0000664000077200007720000001114710646635216017011 0ustar mingomingoFrom rostedt@goodmis.org Wed Jun 13 14:47:26 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from ms-smtp-02.nyroc.rr.com (ms-smtp-02.nyroc.rr.com [24.24.2.56]) by mail.tglx.de (Postfix) with ESMTP id AB7B665C3D9 for ; Wed, 13 Jun 2007 14:47:26 +0200 (CEST) Received: from [192.168.23.10] (cpe-24-94-51-176.stny.res.rr.com [24.94.51.176]) by ms-smtp-02.nyroc.rr.com (8.13.6/8.13.6) with ESMTP id l5DClGVg022890; Wed, 13 Jun 2007 08:47:17 -0400 (EDT) Subject: [PATCH RT] fix migrating softirq [cause of network hang] From: Steven Rostedt To: Ingo Molnar Cc: LKML , RT , Thomas Gleixner , john stultz Content-Type: text/plain Date: Wed, 13 Jun 2007 08:47:16 -0400 Message-Id: <1181738836.10408.54.camel@localhost.localdomain> Mime-Version: 1.0 X-Mailer: Evolution 2.6.3 X-Virus-Scanned: Symantec AntiVirus Scan Engine X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Softirqs are bound to a single CPU. That is to say, that once a softirq function starts to run, it will stay on the CPU that it is running on while it's running. In RT, softirqs are threads, and we have a softirq thread per cpu. Each softirq thread is bound to a single CPU that it represents. In order to speed things up and lower context switches in RT, if a softirq thread is of the same priority as an interrupt thread, then when the interrupt thread is about to exit, it tests to see if any softirq threads need to be run on that cpu. Instead of running the softirq thread, it simply performs the functions for the softirq within the interrupt thread. The problem is, nothing prevents the interrupt thread from migrating. So while the interrupt thread is running the softirq function, it may migrate to another CPU in the middle of that function. This means that any CPU data that the softirq is touching can be corrupted. I was experiencing a network hang that sometimes would come back, and sometimes not. Using my logdev debugger, I started to debug this problem. I came across this at the moment of the hang: [ 389.131279] cpu:0 (IRQ-11:427) tcp_rcv_established:4056 rcv_nxt=-1665585797 [ 389.131615] cpu:1 192.168.23.72:22 <== 192.168.23.60:41352 ack:2629381499 seq:1773074099 (----A-) len:0 win:790 end_seq:1773074099 [ 389.131626] cpu:1 (IRQ-11:427) ip_finish_output2:187 dst->hh=ffff81003b213080 [ 389.131635] cpu:1 (IRQ-11:427) ip_finish_output2:189 hh_output=ffffffff80429009 Here we see IRQ-11 in the process of finishing up the softirq-net-tx function. In the middle of it, we receive a packet, and that must have pushed the interrupt thread over to CPU 1, and it finished up the softirq there. This patch temporarily binds the hardirq thread on the CPU that it runs the softirqs on. With this patch I have not seen my network hang. I ran it over night, doing compiles and such, and it seems fine. I would be able to cause the hang with various loads within a minute, now I can't cause it after several minutes. I'm assuming that this fix may fix other bugs too. Signed-off-by: Steven Rostedt --- kernel/irq/manage.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) Index: linux-rt.q/kernel/irq/manage.c =================================================================== --- linux-rt.q.orig/kernel/irq/manage.c +++ linux-rt.q/kernel/irq/manage.c @@ -763,7 +763,15 @@ static int do_irqd(void * __desc) struct irq_desc *desc = __desc; #ifdef CONFIG_SMP - set_cpus_allowed(current, desc->affinity); + cpumask_t cpus_allowed, mask; + + cpus_allowed = desc->affinity; + /* + * Restrict it to one cpu so we avoid being migrated inside of + * do_softirq_from_hardirq() + */ + mask = cpumask_of_cpu(first_cpu(desc->affinity)); + set_cpus_allowed(current, mask); #endif current->flags |= PF_NOFREEZE | PF_HARDIRQ; @@ -787,8 +795,16 @@ static int do_irqd(void * __desc) /* * Did IRQ affinities change? */ - if (!cpus_equal(current->cpus_allowed, desc->affinity)) - set_cpus_allowed(current, desc->affinity); + if (!cpus_equal(cpus_allowed, desc->affinity)) { + cpus_allowed = desc->affinity; + /* + * Restrict it to one cpu so we avoid being + * migrated inside of + * do_softirq_from_hardirq() + */ + mask = cpumask_of_cpu(first_cpu(desc->affinity)); + set_cpus_allowed(current, mask); + } #endif schedule(); } patches/new-softirq-code.patch0000664000077200007720000002300710646635216015743 0ustar mingomingoSubject: [patch] softirq preemption: optimization From: Ingo Molnar optimize softirq preemption by allowing a hardirq context to pick up softirq processing. Signed-off-by: Ingo Molnar --- include/linux/interrupt.h | 1 kernel/irq/manage.c | 19 +---- kernel/softirq.c | 150 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 127 insertions(+), 43 deletions(-) Index: linux-rt.q/include/linux/interrupt.h =================================================================== --- linux-rt.q.orig/include/linux/interrupt.h +++ linux-rt.q/include/linux/interrupt.h @@ -290,6 +290,7 @@ struct softirq_action asmlinkage void do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data); extern void softirq_init(void); +extern void do_softirq_from_hardirq(void); #ifdef CONFIG_PREEMPT_HARDIRQS # define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr) Index: linux-rt.q/kernel/irq/manage.c =================================================================== --- linux-rt.q.orig/kernel/irq/manage.c +++ linux-rt.q/kernel/irq/manage.c @@ -694,7 +694,6 @@ static void thread_edge_irq(irq_desc_t * desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); - cond_resched_hardirq_context(); spin_lock_irq(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); @@ -723,7 +722,6 @@ static void thread_do_irq(irq_desc_t *de desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); - cond_resched_hardirq_context(); spin_lock_irq(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); @@ -759,8 +757,6 @@ static void do_hardirq(struct irq_desc * wake_up(&desc->wait_for_handler); } -extern asmlinkage void __do_softirq(void); - static int do_irqd(void * __desc) { struct sched_param param = { 0, }; @@ -780,16 +776,13 @@ static int do_irqd(void * __desc) while (!kthread_should_stop()) { local_irq_disable_nort(); - set_current_state(TASK_INTERRUPTIBLE); -#ifndef CONFIG_PREEMPT_RT - irq_enter(); -#endif - do_hardirq(desc); -#ifndef CONFIG_PREEMPT_RT - irq_exit(); -#endif + do { + set_current_state(TASK_INTERRUPTIBLE); + do_hardirq(desc); + do_softirq_from_hardirq(); + } while (current->state == TASK_RUNNING); + local_irq_enable_nort(); - cond_resched(); #ifdef CONFIG_SMP /* * Did IRQ affinities change? Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -100,8 +100,26 @@ static void wakeup_softirqd(int softirq) /* Interrupts are disabled: no need to stop preemption */ struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; - if (tsk && tsk->state != TASK_RUNNING) - wake_up_process(tsk); + if (unlikely(!tsk)) + return; +#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS) + /* + * Optimization: if we are in a hardirq thread context, and + * if the priority of the softirq thread is the same as the + * priority of the hardirq thread, then 'merge' softirq + * processing into the hardirq context. (it will later on + * execute softirqs via do_softirq_from_hardirq()). + * So here we can skip the wakeup and can rely on the hardirq + * context processing it later on. + */ + if ((current->flags & PF_HARDIRQ) && !hardirq_count() && + (tsk->normal_prio == current->normal_prio)) + return; +#endif + /* + * Wake up the softirq task: + */ + wake_up_process(tsk); } /* @@ -250,50 +268,100 @@ EXPORT_SYMBOL(local_bh_enable_ip); * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_RESTART 20 + +static DEFINE_PER_CPU(u32, softirq_running); -asmlinkage void ___do_softirq(void) +static void ___do_softirq(const int same_prio_only) { + int max_restart = MAX_SOFTIRQ_RESTART, max_loops = MAX_SOFTIRQ_RESTART; + __u32 pending, available_mask, same_prio_skipped; struct softirq_action *h; - __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; - int cpu; + struct task_struct *tsk; + int cpu, softirq; pending = local_softirq_pending(); account_system_vtime(current); cpu = smp_processor_id(); restart: + available_mask = -1; + softirq = 0; + same_prio_skipped = 0; /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); - local_irq_enable(); - h = softirq_vec; do { + u32 softirq_mask = 1 << softirq; + if (pending & 1) { - { - u32 preempt_count = preempt_count(); - h->action(h); - if (preempt_count != preempt_count()) { - print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action); - printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); - preempt_count() = preempt_count; + u32 preempt_count = preempt_count(); + +#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS) + /* + * If executed by a same-prio hardirq thread + * then skip pending softirqs that belong + * to softirq threads with different priority: + */ + if (same_prio_only) { + tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; + if (tsk && tsk->normal_prio != + current->normal_prio) { + same_prio_skipped |= softirq_mask; + available_mask &= ~softirq_mask; + goto next; } } +#endif + /* + * Is this softirq already being processed? + */ + if (per_cpu(softirq_running, cpu) & softirq_mask) { + available_mask &= ~softirq_mask; + goto next; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + local_irq_enable(); + + h->action(h); + if (preempt_count != preempt_count()) { + print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; + } rcu_bh_qsctr_inc(cpu); cond_resched_softirq_context(); + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; } +next: h++; + softirq++; pending >>= 1; } while (pending); - local_irq_disable(); - + or_softirq_pending(same_prio_skipped); pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; + if (pending & available_mask) { + if (--max_restart) + goto restart; + /* + * With softirq threading there's no reason not to + * finish the workload we have: + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + if (--max_loops) { + if (printk_ratelimit()) + printk("INFO: softirq overload: %08x\n", pending); + max_restart = MAX_SOFTIRQ_RESTART; + goto restart; + } + if (printk_ratelimit()) + printk("BUG: softirq loop! %08x\n", pending); +#endif + } if (pending) trigger_softirqs(); @@ -321,7 +389,7 @@ asmlinkage void __do_softirq(void) p_flags = current->flags & PF_HARDIRQ; current->flags &= ~PF_HARDIRQ; - ___do_softirq(); + ___do_softirq(0); trace_softirq_exit(); @@ -345,20 +413,29 @@ void do_softirq_from_hardirq(void) if (!local_softirq_pending()) return; /* - * 'immediate' softirq execution: + * 'immediate' softirq execution, from hardirq context: */ + local_irq_disable(); __local_bh_disable((unsigned long)__builtin_return_address(0)); +#ifndef CONFIG_PREEMPT_SOFTIRQS + trace_softirq_enter(); +#endif p_flags = current->flags & PF_HARDIRQ; current->flags &= ~PF_HARDIRQ; + current->flags |= PF_SOFTIRQ; - ___do_softirq(); + ___do_softirq(1); +#ifndef CONFIG_PREEMPT_SOFTIRQS trace_softirq_exit(); - +#endif account_system_vtime(current); - _local_bh_enable(); current->flags |= p_flags; + current->flags &= ~PF_SOFTIRQ; + + _local_bh_enable(); + local_irq_enable(); } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -690,8 +767,9 @@ static int ksoftirqd(void * __data) { struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 }; struct softirqdata *data = __data; - u32 mask = (1 << data->nr); + u32 softirq_mask = (1 << data->nr); struct softirq_action *h; + int cpu = data->cpu; current->flags |= PF_NOFREEZE | PF_SOFTIRQ; @@ -705,7 +783,8 @@ static int ksoftirqd(void * __data) while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending() & mask) { + if (!local_softirq_pending() & softirq_mask) { +sleep_more: __preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -713,16 +792,26 @@ static int ksoftirqd(void * __data) __set_current_state(TASK_RUNNING); - while (local_softirq_pending() & mask) { + while (local_softirq_pending() & soft_irqmask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ - if (cpu_is_offline(data->cpu)) + if (cpu_is_offline(cpu)) goto wait_to_die; local_irq_disable(); + /* + * Is the softirq already being executed by + * a hardirq context? + */ + if (per_cpu(softirq_running, cpu) & softirq_mask) { + local_irq_enable(); + set_current_state(TASK_INTERRUPTIBLE); + goto sleep_more; + } + per_cpu(softirq_running, cpu) |= softirq_mask; __preempt_enable_no_resched(); - set_softirq_pending(local_softirq_pending() & ~mask); + set_softirq_pending(local_softirq_pending() & ~softirq_mask); local_bh_disable(); local_irq_enable(); @@ -732,6 +821,7 @@ static int ksoftirqd(void * __data) rcu_bh_qsctr_inc(data->cpu); local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; _local_bh_enable(); local_irq_enable(); patches/preempt-realtime-powerpc-a7.patch0000664000077200007720000001023710646635214020012 0ustar mingomingo To fix the following compile error by changing local_irq_restore() to raw_local_irq_restore(). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - include/asm-powerpc/hw_irq.h In file included from include/asm/system.h:9, from include/linux/list.h:9, from include/linux/signal.h:8, from arch/powerpc/kernel/asm-offsets.c:16: include/asm/hw_irq.h: In function 'local_get_flags': include/asm/hw_irq.h:23: error: expected expression before '<<' token include/asm/hw_irq.h:24: error: expected expression before '<<' token include/asm/hw_irq.h:25: error: expected expression before ':' token include/asm/hw_irq.h:25: error: expected statement before ')' token - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Signed-off-by: Tsutomu Owa -- owa --- arch/powerpc/kernel/head_64.S | 2 +- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/ppc_ksyms.c | 2 +- include/asm-powerpc/hw_irq.h | 18 ++++++++---------- 4 files changed, 11 insertions(+), 13 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/head_64.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/head_64.S +++ linux-rt.q/arch/powerpc/kernel/head_64.S @@ -1391,7 +1391,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISER * handles any interrupts pending at this point. */ ld r3,SOFTE(r1) - bl .local_irq_restore + bl .raw_local_irq_restore b 11f /* Here we have a page fault that hash_page can't handle. */ Index: linux-rt.q/arch/powerpc/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/irq.c +++ linux-rt.q/arch/powerpc/kernel/irq.c @@ -112,7 +112,7 @@ static inline void set_soft_enabled(unsi : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -void notrace local_irq_restore(unsigned long en) +void notrace raw_local_irq_restore(unsigned long en) { /* * get_paca()->soft_enabled = en; Index: linux-rt.q/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/ppc_ksyms.c +++ linux-rt.q/arch/powerpc/kernel/ppc_ksyms.c @@ -49,7 +49,7 @@ #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(local_irq_restore); +EXPORT_SYMBOL(raw_local_irq_restore); #endif #ifdef CONFIG_PPC32 Index: linux-rt.q/include/asm-powerpc/hw_irq.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/hw_irq.h +++ linux-rt.q/include/asm-powerpc/hw_irq.h @@ -16,18 +16,18 @@ extern void timer_interrupt(struct pt_re #ifdef CONFIG_PPC64 #include -static inline unsigned long local_get_flags(void) +static inline unsigned long raw_local_get_flags(void) { unsigned long flags; -<<<<<<< delete extern unsigned long local_get_flags(void); -<<<<<<< delete extern unsigned long local_irq_disable(void); + __asm__ __volatile__("lbz %0,%1(13)" + : "=r" (flags) : "i" (offsetof(struct paca_struct, soft_enabled))); return flags; } -static inline unsigned long local_irq_disable(void) +static inline unsigned long raw_local_irq_disable(void) { unsigned long flags, zero; @@ -53,8 +53,8 @@ extern void raw_local_irq_restore(unsign #define raw_irqs_disabled_flags(flags) ((flags) == 0) -#define __hard_irq_enable() __mtmsrd(mfmsr() | MSR_EE, 1) -#define __hard_irq_disable() __mtmsrd(mfmsr() & ~MSR_EE, 1) +#define __hard_irq_enable() __mtmsrd(mfmsr() | MSR_EE, 1) +#define __hard_irq_disable() __mtmsrd(mfmsr() & ~MSR_EE, 1) #define hard_irq_disable() \ do { \ @@ -63,17 +63,15 @@ extern void raw_local_irq_restore(unsign get_paca()->hard_enabled = 0; \ } while(0) -#else +#else /* CONFIG_PPC64 */ #if defined(CONFIG_BOOKE) #define SET_MSR_EE(x) mtmsr(x) #define raw_local_irq_restore(flags) __asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory") -<<<<<<< delete #define local_irq_restore(flags) do { \ -#define raw_local_irq_restore(flags) do { \ #else #define SET_MSR_EE(x) mtmsr(x) #define raw_local_irq_restore(flags) mtmsr(flags) -#endif +#endif /* CONFIG_BOOKE */ static inline void raw_local_irq_disable(void) { patches/timer-freq-tweaks.patch0000664000077200007720000000744310646635216016132 0ustar mingomingo--- kernel/rcutorture.c | 2 +- mm/slab.c | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) Index: linux-rt.q/kernel/rcutorture.c =================================================================== --- linux-rt.q.orig/kernel/rcutorture.c +++ linux-rt.q/kernel/rcutorture.c @@ -602,7 +602,7 @@ rcu_torture_reader(void *arg) if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); + schedule_timeout_interruptible(round_jiffies_relative(HZ)); continue; } if (p->rtort_mbtest == 0) Index: linux-rt.q/mm/slab.c =================================================================== --- linux-rt.q.orig/mm/slab.c +++ linux-rt.q/mm/slab.c @@ -1048,7 +1048,7 @@ static int transfer_objects(struct array #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3, this_cpu) do { } while (0) +#define reap_alien(cachep, l3, this_cpu) 0 static inline struct array_cache **alloc_alien_cache(int node, int limit) { @@ -1146,7 +1146,7 @@ static void __drain_alien_cache(struct k /* * Called from cache_reap() to regularly drain alien caches round robin. */ -static void +static int reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu) { int node = per_cpu(reap_node, *this_cpu); @@ -1157,8 +1157,10 @@ reap_alien(struct kmem_cache *cachep, st if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { __drain_alien_cache(cachep, ac, node, this_cpu); spin_unlock_irq(&ac->lock); + return 1; } } + return 0; } static void drain_alien_cache(struct kmem_cache *cachep, @@ -2488,7 +2490,7 @@ static void check_spinlock_acquired_node #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node); @@ -4175,14 +4177,15 @@ static int enable_cpucache(struct kmem_c * Drain an array if it contains any elements taking the l3 lock only if * necessary. Note that the l3 listlock also protects the array_cache * if drain_array() is used on the shared array. + * returns non-zero if some work is done */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node) { int tofree, this_cpu; if (!ac || !ac->avail) - return; + return 0; if (ac->touched && !force) { ac->touched = 0; } else { @@ -4198,6 +4201,7 @@ void drain_array(struct kmem_cache *cach } slab_spin_unlock_irq(&l3->list_lock, this_cpu); } + return 1; } /** @@ -4235,10 +4239,10 @@ static void cache_reap(struct work_struc */ l3 = searchp->nodelists[node]; - reap_alien(searchp, l3, &this_cpu); + work_done += reap_alien(searchp, l3, &this_cpu); - drain_array(searchp, l3, cpu_cache_get(searchp, this_cpu), - 0, node); + work_done += drain_array(searchp, l3, + cpu_cache_get(searchp, this_cpu), 0, node); /* * These are racy checks but it does not matter @@ -4249,7 +4253,7 @@ static void cache_reap(struct work_struc l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - drain_array(searchp, l3, l3->shared, 0, node); + work_done += drain_array(searchp, l3, l3->shared, 0, node); if (l3->free_touched) l3->free_touched = 0; @@ -4268,9 +4272,9 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, + round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC)); } - #ifdef CONFIG_PROC_FS static void print_slabinfo_header(struct seq_file *m) patches/ep93xx-clockevents-fix.patch0000664000077200007720000000253310646635211017013 0ustar mingomingoSubject: timer patch for ep93xx From: Manfred Gruber hi ! this patch is necessary to get latencies < 1ms for ep93xx armv4t with 2.6.21.5-rt18. Signed-off-by: Manfred Gruber --- arch/arm/mach-ep93xx/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/arm/mach-ep93xx/core.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-ep93xx/core.c +++ linux-rt.q/arch/arm/mach-ep93xx/core.c @@ -98,9 +98,9 @@ static struct clock_event_device clockev static int ep93xx_timer_interrupt(int irq, void *dev_id) { - __raw_writel(EP93XX_TC_CLEAR, EP93XX_TIMER1_CLEAR); + __raw_writel(EP93XX_TC_CLEAR, EP93XX_TIMER1_CLEAR); - clockevent_ep93xx.event_handler(&clockevent_ep93xx); + clockevent_ep93xx.event_handler(&clockevent_ep93xx); return IRQ_HANDLED; } @@ -108,7 +108,15 @@ static int ep93xx_timer_interrupt(int ir static int ep93xx_set_next_event(unsigned long evt, struct clock_event_device *unused) { + u32 tmode = __raw_readl(EP93XX_TIMER1_CONTROL); + + /* stop timer */ + __raw_writel(tmode & ~EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL); + /* program timer */ __raw_writel(evt, EP93XX_TIMER1_LOAD); + /* start timer */ + __raw_writel(tmode | EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL); + return 0; } patches/hrtimer-trace.patch0000664000077200007720000000444210646635213015322 0ustar mingomingo include/linux/hrtimer.h | 6 ++++++ kernel/hrtimer.c | 5 +++++ kernel/time/clockevents.c | 4 ++++ 3 files changed, 15 insertions(+) Index: linux-rt.q/include/linux/hrtimer.h =================================================================== --- linux-rt.q.orig/include/linux/hrtimer.h +++ linux-rt.q/include/linux/hrtimer.h @@ -253,6 +253,12 @@ static inline ktime_t hrtimer_cb_get_tim extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); +# if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR) +# define hrtimer_trace(a,b) trace_special_u64((a).tv64,b) +# else +# define hrtimer_trace(a,b) trace_special((a).tv.sec,(a).tv.nsec,b) +# endif + /* Exported timer functions: */ /* Initialize timers: */ Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -710,6 +710,8 @@ static void enqueue_hrtimer(struct hrtim struct hrtimer *entry; int leftmost = 1; + hrtimer_trace(timer->expires, (unsigned long) timer); + /* * Find the right place in the rbtree: */ @@ -1043,6 +1045,7 @@ void hrtimer_interrupt(struct clock_even retry: now = ktime_get(); + hrtimer_trace(now, 0); expires_next.tv64 = KTIME_MAX; @@ -1071,6 +1074,8 @@ void hrtimer_interrupt(struct clock_even break; } + hrtimer_trace(timer->expires, (unsigned long) timer); + /* Move softirq callbacks to the pending list */ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { __remove_hrtimer(timer, base, Index: linux-rt.q/kernel/time/clockevents.c =================================================================== --- linux-rt.q.orig/kernel/time/clockevents.c +++ linux-rt.q/kernel/time/clockevents.c @@ -12,12 +12,14 @@ */ #include +#include #include #include #include #include #include #include +#include /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); @@ -80,6 +82,8 @@ int clockevents_program_event(struct clo delta = ktime_to_ns(ktime_sub(expires, now)); + hrtimer_trace(expires, (unsigned long) delta); + if (delta <= 0) return -ETIME; patches/rt-mutex-arm.patch0000664000077200007720000002576610646635214015133 0ustar mingomingo--- arch/arm/kernel/entry-armv.S | 4 +- arch/arm/kernel/entry-common.S | 10 +++--- arch/arm/kernel/process.c | 10 ++++-- arch/arm/kernel/semaphore.c | 31 +++++++++++++++----- include/asm-arm/semaphore.h | 61 ++++++++++++++++++++++++++++------------- include/asm-arm/thread_info.h | 2 + 6 files changed, 80 insertions(+), 38 deletions(-) Index: linux-rt.q/arch/arm/kernel/entry-armv.S =================================================================== --- linux-rt.q.orig/arch/arm/kernel/entry-armv.S +++ linux-rt.q/arch/arm/kernel/entry-armv.S @@ -204,7 +204,7 @@ __irq_svc: irq_handler #ifdef CONFIG_PREEMPT ldr r0, [tsk, #TI_FLAGS] @ get flags - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED blne svc_preempt preempt_return: ldr r0, [tsk, #TI_PREEMPT] @ read preempt value @@ -235,7 +235,7 @@ svc_preempt: str r7, [tsk, #TI_PREEMPT] @ expects preempt_count == 0 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED beq preempt_return @ go again b 1b #endif Index: linux-rt.q/arch/arm/kernel/entry-common.S =================================================================== --- linux-rt.q.orig/arch/arm/kernel/entry-common.S +++ linux-rt.q/arch/arm/kernel/entry-common.S @@ -46,7 +46,7 @@ ret_fast_syscall: fast_work_pending: str r0, [sp, #S_R0+S_OFF]! @ returned r0 work_pending: - tst r1, #_TIF_NEED_RESCHED + tst r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED bne work_resched tst r1, #_TIF_NOTIFY_RESUME | _TIF_SIGPENDING beq no_work_pending @@ -56,7 +56,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ @@ -396,6 +397,7 @@ ENTRY(sys_oabi_call_table) #include "calls.S" #undef ABI #undef OBSOLETE +#endif #ifdef CONFIG_FRAME_POINTER @@ -445,7 +447,7 @@ mcount: ldr ip, =mcount_enabled @ leave early, if disabled ldr ip, [ip] cmp ip, #0 - moveq pc,lr + moveq pc, lr mov ip, sp stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame @@ -504,5 +506,3 @@ arm_return_addr: #endif -#endif - Index: linux-rt.q/arch/arm/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/process.c +++ linux-rt.q/arch/arm/kernel/process.c @@ -130,7 +130,7 @@ static void default_idle(void) cpu_relax(); else { local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { timer_dyn_reprogram(); arch_idle(); } @@ -162,13 +162,15 @@ void cpu_idle(void) idle = default_idle; leds_event(led_idle_start); tick_nohz_stop_sched_tick(); - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) idle(); leds_event(led_idle_end); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux-rt.q/arch/arm/kernel/semaphore.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/semaphore.c +++ linux-rt.q/arch/arm/kernel/semaphore.c @@ -49,14 +49,16 @@ * we cannot lose wakeup events. */ -void __up(struct semaphore *sem) +fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } +EXPORT_SYMBOL(__compat_up); + static DEFINE_SPINLOCK(semaphore_lock); -void __sched __down(struct semaphore * sem) +fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +EXPORT_SYMBOL(__compat_down); + +fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -140,6 +144,8 @@ int __sched __down_interruptible(struct return retval; } +EXPORT_SYMBOL(__compat_down_interruptible); + /* * Trylock failed - make sure we correct for * having decremented the count. @@ -148,7 +154,7 @@ int __sched __down_interruptible(struct * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -int __down_trylock(struct semaphore * sem) +fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se return 1; } +EXPORT_SYMBOL(__compat_down_trylock); + +fastcall int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); + /* * The semaphore operations have a special calling sequence that * allow us to do a simpler in-line version of them. These routines @@ -185,7 +200,7 @@ asm(" .section .sched.text,\"ax\",%progb __down_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down \n\ + bl __compat_down \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ .align 5 \n\ @@ -193,7 +208,7 @@ __down_failed: \n\ __down_interruptible_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_interruptible \n\ + bl __compat_down_interruptible \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -202,7 +217,7 @@ __down_interruptible_failed: \n\ __down_trylock_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_trylock \n\ + bl __compat_down_trylock \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -211,7 +226,7 @@ __down_trylock_failed: \n\ __up_wakeup: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __up \n\ + bl __compat_up \n\ ldmfd sp!, {r0 - r4, pc} \n\ "); Index: linux-rt.q/include/asm-arm/semaphore.h =================================================================== --- linux-rt.q.orig/include/asm-arm/semaphore.h +++ linux-rt.q/include/asm-arm/semaphore.h @@ -5,46 +5,66 @@ #define __ASM_ARM_SEMAPHORE_H #include + +#ifdef CONFIG_PREEMPT_RT +# include +#endif + #include #include #include +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define semaphore compat_semaphore +#endif + #include #include -struct semaphore { +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INIT(name, cnt) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, cnt) \ { \ .count = ATOMIC_INIT(cnt), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INIT(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) + +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) -#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0) -static inline void sema_init(struct semaphore *sem, int val) +static inline void compat_sema_init(struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); sem->sleepers = 0; init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX(struct semaphore *sem) +static inline void compat_init_MUTEX(struct compat_semaphore *sem) +{ + compat_sema_init(sem, 1); +} + +static inline void compat_init_MUTEX_LOCKED(struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 0); } -static inline void init_MUTEX_LOCKED(struct semaphore *sem) +static inline int compat_sema_count(struct compat_semaphore *sem) { - sema_init(sem, 0); + return atomic_read(&sem->count); } /* @@ -55,16 +75,18 @@ asmlinkage int __down_interruptible_fai asmlinkage int __down_trylock_failed(void); asmlinkage void __up_wakeup(void); -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern int __down_trylock(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_up(struct compat_semaphore *sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern int __compat_down_trylock(struct compat_semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); + +extern int compat_sem_is_locked(struct compat_semaphore *sem); /* * This is ugly, but we want the default case to fall through. * "__down" is the actual routine that waits... */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); __down_op(sem, __down_failed); @@ -74,13 +96,13 @@ static inline void down(struct semaphore * This is ugly, but we want the default case to fall through. * "__down_interruptible" is the actual routine that waits... */ -static inline int down_interruptible (struct semaphore * sem) +static inline int compat_down_interruptible (struct compat_semaphore * sem) { might_sleep(); return __down_op_ret(sem, __down_interruptible_failed); } -static inline int down_trylock(struct semaphore *sem) +static inline int compat_down_trylock(struct compat_semaphore *sem) { return __down_op_ret(sem, __down_trylock_failed); } @@ -91,9 +113,10 @@ static inline int down_trylock(struct se * The default case (no contention) will result in NO * jumps for both down() and up(). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __up_op(sem, __up_wakeup); } +#include #endif Index: linux-rt.q/include/asm-arm/thread_info.h =================================================================== --- linux-rt.q.orig/include/asm-arm/thread_info.h +++ linux-rt.q/include/asm-arm/thread_info.h @@ -144,6 +144,7 @@ extern void iwmmxt_task_switch(struct th #define TIF_NOTIFY_RESUME 0 #define TIF_SIGPENDING 1 #define TIF_NEED_RESCHED 2 +#define TIF_NEED_RESCHED_DELAYED 3 #define TIF_SYSCALL_TRACE 8 #define TIF_POLLING_NRFLAG 16 #define TIF_USING_IWMMXT 17 @@ -153,6 +154,7 @@ extern void iwmmxt_task_switch(struct th #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_DELAYED (1< #include +#include + static DEFINE_RAW_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, touch_timestamp); @@ -103,9 +105,13 @@ void softlockup_tick(void) stop_trace(); spin_lock(&print_lock); - printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", - this_cpu); - dump_stack(); + printk(KERN_ERR "BUG: soft lockup detected on CPU#%d! [%s:%d]\n", + this_cpu, current->comm, current->pid); + if (get_irq_regs()) + show_regs(get_irq_regs()); + else + dump_stack(); +// nmi_show_all_regs(); spin_unlock(&print_lock); } } patches/preempt-realtime-sound.patch0000664000077200007720000000107310646635215017155 0ustar mingomingo--- sound/core/pcm_lib.c | 1 + 1 file changed, 1 insertion(+) Index: linux-rt.q/sound/core/pcm_lib.c =================================================================== --- linux-rt.q.orig/sound/core/pcm_lib.c +++ linux-rt.q/sound/core/pcm_lib.c @@ -130,6 +130,7 @@ static void xrun(struct snd_pcm_substrea snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN); #ifdef CONFIG_SND_PCM_XRUN_DEBUG if (substream->pstr->xrun_debug) { + user_trace_stop(); snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n", substream->pcm->card->number, substream->pcm->device, patches/lockstat-core.patch0000664000077200007720000003326610646635217015340 0ustar mingomingoSubject: lockstat: core infrastructure Introduce the core lock statistics code. Lock statistics provides lock wait-time and hold-time (as well as the count of corresponding contention and acquisitions events). Also, the first few call-sites that encounter contention are tracked. Lock wait-time is the time spent waiting on the lock. This provides insight into the locking scheme, that is, a heavily contended lock is indicative of a too coarse locking scheme. Lock hold-time is the duration the lock was held, this provides a reference for the wait-time numbers, so they can be put into perspective. 1) lock 2) ... do stuff .. unlock 3) The time between 1 and 2 is the wait-time. The time between 2 and 3 is the hold-time. The lockdep held-lock tracking code is reused, because it already collects locks into meaningful groups (classes), and because it is an existing infrastructure for lock instrumentation. Currently lockdep tracks lock acquisition with two hooks: lock() lock_acquire() _lock() ... code protected by lock ... unlock() lock_release() _unlock() We need to extend this with two more hooks, in order to measure contention. lock_contended() - used to measure contention events lock_acquired() - completion of the contention These are then placed the following way: lock() lock_acquire() if (!_try_lock()) lock_contended() _lock() lock_acquired() ... do locked stuff ... unlock() lock_release() _unlock() (Note: the try_lock() 'trick' is used to avoid instrumenting all platform dependent lock primitive implementations.) It is also possible to toggle the two lockdep features at runtime using: /proc/sys/kernel/prove_locking /proc/sys/kernel/lock_stat (esp. turning off the O(n^2) prove_locking functionaliy can help) Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron --- include/linux/lockdep.h | 52 +++++++++ kernel/lockdep.c | 252 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/sysctl.c | 22 ++++ lib/Kconfig.debug | 11 ++ 4 files changed, 336 insertions(+), 1 deletion(-) Index: linux-rt.q/include/linux/lockdep.h =================================================================== --- linux-rt.q.orig/include/linux/lockdep.h +++ linux-rt.q/include/linux/lockdep.h @@ -114,8 +114,32 @@ struct lock_class { const char *name; int name_version; + +#ifdef CONFIG_LOCK_STAT + unsigned long contention_point[4]; +#endif +}; + +#ifdef CONFIG_LOCK_STAT +struct lock_time { + s64 min; + s64 max; + s64 total; + unsigned long nr; }; +struct lock_class_stats { + unsigned long contention_point[4]; + struct lock_time read_waittime; + struct lock_time write_waittime; + struct lock_time read_holdtime; + struct lock_time write_holdtime; +}; + +struct lock_class_stats lock_stats(struct lock_class *class); +void clear_lock_stats(struct lock_class *class); +#endif + /* * Map the lock object (the lock instance) to the lock-class object. * This is embedded into specific lock instances: @@ -165,6 +189,10 @@ struct held_lock { unsigned long acquire_ip; struct lockdep_map *instance; +#ifdef CONFIG_LOCK_STAT + u64 waittime_stamp; + u64 holdtime_stamp; +#endif /* * The lock-stack is unified in that the lock chains of interrupt * contexts nest ontop of process context chains, but we 'separate' @@ -285,6 +313,30 @@ struct lock_class_key { }; #endif /* !LOCKDEP */ +#ifdef CONFIG_LOCK_STAT + +extern void lock_contended(struct lockdep_map *lock, unsigned long ip); +extern void lock_acquired(struct lockdep_map *lock); + +#define LOCK_CONTENDED(_lock, try, lock) \ +do { \ + if (!try(_lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + lock(_lock); \ + lock_acquired(&(_lock)->dep_map); \ + } \ +} while (0) + +#else /* CONFIG_LOCK_STAT */ + +#define lock_contended(lock, ip) do { } while (0) +#define lock_acquired(lock) do { } while (0) + +#define LOCK_CONTENDED(_lock, try, lock) \ + lock(_lock) + +#endif /* CONFIG_LOCK_STAT */ + #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) extern void early_init_irq_lock_class(void); #else Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -42,6 +42,20 @@ #include "lockdep_internals.h" +#ifdef CONFIG_PROVE_LOCKING +int prove_locking = 1; +module_param(prove_locking, int, 0644); +#else +#define prove_locking 0 +#endif + +#ifdef CONFIG_LOCK_STAT +int lock_stat = 1; +module_param(lock_stat, int, 0644); +#else +#define lock_stat 0 +#endif + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -123,6 +137,70 @@ static struct lock_list *alloc_list_entr unsigned long nr_lock_classes; static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +#ifdef CONFIG_LOCK_STAT +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); + +static int lock_contention_point(struct lock_class *class, unsigned long ip) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + if (class->contention_point[i] == 0) { + class->contention_point[i] = ip; + break; + } + if (class->contention_point[i] == ip) + break; + } + + return i; +} + +static void lock_time_inc(struct lock_time *lt, s64 time) +{ + if (time > lt->max) + lt->max = time; + + if (time < lt->min || !lt->min) + lt->min = time; + + lt->total += time; + lt->nr++; +} + +static struct lock_class_stats *get_lock_stats(struct lock_class *class) +{ + return &get_cpu_var(lock_stats)[class - lock_classes]; +} + +static void put_lock_stats(struct lock_class_stats *stats) +{ + put_cpu_var(lock_stats); +} + +static void lock_release_holdtime(struct held_lock *hlock) +{ + struct lock_class_stats *stats; + s64 holdtime; + + if (!lock_stat) + return; + + holdtime = sched_clock() - hlock->holdtime_stamp; + + stats = get_lock_stats(hlock->class); + if (hlock->read) + lock_time_inc(&stats->read_holdtime, holdtime); + else + lock_time_inc(&stats->write_holdtime, holdtime); + put_lock_stats(stats); +} +#else +static inline void lock_release_holdtime(struct held_lock *hlock) +{ +} +#endif + /* * We keep a global list of all lock classes. The list only grows, * never shrinks. The list is only accessed with the lockdep @@ -2050,6 +2128,9 @@ static int __lock_acquire(struct lockdep int chain_head = 0; u64 chain_key; + if (!prove_locking) + check = 1; + if (unlikely(!debug_locks)) return 0; @@ -2100,6 +2181,10 @@ static int __lock_acquire(struct lockdep hlock->read = read; hlock->check = check; hlock->hardirqs_off = hardirqs_off; +#ifdef CONFIG_LOCK_STAT + hlock->waittime_stamp = 0; + hlock->holdtime_stamp = sched_clock(); +#endif if (check != 2) goto out_calc_hash; @@ -2147,10 +2232,11 @@ static int __lock_acquire(struct lockdep } } #endif +out_calc_hash: /* mark it as used: */ if (!mark_lock(curr, hlock, LOCK_USED)) return 0; -out_calc_hash: + /* * Calculate the chain hash: it's the combined has of all the * lock keys along the dependency chain. We save the hash value @@ -2208,6 +2294,7 @@ out_calc_hash: * (If lookup_chain_cache() returns with 1 it acquires * graph_lock for us) */ +#ifdef CONFIG_PROVE_LOCKING if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) { /* * Check whether last held lock: @@ -2244,6 +2331,7 @@ out_calc_hash: /* after lookup_chain_cache(): */ if (unlikely(!debug_locks)) return 0; +#endif curr->lockdep_depth++; check_chain_key(curr); @@ -2394,6 +2482,8 @@ lock_release_non_nested(struct task_stru return print_unlock_inbalance_bug(curr, lock, ip); found_it: + lock_release_holdtime(hlock); + /* * We have the right lock to unlock, 'hlock' points to it. * Now we remove it from the stack, and add back the other @@ -2446,6 +2536,8 @@ static int lock_release_nested(struct ta curr->curr_chain_key = hlock->prev_chain_key; + lock_release_holdtime(hlock); + #ifdef CONFIG_DEBUG_LOCKDEP hlock->prev_chain_key = 0; hlock->class = NULL; @@ -2541,6 +2633,9 @@ lock_release(struct lockdep_map *lock, i { unsigned long flags; + if (unlikely(!lock_stat && !prove_locking)) + return; + if (unlikely(current->lockdep_recursion)) return; @@ -2560,6 +2655,9 @@ lock_set_subclass(struct lockdep_map *lo { unsigned long flags; + if (unlikely(!lock_stat && !prove_locking)) + return; + if (unlikely(current->lockdep_recursion)) return; @@ -2574,6 +2672,158 @@ lock_set_subclass(struct lockdep_map *lo EXPORT_SYMBOL_GPL(lock_set_subclass); +#ifdef CONFIG_LOCK_STAT +static int +print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ BUG: bad contention detected! ]\n"); + printk( "---------------------------------\n"); + printk("%s/%d is trying to contend lock (", + curr->comm, curr->pid); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no locks held!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static void +__lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + int i, point; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, ip); + return; + +found_it: + hlock->waittime_stamp = sched_clock(); + + point = lock_contention_point(hlock->class, ip); + + stats = get_lock_stats(hlock->class); + if (point < ARRAY_SIZE(stats->contention_point)) + stats->contention_point[i]++; + put_lock_stats(stats); +} + +static void +__lock_acquired(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + u64 now; + s64 waittime; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, _RET_IP_); + return; + +found_it: + if (!hlock->waittime_stamp) + return; + + now = sched_clock(); + waittime = now - hlock->waittime_stamp; + hlock->holdtime_stamp = now; + + stats = get_lock_stats(hlock->class); + if (hlock->read) + lock_time_inc(&stats->read_waittime, waittime); + else + lock_time_inc(&stats->write_waittime, waittime); + put_lock_stats(stats); +} + +void lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_contended(lock, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_contended); + +void lock_acquired(struct lockdep_map *lock) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_acquired(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquired); +#endif + /* * Used by the testsuite, sanitize the validator state * after a simulated failure: Index: linux-rt.q/kernel/sysctl.c =================================================================== --- linux-rt.q.orig/kernel/sysctl.c +++ linux-rt.q/kernel/sysctl.c @@ -163,6 +163,8 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +extern int prove_locking; +extern int lock_stat; /* The default sysctl tables: */ @@ -205,6 +207,26 @@ static ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, +#ifdef CONFIG_PROVE_LOCKING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; Index: linux-rt.q/lib/Kconfig.debug =================================================================== --- linux-rt.q.orig/lib/Kconfig.debug +++ linux-rt.q/lib/Kconfig.debug @@ -272,6 +272,17 @@ config LOCKDEP select KALLSYMS select KALLSYMS_ALL +config LOCK_STAT + bool "Lock usage statisitics" + depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT + select LOCKDEP + select DEBUG_SPINLOCK + select DEBUG_MUTEXES + select DEBUG_LOCK_ALLOC + default n + help + This feature enables tracking lock contention points + config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP patches/arm-cmpxchg.patch0000664000077200007720000000240610646635213014760 0ustar mingomingo include/asm-arm/atomic.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) Index: linux-rt.q/include/asm-arm/atomic.h =================================================================== --- linux-rt.q.orig/include/asm-arm/atomic.h +++ linux-rt.q/include/asm-arm/atomic.h @@ -173,6 +173,41 @@ static inline void atomic_clear_mask(uns raw_local_irq_restore(flags); } +#ifndef CONFIG_SMP +/* + * Atomic compare and exchange. + */ +#define __HAVE_ARCH_CMPXCHG 1 + +extern unsigned long wrong_size_cmpxchg(volatile void *ptr); + +static inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long flags, prev; + volatile unsigned long *p = ptr; + + if (size == 4) { + local_irq_save(flags); + if ((prev = *p) == old) + *p = new; + local_irq_restore(flags); + return(prev); + } else + return wrong_size_cmpxchg(ptr); +} + +#define cmpxchg(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ +}) + +#endif + #endif /* __LINUX_ARM_ARCH__ */ #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) patches/highres-improve-debug-output-fix.patch0000664000077200007720000000146510646635210021071 0ustar mingomingoFrom: Andrew Morton Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton --- kernel/time/tick-oneshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/kernel/time/tick-oneshot.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-oneshot.c +++ linux-rt.q/kernel/time/tick-oneshot.c @@ -82,7 +82,7 @@ int tick_switch_to_oneshot(void (*handle } else { if (!tick_device_is_functional(dev)) printk(" %s is not functional.\n", dev->name); - else if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + else printk(" %s does not support one-shot mode.\n", dev->name); } patches/ich-force-hpet-restructure-hpet-generic-clock-code.patch0000664000077200007720000001450010646635211024301 0ustar mingomingoFrom: Venki Pallipadi Restructure and rename legacy replacement mode HPET timer support. Just the code structural changes and should be zero functionality change. Signed-off-by: Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andi Kleen Cc: john stultz Cc: Greg KH Signed-off-by: Andrew Morton --- arch/i386/kernel/hpet.c | 148 ++++++++++++++++++++++++++---------------------- 1 file changed, 83 insertions(+), 65 deletions(-) Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -149,9 +149,9 @@ static void hpet_reserve_platform_timers */ static unsigned long hpet_period; -static void hpet_set_mode(enum clock_event_mode mode, +static void hpet_legacy_set_mode(enum clock_event_mode mode, struct clock_event_device *evt); -static int hpet_next_event(unsigned long delta, +static int hpet_legacy_next_event(unsigned long delta, struct clock_event_device *evt); /* @@ -160,8 +160,8 @@ static int hpet_next_event(unsigned long static struct clock_event_device hpet_clockevent = { .name = "hpet", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = hpet_set_mode, - .set_next_event = hpet_next_event, + .set_mode = hpet_legacy_set_mode, + .set_next_event = hpet_legacy_next_event, .shift = 32, .irq = 0, }; @@ -178,7 +178,7 @@ static void hpet_start_counter(void) hpet_writel(cfg, HPET_CFG); } -static void hpet_enable_int(void) +static void hpet_enable_legacy_int(void) { unsigned long cfg = hpet_readl(HPET_CFG); @@ -187,7 +187,39 @@ static void hpet_enable_int(void) hpet_legacy_int_enabled = 1; } -static void hpet_set_mode(enum clock_event_mode mode, +static void hpet_legacy_clockevent_register(void) +{ + uint64_t hpet_freq; + + /* Start HPET legacy interrupts */ + hpet_enable_legacy_int(); + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, 32); + /* Calculate the min / max delta */ + hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &hpet_clockevent); + hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, + &hpet_clockevent); + + /* + * Start hpet with the boot cpu mask and make it + * global after the IO_APIC has been initialized. + */ + hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + clockevents_register_device(&hpet_clockevent); + global_clock_event = &hpet_clockevent; + printk(KERN_DEBUG "hpet clockevent registered\n"); +} + +static void hpet_legacy_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) { unsigned long cfg, cmp, now; @@ -228,12 +260,12 @@ static void hpet_set_mode(enum clock_eve break; case CLOCK_EVT_MODE_RESUME: - hpet_enable_int(); + hpet_enable_legacy_int(); break; } } -static int hpet_next_event(unsigned long delta, +static int hpet_legacy_next_event(unsigned long delta, struct clock_event_device *evt) { unsigned long cnt; @@ -273,58 +305,11 @@ static struct clocksource clocksource_hp #endif }; -/* - * Try to setup the HPET timer - */ -int __init hpet_enable(void) +static int hpet_clocksource_register(void) { - unsigned long id; - uint64_t hpet_freq; u64 tmp, start, now; cycle_t t1; - if (!is_hpet_capable()) - return 0; - - hpet_set_mapping(); - - /* - * Read the period and check for a sane value: - */ - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) - goto out_nohpet; - - /* - * The period is a femto seconds value. We need to calculate the - * scaled math multiplication factor for nanosecond to hpet tick - * conversion. - */ - hpet_freq = 1000000000000000ULL; - do_div(hpet_freq, hpet_period); - hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, - NSEC_PER_SEC, 32); - /* Calculate the min / max delta */ - hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &hpet_clockevent); - hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, - &hpet_clockevent); - - /* - * Read the HPET ID register to retrieve the IRQ routing - * information and the number of channels - */ - id = hpet_readl(HPET_ID); - -#ifdef CONFIG_HPET_EMULATE_RTC - /* - * The legacy routing mode needs at least two channels, tick timer - * and the rtc emulation channel. - */ - if (!(id & HPET_ID_NUMBER)) - goto out_nohpet; -#endif - /* Start the counter */ hpet_start_counter(); @@ -346,7 +331,7 @@ int __init hpet_enable(void) if (t1 == read_hpet()) { printk(KERN_WARNING "HPET counter not counting. HPET disabled\n"); - goto out_nohpet; + return -ENODEV; } /* Initialize and register HPET clocksource @@ -367,15 +352,48 @@ int __init hpet_enable(void) clocksource_register(&clocksource_hpet); + return 0; +} + +/* + * Try to setup the HPET timer + */ +int __init hpet_enable(void) +{ + unsigned long id; + + if (!is_hpet_capable()) + return 0; + + hpet_set_mapping(); + + /* + * Read the period and check for a sane value: + */ + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) + goto out_nohpet; + + /* + * Read the HPET ID register to retrieve the IRQ routing + * information and the number of channels + */ + id = hpet_readl(HPET_ID); + +#ifdef CONFIG_HPET_EMULATE_RTC + /* + * The legacy routing mode needs at least two channels, tick timer + * and the rtc emulation channel. + */ + if (!(id & HPET_ID_NUMBER)) + goto out_nohpet; +#endif + + if (hpet_clocksource_register()) + goto out_nohpet; + if (id & HPET_ID_LEGSUP) { - hpet_enable_int(); - /* - * Start hpet with the boot cpu mask and make it - * global after the IO_APIC has been initialized. - */ - hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); - clockevents_register_device(&hpet_clockevent); - global_clock_event = &hpet_clockevent; + hpet_legacy_clockevent_register(); return 1; } return 0; patches/apic-dumpstack.patch0000664000077200007720000000070710646635212015460 0ustar mingomingo arch/i386/kernel/apic.c | 1 + 1 file changed, 1 insertion(+) Index: linux-rt.q/arch/i386/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/apic.c +++ linux-rt.q/arch/i386/kernel/apic.c @@ -1309,6 +1309,7 @@ void smp_error_interrupt(struct pt_regs */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); + dump_stack(); irq_exit(); } patches/preempt-realtime-drivers-pci-hotplug.patch0000664000077200007720000000255310646635215021740 0ustar mingomingo--- drivers/pci/hotplug/cpci_hotplug_core.c | 8 ++++---- drivers/pci/hotplug/cpqphp_ctrl.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) Index: linux-rt.q/drivers/pci/hotplug/cpci_hotplug_core.c =================================================================== --- linux-rt.q.orig/drivers/pci/hotplug/cpci_hotplug_core.c +++ linux-rt.q/drivers/pci/hotplug/cpci_hotplug_core.c @@ -521,9 +521,9 @@ event_thread(void *data) { int rc; - lock_kernel(); +// lock_kernel(); daemonize("cpci_hp_eventd"); - unlock_kernel(); +// unlock_kernel(); dbg("%s - event thread started", __FUNCTION__); while (1) { @@ -562,9 +562,9 @@ poll_thread(void *data) { int rc; - lock_kernel(); +// lock_kernel(); daemonize("cpci_hp_polld"); - unlock_kernel(); +// unlock_kernel(); while (1) { if (thread_finished || signal_pending(current)) Index: linux-rt.q/drivers/pci/hotplug/cpqphp_ctrl.c =================================================================== --- linux-rt.q.orig/drivers/pci/hotplug/cpqphp_ctrl.c +++ linux-rt.q/drivers/pci/hotplug/cpqphp_ctrl.c @@ -1746,10 +1746,10 @@ static void pushbutton_helper_thread(uns static int event_thread(void* data) { struct controller *ctrl; - lock_kernel(); +// lock_kernel(); daemonize("phpd_event"); - unlock_kernel(); +// unlock_kernel(); while (1) { dbg("!!!!event_thread sleeping\n"); patches/module-pde-race-fixes.patch0000664000077200007720000001325210646635216016635 0ustar mingomingo-------------------------------------------------------- pde = create_proc_entry() if (!pde) return -ENOMEM; pde->write_proc = ...; open write copy_from_user pde = create_proc_entry(); if (!pde) { remove_proc_entry(); return -ENOMEM; /* module unloaded */ } *boom* -------------------------------------------------------- pde = create_proc_entry(); if (pde) { /* which dereferences ->data */ pde->write_proc = ...; open write pde->data = ...; } -------------------------------------------------------- The following plan is going to be executed (as per Al Viro's explanations): PDE gets atomic counter counting reads and writes in progress via ->read_proc, ->write_proc, ->get_info . Generic proc code will bump PDE's counter before calling into module-specific method and decrement it after it returns. remove_proc_entry() will wait until all readers and writers are done. To do this reliably it will set ->proc_fops to NULL and generic proc code won't call into module it it sees NULL ->proc_fops. This patch implements part above. So far, no changes in modules code required. To proceed, lets look into ->proc_fops values during and after PDE creation: pde = create_proc_entry(); if (pde) pde->proc_fops = ...; proc_create() create empty PDE; (->proc_fops is NULL) proc_register() glues PDE to /proc (->proc_fops is NULL) proc_register() sets ->proc_fops to default (->proc_fops valid) [ module sets ->proc_fops to it's own (->proc_fops) valid ] Observation: ->proc_fops is not NULL, when create_proc_entry() exits. Next set of races come into play: pde = create_proc_entry(); if (pde) { pde->read_proc = ...; pde->data = ...; } Almost all ->read_proc, ->write_proc callbacks assume that ->data is valid when they're called. They cast ->data and dereference it. To fix this we need a way to indicate that PDE is not readable and writeable. ->proc_fops nicely fits, because modules setting ->proc_fops only don't need changes at all. create_proc_entry() will exit with NULL ->proc_fops and helpers will be needed sometimes to set it. 1. Module sets ->proc_fops only no changes 2. Module sets ->data and ->proc_fops use a helper to set ->data before ->proc_fops and a barrier. 2. Module uses only create_proc_read_entry() changes only in create_proc_read_entry(); 3. Module uses only create_proc_info_entry() changes only in create_proc_info_entry(); 4. Module sets combination of ->data, ->read_proc, ->write_proc use helper which will set fields, barrier and sets default ->proc_fops. the best name I've come up so far is void set_proc_entry_data_rw(struct proc_dir_entry *, void *, read_proc_t, write_proc_t); Helper(s) will be introduced, then create_proc_entry() will start exiting with NULL ->proc_fops. After that use of helper(s) will become mandatory. Grepping for offenders will be easy (read_proc, ... are good names). ->data is bad name, however, after helpers will be plugged we can rename ->data to ->pde_data and it'll become good name. Sorry, for somewhat chaotic explanations, please, comment on patch and RFC. Signed-off-by: Alexey Dobriyan --- fs/proc/generic.c | 32 ++++++++++++++++++++++++++++---- include/linux/proc_fs.h | 2 ++ 2 files changed, 30 insertions(+), 4 deletions(-) Index: linux-rt.q/fs/proc/generic.c =================================================================== --- linux-rt.q.orig/fs/proc/generic.c +++ linux-rt.q/fs/proc/generic.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -76,6 +77,12 @@ proc_file_read(struct file *file, char _ if (!(page = (char*) __get_free_page(GFP_KERNEL))) return -ENOMEM; + if (!dp->proc_fops) + goto out_free; + atomic_inc(&dp->pde_users); + if (!dp->proc_fops) + goto out_dec; + while ((nbytes > 0) && !eof) { count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); @@ -195,6 +202,9 @@ proc_file_read(struct file *file, char _ buf += n; retval += n; } +out_dec: + atomic_dec(&dp->pde_users); +out_free: free_page((unsigned long) page); return retval; } @@ -205,14 +215,20 @@ proc_file_write(struct file *file, const { struct inode *inode = file->f_path.dentry->d_inode; struct proc_dir_entry * dp; + ssize_t rv; dp = PDE(inode); - if (!dp->write_proc) + if (!dp->write_proc || !dp->proc_fops) return -EIO; - /* FIXME: does this routine need ppos? probably... */ - return dp->write_proc(file, buffer, count, dp->data); + rv = -EIO; + atomic_inc(&dp->pde_users); + if (dp->proc_fops) + /* FIXME: does this routine need ppos? probably... */ + rv = dp->write_proc(file, buffer, count, dp->data); + atomic_dec(&dp->pde_users); + return rv; } @@ -723,14 +739,22 @@ void remove_proc_entry(const char *name, if (!parent && xlate_proc_name(name, &parent, &fn) != 0) goto out; len = strlen(fn); - filevec_add_drain_all(); +again: spin_lock(&proc_subdir_lock); for (p = &parent->subdir; *p; p=&(*p)->next ) { if (!proc_match(len, fn, *p)) continue; de = *p; + + de->proc_fops = NULL; + if (atomic_read(&de->pde_users) > 0) { + spin_unlock(&proc_subdir_lock); + msleep(1); + goto again; + } + *p = de->next; de->next = NULL; if (S_ISDIR(de->mode)) Index: linux-rt.q/include/linux/proc_fs.h =================================================================== --- linux-rt.q.orig/include/linux/proc_fs.h +++ linux-rt.q/include/linux/proc_fs.h @@ -66,6 +66,8 @@ struct proc_dir_entry { atomic_t count; /* use count */ int deleted; /* delete flag */ void *set; + atomic_t pde_users; /* number of readers + number of writers via + * ->read_proc, ->write_proc, ->get_info */ }; struct kcore_list { patches/ich-force-hpet-make-generic-time-capable-of-switching-broadcast-timer.patch0000664000077200007720000000473210646635211027674 0ustar mingomingoFrom: Venki Pallipadi Auto-detect the presence of HPET on ICH5 or newer platforms and enable HPET for broadcast timer. This gives a bigger upperlimit for tickless time tick and improves the power consumption in comparison to PIT as broadcast timer. This patch: Change the broadcast timer, if a timer with higher rating becomes available. Signed-off-by: Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andi Kleen Cc: john stultz Cc: Greg KH Signed-off-by: Andrew Morton --- kernel/time/tick-broadcast.c | 13 ++++++------- kernel/time/tick-common.c | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) Index: linux-rt.q/kernel/time/tick-broadcast.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-broadcast.c +++ linux-rt.q/kernel/time/tick-broadcast.c @@ -64,8 +64,9 @@ static void tick_broadcast_start_periodi */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if (tick_broadcast_device.evtdev || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if ((tick_broadcast_device.evtdev && + tick_broadcast_device.evtdev->rating >= dev->rating) || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; clockevents_exchange_device(NULL, dev); @@ -519,11 +520,9 @@ static void tick_broadcast_clear_oneshot */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; - } + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; } /* Index: linux-rt.q/kernel/time/tick-common.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-common.c +++ linux-rt.q/kernel/time/tick-common.c @@ -200,7 +200,7 @@ static int tick_check_new_device(struct cpu = smp_processor_id(); if (!cpu_isset(cpu, newdev->cpumask)) - goto out; + goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; @@ -265,7 +265,7 @@ out_bc: */ if (tick_check_broadcast_device(newdev)) ret = NOTIFY_STOP; -out: + spin_unlock_irqrestore(&tick_device_lock, flags); return ret; patches/use-write_trylock_irqsave-in-ptrace_attach.patch0000664000077200007720000000245510646635211023205 0ustar mingomingo[PREEMPT_RT]: Use write_trylock_irqsave in ptrace_attach This patch makes ptrace_attach() use the new API write_trylock_irqsave(). With this, the code in ptrace_attach() will be same for mainline and -rt. On -rt, write_trylock_irqsave() doesn't disable irqs and hence the problem is avoided. Signed-off-by: Sripathi Kodi =================================================================== --- kernel/ptrace.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) Index: linux-rt.q/kernel/ptrace.c =================================================================== --- linux-rt.q.orig/kernel/ptrace.c +++ linux-rt.q/kernel/ptrace.c @@ -161,6 +161,7 @@ int ptrace_may_attach(struct task_struct int ptrace_attach(struct task_struct *task) { int retval; + unsigned long flags; audit_ptrace(task); @@ -181,9 +182,7 @@ repeat: * cpu's that may have task_lock). */ task_lock(task); - local_irq_disable(); - if (!write_trylock(&tasklist_lock)) { - local_irq_enable(); + if (!write_trylock_irqsave(&tasklist_lock, flags)) { task_unlock(task); do { cpu_relax(); @@ -211,7 +210,7 @@ repeat: force_sig_specific(SIGSTOP, task); bad: - write_unlock_irq(&tasklist_lock); + write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); out: return retval; patches/preempt-irqs-mips.patch0000664000077200007720000000204310646635213016145 0ustar mingomingo--- arch/mips/kernel/time.c | 2 +- arch/mips/sibyte/sb1250/irq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/mips/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/time.c +++ linux-rt.q/arch/mips/kernel/time.c @@ -281,7 +281,7 @@ unsigned int mips_hpt_frequency; static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_PERCPU, + .flags = IRQF_DISABLED | IRQF_PERCPU | IRQF_NODELAY, .name = "timer", }; Index: linux-rt.q/arch/mips/sibyte/sb1250/irq.c =================================================================== --- linux-rt.q.orig/arch/mips/sibyte/sb1250/irq.c +++ linux-rt.q/arch/mips/sibyte/sb1250/irq.c @@ -242,7 +242,7 @@ static irqreturn_t sb1250_dummy_handler static struct irqaction sb1250_dummy_action = { .handler = sb1250_dummy_handler, - .flags = 0, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "sb1250-private", .next = NULL, patches/hrtimer-no-printk.patch0000664000077200007720000000204010646635216016140 0ustar mingomingo--- kernel/hrtimer.c | 3 +-- kernel/time/timekeeping.c | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -589,8 +589,7 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_INFO "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); + return 1; } Index: linux-rt.q/kernel/time/timekeeping.c =================================================================== --- linux-rt.q.orig/kernel/time/timekeeping.c +++ linux-rt.q/kernel/time/timekeeping.c @@ -254,8 +254,10 @@ static void change_clocksource(void) tick_clock_notify(); +#ifndef CONFIG_PREEMPT_RT printk(KERN_INFO "Time: %s clocksource has been installed.\n", clock->name); +#endif } #else static inline void change_clocksource(void) { } patches/idle-stop-critical-timing.patch0000664000077200007720000000164710646635212017534 0ustar mingomingo--- drivers/acpi/processor_idle.c | 8 ++++++++ 1 file changed, 8 insertions(+) Index: linux-rt.q/drivers/acpi/processor_idle.c =================================================================== --- linux-rt.q.orig/drivers/acpi/processor_idle.c +++ linux-rt.q/drivers/acpi/processor_idle.c @@ -836,6 +836,12 @@ static inline void acpi_idle_update_bm_r */ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { + /* + * We have irqs disabled here, so stop latency tracing + * at this point and restart it after we return: + */ + stop_critical_timing(); + if (cx->space_id == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -848,6 +854,8 @@ static inline void acpi_idle_do_entry(st gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } + + touch_critical_timing(); } /** patches/mm-fix-latency.patch0000664000077200007720000001160310646635211015401 0ustar mingomingoOn Tue, 27 Dec 2005, Lee Revell wrote: > I am seeing excessive latencies (2ms+) in free_pgtables, called from > exit_mmap with 2.6.15-rc5. This is a significant regression from 2.6.14 > where the maximum observed latency was less than 1ms with some tuning. > Here is the trace. > > The problem is that we now do a lot more work in free_pgtables under the > mm->page_table_lock spinlock so preemption can be delayed for a long > time. Here is the change responsible: Yes, I'm to blame for that - sorry. It didn't occur to me that I was moving any signficant amount of work (on mms with many vmas) into the section with preemption disabled. Actually, the mm->page_table_lock is _not_ held there any more; but preemption is still disabled while using the per-cpu mmu_gathers. I wish you'd found it at -rc1 time. It's not something that can be properly corrected in a hurry. The proper fix is to rework the tlb_gather_mmu stuff, so it can be done without preemption disabled. It's already a serious limitation in unmap_vmas, with CONFIG_PREEMPT's ZAP_BLOCK_SIZE spoiling throughput with far too many TLB flushes. On my list to work on; but the TLB always needs great care, and this goes down into architectural divergences, with truncation of a mapped file adding further awkward constraints. I imagine 2.6.16-rc1 is only a couple of weeks away, so it's unlikely to be fixed in 2.6.16 either. > What was the purpose of this change? To narrow the scope of the page_table_lock, so that it could be taken later and released earlier, for slightly better preemptibility, and to allow more scalable locking by splitting it up per page-table page. And a step towards the mmu_gather rework I refer to above, to recover from 2.6.11's unmap_vmas slowdown. Here's an untested patch which should mostly correct your latency problem with 2.6.15-rc. But it's certainly not the right solution, and it's probably both too ugly and too late for 2.6.15. If you really want Linus to put it in, please test it out, especially on ia64, and try to persuade him. Personally I'd prefer to wait for the right solution: but I don't have your low-latency needs, and I'm certainly guilty of a regression here. 2.6.15-rc1 moved the unlinking of a vma from its prio_tree and anon_vma into free_pgtables: so the vma is hidden from rmap and vmtruncate before freeing its page tables, allowing safe descent without page table lock. But free_pgtables is still called with preemption disabled, and Lee Revell has now detected high latency there. The right fix will be to rework the mmu_gathering, not to need preemption disabled; but for now an ugly CONFIG_PREEMPT block in free_pgtables, to make an initial unlinking pass with preemption enabled - made uglier by CONFIG_IA64 definitions (only ia64 actually uses the start and end given to tlb_finish_mmu, and our floor and ceiling don't quite work for those). These CONFIG choices being to minimize the additional TLB flushing. Signed-off-by: Hugh Dickins mm/memory.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) Index: linux-rt.q/mm/memory.c =================================================================== --- linux-rt.q.orig/mm/memory.c +++ linux-rt.q/mm/memory.c @@ -266,18 +266,48 @@ void free_pgd_range(struct mmu_gather ** flush_tlb_pgtables((*tlb)->mm, start, end); } +#ifdef CONFIG_IA64 +#define tlb_start_addr(tlb) (tlb)->start_addr +#define tlb_end_addr(tlb) (tlb)->end_addr +#else +#define tlb_start_addr(tlb) 0UL /* only ia64 really uses it */ +#define tlb_end_addr(tlb) 0UL /* only ia64 really uses it */ +#endif + void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { +#ifdef CONFIG_PREEMPT + struct vm_area_struct *unlink = vma; + int fullmm = (*tlb)->fullmm; + + if (!vma) /* Sometimes when exiting after an oops */ + return; + if (vma->vm_next) + tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb)); + /* + * Hide vma from rmap and vmtruncate before freeeing pgtables, + * with preemption enabled, except when unmapping just one area. + */ + while (unlink) { + anon_vma_unlink(unlink); + unlink_file_vma(unlink); + unlink = unlink->vm_next; + } + if (vma->vm_next) + *tlb = tlb_gather_mmu(vma->vm_mm, fullmm); +#endif while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; +#ifndef CONFIG_PREEMPT /* * Hide vma from rmap and vmtruncate before freeing pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); +#endif if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, @@ -290,8 +320,10 @@ void free_pgtables(struct mmu_gather **t && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; +#ifndef CONFIG_PREEMPT anon_vma_unlink(vma); unlink_file_vma(vma); +#endif } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); patches/preempt-irqs-ppc-fix-more-fasteoi.patch0000664000077200007720000000675010646635214021145 0ustar mingomingoFrom sshtylyov@ru.mvista.com Thu May 17 15:18:39 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from imap.sh.mvista.com (unknown [63.81.120.155]) by mail.tglx.de (Postfix) with ESMTP id BFD3A65C065 for ; Thu, 17 May 2007 15:18:39 +0200 (CEST) Received: from wasted.dev.rtsoft.ru (unknown [10.150.0.9]) by imap.sh.mvista.com (Postfix) with ESMTP id 8E3CB3EC9; Thu, 17 May 2007 06:18:35 -0700 (PDT) From: Sergei Shtylyov Organization: MontaVista Software Inc. To: mingo@elte.hu, tglx@linutronix.de Subject: [PATCH 2.6.21-rt2] PowerPC: revert fix for threaded fasteoi IRQ handlers Date: Thu, 17 May 2007 17:20:08 +0400 User-Agent: KMail/1.5 Cc: linux-kernel@vger.kernel.org, linuxppc-dev@ozlabs.org, dwalker@mvista.com References: <200611192243.34850.sshtylyov@ru.mvista.com> In-Reply-To: <200611192243.34850.sshtylyov@ru.mvista.com> MIME-Version: 1.0 Content-Disposition: inline Message-Id: <200705171719.34968.sshtylyov@ru.mvista.com> Content-Type: text/plain; charset="us-ascii" X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Revert the change to the "fasteoi" type chips as after handle_fasteoi_irq() had been fixed, they've become meaningless (and even dangerous -- as was the case with Celleb that has been fixed earlier)... Signed-off-by: Sergei Shtylyov --- The patch in question wasn't even initially accepted but then was erroneously restored along with the TOD patch. I've asked to revert it but to no avail, so here's the formal patch to revert it at last... arch/powerpc/platforms/iseries/irq.c | 1 - arch/powerpc/platforms/pseries/xics.c | 2 -- arch/powerpc/sysdev/mpic.c | 1 - 3 files changed, 4 deletions(-) Index: linux-rt.q/arch/powerpc/platforms/iseries/irq.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/iseries/irq.c +++ linux-rt.q/arch/powerpc/platforms/iseries/irq.c @@ -278,7 +278,6 @@ static struct irq_chip iseries_pic = { .shutdown = iseries_shutdown_IRQ, .unmask = iseries_enable_IRQ, .mask = iseries_disable_IRQ, - .ack = iseries_end_IRQ, .eoi = iseries_end_IRQ }; Index: linux-rt.q/arch/powerpc/platforms/pseries/xics.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/pseries/xics.c +++ linux-rt.q/arch/powerpc/platforms/pseries/xics.c @@ -456,7 +456,6 @@ static struct irq_chip xics_pic_direct = .startup = xics_startup, .mask = xics_mask_irq, .unmask = xics_unmask_irq, - .ack = xics_eoi_direct, .eoi = xics_eoi_direct, .set_affinity = xics_set_affinity }; @@ -467,7 +466,6 @@ static struct irq_chip xics_pic_lpar = { .startup = xics_startup, .mask = xics_mask_irq, .unmask = xics_unmask_irq, - .ack = xics_eoi_lpar, .eoi = xics_eoi_lpar, .set_affinity = xics_set_affinity }; Index: linux-rt.q/arch/powerpc/sysdev/mpic.c =================================================================== --- linux-rt.q.orig/arch/powerpc/sysdev/mpic.c +++ linux-rt.q/arch/powerpc/sysdev/mpic.c @@ -835,7 +835,6 @@ int mpic_set_irq_type(unsigned int virq, static struct irq_chip mpic_irq_chip = { .mask = mpic_mask_irq, .unmask = mpic_unmask_irq, - .ack = mpic_end_irq, .eoi = mpic_end_irq, .set_type = mpic_set_irq_type, }; patches/preempt-realtime-console.patch0000664000077200007720000000344610646635215017475 0ustar mingomingo--- drivers/video/console/fbcon.c | 5 +++-- include/linux/console.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) Index: linux-rt.q/drivers/video/console/fbcon.c =================================================================== --- linux-rt.q.orig/drivers/video/console/fbcon.c +++ linux-rt.q/drivers/video/console/fbcon.c @@ -1287,7 +1287,6 @@ static void fbcon_clear(struct vc_data * { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_ops *ops = info->fbcon_par; - struct display *p = &fb_display[vc->vc_num]; u_int y_break; @@ -1316,10 +1315,11 @@ static void fbcon_putcs(struct vc_data * struct display *p = &fb_display[vc->vc_num]; struct fbcon_ops *ops = info->fbcon_par; - if (!fbcon_is_inactive(vc, info)) + if (!fbcon_is_inactive(vc, info)) { ops->putcs(vc, info, s, count, real_y(p, ypos), xpos, get_color(vc, info, scr_readw(s), 1), get_color(vc, info, scr_readw(s), 0)); + } } static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos) @@ -3173,6 +3173,7 @@ static const struct consw fb_con = { .con_screen_pos = fbcon_screen_pos, .con_getxy = fbcon_getxy, .con_resize = fbcon_resize, + .con_preemptible = 1, }; static struct notifier_block fbcon_event_notifier = { Index: linux-rt.q/include/linux/console.h =================================================================== --- linux-rt.q.orig/include/linux/console.h +++ linux-rt.q/include/linux/console.h @@ -55,6 +55,7 @@ struct consw { void (*con_invert_region)(struct vc_data *, u16 *, int); u16 *(*con_screen_pos)(struct vc_data *, int); unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *); + int con_preemptible; // can it reschedule from within printk? }; extern const struct consw *conswitchp; patches/kprobes-preempt-fix.patch0000664000077200007720000000261210646635211016452 0ustar mingomingo arch/i386/kernel/kprobes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) Index: linux-rt.q/arch/i386/kernel/kprobes.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/kprobes.c +++ linux-rt.q/arch/i386/kernel/kprobes.c @@ -329,7 +329,7 @@ ss_probe: /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return 1; } #endif @@ -338,7 +338,7 @@ ss_probe: return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } @@ -569,7 +569,7 @@ static int __kprobes post_kprobe_handler } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, eflags @@ -603,7 +603,7 @@ static int __kprobes kprobe_fault_handle restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -737,7 +737,7 @@ int __kprobes longjmp_break_handler(stru *regs = kcb->jprobe_saved_regs; memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; patches/rt-slab-new.patch0000664000077200007720000011273110646635214014711 0ustar mingomingo new slab port. Signed-off-by: Ingo Molnar --- mm/slab.c | 496 +++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 319 insertions(+), 177 deletions(-) Index: linux-rt.q/mm/slab.c =================================================================== --- linux-rt.q.orig/mm/slab.c +++ linux-rt.q/mm/slab.c @@ -116,6 +116,63 @@ #include /* + * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking + * mechanism. + * + * On PREEMPT_RT, we use per-CPU locks for this. That's why the + * calling convention is changed slightly: a new 'flags' argument + * is passed to 'irq disable/enable' - the PREEMPT_RT code stores + * the CPU number of the lock there. + */ +#ifndef CONFIG_PREEMPT_RT +# define slab_irq_disable(cpu) \ + do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_enable(cpu) local_irq_enable() +# define slab_irq_save(flags, cpu) \ + do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_restore(flags, cpu) local_irq_restore(flags) +/* + * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT, + * which has no per-CPU locking effect since we are holding the cache + * lock in that case already. + * + * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.) + */ +# define slab_irq_disable_nort() local_irq_disable() +# define slab_irq_enable_nort() local_irq_enable() +# define slab_irq_disable_rt(flags) do { (void)(flags); } while (0) +# define slab_irq_enable_rt(flags) do { (void)(flags); } while (0) +# define slab_spin_lock_irq(lock, cpu) \ + do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irq(lock, cpu) \ + spin_unlock_irq(lock) +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); } while (0) +#else +DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, }; +# define slab_irq_disable(cpu) (void)get_cpu_var_locked(slab_irq_locks, &(cpu)) +# define slab_irq_enable(cpu) put_cpu_var_locked(slab_irq_locks, cpu) +# define slab_irq_save(flags, cpu) \ + do { slab_irq_disable(cpu); (void) (flags); } while (0) +# define slab_irq_restore(flags, cpu) \ + do { slab_irq_enable(cpu); (void) (flags); } while (0) +# define slab_irq_disable_rt(cpu) slab_irq_disable(cpu) +# define slab_irq_enable_rt(cpu) slab_irq_enable(cpu) +# define slab_irq_disable_nort() do { } while (0) +# define slab_irq_enable_nort() do { } while (0) +# define slab_spin_lock_irq(lock, cpu) \ + do { slab_irq_disable(cpu); spin_lock(lock); } while (0) +# define slab_spin_unlock_irq(lock, cpu) \ + do { spin_unlock(lock); slab_irq_enable(cpu); } while (0) +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0) +#endif + +/* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * @@ -314,7 +371,7 @@ struct kmem_list3 __initdata initkmem_li static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, int *this_cpu); static int enable_cpucache(struct kmem_cache *cachep); static void cache_reap(struct work_struct *unused); @@ -758,9 +815,10 @@ int slab_is_available(void) static DEFINE_PER_CPU(struct delayed_work, reap_work); -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) +static inline struct array_cache * +cpu_cache_get(struct kmem_cache *cachep, int this_cpu) { - return cachep->array[smp_processor_id()]; + return cachep->array[this_cpu]; } static inline struct kmem_cache *__find_general_cachep(size_t size, @@ -990,7 +1048,7 @@ static int transfer_objects(struct array #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) +#define reap_alien(cachep, l3, this_cpu) do { } while (0) static inline struct array_cache **alloc_alien_cache(int node, int limit) { @@ -1001,7 +1059,8 @@ static inline void free_alien_cache(stru { } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { return 0; } @@ -1013,14 +1072,15 @@ static inline void *alternate_node_alloc } static inline void *____cache_alloc_node(struct kmem_cache *cachep, - gfp_t flags, int nodeid) + gfp_t flags, int nodeid, int *this_cpu) { return NULL; } #else /* CONFIG_NUMA */ -static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, int *this_cpu); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) @@ -1062,7 +1122,8 @@ static void free_alien_cache(struct arra } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + int *this_cpu) { struct kmem_list3 *rl3 = cachep->nodelists[node]; @@ -1076,7 +1137,7 @@ static void __drain_alien_cache(struct k if (rl3->shared) transfer_objects(rl3->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, this_cpu); ac->avail = 0; spin_unlock(&rl3->list_lock); } @@ -1085,15 +1146,16 @@ static void __drain_alien_cache(struct k /* * Called from cache_reap() to regularly drain alien caches round robin. */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +static void +reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu) { - int node = __get_cpu_var(reap_node); + int node = per_cpu(reap_node, *this_cpu); if (l3->alien) { struct array_cache *ac = l3->alien[node]; if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); + __drain_alien_cache(cachep, ac, node, this_cpu); spin_unlock_irq(&ac->lock); } } @@ -1102,21 +1164,22 @@ static void reap_alien(struct kmem_cache static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) { - int i = 0; + int i = 0, this_cpu; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { ac = alien[i]; if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + slab_spin_lock_irqsave(&ac->lock, flags, this_cpu); + __drain_alien_cache(cachep, ac, i, &this_cpu); + slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu); } } } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { struct slab *slabp = virt_to_slab(objp); int nodeid = slabp->nodeid; @@ -1140,17 +1203,18 @@ static inline int cache_free_alien(struc spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, alien, nodeid, this_cpu); } alien->entry[alien->avail++] = objp; spin_unlock(&alien->lock); } else { spin_lock(&(cachep->nodelists[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); + free_block(cachep, &objp, 1, nodeid, this_cpu); spin_unlock(&(cachep->nodelists[nodeid])->list_lock); } return 1; } + #endif static int __cpuinit cpuup_callback(struct notifier_block *nfb, @@ -1161,6 +1225,7 @@ static int __cpuinit cpuup_callback(stru struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); int memsize = sizeof(struct kmem_list3); + int this_cpu; switch (action) { case CPU_LOCK_ACQUIRE: @@ -1197,11 +1262,11 @@ static int __cpuinit cpuup_callback(stru cachep->nodelists[node] = l3; } - spin_lock_irq(&cachep->nodelists[node]->list_lock); + slab_spin_lock_irq(&cachep->nodelists[node]->list_lock, this_cpu); cachep->nodelists[node]->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); + slab_spin_unlock_irq(&cachep->nodelists[node]->list_lock, this_cpu); } /* @@ -1233,7 +1298,7 @@ static int __cpuinit cpuup_callback(stru l3 = cachep->nodelists[node]; BUG_ON(!l3); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (!l3->shared) { /* * We are serialised from CPU_DEAD or @@ -1248,7 +1313,7 @@ static int __cpuinit cpuup_callback(stru alien = NULL; } #endif - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); free_alien_cache(alien); } @@ -1292,6 +1357,7 @@ static int __cpuinit cpuup_callback(stru struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; + int this_cpu; cpumask_t mask; mask = node_to_cpumask(node); @@ -1303,29 +1369,31 @@ static int __cpuinit cpuup_callback(stru if (!l3) goto free_array_cache; - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); /* Free limit for this kmem_list3 */ l3->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, + &this_cpu); if (!cpus_empty(mask)) { - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, + this_cpu); goto free_array_cache; } shared = l3->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &this_cpu); l3->shared = NULL; } alien = l3->alien; l3->alien = NULL; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); if (alien) { @@ -1367,11 +1435,13 @@ static void init_list(struct kmem_cache int nodeid) { struct kmem_list3 *ptr; + int this_cpu; ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); BUG_ON(!ptr); - local_irq_disable(); + WARN_ON(spin_is_locked(&list->list_lock)); + slab_irq_disable(this_cpu); memcpy(ptr, list, sizeof(struct kmem_list3)); /* * Do not assume that spinlocks can be initialized via memcpy: @@ -1380,7 +1450,7 @@ static void init_list(struct kmem_cache MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->nodelists[nodeid] = ptr; - local_irq_enable(); + slab_irq_enable(this_cpu); } /* @@ -1524,36 +1594,34 @@ void __init kmem_cache_init(void) /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; + int this_cpu; ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), - sizeof(struct arraycache_init)); + slab_irq_disable(this_cpu); + BUG_ON(cpu_cache_get(&cache_cache, this_cpu) != &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(&cache_cache, this_cpu), + sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - - cache_cache.array[smp_processor_id()] = ptr; - local_irq_enable(); + cache_cache.array[this_cpu] = ptr; + slab_irq_enable(this_cpu); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) - != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), - sizeof(struct arraycache_init)); + slab_irq_disable(this_cpu); + BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu) + != &initarray_generic.cache); + memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu), + sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - - malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = - ptr; - local_irq_enable(); + malloc_sizes[INDEX_AC].cs_cachep->array[this_cpu] = ptr; + slab_irq_enable(this_cpu); } /* 5) Replace the bootstrap kmem_list3's */ { @@ -1704,7 +1772,7 @@ static void store_stackinfo(struct kmem_ *addr++ = 0x12345678; *addr++ = caller; - *addr++ = smp_processor_id(); + *addr++ = raw_smp_processor_id(); size -= 3 * sizeof(unsigned long); { unsigned long *sptr = &caller; @@ -1859,7 +1927,11 @@ static void check_poison_obj(struct kmem } #endif +static void +__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu); + #if DEBUG + /** * slab_destroy_objs - destroy a slab and its objects * @cachep: cache pointer being destroyed @@ -1868,7 +1940,8 @@ static void check_poison_obj(struct kmem * Call the registered destructor for each object in a slab that is being * destroyed. */ -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void +slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { @@ -1911,7 +1984,8 @@ static void slab_destroy_objs(struct kme * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void +slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu) { void *addr = slabp->s_mem - slabp->colouroff; @@ -1925,8 +1999,12 @@ static void slab_destroy(struct kmem_cac call_rcu(&slab_rcu->head, kmem_rcu_free); } else { kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + if (OFF_SLAB(cachep)) { + if (this_cpu) + __cache_free(cachep->slabp_cache, slabp, this_cpu); + else + kmem_cache_free(cachep->slabp_cache, slabp); + } } } @@ -2039,6 +2117,8 @@ static size_t calculate_slab_order(struc static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) { + int this_cpu; + if (g_cpucache_up == FULL) return enable_cpucache(cachep); @@ -2082,10 +2162,12 @@ static int __init_refok setup_cpu_cache( jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; + this_cpu = raw_smp_processor_id(); + + cpu_cache_get(cachep, this_cpu)->avail = 0; + cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep, this_cpu)->batchcount = 1; + cpu_cache_get(cachep, this_cpu)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; return 0; @@ -2376,19 +2458,19 @@ EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) { +/* + * On PREEMPT_RT we use locks to protect the per-CPU lists, + * and keep interrupts enabled. + */ +#ifndef CONFIG_PREEMPT_RT BUG_ON(!irqs_disabled()); +#endif } static void check_irq_on(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(irqs_disabled()); -} - -static void check_spinlock_acquired(struct kmem_cache *cachep) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); #endif } @@ -2403,7 +2485,6 @@ static void check_spinlock_acquired_node #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) -#define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif @@ -2411,26 +2492,60 @@ static void drain_array(struct kmem_cach struct array_cache *ac, int force, int node); -static void do_drain(void *arg) +static void __do_drain(void *arg, int this_cpu) { struct kmem_cache *cachep = arg; + int node = cpu_to_node(this_cpu); struct array_cache *ac; - int node = numa_node_id(); check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, this_cpu); spin_lock(&cachep->nodelists[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &this_cpu); spin_unlock(&cachep->nodelists[node]->list_lock); ac->avail = 0; } +#ifdef CONFIG_PREEMPT_RT +static void do_drain(void *arg, int this_cpu) +{ + __do_drain(arg, this_cpu); +} +#else +static void do_drain(void *arg) +{ + __do_drain(arg, smp_processor_id()); +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* + * execute func() for all CPUs. On PREEMPT_RT we dont actually have + * to run on the remote CPUs - we only have to take their CPU-locks. + * (This is a rare operation, so cacheline bouncing is not an issue.) + */ +static void +slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg) +{ + unsigned int i; + + check_irq_on(); + for_each_online_cpu(i) { + spin_lock(&__get_cpu_lock(slab_irq_locks, i)); + func(arg, i); + spin_unlock(&__get_cpu_lock(slab_irq_locks, i)); + } +} +#else +# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1, 1) +#endif + static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1, 1); + slab_on_each_cpu(do_drain, cachep); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -2455,16 +2570,16 @@ static int drain_freelist(struct kmem_ca struct kmem_list3 *l3, int tofree) { struct list_head *p; - int nr_freed; + int nr_freed, this_cpu; struct slab *slabp; nr_freed = 0; while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); p = l3->slabs_free.prev; if (p == &l3->slabs_free) { - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); goto out; } @@ -2473,13 +2588,9 @@ static int drain_freelist(struct kmem_ca BUG_ON(slabp->inuse); #endif list_del(&slabp->list); - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. - */ l3->free_objects -= cache->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(cache, slabp); + slab_destroy(cache, slabp, &this_cpu); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); nr_freed++; } out: @@ -2731,8 +2842,8 @@ static void slab_map_pages(struct kmem_c * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) +static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, + void *objp, int *this_cpu) { struct slab *slabp; size_t offset; @@ -2761,7 +2872,8 @@ static int cache_grow(struct kmem_cache offset *= cachep->colour_off; if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_nort(); + slab_irq_enable_rt(*this_cpu); /* * The test for missing atomic flag is performed here, rather than @@ -2791,8 +2903,10 @@ static int cache_grow(struct kmem_cache cache_init_objs(cachep, slabp); + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(); + check_irq_off(); spin_lock(&l3->list_lock); @@ -2805,8 +2919,9 @@ static int cache_grow(struct kmem_cache opps1: kmem_freepages(cachep, objp); failed: + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(); return 0; } @@ -2926,7 +3041,8 @@ bad: #define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +static void * +cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { int batchcount; struct kmem_list3 *l3; @@ -2936,7 +3052,7 @@ static void *cache_alloc_refill(struct k node = numa_node_id(); check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2947,7 +3063,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[node]; + l3 = cachep->nodelists[cpu_to_node(*this_cpu)]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); @@ -2970,7 +3086,7 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); - check_spinlock_acquired(cachep); + check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu)); /* * The slab was either on partial or free list so @@ -2984,8 +3100,9 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - node); + ac->entry[ac->avail++] = + slab_get_obj(cachep, slabp, + cpu_to_node(*this_cpu)); } check_slabp(cachep, slabp); @@ -3004,10 +3121,10 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu); /* cache_grow can reenable interrupts, then ac could change. */ - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (!x && ac->avail == 0) /* no objects in sight? abort */ return NULL; @@ -3159,21 +3276,22 @@ static inline int should_failslab(struct #endif /* CONFIG_FAILSLAB */ -static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) +static inline void * +____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { void *objp; struct array_cache *ac; check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); ac->touched = 1; objp = ac->entry[--ac->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = cache_alloc_refill(cachep, flags, this_cpu); } return objp; } @@ -3187,7 +3305,7 @@ static inline void *____cache_alloc(stru */ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) { - int nid_alloc, nid_here; + int nid_alloc, nid_here, this_cpu = raw_smp_processor_id(); if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; @@ -3197,7 +3315,7 @@ static void *alternate_node_alloc(struct else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); if (nid_alloc != nid_here) - return ____cache_alloc_node(cachep, flags, nid_alloc); + return ____cache_alloc_node(cachep, flags, nid_alloc, &this_cpu); return NULL; } @@ -3209,7 +3327,7 @@ static void *alternate_node_alloc(struct * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. */ -static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { struct zonelist *zonelist; gfp_t local_flags; @@ -3235,8 +3353,10 @@ retry: if (cpuset_zone_allowed_hardwall(*z, flags) && cache->nodelists[nid] && cache->nodelists[nid]->free_objects) - obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid, + this_cpu); } if (!obj) { @@ -3247,19 +3367,24 @@ retry: * set and go into memory reserves if necessary. */ if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_nort(); + slab_irq_enable_rt(*this_cpu); + kmem_flagcheck(cache, flags); obj = kmem_getpages(cache, flags, -1); + + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(); + if (obj) { /* * Insert into the appropriate per node queues */ nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + if (cache_grow(cache, flags, nid, obj, this_cpu)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + flags | GFP_THISNODE, nid, this_cpu); if (!obj) /* * Another processor may allocate the @@ -3280,7 +3405,7 @@ retry: * A interface to enable slab creation on nodeid */ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, int *this_cpu) { struct list_head *entry; struct slab *slabp; @@ -3328,11 +3453,11 @@ retry: must_grow: spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu); if (x) goto retry; - return fallback_alloc(cachep, flags); + return fallback_alloc(cachep, flags, this_cpu); done: return obj; @@ -3354,46 +3479,48 @@ static __always_inline void * __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *caller) { - unsigned long save_flags; + unsigned long irqflags; + int this_cpu; void *ptr; if (should_failslab(cachep, flags)) return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); + + slab_irq_save(irqflags, this_cpu); if (unlikely(nodeid == -1)) - nodeid = numa_node_id(); + nodeid = cpu_to_node(this_cpu); if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); + ptr = fallback_alloc(cachep, flags, &this_cpu); goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == cpu_to_node(this_cpu)) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); + ptr = ____cache_alloc(cachep, flags, &this_cpu); if (ptr) goto out; } /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); + ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu); out: - local_irq_restore(save_flags); + slab_irq_restore(irqflags, this_cpu); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); return ptr; } static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { void *objp; @@ -3402,24 +3529,24 @@ __do_cache_alloc(struct kmem_cache *cach if (objp) goto out; } - objp = ____cache_alloc(cache, flags); + objp = ____cache_alloc(cache, flags, this_cpu); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); - + if (!objp) + objp = ____cache_alloc_node(cache, flags, + cpu_to_node(*this_cpu), this_cpu); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { - return ____cache_alloc(cachep, flags); + return ____cache_alloc(cachep, flags, this_cpu); } #endif /* CONFIG_NUMA */ @@ -3428,15 +3555,16 @@ static __always_inline void * __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) { unsigned long save_flags; + int this_cpu; void *objp; if (should_failslab(cachep, flags)) return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); + slab_irq_save(save_flags, this_cpu); + objp = __do_cache_alloc(cachep, flags, &this_cpu); + slab_irq_restore(save_flags, this_cpu); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); @@ -3447,7 +3575,7 @@ __cache_alloc(struct kmem_cache *cachep, * Caller needs to acquire correct kmem_list's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) + int node, int *this_cpu) { int i; struct kmem_list3 *l3; @@ -3476,7 +3604,7 @@ static void free_block(struct kmem_cache * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, slabp, this_cpu); } else { list_add(&slabp->list, &l3->slabs_free); } @@ -3490,11 +3618,12 @@ static void free_block(struct kmem_cache } } -static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +static void +cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = cpu_to_node(*this_cpu); batchcount = ac->batchcount; #if DEBUG @@ -3516,7 +3645,7 @@ static void cache_flusharray(struct kmem } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, this_cpu); free_done: #if STATS { @@ -3545,14 +3674,15 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp) +static void +__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu) { - struct array_cache *ac = cpu_cache_get(cachep); + struct array_cache *ac = cpu_cache_get(cachep, *this_cpu); check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + if (cache_free_alien(cachep, objp, this_cpu)) return; if (likely(ac->avail < ac->limit)) { @@ -3561,7 +3691,7 @@ static inline void __cache_free(struct k return; } else { STATS_INC_FREEMISS(cachep); - cache_flusharray(cachep, ac); + cache_flusharray(cachep, ac, this_cpu); ac->entry[ac->avail++] = objp; } } @@ -3782,13 +3912,14 @@ EXPORT_SYMBOL(krealloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + int this_cpu; BUG_ON(virt_to_cache(objp) != cachep); - local_irq_save(flags); + slab_irq_save(flags, this_cpu); debug_check_no_locks_freed(objp, obj_size(cachep)); - __cache_free(cachep, objp); - local_irq_restore(flags); + __cache_free(cachep, objp, &this_cpu); + slab_irq_restore(flags, this_cpu); } EXPORT_SYMBOL(kmem_cache_free); @@ -3805,15 +3936,16 @@ void kfree(const void *objp) { struct kmem_cache *c; unsigned long flags; + int this_cpu; if (unlikely(!objp)) return; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp); - local_irq_restore(flags); + __cache_free(c, (void *)objp, &this_cpu); + slab_irq_restore(flags, this_cpu); } EXPORT_SYMBOL(kfree); @@ -3834,7 +3966,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); */ static int alloc_kmemlist(struct kmem_cache *cachep) { - int node; + int node, this_cpu; struct kmem_list3 *l3; struct array_cache *new_shared; struct array_cache **new_alien = NULL; @@ -3862,11 +3994,11 @@ static int alloc_kmemlist(struct kmem_ca if (l3) { struct array_cache *shared = l3->shared; - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &this_cpu); l3->shared = new_shared; if (!l3->alien) { @@ -3875,7 +4007,7 @@ static int alloc_kmemlist(struct kmem_ca } l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); free_alien_cache(new_alien); continue; @@ -3922,42 +4054,50 @@ struct ccupdate_struct { struct array_cache *new[NR_CPUS]; }; -static void do_ccupdate_local(void *info) +static void __do_ccupdate_local(void *info, int this_cpu) { struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); - old = cpu_cache_get(new->cachep); + old = cpu_cache_get(new->cachep, this_cpu); + + new->cachep->array[this_cpu] = new->new[this_cpu]; + new->new[this_cpu] = old; +} - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; +#ifdef CONFIG_PREEMPT_RT +static void do_ccupdate_local(void *arg, int this_cpu) +{ + __do_ccupdate_local(arg, this_cpu); } +#else +static void do_ccupdate_local(void *arg) +{ + __do_ccupdate_local(arg, smp_processor_id()); +} +#endif /* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared) { - struct ccupdate_struct *new; - int i; - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return -ENOMEM; + struct ccupdate_struct new; + int i, this_cpu; + memset(&new.new, 0, sizeof(new.new)); for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_node(i), limit, + new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); - if (!new->new[i]) { + if (!new.new[i]) { for (i--; i >= 0; i--) - kfree(new->new[i]); - kfree(new); + kfree(new.new[i]); return -ENOMEM; } } - new->cachep = cachep; + new.cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); + slab_on_each_cpu(do_ccupdate_local, (void *)&new); check_irq_on(); cachep->batchcount = batchcount; @@ -3965,15 +4105,15 @@ static int do_tune_cpucache(struct kmem_ cachep->shared = shared; for_each_online_cpu(i) { - struct array_cache *ccold = new->new[i]; + struct array_cache *ccold = new.new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), &this_cpu); + slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu); kfree(ccold); } - kfree(new); + return alloc_kmemlist(cachep); } @@ -4037,26 +4177,26 @@ static int enable_cpucache(struct kmem_c * if drain_array() is used on the shared array. */ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, - struct array_cache *ac, int force, int node) + struct array_cache *ac, int force, int node) { - int tofree; + int tofree, this_cpu; if (!ac || !ac->avail) return; if (ac->touched && !force) { ac->touched = 0; } else { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &this_cpu); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } } @@ -4074,11 +4214,12 @@ void drain_array(struct kmem_cache *cach */ static void cache_reap(struct work_struct *w) { + int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu); struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); struct delayed_work *work = container_of(w, struct delayed_work, work); + int work_done = 0; if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ @@ -4094,9 +4235,10 @@ static void cache_reap(struct work_struc */ l3 = searchp->nodelists[node]; - reap_alien(searchp, l3); + reap_alien(searchp, l3, &this_cpu); - drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + drain_array(searchp, l3, cpu_cache_get(searchp, this_cpu), + 0, node); /* * These are racy checks but it does not matter @@ -4194,7 +4336,7 @@ static int s_show(struct seq_file *m, vo unsigned long num_slabs, free_objects = 0, shared_avail = 0; const char *name; char *error = NULL; - int node; + int this_cpu, node; struct kmem_list3 *l3; active_objs = 0; @@ -4205,7 +4347,7 @@ static int s_show(struct seq_file *m, vo continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) { if (slabp->inuse != cachep->num && !error) @@ -4230,7 +4372,7 @@ static int s_show(struct seq_file *m, vo if (l3->shared) shared_avail += l3->shared->avail; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -4435,7 +4577,7 @@ static int leaks_show(struct seq_file *m struct kmem_list3 *l3; const char *name; unsigned long *n = m->private; - int node; + int node, this_cpu; int i; if (!(cachep->flags & SLAB_STORE_USER)) @@ -4453,13 +4595,13 @@ static int leaks_show(struct seq_file *m continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) handle_slab(n, cachep, slabp); list_for_each_entry(slabp, &l3->slabs_partial, list) handle_slab(n, cachep, slabp); - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } name = cachep->name; if (n[0] == n[1]) { patches/dont-unmask-io_apic.patch0000664000077200007720000000133610646635211016412 0ustar mingomingo--- arch/x86_64/kernel/io_apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/x86_64/kernel/io_apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/io_apic.c +++ linux-rt.q/arch/x86_64/kernel/io_apic.c @@ -1405,7 +1405,8 @@ static void ack_apic_level(unsigned int irq_complete_move(irq); #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING) && + !(irq_desc[irq].status & IRQ_INPROGRESS)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } patches/hpet-force-enable-on-vt8235-37-chipsets.patch0000664000077200007720000001055410646635211021465 0ustar mingomingoFrom us15@os.inf.tu-dresden.de Tue Jun 12 14:31:48 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.2 required=5.0 tests=AWL,MAILTO_TO_SPAM_ADDR autolearn=no version=3.1.7-deb Received: from os.inf.tu-dresden.de (os.inf.tu-dresden.de [141.76.48.99]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (No client certificate requested) by mail.tglx.de (Postfix) with ESMTP id 13C2565C292 for ; Tue, 12 Jun 2007 14:31:48 +0200 (CEST) Received: from nova.inf.tu-dresden.de ([141.76.48.73] helo=laptop.hypervisor.org) by os.inf.tu-dresden.de with esmtpsa (TLSv1:AES256-SHA:256) (Exim 4.67) id 1Hy5XI-0008Nr-CO for tglx@linutronix.de; Tue, 12 Jun 2007 14:31:48 +0200 Date: Tue, 12 Jun 2007 14:31:47 +0200 From: "Udo A. Steinberg" To: Thomas Gleixner Subject: [PATCH]: Force enable HPET on VT8235/8237 chipsets Message-ID: <20070612143147.2a6199c2@laptop.hypervisor.org> X-Mailer: X-Mailer 5.0 Gold Mime-Version: 1.0 Content-Type: multipart/signed; boundary=Sig_48zpQdK28xw1yvtSEbZ9tfp; protocol="application/pgp-signature"; micalg=PGP-SHA1 X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ --Sig_48zpQdK28xw1yvtSEbZ9tfp Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable This patch adds quirks to force enable HPET on Via VT8235 and VT8237 chipse= ts. The datasheet for 8237 documents HPET functionality (although wrongly) wher= eas HPET is undocumented for 8235. Tested on A7V880 (8237) and K7VT4A+ (8235) boards. Signed-off-by: Udo A. Steinberg --- quirks.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++= +++- 1 file changed, 66 insertions(+), 1 deletion(-) --- arch/i386/kernel/quirks.c | 68 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/quirks.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/quirks.c +++ linux-rt.q/arch/i386/kernel/quirks.c @@ -56,7 +56,8 @@ unsigned long force_hpet_address; static enum { NONE_FORCE_HPET_RESUME, OLD_ICH_FORCE_HPET_RESUME, - ICH_FORCE_HPET_RESUME + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME } force_hpet_resume_type; static void __iomem *rcba_base; @@ -245,6 +246,68 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, old_ich_force_enable_hpet); + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + printk(KERN_DEBUG "HPET at base address 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + printk(KERN_DEBUG "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); + void force_hpet_resume(void) { switch (force_hpet_resume_type) { @@ -254,6 +317,9 @@ void force_hpet_resume(void) case OLD_ICH_FORCE_HPET_RESUME: return old_ich_force_hpet_resume(); + case VT8237_FORCE_HPET_RESUME: + return vt8237_force_hpet_resume(); + default: break; } patches/lockdep_fixup_annotate.patch0000664000077200007720000000504610646635217017306 0ustar mingomingo--- kernel/lockdep.c | 2 +- net/core/sock.c | 23 +++++++++++++++++++---- 2 files changed, 20 insertions(+), 5 deletions(-) Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -1335,7 +1335,7 @@ look_up_lock_class(struct lockdep_map *l */ list_for_each_entry(class, hash_head, hash_entry) { if (class->key == key) { - WARN_ON(class->name != lock->name); + WARN_ON_ONCE(class->name != lock->name); return class; } } Index: linux-rt.q/net/core/sock.c =================================================================== --- linux-rt.q.orig/net/core/sock.c +++ linux-rt.q/net/core/sock.c @@ -171,6 +171,19 @@ static const char *af_family_slock_key_s "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_MAX" }; +static const char *af_family_clock_key_strings[AF_MAX+1] = { + "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , + "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", + "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , + "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , + "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , + "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , + "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , + "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" , + "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , + "clock-27" , "clock-28" , "clock-29" , + "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_MAX" +}; #endif /* @@ -940,8 +953,9 @@ struct sock *sk_clone(const struct sock rwlock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); - lockdep_set_class(&newsk->sk_callback_lock, - af_callback_keys + newsk->sk_family); + lockdep_set_class_and_name(&newsk->sk_callback_lock, + af_callback_keys + newsk->sk_family, + af_family_clock_key_strings[newsk->sk_family]); newsk->sk_dst_cache = NULL; newsk->sk_wmem_queued = 0; @@ -1529,8 +1543,9 @@ void sock_init_data(struct socket *sock, rwlock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); - lockdep_set_class(&sk->sk_callback_lock, - af_callback_keys + sk->sk_family); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; patches/tick-management-spread-timer-interrupt.patch0000664000077200007720000000303010646635210022231 0ustar mingomingoFrom: john stultz After discussing w/ Thomas over IRC, it seems the issue is the sched tick fires on every cpu at the same time, causing extra lock contention. This smaller change, adds an extra offset per cpu so the ticks don't line up. This patch also drops the idle latency from 40us down to under 20us. Signed-off-by: john stultz Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton --- kernel/time/tick-sched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) Index: linux-rt.q/kernel/time/tick-sched.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-sched.c +++ linux-rt.q/kernel/time/tick-sched.c @@ -572,6 +572,7 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); + u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -580,8 +581,12 @@ void tick_setup_sched_timer(void) ts->sched_timer.function = tick_sched_timer; ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; - /* Get the next period */ + /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); + offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, NR_CPUS); + offset *= smp_processor_id(); + ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); patches/ich-force-hpet-late-initialization-of-hpet-after-quirk.patch0000664000077200007720000000473210646635211025111 0ustar mingomingoFrom: Venki Pallipadi Enable HPET later during boot, after the force detect in PCI quirks. Also add a call to repeat the force enabling at resume time. Signed-off-by: Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andi Kleen Cc: john stultz Cc: Greg KH Signed-off-by: Andrew Morton --- arch/i386/kernel/hpet.c | 27 +++++++++++++++++++++++++-- include/asm-i386/hpet.h | 1 + 2 files changed, 26 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -164,6 +164,7 @@ static struct clock_event_device hpet_cl .set_next_event = hpet_legacy_next_event, .shift = 32, .irq = 0, + .rating = 50, }; static void hpet_start_counter(void) @@ -178,6 +179,17 @@ static void hpet_start_counter(void) hpet_writel(cfg, HPET_CFG); } +static void hpet_resume_device(void) +{ + ich_force_hpet_resume(); +} + +static void hpet_restart_counter(void) +{ + hpet_resume_device(); + hpet_start_counter(); +} + static void hpet_enable_legacy_int(void) { unsigned long cfg = hpet_readl(HPET_CFG); @@ -299,7 +311,7 @@ static struct clocksource clocksource_hp .mask = HPET_MASK, .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .resume = hpet_start_counter, + .resume = hpet_restart_counter, #ifdef CONFIG_X86_64 .vread = vread_hpet, #endif @@ -412,10 +424,21 @@ out_nohpet: */ static __init int hpet_late_init(void) { - if (!is_hpet_capable()) + if (boot_hpet_disable) return -ENODEV; + if (!hpet_address) { + if (!force_hpet_address) + return -ENODEV; + + hpet_address = force_hpet_address; + hpet_enable(); + if (!hpet_virt_address) + return -ENODEV; + } + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + return 0; } fs_initcall(hpet_late_init); Index: linux-rt.q/include/asm-i386/hpet.h =================================================================== --- linux-rt.q.orig/include/asm-i386/hpet.h +++ linux-rt.q/include/asm-i386/hpet.h @@ -64,6 +64,7 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; +extern unsigned long force_hpet_address; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern unsigned long hpet_readl(unsigned long a); patches/write-try-lock-irqsave.patch0000664000077200007720000000115410646635211017113 0ustar mingomingo--- include/linux/spinlock.h | 7 +++++++ 1 file changed, 7 insertions(+) Index: linux-rt.q/include/linux/spinlock.h =================================================================== --- linux-rt.q.orig/include/linux/spinlock.h +++ linux-rt.q/include/linux/spinlock.h @@ -282,6 +282,13 @@ do { \ 1 : ({ local_irq_restore(flags); 0; }); \ }) +#define write_trylock_irqsave(lock, flags) \ +({ \ + local_irq_save(flags); \ + write_trylock(lock) ? \ + 1 : ({ local_irq_restore(flags); 0; }); \ +}) + /* * Locks two spinlocks l1 and l2. * l1_first indicates if spinlock l1 should be taken first. patches/lockdep-prove-locking.patch0000664000077200007720000000662110646635217016757 0ustar mingomingoEnsure that all of the lock dependency tracking code is under CONFIG_PROVE_LOCKING. This allows us to use the held lock tracking code for other purposes. Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron --- kernel/lockdep.c | 13 ++++++++++++- kernel/spinlock.c | 4 ++-- 2 files changed, 14 insertions(+), 3 deletions(-) Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -95,6 +95,7 @@ static int lockdep_initialized; unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +#ifdef CONFIG_PROVE_LOCKING /* * Allocate a lockdep entry. (assumes the graph_lock held, returns * with NULL on failure) @@ -111,6 +112,7 @@ static struct lock_list *alloc_list_entr } return list_entries + nr_list_entries++; } +#endif /* * All data structures here are protected by the global debug_lock. @@ -140,7 +142,9 @@ LIST_HEAD(all_lock_classes); static struct list_head classhash_table[CLASSHASH_SIZE]; unsigned long nr_lock_chains; +#ifdef CONFIG_PROVE_LOCKING static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +#endif /* * We put the lock dependency chains into a hash-table as well, to cache @@ -486,6 +490,7 @@ static void print_lock_dependencies(stru } } +#ifdef CONFIG_PROVE_LOCKING /* * Add a new dependency to the head of the list: */ @@ -545,6 +550,7 @@ print_circular_bug_entry(struct lock_lis return 0; } +#endif static void print_kernel_version(void) { @@ -553,6 +559,7 @@ static void print_kernel_version(void) init_utsname()->version); } +#ifdef CONFIG_PROVE_LOCKING /* * When a circular dependency is detected, print the * header first: @@ -643,6 +650,7 @@ check_noncircular(struct lock_class *sou } return 1; } +#endif static int very_verbose(struct lock_class *class) { @@ -827,6 +835,7 @@ check_usage(struct task_struct *curr, st #endif +#ifdef CONFIG_PROVE_LOCKING static int print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) @@ -1091,7 +1100,7 @@ out_bug: return 0; } - +#endif /* * Is this the address of a static object: @@ -1311,6 +1320,7 @@ out_unlock_set: return class; } +#ifdef CONFIG_PROVE_LOCKING /* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is @@ -1385,6 +1395,7 @@ cache_hit: return 1; } +#endif /* * We are building curr_chain_key incrementally, so double-check Index: linux-rt.q/kernel/spinlock.c =================================================================== --- linux-rt.q.orig/kernel/spinlock.c +++ linux-rt.q/kernel/spinlock.c @@ -139,7 +139,7 @@ unsigned long __lockfunc __spin_lock_irq * _raw_spin_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ -#ifdef CONFIG_PROVE_LOCKING +#ifdef CONFIG_LOCKDEP _raw_spin_lock(lock); #else _raw_spin_lock_flags(lock, &flags); @@ -359,7 +359,7 @@ __spin_lock_irqsave_nested(raw_spinlock_ * _raw_spin_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ -#ifdef CONFIG_PROVE_SPIN_LOCKING +#ifdef CONFIG_LOCKDEP _raw_spin_lock(lock); #else _raw_spin_lock_flags(lock, &flags); patches/x86_64-consolidate-tsc-calibration.patch0000664000077200007720000001540110646635211021063 0ustar mingomingoSubject: x86_64: Consolidate tsc calibration Move the TSC calibration code to tsc.c. Reimplement it so the pm timer can be used as a reference as well. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/hpet.c | 49 ------------------------- arch/x86_64/kernel/time.c | 33 +--------------- arch/x86_64/kernel/tsc.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/tsc.h | 4 ++ 4 files changed, 96 insertions(+), 80 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/hpet.c +++ linux-rt.q/arch/x86_64/kernel/hpet.c @@ -184,55 +184,6 @@ int hpet_reenable(void) return hpet_timer_stop_set_go(hpet_tick); } -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 -#define SMI_THRESHOLD 50000 -#define MAX_TRIES 5 - -/* - * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none - * occurs between the reads of the hpet & TSC. - */ -static void __init read_hpet_tsc(int *hpet, int *tsc) -{ - int tsc1, tsc2, hpet1, i; - - for (i = 0; i < MAX_TRIES; i++) { - tsc1 = get_cycles_sync(); - hpet1 = hpet_readl(HPET_COUNTER); - tsc2 = get_cycles_sync(); - if ((tsc2 - tsc1) < SMI_THRESHOLD) - break; - } - *hpet = hpet1; - *tsc = tsc2; -} - -unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - - read_hpet_tsc(&hpet_start, &tsc_start); - - do { - local_irq_disable(); - read_hpet_tsc(&hpet_now, &tsc_now); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - #ifdef CONFIG_HPET_EMULATE_RTC /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET * is enabled, we support RTC interrupt functionality in software. Index: linux-rt.q/arch/x86_64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/time.c +++ linux-rt.q/arch/x86_64/kernel/time.c @@ -291,35 +291,6 @@ static unsigned int __init tsc_calibrate return pmc_now * tsc_khz / (tsc_now - tsc_start); } -/* - * pit_calibrate_tsc() uses the speaker output (channel 2) of - * the PIT. This is better than using the timer interrupt output, - * because we can read the value of the speaker with just one inb(), - * where we need three i/o operations for the interrupt channel. - * We count how many ticks the TSC does in 50 ms. - */ - -static unsigned int __init pit_calibrate_tsc(void) -{ - unsigned long start, end; - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - outb(0xb0, 0x43); - outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); - start = get_cycles_sync(); - while ((inb(0x61) & 0x20) == 0); - end = get_cycles_sync(); - - spin_unlock_irqrestore(&i8253_lock, flags); - - return (end - start) / 50; -} - #define PIT_MODE 0x43 #define PIT_CH0 0x40 @@ -375,14 +346,14 @@ void __init time_init(void) if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; - tsc_khz = hpet_calibrate_tsc(); timename = "HPET"; } else { pit_init(); - tsc_khz = pit_calibrate_tsc(); timename = "PIT"; } + tsc_calibrate(); + cpu_khz = tsc_khz; if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && boot_cpu_data.x86_vendor == X86_VENDOR_AMD && Index: linux-rt.q/arch/x86_64/kernel/tsc.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/tsc.c +++ linux-rt.q/arch/x86_64/kernel/tsc.c @@ -6,7 +6,9 @@ #include #include #include +#include +#include #include static int notsc __initdata = 0; @@ -118,6 +120,94 @@ core_initcall(cpufreq_tsc); #endif +#define MAX_RETRIES 5 +#define SMI_TRESHOLD 50000 + +/* + * Read TSC and the reference counters. Take care of SMI disturbance + */ +static unsigned long __init tsc_read_refs(unsigned long *pm, + unsigned long *hpet) +{ + unsigned long t1, t2; + int i; + + for (i = 0; i < MAX_RETRIES; i++) { + t1 = get_cycles_sync(); + if (hpet) + *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + else + *pm = acpi_pm_read_early(); + t2 = get_cycles_sync(); + if ((t2 - t1) < SMI_TRESHOLD) + return t2; + } + return ULONG_MAX; +} + +/** + * tsc_calibrate - calibrate the tsc on boot + */ +void __init tsc_calibrate(void) +{ + unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; + int hpet = is_hpet_enabled(); + + local_irq_save(flags); + + tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + outb(0xb0, 0x43); + outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); + outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); + tr1 = get_cycles_sync(); + while ((inb(0x61) & 0x20) == 0); + tr2 = get_cycles_sync(); + + tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + + local_irq_restore(flags); + + /* + * Preset the result with the raw and inaccurate PIT + * calibration value + */ + tsc_khz = (tr2 - tr1) / 50; + + /* hpet or pmtimer available ? */ + if (!hpet && !pm1 && !pm2) { + printk(KERN_INFO "TSC calibrated against PIT\n"); + return; + } + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) { + printk(KERN_WARNING "TSC calibration disturbed by SMI, " + "using PIT calibration result\n"); + return; + } + + tsc2 = (tsc2 - tsc1) * 1000000L; + + if (hpet) { + printk(KERN_INFO "TSC calibrated against HPET\n"); + if (hpet2 < hpet1) + hpet2 += 0x100000000; + hpet2 -= hpet1; + tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000; + } else { + printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); + if (pm2 < pm1) + pm2 += ACPI_PM_OVRRUN; + pm2 -= pm1; + tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC; + } + + tsc_khz = tsc2 / tsc1; +} + /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. Index: linux-rt.q/include/asm-i386/tsc.h =================================================================== --- linux-rt.q.orig/include/asm-i386/tsc.h +++ linux-rt.q/include/asm-i386/tsc.h @@ -71,4 +71,8 @@ extern void init_tsc_clocksource(void); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); +#ifdef CONFIG_X86_64 +extern void tsc_calibrate(void); +#endif + #endif patches/nmi-profiling-base.patch0000664000077200007720000003055210646635212016236 0ustar mingomingoSubject: [patch] nmi-driven profiling for /proc/profile From: Ingo Molnar nmi-driven profiling for /proc/profile Signed-off-by: Ingo Molnar --- arch/i386/kernel/crash.c | 8 --- arch/i386/kernel/nmi.c | 91 +++++++++++++++++++++++++++++++++++++++++---- arch/x86_64/kernel/crash.c | 5 -- arch/x86_64/kernel/irq.c | 2 arch/x86_64/kernel/nmi.c | 67 +++++++++++++++++++++++++++++++-- include/asm-i386/apic.h | 2 include/asm-x86_64/apic.h | 2 include/linux/profile.h | 1 kernel/profile.c | 9 ++-- kernel/time/tick-common.c | 1 kernel/time/tick-sched.c | 2 11 files changed, 159 insertions(+), 31 deletions(-) Index: linux-rt.q/arch/i386/kernel/crash.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/crash.c +++ linux-rt.q/arch/i386/kernel/crash.c @@ -70,14 +70,6 @@ static int crash_nmi_callback(struct not return 1; } -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); -} - static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; Index: linux-rt.q/arch/i386/kernel/nmi.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/nmi.c +++ linux-rt.q/arch/i386/kernel/nmi.c @@ -28,6 +28,8 @@ #include #include +#include + #include "mach_traps.h" int unknown_nmi_panic; @@ -44,7 +46,7 @@ static cpumask_t backtrace_mask = CPU_MA atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static DEFINE_PER_CPU(short, wd_enabled); @@ -95,7 +97,7 @@ static int __init check_nmi_watchdog(voi for_each_possible_cpu(cpu) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks + mdelay((100*1000)/nmi_hz); /* wait 100 ticks */ for_each_possible_cpu(cpu) { #ifdef CONFIG_SMP @@ -317,9 +319,48 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) { + int i; + + if (system_state == SYSTEM_BOOTING) + return; + + printk(KERN_WARNING "nmi_show_all_regs(): start on CPU#%d.\n", + raw_smp_processor_id()); + dump_stack(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +static DEFINE_SPINLOCK(nmi_print_lock); + +void irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return; + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu); + printk(KERN_WARNING "apic_timer_irqs: %d\n", + per_cpu(irq_stat, cpu).apic_timer_irqs); + show_regs(regs); + spin_unlock(&nmi_print_lock); +} + +__kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) +{ /* * Since current_thread_info()-> is always on the stack, and we * always switch the stack NMI-atomically, it's safe to use @@ -330,6 +371,8 @@ __kprobes int nmi_watchdog_tick(struct p int cpu = smp_processor_id(); int rc=0; + __profile_tick(CPU_PROFILING, regs); + /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { @@ -353,6 +396,9 @@ __kprobes int nmi_watchdog_tick(struct p */ sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); + irq_show_regs_callback(cpu, regs); + + /* if the apic timer isn't firing, this cpu isn't doing much */ /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { /* @@ -360,11 +406,30 @@ __kprobes int nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI watchdog detected lockup on " + "CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], + 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } + printk(KERN_WARNING "NMI watchdog running again ...\n"); + for_each_online_cpu(i) + alert_counter[i] = 0; + + + } + } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; @@ -462,5 +527,15 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + cpumask_t mask = cpu_online_map; + cpu_clear(safe_smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, NMI_VECTOR); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); Index: linux-rt.q/arch/x86_64/kernel/crash.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/crash.c +++ linux-rt.q/arch/x86_64/kernel/crash.c @@ -62,11 +62,6 @@ static int crash_nmi_callback(struct not return 1; } -static void smp_send_nmi_allbutself(void) -{ - send_IPI_allbutself(NMI_VECTOR); -} - /* * This code is a best effort heuristic to get the * other cpus to stop executing. So races with Index: linux-rt.q/arch/x86_64/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/irq.c +++ linux-rt.q/arch/x86_64/kernel/irq.c @@ -111,6 +111,8 @@ asmlinkage unsigned int do_IRQ(struct pt unsigned vector = ~regs->orig_rax; unsigned irq; + irq_show_regs_callback(smp_processor_id(), regs); + exit_idle(); irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; Index: linux-rt.q/arch/x86_64/kernel/nmi.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/nmi.c +++ linux-rt.q/arch/x86_64/kernel/nmi.c @@ -22,11 +22,13 @@ #include #include #include +#include #include #include #include #include +#include int unknown_nmi_panic; int nmi_watchdog_enabled; @@ -44,7 +46,7 @@ atomic_t nmi_active = ATOMIC_INIT(0); / int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static DEFINE_PER_CPU(short, wd_enabled); @@ -302,7 +304,7 @@ void touch_nmi_watchdog (void) unsigned cpu; /* - * Tell other CPUs to reset their alert counters. We cannot + * Tell other CPUs to reset their alert counters. We cannot * do it ourselves because the alert count increase is not * atomic. */ @@ -310,7 +312,42 @@ void touch_nmi_watchdog (void) per_cpu(nmi_touch, cpu) = 1; } - touch_softlockup_watchdog(); + touch_softlockup_watchdog(); +} + +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (system_state == SYSTEM_BOOTING) + return; + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +static DEFINE_SPINLOCK(nmi_print_lock); + +void irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return; + + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu); + printk(KERN_WARNING "apic_timer_irqs: %d\n", read_pda(apic_timer_irqs)); + show_regs(regs); + spin_unlock(&nmi_print_lock); } int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) @@ -320,6 +357,9 @@ int __kprobes nmi_watchdog_tick(struct p int cpu = smp_processor_id(); int rc = 0; + irq_show_regs_callback(cpu, regs); + __profile_tick(CPU_PROFILING, regs); + /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { @@ -328,6 +368,7 @@ int __kprobes nmi_watchdog_tick(struct p } sum = read_pda(apic_timer_irqs); + if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -356,9 +397,20 @@ int __kprobes nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { + int i; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, panic_on_timeout); + } } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -461,6 +513,13 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + send_IPI_allbutself(NMI_VECTOR); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); Index: linux-rt.q/include/asm-i386/apic.h =================================================================== --- linux-rt.q.orig/include/asm-i386/apic.h +++ linux-rt.q/include/asm-i386/apic.h @@ -116,6 +116,8 @@ extern void enable_NMI_through_LVT0 (voi extern int timer_over_8254; extern int local_apic_timer_c2_ok; +extern void smp_send_nmi_allbutself(void); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } Index: linux-rt.q/include/asm-x86_64/apic.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/apic.h +++ linux-rt.q/include/asm-x86_64/apic.h @@ -85,6 +85,8 @@ extern void setup_APIC_extended_lvt(unsi extern int apic_is_clustered_box(void); +extern void smp_send_nmi_allbutself(void); + #define K8_APIC_EXT_LVT_BASE 0x500 #define K8_APIC_EXT_INT_MSG_FIX 0x0 #define K8_APIC_EXT_INT_MSG_SMI 0x2 Index: linux-rt.q/include/linux/profile.h =================================================================== --- linux-rt.q.orig/include/linux/profile.h +++ linux-rt.q/include/linux/profile.h @@ -23,6 +23,7 @@ struct notifier_block; /* init basic kernel profiler */ void __init profile_init(void); +void __profile_tick(int type, struct pt_regs *regs); void profile_tick(int); /* Index: linux-rt.q/kernel/profile.c =================================================================== --- linux-rt.q.orig/kernel/profile.c +++ linux-rt.q/kernel/profile.c @@ -407,16 +407,19 @@ void profile_hits(int type, void *__pc, EXPORT_SYMBOL_GPL(profile_hits); -void profile_tick(int type) +void __profile_tick(int type, struct pt_regs *regs) { - struct pt_regs *regs = get_irq_regs(); - if (type == CPU_PROFILING && timer_hook) timer_hook(regs); if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } +void profile_tick(int type) +{ + return __profile_tick(type, get_irq_regs()); +} + #ifdef CONFIG_PROC_FS #include #include Index: linux-rt.q/kernel/time/tick-common.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-common.c +++ linux-rt.q/kernel/time/tick-common.c @@ -68,7 +68,6 @@ static void tick_periodic(int cpu) } update_process_times(user_mode(get_irq_regs())); - profile_tick(CPU_PROFILING); } /* Index: linux-rt.q/kernel/time/tick-sched.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-sched.c +++ linux-rt.q/kernel/time/tick-sched.c @@ -438,7 +438,6 @@ static void tick_nohz_handler(struct clo } update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); /* Do not restart, when we are in the idle loop */ if (ts->tick_stopped) @@ -552,7 +551,6 @@ static enum hrtimer_restart tick_sched_t */ spin_unlock(&base->lock); update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); spin_lock(&base->lock); } patches/arm-preempt-config.patch0000664000077200007720000000176210646635213016252 0ustar mingomingo arch/arm/Kconfig | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) Index: linux-rt.q/arch/arm/Kconfig =================================================================== --- linux-rt.q.orig/arch/arm/Kconfig +++ linux-rt.q/arch/arm/Kconfig @@ -595,18 +595,7 @@ config LOCAL_TIMERS accounting to be spread across the timer interval, preventing a "thundering herd" at every timer tick. -config PREEMPT - bool "Preemptible Kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source kernel/Kconfig.preempt config NO_IDLE_HZ bool "Dynamic tick timer" patches/latency-tracer-variable-threshold.patch0000664000077200007720000000746710646635212021257 0ustar mingomingoFrom ce@ceag.ch Sun Jun 3 17:30:11 2007 Return-Path: Received: from toro.web-alm.net (toro.web-alm.net [62.245.132.31]) by mail.tglx.de (Postfix) with ESMTP id DC0AF65C065 for ; Sun, 3 Jun 2007 17:30:11 +0200 (CEST) Received: from toro.web-alm.net (localhost.localdomain [127.0.0.1]) by toro.web-alm.net (8.12.11.20060308/8.12.11/Web-Alm-2003112001) with ESMTP id l53FU9Dp010764 for ; Sun, 3 Jun 2007 17:30:09 +0200 Received: from thllin.ceag.ch (uucp@localhost) by toro.web-alm.net (8.12.11.20060308/8.12.10/Submit/Web-Alm-2003112001) with bsmtp id l53FU8ol010731 for tglx@linutronix.de; Sun, 3 Jun 2007 17:30:08 +0200 Received: from [192.168.255.76] (thlblade.ceag.ch [192.168.255.76]) by thllin.ceag.ch (8.12.11.20060308/8.12.11/CE-2005091901) with ESMTP id l53FMsUX003540 for ; Sun, 3 Jun 2007 17:22:55 +0200 Message-ID: <4662DCCE.8070002@ceag.ch> Date: Sun, 03 Jun 2007 17:22:54 +0200 From: Carsten Emde Organization: CE Computer Experts AG User-Agent: Mozilla/5.0 (X11; U; SunOS sun4u; en-US; rv:1.8.1.2) Gecko/20070301 SeaMonkey/1.1.1 MIME-Version: 1.0 To: Thomas Gleixner Subject: [PATCH] Make threshold to print '!' in latency trace variable Content-Type: multipart/mixed; boundary="------------020807010006040805040904" X-Virus-Scanned: ClamAV 0.90.1/3340/Sun Jun 3 00:40:38 2007 on thllin.ceag.ch X-Virus-Status: Clean X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ This is a multi-part message in MIME format. --------------020807010006040805040904 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 8bit Thomas, this patch introduces a variable threshold to print the exclamation mark in the latency_trace output instead of the constant 100 microseconds. --cbe --------------020807010006040805040904 Content-Type: text/plain; name="linux-2.6.21.3-rt9-mark_thresh.patch" Content-Disposition: inline; filename="linux-2.6.21.3-rt9-mark_thresh.patch" Content-Transfer-Encoding: 8bit --- include/linux/clocksource.h | 1 + kernel/latency_trace.c | 4 +++- kernel/sysctl.c | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) Index: linux-rt.q/include/linux/clocksource.h =================================================================== --- linux-rt.q.orig/include/linux/clocksource.h +++ linux-rt.q/include/linux/clocksource.h @@ -23,6 +23,7 @@ struct clocksource; extern unsigned long preempt_max_latency; extern unsigned long preempt_thresh; +extern unsigned long preempt_mark_thresh; /** * struct clocksource - hardware abstraction for a free running counter Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -1296,11 +1296,13 @@ static void notrace l_stop(struct seq_fi up(&out_mutex); } +unsigned long preempt_mark_thresh = 100; + static void print_timestamp(struct seq_file *m, unsigned long abs_usecs, unsigned long rel_usecs) { seq_printf(m, " %4ldus", abs_usecs); - if (rel_usecs > 100) + if (rel_usecs > preempt_mark_thresh) seq_puts(m, "!: "); else if (rel_usecs > 1) seq_puts(m, "+: "); Index: linux-rt.q/kernel/sysctl.c =================================================================== --- linux-rt.q.orig/kernel/sysctl.c +++ linux-rt.q/kernel/sysctl.c @@ -324,6 +324,14 @@ static ctl_table kern_table[] = { #ifdef CONFIG_EVENT_TRACE { .ctl_name = CTL_UNNUMBERED, + .procname = "preempt_mark_thresh", + .data = &preempt_mark_thresh, + .maxlen = sizeof(preempt_mark_thresh), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "trace_enabled", .data = &trace_enabled, .maxlen = sizeof(int), patches/rt-time-starvation-fix.patch0000664000077200007720000002212110646635216017106 0ustar mingomingoHey Ingo, Noticed -rt has been updated a few times and this is still missing so I figured I'd resend it just in case you missed it: We've worked around this before, but its cropped up again. Since update_wall_time is now called from a softirq, it can be preempted by a high priority process. If its preempted for long enough, the clocksource can wrap, causing time to stop incrementing, which if the preempting process is checking the time, can cause a hard lockup. This patch forces the clocksource to be read each tick, and accumulate only the cycle count. This allows the update_wall_time to be deferred w/o fear of hardware overflow. thanks -john arch/x86_64/kernel/vsyscall.c | 5 ++++- include/linux/clocksource.h | 40 ++++++++++++++++++++++++++++++++++++++-- include/linux/time.h | 1 + kernel/time/timekeeping.c | 34 ++++++++++++++++++---------------- kernel/timer.c | 1 + 5 files changed, 62 insertions(+), 19 deletions(-) linux-2.6.21-rc5_cycles-accumulated_C7.patch ============================================ Index: linux-rt.q/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/vsyscall.c +++ linux-rt.q/arch/x86_64/kernel/vsyscall.c @@ -93,6 +93,7 @@ void update_vsyscall(struct timespec *wa vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = clock->mult; vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.clock.cycle_accumulated = clock->cycle_accumulated; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.sys_tz = sys_tz; @@ -128,7 +129,7 @@ static __always_inline long time_syscall static __always_inline void do_vgettimeofday(struct timeval * tv) { - cycle_t now, base, mask, cycle_delta; + cycle_t now, base, accumulated, mask, cycle_delta; unsigned seq; unsigned long mult, shift, nsec; cycle_t (*vread)(void); @@ -161,6 +162,7 @@ static __always_inline void do_vgettimeo } now = vread(); base = __vsyscall_gtod_data.clock.cycle_last; + accumulated = __vsyscall_gtod_data.clock.cycle_accumulated; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; shift = __vsyscall_gtod_data.clock.shift; @@ -171,6 +173,7 @@ static __always_inline void do_vgettimeo /* calculate interval: */ cycle_delta = (now - base) & mask; + cycle_delta += accumulated; /* convert to nsecs: */ nsec += (cycle_delta * mult) >> shift; Index: linux-rt.q/include/linux/clocksource.h =================================================================== --- linux-rt.q.orig/include/linux/clocksource.h +++ linux-rt.q/include/linux/clocksource.h @@ -54,8 +54,12 @@ extern unsigned long preempt_mark_thresh * @flags: flags describing special properties * @vread: vsyscall based read * @resume: resume function for the clocksource, if necessary + * @cycle_last: Used internally by timekeeping core, please ignore. + * @cycle_accumulated: Used internally by timekeeping core, please ignore. * @cycle_interval: Used internally by timekeeping core, please ignore. * @xtime_interval: Used internally by timekeeping core, please ignore. + * @xtime_nsec: Used internally by timekeeping core, please ignore. + * @error: Used internally by timekeeping core, please ignore. */ struct clocksource { /* @@ -73,7 +77,7 @@ struct clocksource { void (*resume)(void); /* timekeeping specific data, ignore */ - cycle_t cycle_interval; + cycle_t cycle_accumulated, cycle_interval; u64 xtime_interval; /* * Second part is written at each timer interrupt @@ -166,11 +170,43 @@ static inline cycle_t clocksource_read(s } /** + * clocksource_get_cycles: - Access the clocksource's accumulated cycle value + * @cs: pointer to clocksource being read + * @now: current cycle value + * + * Uses the clocksource to return the current cycle_t value. + * NOTE!!!: This is different from clocksource_read, because it + * returns the accumulated cycle value! Must hold xtime lock! + */ +static inline cycle_t clocksource_get_cycles(struct clocksource *cs, cycle_t now) +{ + cycle_t offset = (now - cs->cycle_last) & cs->mask; + offset += cs->cycle_accumulated; + return offset; +} + +/** + * clocksource_accumulate: - Accumulates clocksource cycles + * @cs: pointer to clocksource being read + * @now: current cycle value + * + * Used to avoids clocksource hardware overflow by periodically + * accumulating the current cycle delta. Must hold xtime write lock! + */ +static inline void clocksource_accumulate(struct clocksource *cs, cycle_t now) +{ + cycle_t offset = (now - cs->cycle_last) & cs->mask; + cs->cycle_last = now; + cs->cycle_accumulated += offset; +} + +/** * cyc2ns - converts clocksource cycles to nanoseconds * @cs: Pointer to clocksource * @cycles: Cycles * * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds. + * Must hold xtime lock! * * XXX - This could use some mult_lxl_ll() asm optimization */ @@ -200,7 +236,7 @@ static inline cycle_t ns2cyc(struct cloc * @length_nsec: Desired interval length in nanoseconds. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment - * pair and interval request. + * pair and interval request. Must hold xtime_lock! * * Unless you're the timekeeping code, you should not be using this! */ Index: linux-rt.q/include/linux/time.h =================================================================== --- linux-rt.q.orig/include/linux/time.h +++ linux-rt.q/include/linux/time.h @@ -97,6 +97,7 @@ extern unsigned long read_persistent_clo extern int update_persistent_clock(struct timespec now); extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); +extern void timekeeping_accumulate(void); static inline unsigned long get_seconds(void) { Index: linux-rt.q/kernel/time/timekeeping.c =================================================================== --- linux-rt.q.orig/kernel/time/timekeeping.c +++ linux-rt.q/kernel/time/timekeeping.c @@ -56,16 +56,10 @@ static struct clocksource *clock; /* poi */ s64 __get_nsec_offset(void) { - cycle_t cycle_now, cycle_delta; + cycle_t cycle_delta; s64 ns_offset; - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ + cycle_delta = clocksource_get_cycles(clock, clocksource_read(clock)); ns_offset = cyc2ns(clock, cycle_delta); return ns_offset; @@ -247,7 +241,7 @@ static void change_clocksource(void) clock = new; clock->cycle_last = now; - + clock->cycle_accumulated = 0; clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -259,8 +253,14 @@ static void change_clocksource(void) clock->name); #endif } + +void timekeeping_accumulate(void) +{ + clocksource_accumulate(clock, clocksource_read(clock)); +} #else static inline void change_clocksource(void) { } +void timekeeping_accumulate(void) { } #endif /** @@ -349,6 +349,7 @@ static int timekeeping_resume(struct sys } /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); + clock->cycle_accumulated = 0; clock->error = 0; timekeeping_suspended = 0; warp_check_clock_was_changed(); @@ -497,27 +498,28 @@ static void clocksource_adjust(s64 offse */ void update_wall_time(void) { - cycle_t offset; + cycle_t cycle_now; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return; #ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; + cycle_now = (clocksource_read(clock) - clock->cycle_last) & clock->mask; #else - offset = clock->cycle_interval; + cycle_now = clock->cycle_interval; #endif + clocksource_accumulate(clock, cycle_now); + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ - while (offset >= clock->cycle_interval) { + while (clock->cycle_accumulated >= clock->cycle_interval) { /* accumulate one interval */ clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; - offset -= clock->cycle_interval; + clock->cycle_accumulated -= clock->cycle_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; @@ -535,7 +537,7 @@ void update_wall_time(void) } /* correct the clock when NTP error is too big */ - clocksource_adjust(offset); + clocksource_adjust(clock->cycle_accumulated); /* store full nanoseconds into xtime */ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; Index: linux-rt.q/kernel/timer.c =================================================================== --- linux-rt.q.orig/kernel/timer.c +++ linux-rt.q/kernel/timer.c @@ -1017,6 +1017,7 @@ static void run_timer_softirq(struct sof void do_timer(unsigned long ticks) { jiffies_64 += ticks; + timekeeping_accumulate(); } #ifdef __ARCH_WANT_SYS_ALARM patches/ntp-move-the-cmos-update-code-into-ntpc-fix.patch0000664000077200007720000000151510646635210022716 0ustar mingomingoFrom: Andrew Morton alpha: include/linux/time.h:97: error: expected '=', ',', ';', 'asm' or '__attribute__' before '__read_mostly' Cc: Thomas Gleixner Cc: Chris Wright Cc: Ingo Molnar Cc: john stultz Cc: David Miller Cc: Roman Zippel Signed-off-by: Andrew Morton --- include/linux/time.h | 1 + 1 file changed, 1 insertion(+) Index: linux-rt.q/include/linux/time.h =================================================================== --- linux-rt.q.orig/include/linux/time.h +++ linux-rt.q/include/linux/time.h @@ -2,6 +2,7 @@ #define _LINUX_TIME_H #include +#include #ifdef __KERNEL__ # include patches/pagefault-disable-cleanup.patch0000664000077200007720000001337210646635216017557 0ustar mingomingoSubject: [patch] clean up the page fault disabling logic From: Ingo Molnar decouple the pagefault-disabled logic from the preempt count. Signed-off-by: Ingo Molnar --- arch/arm/mm/fault.c | 2 +- arch/i386/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/x86_64/mm/fault.c | 2 +- include/linux/sched.h | 1 + include/linux/uaccess.h | 33 +++------------------------------ kernel/fork.c | 1 + mm/memory.c | 22 ++++++++++++++++++++++ 9 files changed, 32 insertions(+), 35 deletions(-) Index: linux-rt.q/arch/arm/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/fault.c +++ linux-rt.q/arch/arm/mm/fault.c @@ -229,7 +229,7 @@ do_page_fault(unsigned long addr, unsign * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto no_context; /* Index: linux-rt.q/arch/i386/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/fault.c +++ linux-rt.q/arch/i386/mm/fault.c @@ -351,7 +351,7 @@ fastcall notrace void __kprobes do_page_ * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto bad_area_nosemaphore; /* When running in the kernel we expect faults to occur only to Index: linux-rt.q/arch/mips/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/mips/mm/fault.c +++ linux-rt.q/arch/mips/mm/fault.c @@ -68,7 +68,7 @@ asmlinkage void do_page_fault(struct pt_ * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto bad_area_nosemaphore; down_read(&mm->mmap_sem); Index: linux-rt.q/arch/powerpc/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/powerpc/mm/fault.c +++ linux-rt.q/arch/powerpc/mm/fault.c @@ -184,7 +184,7 @@ int __kprobes notrace do_page_fault(stru } #endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ - if (in_atomic() || mm == NULL) { + if (in_atomic() || mm == NULL || current->pagefault_disabled) { if (!user_mode(regs)) return SIGSEGV; /* in_atomic() in user mode is really bad, Index: linux-rt.q/arch/x86_64/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/x86_64/mm/fault.c +++ linux-rt.q/arch/x86_64/mm/fault.c @@ -381,7 +381,7 @@ asmlinkage void __kprobes do_page_fault( * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (unlikely(in_atomic() || !mm)) + if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) goto bad_area_nosemaphore; again: Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -1229,6 +1229,7 @@ struct task_struct { /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif + int pagefault_disabled; #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; int hardirqs_enabled; Index: linux-rt.q/include/linux/uaccess.h =================================================================== --- linux-rt.q.orig/include/linux/uaccess.h +++ linux-rt.q/include/linux/uaccess.h @@ -6,37 +6,10 @@ /* * These routines enable/disable the pagefault handler in that - * it will not take any locks and go straight to the fixup table. - * - * They have great resemblance to the preempt_disable/enable calls - * and in fact they are identical; this is because currently there is - * no other way to make the pagefault handlers do this. So we do - * disable preemption but we don't necessarily care about that. + * it will not take any MM locks and go straight to the fixup table. */ -static inline void pagefault_disable(void) -{ - inc_preempt_count(); - /* - * make sure to have issued the store before a pagefault - * can hit. - */ - barrier(); -} - -static inline void pagefault_enable(void) -{ - /* - * make sure to issue those last loads/stores before enabling - * the pagefault handler again. - */ - barrier(); - dec_preempt_count(); - /* - * make sure we do.. - */ - barrier(); - preempt_check_resched(); -} +extern void pagefault_disable(void); +extern void pagefault_enable(void); #ifndef ARCH_HAS_NOCACHE_UACCESS Index: linux-rt.q/kernel/fork.c =================================================================== --- linux-rt.q.orig/kernel/fork.c +++ linux-rt.q/kernel/fork.c @@ -1133,6 +1133,7 @@ static struct task_struct *copy_process( p->hardirq_context = 0; p->softirq_context = 0; #endif + p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; Index: linux-rt.q/mm/memory.c =================================================================== --- linux-rt.q.orig/mm/memory.c +++ linux-rt.q/mm/memory.c @@ -2618,6 +2618,28 @@ unlock: return VM_FAULT_MINOR; } +void pagefault_disable(void) +{ + current->pagefault_disabled++; + /* + * make sure to have issued the store before a pagefault + * can hit. + */ + barrier(); +} +EXPORT_SYMBOL(pagefault_disable); + +void pagefault_enable(void) +{ + /* + * make sure to issue those last loads/stores before enabling + * the pagefault handler again. + */ + barrier(); + current->pagefault_disabled--; +} +EXPORT_SYMBOL(pagefault_enable); + /* * By the time we get here, we already hold the mm semaphore */ patches/x86_64-apic-remove-bogus-pit-synchronization.patch0000664000077200007720000000271010646635211023057 0ustar mingomingoSubject: x86_64: remove pit synchronization The APIC timer setup code synchronizes the local APIC timer to the PIT/HPET. This is pointless as the PIT and the local APIC timer frequency are not correlated and the APIC timer calibration can never be accurate enough to avoid that the local APIC timer and the PIT/HPET drift apart. Simply remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 20 -------------------- 1 file changed, 20 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -791,26 +791,6 @@ static void setup_APIC_timer(unsigned in local_irq_save(flags); - /* wait for irq slice */ - if (hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { - int c1, c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - do { - c1 = c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - } while (c2 - c1 < 300); - } - irqen = ! cpu_isset(smp_processor_id(), timer_interrupt_broadcast_ipi_mask); __setup_APIC_LVTT(clocks, 0, irqen); patches/smaller-trace.patch0000664000077200007720000000073410646635212015306 0ustar mingomingo--- kernel/latency_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -190,7 +190,7 @@ static int report_latency(cycle_t delta) /* * Number of per-CPU trace entries: */ -#define MAX_TRACE (65536UL*16UL) +#define MAX_TRACE (65536UL) #define CMDLINE_BYTES 16 patches/i386-prepare-sharing-hpet-code.patch0000664000077200007720000000516210646635211020176 0ustar mingomingoSubject: i386: prepare sharing the hpet code with x86_64 The hpet implementations of i386 and x8664 has been mostly the same before the clock events conversion of i386. The clock events conversion of i386 hpet is already done. So it makes sense to share the code for the x86_64 clock events conversion. Abstract out the mapping functions. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/i386/kernel/hpet.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -3,14 +3,14 @@ #include #include #include +#include #include #include #include +#include #include -extern struct clock_event_device *global_clock_event; - #define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_SHIFT 22 @@ -21,7 +21,7 @@ extern struct clock_event_device *global * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; -static void __iomem * hpet_virt_address; +static void __iomem *hpet_virt_address; static inline unsigned long hpet_readl(unsigned long a) { @@ -33,6 +33,17 @@ static inline void hpet_writel(unsigned writel(d, hpet_virt_address + a); } +static inline void hpet_set_mapping(void) +{ + hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); +} + +static inline void hpet_clear_mapping(void) +{ + iounmap(hpet_virt_address); + hpet_virt_address = NULL; +} + /* * HPET command line enable / disable */ @@ -82,7 +93,7 @@ static void hpet_reserve_platform_timers memset(&hd, 0, sizeof (hd)); hd.hd_phys_address = hpet_address; - hd.hd_address = hpet_virt_address; + hd.hd_address = hpet; hd.hd_nirqs = nrtimers; hd.hd_flags = HPET_DATA_PLATFORM; hpet_reserve_timer(&hd, 0); @@ -237,7 +248,7 @@ int __init hpet_enable(void) if (!is_hpet_capable()) return 0; - hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_set_mapping(); /* * Read the period and check for a sane value: @@ -333,13 +344,11 @@ int __init hpet_enable(void) return 0; out_nohpet: - iounmap(hpet_virt_address); - hpet_virt_address = NULL; + hpet_clear_mapping(); boot_hpet_disable = 1; return 0; } - #ifdef CONFIG_HPET_EMULATE_RTC /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET patches/hrtimer-speedup-hrtimer_enqueue.patch0000664000077200007720000000277510646635210021074 0ustar mingomingoFrom: Ingo Molnar Speedup hrtimer_enqueue by evaluating the rbtree insertion result. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton --- kernel/hrtimer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -686,6 +686,7 @@ static void enqueue_hrtimer(struct hrtim struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; struct hrtimer *entry; + int leftmost = 1; /* * Find the right place in the rbtree: @@ -697,18 +698,19 @@ static void enqueue_hrtimer(struct hrtim * We dont care about collisions. Nodes with * the same expiry time stay together. */ - if (timer->expires.tv64 < entry->expires.tv64) + if (timer->expires.tv64 < entry->expires.tv64) { link = &(*link)->rb_left; - else + } else { link = &(*link)->rb_right; + leftmost = 0; + } } /* * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + if (leftmost) { /* * Reprogram the clock event device. When the timer is already * expired hrtimer_enqueue_reprogram has either called the patches/undo-latency-tracing-raw-spinlock-hack.patch0000664000077200007720000000071010646635213022110 0ustar mingomingo--- kernel/latency_trace.c | 3 --- 1 file changed, 3 deletions(-) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -40,9 +40,6 @@ int trace_use_raw_cycles = 0; -#define __raw_spinlock_t raw_spinlock_t -#define need_resched_delayed() 0 - #ifdef CONFIG_EVENT_TRACE /* * Convert raw cycles to usecs. patches/x86-64-tscless-vgettimeofday.patch0000664000077200007720000000311010646635216017745 0ustar mingomingoSubject: [patch] x86_64 GTOD: offer scalable vgettimeofday From: Ingo Molnar offer scalable vgettimeofday independently of whether the TSC is synchronous or not. Off by default. this patch also fixes an SMP bug in sys_vtime(): we should read __vsyscall_gtod_data.wall_time_tv.tv_sec only once. Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/vsyscall.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/vsyscall.c +++ linux-rt.q/arch/x86_64/kernel/vsyscall.c @@ -68,7 +68,7 @@ struct vsyscall_gtod_data_t { struct timezone sys_tz; struct { /* extract of a clocksource struct */ cycle_t (*vread)(void); - cycle_t cycle_last; + cycle_t cycle_last, cycle_accumulated; cycle_t mask; u32 mult; u32 shift; @@ -132,6 +132,25 @@ static __always_inline void do_vgettimeo unsigned seq; unsigned long mult, shift, nsec; cycle_t (*vread)(void); + + if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timeval tmp; + + do { + barrier(); + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec; + barrier(); + tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec; + tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec; + + } while (tmp.tv_usec != tv->tv_usec || + tmp.tv_sec != tv->tv_sec); + + tv->tv_usec /= NSEC_PER_USEC; + return; + } + do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); patches/hrtimer-no-getnstimeofday.patch0000664000077200007720000000545410646635213017664 0ustar mingomingo--- include/linux/time.h | 3 +++ kernel/hrtimer.c | 15 +++++++++++---- kernel/time/timekeeping.c | 2 +- kernel/timer.c | 7 ++++++- 4 files changed, 21 insertions(+), 6 deletions(-) Index: linux-rt.q/include/linux/time.h =================================================================== --- linux-rt.q.orig/include/linux/time.h +++ linux-rt.q/include/linux/time.h @@ -119,6 +119,9 @@ extern int do_setitimer(int which, struc extern unsigned int alarm_setitimer(unsigned int seconds); extern int do_getitimer(int which, struct itimerval *value); extern void getnstimeofday(struct timespec *tv); +#ifdef CONFIG_GENERIC_TIME +s64 __get_nsec_offset(void); +#endif extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_is_continuous(void); Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -116,10 +116,13 @@ void ktime_get_ts(struct timespec *ts) { struct timespec tomono; unsigned long seq; + s64 nsecs; do { seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); + *ts = xtime; + nsecs = __get_nsec_offset(); + timespec_add_ns(ts, nsecs); tomono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); @@ -141,10 +144,14 @@ static void hrtimer_get_softirq_time(str do { seq = read_seqbegin(&xtime_lock); -#ifdef CONFIG_NO_HZ - getnstimeofday(&xts); -#else xts = xtime; +#ifdef CONFIG_NO_HZ + { + s64 nsecs; + + nsecs = __get_nsec_offset(); + timespec_add_ns(&xts, nsecs); + } #endif tom = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); Index: linux-rt.q/kernel/time/timekeeping.c =================================================================== --- linux-rt.q.orig/kernel/time/timekeeping.c +++ linux-rt.q/kernel/time/timekeeping.c @@ -54,7 +54,7 @@ static struct clocksource *clock; /* poi * called. Returns the number of nanoseconds since the * last call to update_wall_time() (adjusted by NTP scaling) */ -static inline s64 __get_nsec_offset(void) +s64 __get_nsec_offset(void) { cycle_t cycle_now, cycle_delta; s64 ns_offset; Index: linux-rt.q/kernel/timer.c =================================================================== --- linux-rt.q.orig/kernel/timer.c +++ linux-rt.q/kernel/timer.c @@ -1102,6 +1102,8 @@ int do_sysinfo(struct sysinfo *info) do { struct timespec tp; + s64 nsecs; + seq = read_seqbegin(&xtime_lock); /* @@ -1111,7 +1113,10 @@ int do_sysinfo(struct sysinfo *info) * too. */ - getnstimeofday(&tp); + tp = xtime; + nsecs = __get_nsec_offset(); + timespec_add_ns(&tp, nsecs); + tp.tv_sec += wall_to_monotonic.tv_sec; tp.tv_nsec += wall_to_monotonic.tv_nsec; if (tp.tv_nsec - NSEC_PER_SEC >= 0) { patches/preempt-realtime-arm-bagde4.patch0000664000077200007720000000224610646635214017732 0ustar mingomingo--- arch/arm/mach-sa1100/badge4.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/arm/mach-sa1100/badge4.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-sa1100/badge4.c +++ linux-rt.q/arch/arm/mach-sa1100/badge4.c @@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i /* detect on->off and off->on transitions */ if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { /* was off, now on */ - printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); GPSR = BADGE4_GPIO_PCMEN5V; } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { /* was on, now off */ - printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); GPCR = BADGE4_GPIO_PCMEN5V; } local_irq_restore(flags); + + /* detect on->off and off->on transitions */ + if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { + /* was off, now on */ + printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); + } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { + /* was on, now off */ + printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); + } } EXPORT_SYMBOL(badge4_set_5V); patches/preempt-realtime-sh.patch0000664000077200007720000010117610646635215016444 0ustar mingomingoFrom lethal@linux-sh.org Fri Apr 27 10:21:47 2007 Date: Fri, 27 Apr 2007 10:21:47 +0900 From: Paul Mundt To: Thomas Gleixner , Ingo Molnar Subject: [PATCH] preempt-rt: Preliminary SH support Hi Thomas, Ingo, Here's preliminary preempt-rt support for SH. It was written against 2.6.21-rc5, but still applies cleanly. I've kept the clock events stuff out of this patch, since I'm planning on overhauling the timer stuff on SH first, but this should trickle in through 2.6.22-rc. Feel free to either merge this in to preempt-rt or hold off until the timer stuff gets done. Patch from Matsubara-san. Signed-off-by: Katsuya MATSUBARA Signed-off-by: Paul Mundt -- arch/sh/kernel/cpu/clock.c | 2 - arch/sh/kernel/cpu/sh4/sq.c | 2 - arch/sh/kernel/entry-common.S | 8 ++-- arch/sh/kernel/irq.c | 2 - arch/sh/kernel/process.c | 10 +++--- arch/sh/kernel/semaphore.c | 14 ++++++-- arch/sh/kernel/sh_ksyms.c | 9 ++--- arch/sh/kernel/signal.c | 7 ++++ arch/sh/kernel/time.c | 2 - arch/sh/kernel/traps.c | 2 - arch/sh/mm/cache-sh4.c | 12 +++---- arch/sh/mm/init.c | 2 - arch/sh/mm/pg-sh4.c | 8 ++-- arch/sh/mm/tlb-flush.c | 20 ++++++------ arch/sh/mm/tlb-sh4.c | 4 +- include/asm-sh/atomic-irq.h | 24 +++++++------- include/asm-sh/atomic.h | 8 ++-- include/asm-sh/bitops.h | 24 +++++++------- include/asm-sh/pgalloc.h | 2 - include/asm-sh/rwsem.h | 46 ++++++++++++++-------------- include/asm-sh/semaphore-helper.h | 8 ++-- include/asm-sh/semaphore.h | 61 +++++++++++++++++++++++--------------- include/asm-sh/system.h | 12 +++---- include/asm-sh/thread_info.h | 2 + 24 files changed, 160 insertions(+), 131 deletions(-) Index: linux-rt.q/arch/sh/kernel/cpu/clock.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/cpu/clock.c +++ linux-rt.q/arch/sh/kernel/cpu/clock.c @@ -28,7 +28,7 @@ #include static LIST_HEAD(clock_list); -static DEFINE_SPINLOCK(clock_lock); +static DEFINE_RAW_SPINLOCK(clock_lock); static DEFINE_MUTEX(clock_list_sem); /* Index: linux-rt.q/arch/sh/kernel/cpu/sh4/sq.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/cpu/sh4/sq.c +++ linux-rt.q/arch/sh/kernel/cpu/sh4/sq.c @@ -37,7 +37,7 @@ struct sq_mapping { }; static struct sq_mapping *sq_mapping_list; -static DEFINE_SPINLOCK(sq_mapping_lock); +static DEFINE_RAW_SPINLOCK(sq_mapping_lock); static struct kmem_cache *sq_cache; static unsigned long *sq_bitmap; Index: linux-rt.q/arch/sh/kernel/entry-common.S =================================================================== --- linux-rt.q.orig/arch/sh/kernel/entry-common.S +++ linux-rt.q/arch/sh/kernel/entry-common.S @@ -157,7 +157,7 @@ ENTRY(resume_userspace) mov.l @(TI_FLAGS,r8), r0 ! current_thread_info->flags tst #_TIF_WORK_MASK, r0 bt/s __restore_all - tst #_TIF_NEED_RESCHED, r0 + tst #_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED, r0 .align 2 work_pending: @@ -209,10 +209,10 @@ work_resched: tst #_TIF_WORK_MASK, r0 bt __restore_all bra work_pending - tst #_TIF_NEED_RESCHED, r0 + tst #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED, r0 .align 2 -1: .long schedule +1: .long __schedule 2: .long do_notify_resume 3: .long restore_all #ifdef CONFIG_TRACE_IRQFLAGS @@ -226,7 +226,7 @@ syscall_exit_work: ! r8: current_thread_info tst #_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP, r0 bt/s work_pending - tst #_TIF_NEED_RESCHED, r0 + tst #_TIF_NEED_RESCHED| _TIF_NEED_RESCHED_DELAYED, r0 #ifdef CONFIG_TRACE_IRQFLAGS mov.l 5f, r0 jsr @r0 Index: linux-rt.q/arch/sh/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/irq.c +++ linux-rt.q/arch/sh/kernel/irq.c @@ -82,7 +82,7 @@ static union irq_ctx *hardirq_ctx[NR_CPU static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; #endif -asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage notrace int do_IRQ(unsigned int irq, struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); #ifdef CONFIG_4KSTACKS Index: linux-rt.q/arch/sh/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/process.c +++ linux-rt.q/arch/sh/kernel/process.c @@ -62,7 +62,7 @@ void default_idle(void) clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb__after_clear_bit(); set_bl_bit(); - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) cpu_sleep(); clear_bl_bit(); set_thread_flag(TIF_POLLING_NRFLAG); @@ -83,13 +83,15 @@ void cpu_idle(void) idle = default_idle; tick_nohz_stop_sched_tick(); - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); check_pgt_cache(); } } Index: linux-rt.q/arch/sh/kernel/semaphore.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/semaphore.c +++ linux-rt.q/arch/sh/kernel/semaphore.c @@ -46,7 +46,7 @@ DEFINE_SPINLOCK(semaphore_wake_lock); * critical part is the inline stuff in * where we want to avoid any extra jumps and calls. */ -void __up(struct semaphore *sem) +void __attribute_used__ __compat_up(struct compat_semaphore *sem) { wake_one_more(sem); wake_up(&sem->wait); @@ -104,7 +104,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __sched __down(struct semaphore * sem) +void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -114,7 +114,7 @@ void __sched __down(struct semaphore * s DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __sched __down_interruptible(struct semaphore * sem) +int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; DOWN_VAR @@ -133,7 +133,13 @@ int __sched __down_interruptible(struct return ret; } -int __down_trylock(struct semaphore * sem) +int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem) { return waking_non_zero_trylock(sem); } + +fastcall int __sched compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + Index: linux-rt.q/arch/sh/kernel/sh_ksyms.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/sh_ksyms.c +++ linux-rt.q/arch/sh/kernel/sh_ksyms.c @@ -26,7 +26,6 @@ EXPORT_SYMBOL(sh_mv); /* platform dependent support */ EXPORT_SYMBOL(dump_fpu); EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(no_irq_type); EXPORT_SYMBOL(strlen); @@ -50,9 +49,9 @@ EXPORT_SYMBOL(get_vm_area); #endif /* semaphore exports */ -EXPORT_SYMBOL(__up); -EXPORT_SYMBOL(__down); -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_up); +EXPORT_SYMBOL(__compat_down); +EXPORT_SYMBOL(__compat_down_interruptible); EXPORT_SYMBOL(__udelay); EXPORT_SYMBOL(__ndelay); @@ -98,7 +97,7 @@ EXPORT_SYMBOL(__flush_purge_region); EXPORT_SYMBOL(clear_user_page); #endif -EXPORT_SYMBOL(__down_trylock); +EXPORT_SYMBOL(__compat_down_trylock); #ifdef CONFIG_SMP EXPORT_SYMBOL(synchronize_irq); Index: linux-rt.q/arch/sh/kernel/signal.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/signal.c +++ linux-rt.q/arch/sh/kernel/signal.c @@ -565,6 +565,13 @@ static void do_signal(struct pt_regs *re struct k_sigaction ka; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + raw_local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux-rt.q/arch/sh/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/time.c +++ linux-rt.q/arch/sh/kernel/time.c @@ -24,7 +24,7 @@ struct sys_timer *sys_timer; /* Move this somewhere more sensible.. */ -DEFINE_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); /* Dummy RTC ops */ Index: linux-rt.q/arch/sh/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/traps.c +++ linux-rt.q/arch/sh/kernel/traps.c @@ -77,7 +77,7 @@ static void dump_mem(const char *str, un } } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * regs, long err) { Index: linux-rt.q/arch/sh/mm/cache-sh4.c =================================================================== --- linux-rt.q.orig/arch/sh/mm/cache-sh4.c +++ linux-rt.q/arch/sh/mm/cache-sh4.c @@ -203,7 +203,7 @@ void flush_cache_sigtramp(unsigned long index = CACHE_IC_ADDRESS_ARRAY | (v & current_cpu_data.icache.entry_mask); - local_irq_save(flags); + raw_local_irq_save(flags); jump_to_P2(); for (i = 0; i < current_cpu_data.icache.ways; @@ -212,7 +212,7 @@ void flush_cache_sigtramp(unsigned long back_to_P1(); wmb(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline void flush_cache_4096(unsigned long start, @@ -228,10 +228,10 @@ static inline void flush_cache_4096(unsi (start < CACHE_OC_ADDRESS_ARRAY)) exec_offset = 0x20000000; - local_irq_save(flags); + raw_local_irq_save(flags); __flush_cache_4096(start | SH_CACHE_ASSOC, P1SEGADDR(phys), exec_offset); - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* @@ -259,7 +259,7 @@ static inline void flush_icache_all(void { unsigned long flags, ccr; - local_irq_save(flags); + raw_local_irq_save(flags); jump_to_P2(); /* Flush I-cache */ @@ -273,7 +273,7 @@ static inline void flush_icache_all(void */ back_to_P1(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } void flush_dcache_all(void) Index: linux-rt.q/arch/sh/mm/init.c =================================================================== --- linux-rt.q.orig/arch/sh/mm/init.c +++ linux-rt.q/arch/sh/mm/init.c @@ -20,7 +20,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); pgd_t swapper_pg_dir[PTRS_PER_PGD]; void (*copy_page)(void *from, void *to); Index: linux-rt.q/arch/sh/mm/pg-sh4.c =================================================================== --- linux-rt.q.orig/arch/sh/mm/pg-sh4.c +++ linux-rt.q/arch/sh/mm/pg-sh4.c @@ -39,9 +39,9 @@ void clear_user_page(void *to, unsigned entry = pfn_pte(phys_addr >> PAGE_SHIFT, PAGE_KERNEL); mutex_lock(&p3map_mutex[(address & CACHE_ALIAS)>>12]); set_pte(pte, entry); - local_irq_save(flags); + raw_local_irq_save(flags); flush_tlb_one(get_asid(), p3_addr); - local_irq_restore(flags); + raw_local_irq_restore(flags); update_mmu_cache(NULL, p3_addr, entry); __clear_user_page((void *)p3_addr, to); pte_clear(&init_mm, p3_addr, pte); @@ -75,9 +75,9 @@ void copy_user_page(void *to, void *from entry = pfn_pte(phys_addr >> PAGE_SHIFT, PAGE_KERNEL); mutex_lock(&p3map_mutex[(address & CACHE_ALIAS)>>12]); set_pte(pte, entry); - local_irq_save(flags); + raw_local_irq_save(flags); flush_tlb_one(get_asid(), p3_addr); - local_irq_restore(flags); + raw_local_irq_restore(flags); update_mmu_cache(NULL, p3_addr, entry); __copy_user_page((void *)p3_addr, from, to); pte_clear(&init_mm, p3_addr, pte); Index: linux-rt.q/arch/sh/mm/tlb-flush.c =================================================================== --- linux-rt.q.orig/arch/sh/mm/tlb-flush.c +++ linux-rt.q/arch/sh/mm/tlb-flush.c @@ -24,7 +24,7 @@ void local_flush_tlb_page(struct vm_area asid = cpu_asid(cpu, vma->vm_mm); page &= PAGE_MASK; - local_irq_save(flags); + raw_local_irq_save(flags); if (vma->vm_mm != current->mm) { saved_asid = get_asid(); set_asid(asid); @@ -32,7 +32,7 @@ void local_flush_tlb_page(struct vm_area local_flush_tlb_one(asid, page); if (saved_asid != MMU_NO_ASID) set_asid(saved_asid); - local_irq_restore(flags); + raw_local_irq_restore(flags); } } @@ -46,7 +46,7 @@ void local_flush_tlb_range(struct vm_are unsigned long flags; int size; - local_irq_save(flags); + raw_local_irq_save(flags); size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT; if (size > (MMU_NTLB_ENTRIES/4)) { /* Too many TLB to flush */ cpu_context(cpu, mm) = NO_CONTEXT; @@ -71,7 +71,7 @@ void local_flush_tlb_range(struct vm_are if (saved_asid != MMU_NO_ASID) set_asid(saved_asid); } - local_irq_restore(flags); + raw_local_irq_restore(flags); } } @@ -81,7 +81,7 @@ void local_flush_tlb_kernel_range(unsign unsigned long flags; int size; - local_irq_save(flags); + raw_local_irq_save(flags); size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT; if (size > (MMU_NTLB_ENTRIES/4)) { /* Too many TLB to flush */ local_flush_tlb_all(); @@ -100,7 +100,7 @@ void local_flush_tlb_kernel_range(unsign } set_asid(saved_asid); } - local_irq_restore(flags); + raw_local_irq_restore(flags); } void local_flush_tlb_mm(struct mm_struct *mm) @@ -112,11 +112,11 @@ void local_flush_tlb_mm(struct mm_struct if (cpu_context(cpu, mm) != NO_CONTEXT) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); cpu_context(cpu, mm) = NO_CONTEXT; if (mm == current->mm) activate_context(mm, cpu); - local_irq_restore(flags); + raw_local_irq_restore(flags); } } @@ -131,10 +131,10 @@ void local_flush_tlb_all(void) * TF-bit for SH-3, TI-bit for SH-4. * It's same position, bit #2. */ - local_irq_save(flags); + raw_local_irq_save(flags); status = ctrl_inl(MMUCR); status |= 0x04; ctrl_outl(status, MMUCR); ctrl_barrier(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } Index: linux-rt.q/arch/sh/mm/tlb-sh4.c =================================================================== --- linux-rt.q.orig/arch/sh/mm/tlb-sh4.c +++ linux-rt.q/arch/sh/mm/tlb-sh4.c @@ -51,7 +51,7 @@ void update_mmu_cache(struct vm_area_str } } - local_irq_save(flags); + raw_local_irq_save(flags); /* Set PTEH register */ vpn = (address & MMU_VPN_MASK) | get_asid(); @@ -74,7 +74,7 @@ void update_mmu_cache(struct vm_area_str /* Load the TLB */ asm volatile("ldtlb": /* no output */ : /* no input */ : "memory"); - local_irq_restore(flags); + raw_local_irq_restore(flags); } void local_flush_tlb_one(unsigned long asid, unsigned long page) Index: linux-rt.q/include/asm-sh/atomic-irq.h =================================================================== --- linux-rt.q.orig/include/asm-sh/atomic-irq.h +++ linux-rt.q/include/asm-sh/atomic-irq.h @@ -10,29 +10,29 @@ static inline void atomic_add(int i, ato { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); *(long *)v += i; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline void atomic_sub(int i, atomic_t *v) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); *(long *)v -= i; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline int atomic_add_return(int i, atomic_t *v) { unsigned long temp, flags; - local_irq_save(flags); + raw_local_irq_save(flags); temp = *(long *)v; temp += i; *(long *)v = temp; - local_irq_restore(flags); + raw_local_irq_restore(flags); return temp; } @@ -41,11 +41,11 @@ static inline int atomic_sub_return(int { unsigned long temp, flags; - local_irq_save(flags); + raw_local_irq_save(flags); temp = *(long *)v; temp -= i; *(long *)v = temp; - local_irq_restore(flags); + raw_local_irq_restore(flags); return temp; } @@ -54,18 +54,18 @@ static inline void atomic_clear_mask(uns { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); *(long *)v &= ~mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline void atomic_set_mask(unsigned int mask, atomic_t *v) { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); *(long *)v |= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } #endif /* __ASM_SH_ATOMIC_IRQ_H */ Index: linux-rt.q/include/asm-sh/atomic.h =================================================================== --- linux-rt.q.orig/include/asm-sh/atomic.h +++ linux-rt.q/include/asm-sh/atomic.h @@ -49,11 +49,11 @@ static inline int atomic_cmpxchg(atomic_ int ret; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); ret = v->counter; if (likely(ret == old)) v->counter = new; - local_irq_restore(flags); + raw_local_irq_restore(flags); return ret; } @@ -65,11 +65,11 @@ static inline int atomic_add_unless(atom int ret; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); ret = v->counter; if (ret != u) v->counter += a; - local_irq_restore(flags); + raw_local_irq_restore(flags); return ret != u; } Index: linux-rt.q/include/asm-sh/bitops.h =================================================================== --- linux-rt.q.orig/include/asm-sh/bitops.h +++ linux-rt.q/include/asm-sh/bitops.h @@ -14,9 +14,9 @@ static inline void set_bit(int nr, volat a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); *a |= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* @@ -32,9 +32,9 @@ static inline void clear_bit(int nr, vol a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); *a &= ~mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline void change_bit(int nr, volatile void * addr) @@ -45,9 +45,9 @@ static inline void change_bit(int nr, vo a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); *a ^= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline int test_and_set_bit(int nr, volatile void * addr) @@ -58,10 +58,10 @@ static inline int test_and_set_bit(int n a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); retval = (mask & *a) != 0; *a |= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } @@ -74,10 +74,10 @@ static inline int test_and_clear_bit(int a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); retval = (mask & *a) != 0; *a &= ~mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } @@ -90,10 +90,10 @@ static inline int test_and_change_bit(in a += nr >> 5; mask = 1 << (nr & 0x1f); - local_irq_save(flags); + raw_local_irq_save(flags); retval = (mask & *a) != 0; *a ^= mask; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } Index: linux-rt.q/include/asm-sh/pgalloc.h =================================================================== --- linux-rt.q.orig/include/asm-sh/pgalloc.h +++ linux-rt.q/include/asm-sh/pgalloc.h @@ -13,7 +13,7 @@ static inline void pmd_populate_kernel(s set_pmd(pmd, __pmd((unsigned long)pte)); } -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, +static inline void notrace pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { set_pmd(pmd, __pmd((unsigned long)page_address(pte))); Index: linux-rt.q/include/asm-sh/rwsem.h =================================================================== --- linux-rt.q.orig/include/asm-sh/rwsem.h +++ linux-rt.q/include/asm-sh/rwsem.h @@ -15,7 +15,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct compat_rw_semaphore { long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -23,7 +23,7 @@ struct rw_semaphore { #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -41,25 +41,25 @@ struct rw_semaphore { LIST_HEAD_INIT((name).wait_list) \ __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem); -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define compat_init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __compat_init_rwsem((sem), #sem, &__key); \ } while (0) -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void compat_init_rwsem(struct rw_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -69,7 +69,7 @@ static inline void init_rwsem(struct rw_ /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct compat_rw_semaphore *sem) { if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) smp_wmb(); @@ -77,7 +77,7 @@ static inline void __down_read(struct rw rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -94,7 +94,7 @@ static inline int __down_read_trylock(st /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct compat_rw_semaphore *sem) { int tmp; @@ -106,7 +106,7 @@ static inline void __down_write(struct r rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -119,7 +119,7 @@ static inline int __down_write_trylock(s /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct compat_rw_semaphore *sem) { int tmp; @@ -132,7 +132,7 @@ static inline void __up_read(struct rw_s /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct compat_rw_semaphore *sem) { smp_wmb(); if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, @@ -143,7 +143,7 @@ static inline void __up_write(struct rw_ /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -151,7 +151,7 @@ static inline void rwsem_atomic_add(int /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct compat_rw_semaphore *sem) { int tmp; @@ -161,7 +161,7 @@ static inline void __downgrade_write(str rwsem_downgrade_wake(sem); } -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write_nested(struct compat_rw_semaphore *sem, int subclass) { __down_write(sem); } @@ -169,13 +169,13 @@ static inline void __down_write_nested(s /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) { smp_mb(); return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->count != 0); } Index: linux-rt.q/include/asm-sh/semaphore-helper.h =================================================================== --- linux-rt.q.orig/include/asm-sh/semaphore-helper.h +++ linux-rt.q/include/asm-sh/semaphore-helper.h @@ -14,12 +14,12 @@ * This is trivially done with load_locked/store_cond, * which we have. Let the rest of the losers suck eggs. */ -static __inline__ void wake_one_more(struct semaphore * sem) +static __inline__ void wake_one_more(struct compat_semaphore * sem) { atomic_inc((atomic_t *)&sem->sleepers); } -static __inline__ int waking_non_zero(struct semaphore *sem) +static __inline__ int waking_non_zero(struct compat_semaphore *sem) { unsigned long flags; int ret = 0; @@ -43,7 +43,7 @@ static __inline__ int waking_non_zero(st * protected by the spinlock in order to make atomic this atomic_inc() with the * atomic_read() in wake_one_more(), otherwise we can race. -arca */ -static __inline__ int waking_non_zero_interruptible(struct semaphore *sem, +static __inline__ int waking_non_zero_interruptible(struct compat_semaphore *sem, struct task_struct *tsk) { unsigned long flags; @@ -70,7 +70,7 @@ static __inline__ int waking_non_zero_in * protected by the spinlock in order to make atomic this atomic_inc() with the * atomic_read() in wake_one_more(), otherwise we can race. -arca */ -static __inline__ int waking_non_zero_trylock(struct semaphore *sem) +static __inline__ int waking_non_zero_trylock(struct compat_semaphore *sem) { unsigned long flags; int ret = 1; Index: linux-rt.q/include/asm-sh/semaphore.h =================================================================== --- linux-rt.q.orig/include/asm-sh/semaphore.h +++ linux-rt.q/include/asm-sh/semaphore.h @@ -20,29 +20,36 @@ #include #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) -#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0) -static inline void sema_init (struct semaphore *sem, int val) +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { /* - * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val); + * *sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val); * * i'd rather use the more flexible initialization above, but sadly * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well. @@ -52,14 +59,14 @@ static inline void sema_init (struct sem init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } #if 0 @@ -69,36 +76,36 @@ asmlinkage int __down_failed_trylock(vo asmlinkage void __up_wakeup(void /* special register calling convention */); #endif -asmlinkage void __down(struct semaphore * sem); -asmlinkage int __down_interruptible(struct semaphore * sem); -asmlinkage int __down_trylock(struct semaphore * sem); -asmlinkage void __up(struct semaphore * sem); +asmlinkage void __compat_down(struct compat_semaphore * sem); +asmlinkage int __compat_down_interruptible(struct compat_semaphore * sem); +asmlinkage int __compat_down_trylock(struct compat_semaphore * sem); +asmlinkage void __compat_up(struct compat_semaphore * sem); extern spinlock_t semaphore_wake_lock; -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); if (atomic_dec_return(&sem->count) < 0) - __down(sem); + __compat_down(sem); } -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; might_sleep(); if (atomic_dec_return(&sem->count) < 0) - ret = __down_interruptible(sem); + ret = __compat_down_interruptible(sem); return ret; } -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { int ret = 0; if (atomic_dec_return(&sem->count) < 0) - ret = __down_trylock(sem); + ret = __compat_down_trylock(sem); return ret; } @@ -106,11 +113,17 @@ static inline int down_trylock(struct se * Note! This is subtle. We jump to wake people up only if * the semaphore was negative (== somebody was waiting on it). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { if (atomic_inc_return(&sem->count) <= 0) - __up(sem); + __compat_up(sem); } +extern int compat_sem_is_locked(struct compat_semaphore *sem); + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif #endif /* __ASM_SH_SEMAPHORE_H */ Index: linux-rt.q/include/asm-sh/system.h =================================================================== --- linux-rt.q.orig/include/asm-sh/system.h +++ linux-rt.q/include/asm-sh/system.h @@ -164,10 +164,10 @@ static inline unsigned long xchg_u32(vol { unsigned long flags, retval; - local_irq_save(flags); + raw_local_irq_save(flags); retval = *m; *m = val; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } @@ -175,10 +175,10 @@ static inline unsigned long xchg_u8(vola { unsigned long flags, retval; - local_irq_save(flags); + raw_local_irq_save(flags); retval = *m; *m = val & 0xff; - local_irq_restore(flags); + raw_local_irq_restore(flags); return retval; } @@ -213,11 +213,11 @@ static inline unsigned long __cmpxchg_u3 __u32 retval; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); retval = *m; if (retval == old) *m = new; - local_irq_restore(flags); /* implies memory barrier */ + raw_local_irq_restore(flags); /* implies memory barrier */ return retval; } Index: linux-rt.q/include/asm-sh/thread_info.h =================================================================== --- linux-rt.q.orig/include/asm-sh/thread_info.h +++ linux-rt.q/include/asm-sh/thread_info.h @@ -112,6 +112,7 @@ static inline struct thread_info *curren #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ #define TIF_SINGLESTEP 5 /* singlestepping active */ +#define TIF_NEED_RESCHED_DELAYED 6 /* reschedule on return to userspace */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 18 @@ -123,6 +124,7 @@ static inline struct thread_info *curren #define _TIF_NEED_RESCHED (1<s_files iteration semantics Ensure no new files will be added when we're inspecting 'all' files. Without this, files could be added in front while we're iterating and we'd miss those. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- fs/file_table.c | 9 +++++++++ fs/super.c | 3 +++ include/linux/fs.h | 2 ++ security/selinux/selinuxfs.c | 7 ++++--- 4 files changed, 18 insertions(+), 3 deletions(-) Index: linux-rt.q/fs/file_table.c =================================================================== --- linux-rt.q.orig/fs/file_table.c +++ linux-rt.q/fs/file_table.c @@ -353,6 +353,12 @@ EXPORT_SYMBOL_GPL(filevec_add_drain_all) void file_kill(struct file *file) { + if (file && file->f_mapping && file->f_mapping->host) { + struct super_block *sb = file->f_mapping->host->i_sb; + if (sb) + barrier_sync(&sb->s_barrier); + } + if (file_flag(file, F_SUPERBLOCK)) { void **ptr; @@ -411,6 +417,7 @@ int fs_may_remount_ro(struct super_block struct file *file; /* Check that no files are currently opened for writing. */ + barrier_lock(&sb->s_barrier); filevec_add_drain_all(); lock_list_for_each_entry(file, &sb->s_files, f_u.fu_llist) { struct inode *inode = file->f_path.dentry->d_inode; @@ -423,9 +430,11 @@ int fs_may_remount_ro(struct super_block if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) goto too_bad; } + barrier_unlock(&sb->s_barrier); return 1; /* Tis' cool bro. */ too_bad: lock_list_for_each_entry_stop(file, f_u.fu_llist); + barrier_unlock(&sb->s_barrier); return 0; } Index: linux-rt.q/fs/super.c =================================================================== --- linux-rt.q.orig/fs/super.c +++ linux-rt.q/fs/super.c @@ -68,6 +68,7 @@ static struct super_block *alloc_super(s INIT_LIST_HEAD(&s->s_dirty); INIT_LIST_HEAD(&s->s_io); INIT_LOCK_LIST_HEAD(&s->s_files); + init_barrier(&s->s_barrier); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); @@ -569,11 +570,13 @@ static void mark_files_ro(struct super_b { struct file *f; + barrier_lock(&sb->s_barrier); filevec_add_drain_all(); lock_list_for_each_entry(f, &sb->s_files, f_u.fu_llist) { if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f)) f->f_mode &= ~FMODE_WRITE; } + barrier_unlock(&sb->s_barrier); } /** Index: linux-rt.q/include/linux/fs.h =================================================================== --- linux-rt.q.orig/include/linux/fs.h +++ linux-rt.q/include/linux/fs.h @@ -284,6 +284,7 @@ extern int dir_notify_enable; #include #include #include +#include #include #include @@ -957,6 +958,7 @@ struct super_block { struct list_head s_io; /* parked for writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ struct lock_list_head s_files; + struct barrier s_barrier; struct block_device *s_bdev; struct mtd_info *s_mtd; Index: linux-rt.q/security/selinux/selinuxfs.c =================================================================== --- linux-rt.q.orig/security/selinux/selinuxfs.c +++ linux-rt.q/security/selinux/selinuxfs.c @@ -967,9 +967,9 @@ static void sel_remove_bools(struct dent spin_unlock(&dcache_lock); - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file * filp = list_entry(p, struct file, f_u.fu_list); + barrier_lock(&sb->s_barrier); + filevec_add_drain_all(); + lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) { struct dentry * dentry = filp->f_path.dentry; if (dentry->d_parent != de) { @@ -977,6 +977,7 @@ static void sel_remove_bools(struct dent } filp->f_op = NULL; } + barrier_unlock(&sb->s_barrier); } #define BOOL_DIR_NAME "booleans" patches/rt-mutex-spinlock-nested-export-fix.patch0000664000077200007720000000543510646635214021550 0ustar mingomingoFrom linux-kernel-owner@vger.kernel.org Wed May 23 03:41:16 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.1 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by mail.tglx.de (Postfix) with ESMTP id 9CF7665C065 for ; Wed, 23 May 2007 03:41:16 +0200 (CEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1760713AbXEWBkE (ORCPT ); Tue, 22 May 2007 21:40:04 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1757805AbXEWBj4 (ORCPT ); Tue, 22 May 2007 21:39:56 -0400 Received: from rwcrmhc15.comcast.net ([204.127.192.85]:46662 "EHLO rwcrmhc15.comcast.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757432AbXEWBjz (ORCPT ); Tue, 22 May 2007 21:39:55 -0400 Received: from sx.thebigcorporation.com ([69.181.45.228]) by comcast.net (rwcrmhc15) with ESMTP id <20070523013954m1500dteb1e>; Wed, 23 May 2007 01:39:55 +0000 Received: from sx.thebigcorporation.com (localhost.localdomain [127.0.0.1]) by sx.thebigcorporation.com (8.13.8/8.13.8) with ESMTP id l4N1dsjb030388; Tue, 22 May 2007 18:39:54 -0700 Received: (from sven@localhost) by sx.thebigcorporation.com (8.13.8/8.13.8/Submit) id l4N1drBv030387; Tue, 22 May 2007 18:39:53 -0700 X-Authentication-Warning: sx.thebigcorporation.com: sven set sender to sven@thebigcorporation.com using -f Subject: [PATCH 2.6.21-rt6] From: Sven-Thorsten Dietrich To: LKML Cc: Ingo Molnar Content-Type: text/plain Organization: The Big Corporation Date: Tue, 22 May 2007 18:39:53 -0700 Message-Id: <1179884393.25500.86.camel@sx.thebigcorporation.com> Mime-Version: 1.0 X-Mailer: Evolution 2.8.3 (2.8.3-2.fc6) Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org X-Filter-To: .Kernel.LKML X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit This patch properly exports __spin_lock_irqsave_nested. signed-off-by: Sven-Thorsten Dietrich --- kernel/spinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/kernel/spinlock.c =================================================================== --- linux-rt.q.orig/kernel/spinlock.c +++ linux-rt.q/kernel/spinlock.c @@ -366,7 +366,7 @@ __spin_lock_irqsave_nested(raw_spinlock_ #endif return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave_nested); +EXPORT_SYMBOL(__spin_lock_irqsave_nested); #endif patches/preempt-realtime-powerpc-add-raw-relax-macros.patch0000664000077200007720000000213210646635215023411 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Mon May 14 15:26:25 2007 Date: Mon, 14 May 2007 15:26:25 +0900 From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [patch 1/4] powerpc 2.6.21-rt1: fix a build breakage by adding __raw_*_relax() macros Add missing macros to fix a build breakage for PREEMPT_DESKTOP. Signed-off-by: Tsutomu OWA -- owa --- include/asm-powerpc/spinlock.h | 4 ++++ 1 file changed, 4 insertions(+) Index: linux-rt.q/include/asm-powerpc/spinlock.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/spinlock.h +++ linux-rt.q/include/asm-powerpc/spinlock.h @@ -289,5 +289,9 @@ static __inline__ void __raw_write_unloc #define _raw_read_relax(lock) __rw_yield(lock) #define _raw_write_relax(lock) __rw_yield(lock) +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ patches/ppc-add-mcount.patch0000664000077200007720000001013510646635212015362 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Mon May 14 10:15:30 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=UNPARSEABLE_RELAY autolearn=ham version=3.1.7-deb Received: from imx12.toshiba.co.jp (imx12.toshiba.co.jp [61.202.160.132]) by mail.tglx.de (Postfix) with ESMTP id 7006365C065 for ; Mon, 14 May 2007 10:15:30 +0200 (CEST) Received: from wall11.toshiba.co.jp (wall11 [133.199.90.149]) by imx12.toshiba.co.jp with ESMTP id l4E8FKmi007480; Mon, 14 May 2007 17:15:20 +0900 (JST) Received: (from root@localhost) by wall11.toshiba.co.jp id l4E8FKaH003434; Mon, 14 May 2007 17:15:20 +0900 (JST) Received: from ovp11.toshiba.co.jp [133.199.90.148] by wall11.toshiba.co.jp with ESMTP id TAA03430; Mon, 14 May 2007 17:15:20 +0900 Received: from mx2.toshiba.co.jp (localhost [127.0.0.1]) by ovp11.toshiba.co.jp with ESMTP id l4E8FJCq025717; Mon, 14 May 2007 17:15:19 +0900 (JST) Received: from rdcgw.rdc.toshiba.co.jp by toshiba.co.jp id l4E8FJ3Y013473; Mon, 14 May 2007 17:15:19 +0900 (JST) Received: from island.swc.toshiba.co.jp by rdcgw.rdc.toshiba.co.jp (8.8.8p2+Sun/3.7W) with ESMTP id RAA01521; Mon, 14 May 2007 17:15:18 +0900 (JST) Received: from forest.toshiba.co.jp (forest [133.196.122.2]) by island.swc.toshiba.co.jp (Postfix) with ESMTP id 87FCB40002; Mon, 14 May 2007 17:15:10 +0900 (JST) Date: Mon, 14 May 2007 17:15:10 +0900 Message-ID: From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [patch 1/5] powerpc 2.6.21-rt1: add mcount() and _mcount() In-Reply-To: References: User-Agent: Wanderlust/2.8.1 (Something) Emacs/20.7 Mule/4.0 (HANANOEN) Organization: Software Engineering Center, TOSHIBA. MIME-Version: 1.0 (generated by SEMI 1.14.4 - "Hosorogi") Content-Type: text/plain; charset=US-ASCII X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit add mcount() and _mcount() for latency trace support. Signed-off-by: Tsutomu OWA -- owa --- arch/powerpc/kernel/entry_64.S | 60 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) Index: linux-rt.q/arch/powerpc/kernel/entry_64.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/entry_64.S +++ linux-rt.q/arch/powerpc/kernel/entry_64.S @@ -826,3 +826,63 @@ _GLOBAL(enter_prom) ld r0,16(r1) mtlr r0 blr + +#ifdef CONFIG_MCOUNT +/* + * code almost taken from entry_32.S + */ +#define MCOUNT_FRAME_SIZE 32 +_GLOBAL(mcount) + stdu r1,-MCOUNT_FRAME_SIZE(r1) + mflr r3 + + LOAD_REG_ADDR(r5,mcount_enabled) + lwz r5,0(r5) + std r3,MCOUNT_FRAME_SIZE+16(r1) + cmpwi r5,0 + beq 1f + + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + ld r4,MCOUNT_FRAME_SIZE(r1) + ld r4,16(r4) + bl .__trace + nop +1: + ld r0,MCOUNT_FRAME_SIZE+16(r1) + mtlr r0 + addi r1,r1,MCOUNT_FRAME_SIZE + blr + +/* + * Based on glibc-2.4/sysdeps/powerpc/powerpc64/ppc-mcount.S + * + * We don't need to save the parameter-passing registers as gcc takes + * care of that for us. Thus this function looks fairly normal. + * In fact, the generic code would work for us. + */ +_GLOBAL(_mcount) + /* return if we're in real mode. */ + mfmsr r3 + andi. r0,r3,MSR_IR|MSR_DR /* see if relocation is on? */ + beqlr /* if not, do nothing. */ + /* we're in translation mode. keep going. */ + mflr r3 + ld r11,0(r1) /* load back chain ptr */ + stdu r1,-STACK_FRAME_OVERHEAD(r1) + std r3,STACK_FRAME_OVERHEAD+16(r1) + ld r4,16(r11) /* LR in back chain */ + LOAD_REG_ADDR(r5,mcount_enabled) + lwz r5,0(r5) + cmpwi r5,0 /* see if mcount_enabled? */ + beq 1f /* if disabled, then skip */ + + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + bl .__trace + nop +1: + ld r0,STACK_FRAME_OVERHEAD+16(r1) /* restore saved LR */ + mtlr r0 + addi r1,r1,STACK_FRAME_OVERHEAD + blr + +#endif /* CONFIG_MCOUNT */ patches/add-notrace.patch0000664000077200007720000000066010646635212014732 0ustar mingomingo--- include/linux/linkage.h | 2 ++ 1 file changed, 2 insertions(+) Index: linux-rt.q/include/linux/linkage.h =================================================================== --- linux-rt.q.orig/include/linux/linkage.h +++ linux-rt.q/include/linux/linkage.h @@ -3,6 +3,8 @@ #include +#define notrace __attribute ((no_instrument_function)) + #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else patches/preempt-realtime-powerpc.patch0000664000077200007720000003755310646635214017517 0ustar mingomingo--- arch/powerpc/kernel/smp.c | 12 ++++++++- arch/powerpc/kernel/traps.c | 9 +++++- arch/powerpc/platforms/cell/smp.c | 2 - arch/powerpc/platforms/chrp/smp.c | 2 - arch/powerpc/platforms/chrp/time.c | 2 - arch/powerpc/platforms/powermac/feature.c | 2 - arch/powerpc/platforms/powermac/nvram.c | 2 - arch/powerpc/platforms/powermac/pic.c | 2 - arch/powerpc/platforms/pseries/smp.c | 2 - arch/ppc/8260_io/enet.c | 2 - arch/ppc/8260_io/fcc_enet.c | 2 - arch/ppc/8xx_io/commproc.c | 2 - arch/ppc/8xx_io/enet.c | 2 - arch/ppc/8xx_io/fec.c | 2 - arch/ppc/kernel/smp.c | 12 ++++++++- arch/ppc/kernel/traps.c | 6 +++- arch/ppc/platforms/hdpu.c | 2 - arch/ppc/platforms/sbc82xx.c | 2 - arch/ppc/syslib/cpm2_common.c | 2 - arch/ppc/syslib/open_pic.c | 2 - arch/ppc/syslib/open_pic2.c | 2 - include/asm-powerpc/hw_irq.h | 40 ++++++++++++++++++------------ 22 files changed, 76 insertions(+), 37 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/smp.c +++ linux-rt.q/arch/powerpc/kernel/smp.c @@ -126,6 +126,16 @@ void smp_send_reschedule(int cpu) smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE); +} + #ifdef CONFIG_DEBUGGER void smp_send_debugger_break(int cpu) { @@ -162,7 +172,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux-rt.q/arch/powerpc/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/traps.c +++ linux-rt.q/arch/powerpc/kernel/traps.c @@ -97,11 +97,11 @@ static inline void pmac_backlight_unblan int die(const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = _RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -177,6 +177,11 @@ void _exception(int signr, struct pt_reg return; } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; Index: linux-rt.q/arch/powerpc/platforms/cell/smp.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/cell/smp.c +++ linux-rt.q/arch/powerpc/platforms/cell/smp.c @@ -133,7 +133,7 @@ static void __devinit smp_iic_setup_cpu( iic_setup_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit cell_give_timebase(void) Index: linux-rt.q/arch/powerpc/platforms/chrp/smp.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/chrp/smp.c +++ linux-rt.q/arch/powerpc/platforms/chrp/smp.c @@ -44,7 +44,7 @@ static void __devinit smp_chrp_setup_cpu mpic_setup_this_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; void __devinit smp_chrp_give_timebase(void) Index: linux-rt.q/arch/powerpc/platforms/chrp/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/chrp/time.c +++ linux-rt.q/arch/powerpc/platforms/chrp/time.c @@ -27,7 +27,7 @@ #include #include -extern spinlock_t rtc_lock; +extern raw_spinlock_t rtc_lock; static int nvram_as1 = NVRAM_AS1; static int nvram_as0 = NVRAM_AS0; Index: linux-rt.q/arch/powerpc/platforms/powermac/feature.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/powermac/feature.c +++ linux-rt.q/arch/powerpc/platforms/powermac/feature.c @@ -59,7 +59,7 @@ extern struct device_node *k2_skiplist[2 * We use a single global lock to protect accesses. Each driver has * to take care of its own locking */ -DEFINE_SPINLOCK(feature_lock); +DEFINE_RAW_SPINLOCK(feature_lock); #define LOCK(flags) spin_lock_irqsave(&feature_lock, flags); #define UNLOCK(flags) spin_unlock_irqrestore(&feature_lock, flags); Index: linux-rt.q/arch/powerpc/platforms/powermac/nvram.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/powermac/nvram.c +++ linux-rt.q/arch/powerpc/platforms/powermac/nvram.c @@ -80,7 +80,7 @@ static int is_core_99; static int core99_bank = 0; static int nvram_partitions[3]; // XXX Turn that into a sem -static DEFINE_SPINLOCK(nv_lock); +static DEFINE_RAW_SPINLOCK(nv_lock); static int (*core99_write_bank)(int bank, u8* datas); static int (*core99_erase_bank)(int bank); Index: linux-rt.q/arch/powerpc/platforms/powermac/pic.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/powermac/pic.c +++ linux-rt.q/arch/powerpc/platforms/powermac/pic.c @@ -63,7 +63,7 @@ static int max_irqs; static int max_real_irqs; static u32 level_mask[4]; -static DEFINE_SPINLOCK(pmac_pic_lock); +static DEFINE_RAW_SPINLOCK(pmac_pic_lock); #define NR_MASK_WORDS ((NR_IRQS + 31) / 32) static unsigned long ppc_lost_interrupts[NR_MASK_WORDS]; Index: linux-rt.q/arch/powerpc/platforms/pseries/smp.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/pseries/smp.c +++ linux-rt.q/arch/powerpc/platforms/pseries/smp.c @@ -154,7 +154,7 @@ static void __devinit smp_xics_setup_cpu } #endif /* CONFIG_XICS */ -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit pSeries_give_timebase(void) Index: linux-rt.q/arch/ppc/8260_io/enet.c =================================================================== --- linux-rt.q.orig/arch/ppc/8260_io/enet.c +++ linux-rt.q/arch/ppc/8260_io/enet.c @@ -115,7 +115,7 @@ struct scc_enet_private { scc_t *sccp; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux-rt.q/arch/ppc/8260_io/fcc_enet.c =================================================================== --- linux-rt.q.orig/arch/ppc/8260_io/fcc_enet.c +++ linux-rt.q/arch/ppc/8260_io/fcc_enet.c @@ -375,7 +375,7 @@ struct fcc_enet_private { volatile fcc_enet_t *ep; struct net_device_stats stats; uint tx_free; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux-rt.q/arch/ppc/8xx_io/commproc.c =================================================================== --- linux-rt.q.orig/arch/ppc/8xx_io/commproc.c +++ linux-rt.q/arch/ppc/8xx_io/commproc.c @@ -370,7 +370,7 @@ cpm_setbrg(uint brg, uint rate) /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* * 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... Index: linux-rt.q/arch/ppc/8xx_io/enet.c =================================================================== --- linux-rt.q.orig/arch/ppc/8xx_io/enet.c +++ linux-rt.q/arch/ppc/8xx_io/enet.c @@ -142,7 +142,7 @@ struct scc_enet_private { unsigned char *rx_vaddr[RX_RING_SIZE]; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux-rt.q/arch/ppc/8xx_io/fec.c =================================================================== --- linux-rt.q.orig/arch/ppc/8xx_io/fec.c +++ linux-rt.q/arch/ppc/8xx_io/fec.c @@ -164,7 +164,7 @@ struct fec_enet_private { struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux-rt.q/arch/ppc/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/ppc/kernel/smp.c +++ linux-rt.q/arch/ppc/kernel/smp.c @@ -136,6 +136,16 @@ void smp_send_reschedule(int cpu) smp_message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0); +} + #ifdef CONFIG_XMON void smp_send_xmon_break(int cpu) { @@ -160,7 +170,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux-rt.q/arch/ppc/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/ppc/kernel/traps.c +++ linux-rt.q/arch/ppc/kernel/traps.c @@ -72,7 +72,7 @@ void (*debugger_fault_handler)(struct pt * Trap & Exception support */ -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); int die(const char * str, struct pt_regs * fp, long err) { @@ -107,6 +107,10 @@ void _exception(int signr, struct pt_reg debugger(regs); die("Exception in kernel mode", regs, signr); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif info.si_signo = signr; info.si_errno = 0; info.si_code = code; Index: linux-rt.q/arch/ppc/platforms/hdpu.c =================================================================== --- linux-rt.q.orig/arch/ppc/platforms/hdpu.c +++ linux-rt.q/arch/ppc/platforms/hdpu.c @@ -55,7 +55,7 @@ static void parse_bootinfo(unsigned long static void hdpu_set_l1pe(void); static void hdpu_cpustate_set(unsigned char new_state); #ifdef CONFIG_SMP -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; extern int smp_tb_synchronized; Index: linux-rt.q/arch/ppc/platforms/sbc82xx.c =================================================================== --- linux-rt.q.orig/arch/ppc/platforms/sbc82xx.c +++ linux-rt.q/arch/ppc/platforms/sbc82xx.c @@ -65,7 +65,7 @@ static void sbc82xx_time_init(void) static volatile char *sbc82xx_i8259_map; static char sbc82xx_i8259_mask = 0xff; -static DEFINE_SPINLOCK(sbc82xx_i8259_lock); +static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock); static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr) { Index: linux-rt.q/arch/ppc/syslib/cpm2_common.c =================================================================== --- linux-rt.q.orig/arch/ppc/syslib/cpm2_common.c +++ linux-rt.q/arch/ppc/syslib/cpm2_common.c @@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... */ static rh_block_t cpm_boot_dpmem_rh_block[16]; Index: linux-rt.q/arch/ppc/syslib/open_pic.c =================================================================== --- linux-rt.q.orig/arch/ppc/syslib/open_pic.c +++ linux-rt.q/arch/ppc/syslib/open_pic.c @@ -526,7 +526,7 @@ void openpic_reset_processor_phys(u_int } #if defined(CONFIG_SMP) || defined(CONFIG_PM) -static DEFINE_SPINLOCK(openpic_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic_setup_lock); #endif #ifdef CONFIG_SMP Index: linux-rt.q/arch/ppc/syslib/open_pic2.c =================================================================== --- linux-rt.q.orig/arch/ppc/syslib/open_pic2.c +++ linux-rt.q/arch/ppc/syslib/open_pic2.c @@ -380,7 +380,7 @@ static void openpic2_set_spurious(u_int vec); } -static DEFINE_SPINLOCK(openpic2_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic2_setup_lock); /* * Initialize a timer interrupt (and disable it) Index: linux-rt.q/include/asm-powerpc/hw_irq.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/hw_irq.h +++ linux-rt.q/include/asm-powerpc/hw_irq.h @@ -20,8 +20,8 @@ static inline unsigned long local_get_fl { unsigned long flags; - __asm__ __volatile__("lbz %0,%1(13)" - : "=r" (flags) +<<<<<<< delete extern unsigned long local_get_flags(void); +<<<<<<< delete extern unsigned long local_irq_disable(void); : "i" (offsetof(struct paca_struct, soft_enabled))); return flags; @@ -39,14 +39,19 @@ static inline unsigned long local_irq_di return flags; } -extern void local_irq_restore(unsigned long); + extern void iseries_handle_interrupts(void); +extern unsigned long raw_local_get_flags(void); +extern unsigned long raw_local_irq_disable(void); +extern void raw_local_irq_restore(unsigned long); + +#define raw_local_irq_enable() raw_local_irq_restore(1) +#define raw_local_save_flags(flags) ((flags) = raw_local_get_flags()) +#define raw_local_irq_save(flags) ((flags) = raw_local_irq_disable()) -#define local_irq_enable() local_irq_restore(1) -#define local_save_flags(flags) ((flags) = local_get_flags()) -#define local_irq_save(flags) ((flags) = local_irq_disable()) +#define raw_irqs_disabled() (raw_local_get_flags() == 0) +#define raw_irqs_disabled_flags(flags) ((flags) == 0) -#define irqs_disabled() (local_get_flags() == 0) #define __hard_irq_enable() __mtmsrd(mfmsr() | MSR_EE, 1) #define __hard_irq_disable() __mtmsrd(mfmsr() & ~MSR_EE, 1) @@ -62,13 +67,15 @@ extern void iseries_handle_interrupts(vo #if defined(CONFIG_BOOKE) #define SET_MSR_EE(x) mtmsr(x) -#define local_irq_restore(flags) __asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory") +#define raw_local_irq_restore(flags) __asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory") +<<<<<<< delete #define local_irq_restore(flags) do { \ +#define raw_local_irq_restore(flags) do { \ #else #define SET_MSR_EE(x) mtmsr(x) -#define local_irq_restore(flags) mtmsr(flags) +#define raw_local_irq_restore(flags) mtmsr(flags) #endif -static inline void local_irq_disable(void) +static inline void raw_local_irq_disable(void) { #ifdef CONFIG_BOOKE __asm__ __volatile__("wrteei 0": : :"memory"); @@ -80,7 +87,7 @@ static inline void local_irq_disable(voi #endif } -static inline void local_irq_enable(void) +static inline void raw_local_irq_enable(void) { #ifdef CONFIG_BOOKE __asm__ __volatile__("wrteei 1": : :"memory"); @@ -92,7 +99,7 @@ static inline void local_irq_enable(void #endif } -static inline void local_irq_save_ptr(unsigned long *flags) +static inline void raw_local_irq_save_ptr(unsigned long *flags) { unsigned long msr; msr = mfmsr(); @@ -105,13 +112,16 @@ static inline void local_irq_save_ptr(un __asm__ __volatile__("": : :"memory"); } -#define local_save_flags(flags) ((flags) = mfmsr()) -#define local_irq_save(flags) local_irq_save_ptr(&flags) -#define irqs_disabled() ((mfmsr() & MSR_EE) == 0) +#define raw_local_save_flags(flags) ((flags) = mfmsr()) +#define raw_local_irq_save(flags) raw_local_irq_save_ptr(&flags) +#define raw_irqs_disabled() ((mfmsr() & MSR_EE) == 0) +#define raw_irqs_disabled_flags(flags) ((flags & MSR_EE) == 0) #define hard_irq_enable() local_irq_enable() #define hard_irq_disable() local_irq_disable() +#include + #endif /* CONFIG_PPC64 */ /* patches/ep93xx-clockevents.patch0000664000077200007720000001440710646635211016232 0ustar mingomingoclockevent support for the EP93xx platform clockevent support for the EP93xx platform (by tglx) Only added a fix for clockevent_ep93xx.mult, which was using the wrong clock tickrate) --- arch/arm/mach-ep93xx/core.c | 125 ++++++++++++++++++++---------- include/asm-arm/arch-ep93xx/ep93xx-regs.h | 6 + 2 files changed, 91 insertions(+), 40 deletions(-) Index: linux-rt.q/arch/arm/mach-ep93xx/core.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-ep93xx/core.c +++ linux-rt.q/arch/arm/mach-ep93xx/core.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include @@ -50,7 +52,6 @@ #include - /************************************************************************* * Static I/O mappings that are needed for all EP93xx platforms *************************************************************************/ @@ -93,39 +94,58 @@ void __init ep93xx_map_io(void) * to use this timer for something else. We also use timer 4 for keeping * track of lost jiffies. */ -static unsigned int last_jiffy_time; -static unsigned int next_jiffy_time; -static unsigned int accumulator; +static struct clock_event_device clockevent_ep93xx; + +static int ep93xx_timer_interrupt(int irq, void *dev_id) +{ + __raw_writel(EP93XX_TC_CLEAR, EP93XX_TIMER1_CLEAR); -#define TIMER4_TICKS_PER_JIFFY (983040 / HZ) -#define TIMER4_TICKS_MOD_JIFFY (983040 % HZ) + clockevent_ep93xx.event_handler(&clockevent_ep93xx); -static int after_eq(unsigned long a, unsigned long b) + return IRQ_HANDLED; +} + +static int ep93xx_set_next_event(unsigned long evt, + struct clock_event_device *unused) { - return ((signed long)(a - b)) >= 0; + __raw_writel(evt, EP93XX_TIMER1_LOAD); + return 0; } -static int ep93xx_timer_interrupt(int irq, void *dev_id) +static void ep93xx_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) { - write_seqlock(&xtime_lock); + u32 tmode = EP93XX_TC123_SEL_508KHZ; - __raw_writel(1, EP93XX_TIMER1_CLEAR); - while (after_eq(__raw_readl(EP93XX_TIMER4_VALUE_LOW), next_jiffy_time)) { - timer_tick(); - - last_jiffy_time = next_jiffy_time; - next_jiffy_time += TIMER4_TICKS_PER_JIFFY; - accumulator += TIMER4_TICKS_MOD_JIFFY; - if (accumulator >= HZ) { - next_jiffy_time++; - accumulator -= HZ; - } + /* Disable timer */ + __raw_writel(tmode, EP93XX_TIMER1_CONTROL); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* Set timer period */ + __raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD); + tmode |= EP93XX_TC123_PERIODIC; + + case CLOCK_EVT_MODE_ONESHOT: + tmode |= EP93XX_TC123_ENABLE; + __raw_writel(tmode, EP93XX_TIMER1_CONTROL); + break; + + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_RESUME: + return; } +} - write_sequnlock(&xtime_lock); +static struct clock_event_device clockevent_ep93xx = { + .name = "ep93xx-timer1", + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, + .shift = 32, + .set_mode = ep93xx_set_mode, + .set_next_event = ep93xx_set_next_event, +}; - return IRQ_HANDLED; -} static struct irqaction ep93xx_timer_irq = { .name = "ep93xx timer", @@ -133,32 +153,58 @@ static struct irqaction ep93xx_timer_irq .handler = ep93xx_timer_interrupt, }; -static void __init ep93xx_timer_init(void) +static void __init ep93xx_clockevent_init(void) { - /* Enable periodic HZ timer. */ - __raw_writel(0x48, EP93XX_TIMER1_CONTROL); - __raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD); - __raw_writel(0xc8, EP93XX_TIMER1_CONTROL); + setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq); - /* Enable lost jiffy timer. */ - __raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH); + clockevent_ep93xx.mult = div_sc(508469, NSEC_PER_SEC, + clockevent_ep93xx.shift); + clockevent_ep93xx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_ep93xx); + clockevent_ep93xx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_ep93xx); + clockevent_ep93xx.cpumask = cpumask_of_cpu(0); + clockevents_register_device(&clockevent_ep93xx); +} - setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq); +/* + * timer4 is a 40 Bit timer, separated in a 32bit and a 8 bit + * register, EP93XX_TIMER4_VALUE_LOW stores 32 bit word. The + * controlregister is in EP93XX_TIMER4_VALUE_HIGH + */ + +cycle_t ep93xx_get_cycles(void) +{ + return __raw_readl(EP93XX_TIMER4_VALUE_LOW); } -static unsigned long ep93xx_gettimeoffset(void) +static struct clocksource clocksource_ep93xx = { + .name = "ep93xx_timer4", + .rating = 200, + .read = ep93xx_get_cycles, + .mask = 0xFFFFFFFF, + .shift = 20, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static void __init ep93xx_clocksource_init(void) { - int offset; + /* Reset time-stamp counter */ + __raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH); - offset = __raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time; + clocksource_ep93xx.mult = + clocksource_hz2mult(983040, clocksource_ep93xx.shift); + clocksource_register(&clocksource_ep93xx); +} - /* Calculate (1000000 / 983040) * offset. */ - return offset + (53 * offset / 3072); +static void __init ep93xx_timer_init(void) +{ + ep93xx_clocksource_init(); + ep93xx_clockevent_init(); } struct sys_timer ep93xx_timer = { - .init = ep93xx_timer_init, - .offset = ep93xx_gettimeoffset, + .init = ep93xx_timer_init, }; @@ -510,7 +556,6 @@ static struct platform_device ep93xx_ohc .resource = ep93xx_ohci_resources, }; - void __init ep93xx_init_devices(void) { unsigned int v; Index: linux-rt.q/include/asm-arm/arch-ep93xx/ep93xx-regs.h =================================================================== --- linux-rt.q.orig/include/asm-arm/arch-ep93xx/ep93xx-regs.h +++ linux-rt.q/include/asm-arm/arch-ep93xx/ep93xx-regs.h @@ -67,6 +67,12 @@ #define EP93XX_TIMER3_CONTROL EP93XX_TIMER_REG(0x88) #define EP93XX_TIMER3_CLEAR EP93XX_TIMER_REG(0x8c) +#define EP93XX_TC_CLEAR 0x00000001 +#define EP93XX_TC123_ENABLE 0x00000080 +#define EP93XX_TC123_PERIODIC 0x00000040 +#define EP93XX_TC123_SEL_508KHZ 0x00000008 +#define EP93XX_TC4_ENABLE 0x00000100 + #define EP93XX_I2S_BASE (EP93XX_APB_VIRT_BASE + 0x00020000) #define EP93XX_SECURITY_BASE (EP93XX_APB_VIRT_BASE + 0x00030000) patches/pcspkr-use-the-global-pit-lock.patch0000664000077200007720000000472410646635210020404 0ustar mingomingoFrom: Thomas Gleixner Replace the pcspkr private PIT lock by the global PIT lock to serialize the PIT access all over the place. Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Cc: Dmitry Torokhov Signed-off-by: Andrew Morton --- arch/x86_64/kernel/time.c | 2 ++ drivers/input/misc/pcspkr.c | 11 ++++++++--- include/asm-x86_64/i8253.h | 6 ++++++ 3 files changed, 16 insertions(+), 3 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/time.c +++ linux-rt.q/arch/x86_64/kernel/time.c @@ -33,6 +33,7 @@ #include #endif #include +#include #include #include #include @@ -50,6 +51,7 @@ static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; Index: linux-rt.q/drivers/input/misc/pcspkr.c =================================================================== --- linux-rt.q.orig/drivers/input/misc/pcspkr.c +++ linux-rt.q/drivers/input/misc/pcspkr.c @@ -24,7 +24,12 @@ MODULE_AUTHOR("Vojtech Pavlik +#else +static DEFINE_SPINLOCK(i8253_lock); +#endif static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { @@ -43,7 +48,7 @@ static int pcspkr_event(struct input_dev if (value > 20 && value < 32767) count = PIT_TICK_RATE / value; - spin_lock_irqsave(&i8253_beep_lock, flags); + spin_lock_irqsave(&i8253_lock, flags); if (count) { /* enable counter 2 */ @@ -58,7 +63,7 @@ static int pcspkr_event(struct input_dev outb(inb_p(0x61) & 0xFC, 0x61); } - spin_unlock_irqrestore(&i8253_beep_lock, flags); + spin_unlock_irqrestore(&i8253_lock, flags); return 0; } Index: linux-rt.q/include/asm-x86_64/i8253.h =================================================================== --- /dev/null +++ linux-rt.q/include/asm-x86_64/i8253.h @@ -0,0 +1,6 @@ +#ifndef __ASM_I8253_H__ +#define __ASM_I8253_H__ + +extern spinlock_t i8253_lock; + +#endif /* __ASM_I8253_H__ */ patches/preempt-realtime-x86_64.patch0000664000077200007720000002751210646635214016770 0ustar mingomingo arch/x86_64/kernel/early_printk.c | 2 +- arch/x86_64/kernel/head64.c | 6 +++++- arch/x86_64/kernel/i8259.c | 2 +- arch/x86_64/kernel/io_apic.c | 13 +++++++------ arch/x86_64/kernel/nmi.c | 2 ++ arch/x86_64/kernel/process.c | 23 +++++++++++++---------- arch/x86_64/kernel/signal.c | 7 +++++++ arch/x86_64/kernel/smp.c | 14 ++++++++++++-- arch/x86_64/kernel/traps.c | 5 ++--- include/asm-x86_64/acpi.h | 4 ++-- include/asm-x86_64/hw_irq.h | 2 +- include/asm-x86_64/io_apic.h | 2 +- include/asm-x86_64/spinlock.h | 6 +++--- include/asm-x86_64/tlbflush.h | 8 +++++++- 14 files changed, 64 insertions(+), 32 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/early_printk.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/early_printk.c +++ linux-rt.q/arch/x86_64/kernel/early_printk.c @@ -202,7 +202,7 @@ static int early_console_initialized = 0 void early_printk(const char *fmt, ...) { - char buf[512]; + static char buf[512]; int n; va_list ap; Index: linux-rt.q/arch/x86_64/kernel/head64.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/head64.c +++ linux-rt.q/arch/x86_64/kernel/head64.c @@ -26,7 +26,11 @@ static void __init zap_identity_mappings { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb(); + /* + * preempt_disable/enable does not work this early in the + * bootup yet: + */ + write_cr3(read_cr3()); } /* Don't add a printk in there. printk relies on the PDA which is not initialized Index: linux-rt.q/arch/x86_64/kernel/i8259.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/i8259.c +++ linux-rt.q/arch/x86_64/kernel/i8259.c @@ -96,8 +96,8 @@ static void (*interrupt[NR_VECTORS - FIR */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +DEFINE_RAW_SPINLOCK(i8259A_lock); static struct irq_chip i8259A_chip = { .name = "XT-PIC", Index: linux-rt.q/arch/x86_64/kernel/io_apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/io_apic.c +++ linux-rt.q/arch/x86_64/kernel/io_apic.c @@ -90,8 +90,8 @@ int timer_over_8254 __initdata = 1; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -178,6 +178,9 @@ static inline void io_apic_sync(unsigned reg ACTION; \ io_apic_modify(entry->apic, reg); \ FINAL; \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -322,10 +325,8 @@ static void add_pin_to_irq(unsigned int static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ DO_ACTION( __pcix_mask, 0, &= 0xffff7fff, ) /* edge */ DO_ACTION( __pcix_unmask, 0, = (reg & 0xfffeffff) | 0x00008000, ) /* level */ Index: linux-rt.q/arch/x86_64/kernel/nmi.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/nmi.c +++ linux-rt.q/arch/x86_64/kernel/nmi.c @@ -70,7 +70,9 @@ static int endflag __initdata = 0; */ static __init void nmi_cpu_busy(void *data) { +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the Index: linux-rt.q/arch/x86_64/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/process.c +++ linux-rt.q/arch/x86_64/kernel/process.c @@ -116,7 +116,7 @@ static void default_idle(void) */ smp_mb(); local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { /* Enables interrupts one instruction before HLT. x86 special cases this so there is no race. */ safe_halt(); @@ -202,7 +202,7 @@ void cpu_idle (void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -230,12 +230,14 @@ void cpu_idle (void) __exit_idle(); } - trace_preempt_exit_idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + trace_preempt_exit_idle(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); trace_preempt_enter_idle(); + local_irq_enable(); } } @@ -251,10 +253,10 @@ void cpu_idle (void) */ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(eax, ecx); } } @@ -262,10 +264,10 @@ void mwait_idle_with_hints(unsigned long /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { trace_hardirqs_on(); __sti_mwait(0, 0); } else @@ -374,7 +376,7 @@ void exit_thread(void) struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; @@ -382,6 +384,7 @@ void exit_thread(void) /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); Index: linux-rt.q/arch/x86_64/kernel/signal.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/signal.c +++ linux-rt.q/arch/x86_64/kernel/signal.c @@ -395,6 +395,13 @@ static void do_signal(struct pt_regs *re int signr; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux-rt.q/arch/x86_64/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/smp.c +++ linux-rt.q/arch/x86_64/kernel/smp.c @@ -56,7 +56,7 @@ union smp_flush_state { struct mm_struct *flush_mm; unsigned long flush_va; #define FLUSH_ALL -1ULL - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; } ____cacheline_aligned; @@ -295,10 +295,20 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); Index: linux-rt.q/arch/x86_64/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/traps.c +++ linux-rt.q/arch/x86_64/kernel/traps.c @@ -215,7 +215,7 @@ void dump_trace(struct task_struct *tsk, unsigned long *stack, struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); + const unsigned cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; @@ -306,7 +306,6 @@ void dump_trace(struct task_struct *tsk, tinfo = task_thread_info(tsk); HANDLE_STACK (valid_stack_ptr(tinfo, stack)); #undef HANDLE_STACK - put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -355,7 +354,7 @@ _show_stack(struct task_struct *tsk, str { unsigned long *stack; int i; - const int cpu = smp_processor_id(); + const int cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); Index: linux-rt.q/include/asm-x86_64/acpi.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/acpi.h +++ linux-rt.q/include/asm-x86_64/acpi.h @@ -50,8 +50,8 @@ #define ACPI_ASM_MACROS #define BREAKPOINT3 -#define ACPI_DISABLE_IRQS() local_irq_disable() -#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_DISABLE_IRQS() local_irq_disable_nort() +#define ACPI_ENABLE_IRQS() local_irq_enable_nort() #define ACPI_FLUSH_CPU_CACHE() wbinvd() int __acpi_acquire_global_lock(unsigned int *lock); Index: linux-rt.q/include/asm-x86_64/hw_irq.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/hw_irq.h +++ linux-rt.q/include/asm-x86_64/hw_irq.h @@ -98,7 +98,7 @@ typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); extern void __setup_vector_irq(int cpu); -extern spinlock_t vector_lock; +extern raw_spinlock_t vector_lock; /* * Various low-level irq details needed by irq.c, process.c, Index: linux-rt.q/include/asm-x86_64/io_apic.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/io_apic.h +++ linux-rt.q/include/asm-x86_64/io_apic.h @@ -125,6 +125,6 @@ extern int sis_apic_bug; /* dummy */ void enable_NMI_through_LVT0 (void * dummy); -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; #endif Index: linux-rt.q/include/asm-x86_64/spinlock.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/spinlock.h +++ linux-rt.q/include/asm-x86_64/spinlock.h @@ -160,8 +160,8 @@ static inline void __raw_write_unlock(__ : "=m" (rw->lock) : : "memory"); } -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() #endif /* __ASM_SPINLOCK_H */ Index: linux-rt.q/include/asm-x86_64/tlbflush.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/tlbflush.h +++ linux-rt.q/include/asm-x86_64/tlbflush.h @@ -8,14 +8,20 @@ static inline void __flush_tlb(void) { + preempt_disable(); write_cr3(read_cr3()); + preempt_enable(); } static inline void __flush_tlb_all(void) { - unsigned long cr4 = read_cr4(); + unsigned long cr4; + + preempt_disable(); + cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); /* clear PGE */ write_cr4(cr4); /* write old PGE again and flush TLBs */ + preempt_enable(); } #define __flush_tlb_one(addr) \ patches/ppc-clockevents-fix.patch0000664000077200007720000001012510646635213016433 0ustar mingomingoFrom linux-kernel-owner@vger.kernel.org Thu May 24 20:24:54 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by mail.tglx.de (Postfix) with ESMTP id B0D2F65C3E9 for ; Thu, 24 May 2007 20:24:54 +0200 (CEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751886AbXEXSYQ (ORCPT ); Thu, 24 May 2007 14:24:16 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1750768AbXEXSYE (ORCPT ); Thu, 24 May 2007 14:24:04 -0400 Received: from gateway-1237.mvista.com ([63.81.120.155]:2175 "EHLO imap.sh.mvista.com" rhost-flags-OK-FAIL-OK-FAIL) by vger.kernel.org with ESMTP id S1750741AbXEXSYD (ORCPT ); Thu, 24 May 2007 14:24:03 -0400 Received: from wasted.dev.rtsoft.ru (unknown [10.150.0.9]) by imap.sh.mvista.com (Postfix) with ESMTP id 767D13ECA; Thu, 24 May 2007 11:23:59 -0700 (PDT) From: Sergei Shtylyov Organization: MontaVista Software Inc. To: tglx@linutronix.de, mingo@elte.hu Subject: [PATCH 2.6.21-rt7] PowerPC: fix clockevents for classic CPUs Date: Thu, 24 May 2007 22:25:30 +0400 User-Agent: KMail/1.5 Cc: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org References: <200705172142.26739.sshtylyov@ru.mvista.com> In-Reply-To: <200705172142.26739.sshtylyov@ru.mvista.com> MIME-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Disposition: inline Message-Id: <200705242225.30225.sshtylyov@ru.mvista.com> Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org X-Filter-To: .Kernel.LKML X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Uncoditionally set a maximum positive value to the decrementer before calling an event handler for all "classic" PPC CPUs (although this is only necessary to clear interrupt on POWER4+, I've been asked to do it this way) -- otherwise it wouldn't have been done for an offline CPU in periodic mode since the event reprogramming has been delegated to the timer subsystem. Also, as the classic decrementer doesn't have periodic mode, make set_mode() method for this case completely empty. While at it, add a switch case for CLOCK_EVT_MODE_RESUME to hush the warning. Signed-off-by: Sergei Shtylyov --- Testing on "classic" CPUs is still needed (used to work atop of 2.6.18-rt7). arch/powerpc/kernel/time.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -166,11 +166,14 @@ static void decrementer_set_mode(enum cl case CLOCK_EVT_MODE_SHUTDOWN: tcr &= ~TCR_DIE; break; + case CLOCK_EVT_MODE_RESUME: + break; } mtspr(SPRN_TCR, tcr); -#endif + if (mode == CLOCK_EVT_MODE_PERIODIC) decrementer_set_next_event(tb_ticks_per_jiffy, dev); +#endif } static struct clock_event_device decrementer_clockevent = { @@ -549,16 +552,12 @@ void timer_interrupt(struct pt_regs * re irq_enter(); #ifdef CONFIG_GENERIC_CLOCKEVENTS -#ifdef CONFIG_PPC_MULTIPLATFORM +#if !defined(CONFIG_40x) && !defined(CONFIG_BOOKE) /* * We must write a positive value to the decrementer to clear - * the interrupt on the IBM 970 CPU series. In periodic mode, - * this happens when the decrementer gets reloaded later, but - * in one-shot mode, we have to do it here since an event handler - * may skip loading the new value... + * the interrupt on POWER4+ compatible CPUs. */ - if (per_cpu(decrementers, cpu).mode != CLOCK_EVT_MODE_PERIODIC) - set_dec(DECREMENTER_MAX); + set_dec(DECREMENTER_MAX); #endif /* * We can't disable the decrementer, so in the period between patches/arm-cmpxchg-support-armv6.patch0000664000077200007720000000315210646635213017522 0ustar mingomingo[PATCH -rt] cmpxchg support on ARMv6 Current rt patch don't support the cmpxchg on ARMv6. This patch supports cmpxchg in ARMv6. It's tested on OMAP2 (apollon board). Signed-off-by: Kyungmin Park p.s., Pleaes cc to me, I'm not subscriber on this mailing list. -- --- include/asm-arm/atomic.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) Index: linux-rt.q/include/asm-arm/atomic.h =================================================================== --- linux-rt.q.orig/include/asm-arm/atomic.h +++ linux-rt.q/include/asm-arm/atomic.h @@ -114,6 +114,46 @@ static inline void atomic_clear_mask(uns : "cc"); } +/* + * Atomic compare and exchange. + */ +#define __HAVE_ARCH_CMPXCHG 1 + +extern unsigned long wrong_size_cmpxchg(volatile void *ptr); + +static inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + volatile unsigned long *p = ptr; + + if (size == 4) { + unsigned long oldval, res; + + do { + __asm__ __volatile__("@ atomic_cmpxchg\n" + "ldrex %1, [%2]\n" + "mov %0, #0\n" + "teq %1, %3\n" + "strexeq %0, %4, [%2]\n" + : "=&r" (res), "=&r" (oldval) + : "r" (p), "Ir" (old), "r" (new) + : "cc"); + } while (res); + + return oldval; + } else + return wrong_size_cmpxchg(ptr); +} + +#define cmpxchg(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ +}) + #else /* ARM_ARCH_6 */ #include patches/nf_conntrack-weird-crash-fix.patch0000664000077200007720000000221110646635216020206 0ustar mingomingo--- net/netfilter/nf_conntrack_core.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) Index: linux-rt.q/net/netfilter/nf_conntrack_core.c =================================================================== --- linux-rt.q.orig/net/netfilter/nf_conntrack_core.c +++ linux-rt.q/net/netfilter/nf_conntrack_core.c @@ -1278,6 +1278,24 @@ int __init nf_conntrack_init(void) /* - and look it like as a confirmed connection */ set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + /* + * There's something really weird (read: crash) going on in + * this module when lockdep and rt is enabled - the locks are + * not initialized in the per-CPU area properly - or they might + * be initialized by getting a copy of the first CPU's per-cpu + * area? Only seems to happen when things are modular. Maybe + * per-cpu-alloc does not zero buffers properly? Needs + * investigating. Reported and fixed by Mike. + */ +#if defined(CONFIG_NF_CONNTRACK_EVENTS) && defined(CONFIG_SMP) + { + int cpu; + + for_each_possible_cpu(cpu) + spin_lock_init(&per_cpu_lock(nf_conntrack_ecache, cpu)); + } +#endif + return ret; out_free_expect_slab: patches/ich-force-hpet-ich5-quirk-to-force-detect-enable-fix.patch0000664000077200007720000000266610646635211024326 0ustar mingomingoFrom: Andrew Morton arch/i386/kernel/quirks.c: In function 'old_ich_force_enable_hpet': arch/i386/kernel/quirks.c:196: warning: 'gen_cntl' is used uninitialized in this function arch/i386/kernel/quirks.c: In function 'force_hpet_resume': arch/i386/kernel/quirks.c:171: warning: 'gen_cntl' is used uninitialized in this function Cc: Andi Kleen Cc: Greg KH Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Venkatesh Pallipadi Cc: Venki Pallipadi Cc: john stultz Signed-off-by: Andrew Morton --- arch/i386/kernel/quirks.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/i386/kernel/quirks.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/quirks.c +++ linux-rt.q/arch/i386/kernel/quirks.c @@ -161,7 +161,8 @@ static struct pci_dev *cached_dev; static void old_ich_force_hpet_resume(void) { - u32 val, gen_cntl; + u32 val; + u32 uninitialized_var(gen_cntl); if (!force_hpet_address || !cached_dev) return; @@ -182,7 +183,8 @@ static void old_ich_force_hpet_resume(vo static void old_ich_force_enable_hpet(struct pci_dev *dev) { - u32 val, gen_cntl; + u32 val; + u32 uninitialized_var(gen_cntl); if (hpet_address || force_hpet_address) return; patches/s_files.patch0000664000077200007720000002675610646635216014217 0ustar mingomingoSubject: fs: break the file_list_lock for sb->s_files Break the protection of sb->s_files out from under the global file_list_lock. sb->s_files is converted to a lock_list. furthermore to prevent the lock_list_head of getting too contended with concurrent add operations the add is buffered in per cpu filevecs. This would ordinarily require a flush before a delete operation - to ensure the to be deleted entry is indeed added to the list. This is avoided by storing a pointer to the filevec location in the not yet used list_head. This pointer can then be used to clear the filevec entry before its actually added. The file_flag mess is a bit unfortunate - this can be removed by also converting tty->tty_files to a lock_list (TODO). Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- fs/file_table.c | 170 +++++++++++++++++++++++++++++++++++++++---- fs/open.c | 2 fs/proc/generic.c | 8 -- fs/super.c | 7 - include/linux/fs.h | 23 +++++ mm/readahead.c | 2 security/selinux/selinuxfs.c | 4 - 7 files changed, 188 insertions(+), 28 deletions(-) Index: linux-rt.q/fs/file_table.c =================================================================== --- linux-rt.q.orig/fs/file_table.c +++ linux-rt.q/fs/file_table.c @@ -112,7 +112,7 @@ struct file *get_empty_filp(void) goto fail_sec; tsk = current; - INIT_LIST_HEAD(&f->f_u.fu_list); + INIT_LOCK_LIST_HEAD(&f->f_u.fu_llist); atomic_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); f->f_uid = tsk->fsuid; @@ -244,32 +244,175 @@ void put_filp(struct file *file) } } -void file_move(struct file *file, struct list_head *list) +enum { + FILEVEC_SIZE = 15 +}; + +struct filevec { + unsigned long nr; + struct file *files[FILEVEC_SIZE]; +}; + +static DEFINE_PER_CPU(struct filevec, sb_fvec); + +static inline unsigned int filevec_size(struct filevec *fvec) { - if (!list) - return; - file_list_lock(); - list_move(&file->f_u.fu_list, list); - file_list_unlock(); + return FILEVEC_SIZE - fvec->nr; +} + +static inline unsigned int filevec_count(struct filevec *fvec) +{ + return fvec->nr; +} + +static inline void filevec_reinit(struct filevec *fvec) +{ + fvec->nr = 0; +} + +static inline unsigned int filevec_add(struct filevec *fvec, struct file *filp) +{ + rcu_assign_pointer(fvec->files[fvec->nr], filp); + + /* + * Here we do icky stuff in order to avoid flushing the per cpu filevec + * on list removal. + * + * We store the location on the per cpu filevec in the as of yet unused + * fu_llist.next field and toggle bit 0 to indicate we done so. This + * allows the removal code to set the filevec entry to NULL, thereby + * avoiding the list add. + * + * Abuse the fu_llist.lock for protection. + */ + spin_lock(&filp->f_u.fu_llist.lock); + filp->f_u.fu_llist.next = (void *)&fvec->files[fvec->nr]; + __set_bit(0, (void *)&filp->f_u.fu_llist.next); + spin_unlock(&filp->f_u.fu_llist.lock); + + fvec->nr++; + return filevec_size(fvec); +} + +static void __filevec_add(struct filevec *fvec) +{ + int i; + + for (i = 0; i < filevec_count(fvec); i++) { + struct file *filp; + + /* + * see the comment in filevec_add(); + * need RCU because a concurrent remove might have deleted + * the entry from under us. + */ + rcu_read_lock(); + filp = rcu_dereference(fvec->files[i]); + /* + * the simple case, its gone - NEXT! + */ + if (!filp) { + rcu_read_unlock(); + continue; + } + + spin_lock(&filp->f_u.fu_llist.lock); + /* + * If the entry really is still there, add it! + */ + if (rcu_dereference(fvec->files[i])) { + struct super_block *sb = + filp->f_mapping->host->i_sb; + + __lock_list_add(&filp->f_u.fu_llist, &sb->s_files); + } + spin_unlock(&filp->f_u.fu_llist.lock); + rcu_read_unlock(); + } + filevec_reinit(fvec); +} + +static void filevec_add_drain(void) +{ + struct filevec *fvec = &get_cpu_var(sb_fvec, &cpu); + if (filevec_count(fvec)) + __filevec_add(fvec); + put_cpu_var(sb_fvec, cpu); } +static void filevec_add_drain_per_cpu(struct work_struct *dummy) +{ + filevec_add_drain(); +} + +int filevec_add_drain_all(void) +{ + return schedule_on_each_cpu(filevec_add_drain_per_cpu); +} +EXPORT_SYMBOL_GPL(filevec_add_drain_all); + void file_kill(struct file *file) { - if (!list_empty(&file->f_u.fu_list)) { + if (file_flag(file, F_SUPERBLOCK)) { + void **ptr; + + file_flag_clear(file, F_SUPERBLOCK); + + /* + * If bit 0 of the fu_llist.next pointer is set we're still + * enqueued on a per cpu filevec, in that case clear the entry + * and we're done. + */ + spin_lock(&file->f_u.fu_llist.lock); + ptr = (void **)file->f_u.fu_llist.next; + if (__test_and_clear_bit(0, (void *)&ptr)) { + rcu_assign_pointer(*ptr, NULL); + INIT_LIST_HEAD(&file->f_u.fu_llist.head); + spin_unlock(&file->f_u.fu_llist.lock); + return; + } + spin_unlock(&file->f_u.fu_llist.lock); + + if (!list_empty(&file->f_u.fu_list)) + lock_list_del_init(&file->f_u.fu_llist); + + } else if (!list_empty(&file->f_u.fu_list)) { file_list_lock(); list_del_init(&file->f_u.fu_list); file_list_unlock(); } } +void file_move(struct file *file, struct list_head *list) +{ + struct super_block *sb; + + if (!list) + return; + + file_kill(file); + + sb = file->f_mapping->host->i_sb; + if (list == &sb->s_files.head) { + struct filevec *fvec = &get_cpu_var(sb_fvec, &cpu); + file_flag_set(file, F_SUPERBLOCK); + if (!filevec_add(fvec, file)) + __filevec_add(fvec); + put_cpu_var(sb_fvec, cpu); + } else { + file_list_lock(); + list_add(&file->f_u.fu_list, list); + file_list_unlock(); + } +} + int fs_may_remount_ro(struct super_block *sb) { - struct list_head *p; + struct file *file; /* Check that no files are currently opened for writing. */ - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file *file = list_entry(p, struct file, f_u.fu_list); + filevec_add_drain_all(); + lock_list_for_each_entry(file, &sb->s_files, f_u.fu_llist) { struct inode *inode = file->f_path.dentry->d_inode; /* File with pending delete? */ @@ -280,10 +423,9 @@ int fs_may_remount_ro(struct super_block if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) goto too_bad; } - file_list_unlock(); return 1; /* Tis' cool bro. */ too_bad: - file_list_unlock(); + lock_list_for_each_entry_stop(file, f_u.fu_llist); return 0; } Index: linux-rt.q/fs/open.c =================================================================== --- linux-rt.q.orig/fs/open.c +++ linux-rt.q/fs/open.c @@ -694,7 +694,7 @@ static struct file *__dentry_open(struct f->f_path.mnt = mnt; f->f_pos = 0; f->f_op = fops_get(inode->i_fop); - file_move(f, &inode->i_sb->s_files); + file_move(f, &inode->i_sb->s_files.head); if (!open && f->f_op) open = f->f_op->open; Index: linux-rt.q/fs/proc/generic.c =================================================================== --- linux-rt.q.orig/fs/proc/generic.c +++ linux-rt.q/fs/proc/generic.c @@ -558,15 +558,14 @@ static int proc_register(struct proc_dir */ static void proc_kill_inodes(struct proc_dir_entry *de) { - struct list_head *p; + struct file *filp; struct super_block *sb = proc_mnt->mnt_sb; /* * Actually it's a partial revoke(). */ - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file * filp = list_entry(p, struct file, f_u.fu_list); + filevec_add_drain_all(); + lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) { struct dentry * dentry = filp->f_path.dentry; struct inode * inode; const struct file_operations *fops; @@ -580,7 +579,6 @@ static void proc_kill_inodes(struct proc filp->f_op = NULL; fops_put(fops); } - file_list_unlock(); } static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent, Index: linux-rt.q/fs/super.c =================================================================== --- linux-rt.q.orig/fs/super.c +++ linux-rt.q/fs/super.c @@ -67,7 +67,7 @@ static struct super_block *alloc_super(s } INIT_LIST_HEAD(&s->s_dirty); INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_files); + INIT_LOCK_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); @@ -569,12 +569,11 @@ static void mark_files_ro(struct super_b { struct file *f; - file_list_lock(); - list_for_each_entry(f, &sb->s_files, f_u.fu_list) { + filevec_add_drain_all(); + lock_list_for_each_entry(f, &sb->s_files, f_u.fu_llist) { if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f)) f->f_mode &= ~FMODE_WRITE; } - file_list_unlock(); } /** Index: linux-rt.q/include/linux/fs.h =================================================================== --- linux-rt.q.orig/include/linux/fs.h +++ linux-rt.q/include/linux/fs.h @@ -278,6 +278,7 @@ extern int dir_notify_enable; #include #include #include +#include #include #include #include @@ -713,9 +714,13 @@ struct file { /* * fu_list becomes invalid after file_free is called and queued via * fu_rcuhead for RCU freeing + * fu_llist is used for the superblock s_files list; its crucial that + * the spinlock contained therein is not clobbered by other uses of + * the union. */ union { struct list_head fu_list; + struct lock_list_head fu_llist; struct rcu_head fu_rcuhead; } f_u; struct path f_path; @@ -748,9 +753,25 @@ extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); #define file_list_unlock() spin_unlock(&files_lock); +/* + * steal the upper 8 bits from the read-a-head flags + */ +#define F_SHIFT 24 + +#define F_SUPERBLOCK 0 + +#define file_flag_set(file, flag) \ + __set_bit((flag) + F_SHIFT, &(file)->f_ra.flags) +#define file_flag_clear(file, flag) \ + __clear_bit((flag) + F_SHIFT, &(file)->f_ra.flags) +#define file_flag(file, flag) \ + test_bit((flag) + F_SHIFT, &(file)->f_ra.flags) + #define get_file(x) atomic_inc(&(x)->f_count) #define file_count(x) atomic_read(&(x)->f_count) +extern int filevec_add_drain_all(void); + #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes @@ -935,7 +956,7 @@ struct super_block { struct list_head s_dirty; /* dirty inodes */ struct list_head s_io; /* parked for writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ - struct list_head s_files; + struct lock_list_head s_files; struct block_device *s_bdev; struct mtd_info *s_mtd; Index: linux-rt.q/mm/readahead.c =================================================================== --- linux-rt.q.orig/mm/readahead.c +++ linux-rt.q/mm/readahead.c @@ -69,7 +69,7 @@ static inline void reset_ahead_window(st static inline void ra_off(struct file_ra_state *ra) { ra->start = 0; - ra->flags = 0; + ra->flags &= (~0UL) << F_SHIFT; ra->size = 0; reset_ahead_window(ra); return; Index: linux-rt.q/security/selinux/selinuxfs.c =================================================================== --- linux-rt.q.orig/security/selinux/selinuxfs.c +++ linux-rt.q/security/selinux/selinuxfs.c @@ -944,7 +944,8 @@ static const struct file_operations sel_ * fs/proc/generic.c proc_kill_inodes */ static void sel_remove_bools(struct dentry *de) { - struct list_head *p, *node; + struct list_head *node; + struct file *filp; struct super_block *sb = de->d_sb; spin_lock(&dcache_lock); @@ -976,7 +977,6 @@ static void sel_remove_bools(struct dent } filp->f_op = NULL; } - file_list_unlock(); } #define BOOL_DIR_NAME "booleans" patches/lock_list.patch0000664000077200007720000001170710646635216014544 0ustar mingomingoSubject: lock_list - a fine grain locked double linked list Provide a simple fine grain locked double link list. It build upon the regular double linked list primitives, spinlocks and RCU. In order to avoid deadlocks a prev -> next locking order is observed. This prevents reverse iteration. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lock_list.h | 88 ++++++++++++++++++++++++++++++++++++++++++++++ lib/Makefile | 2 - lib/lock_list.c | 55 ++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 1 deletion(-) Index: linux-rt.q/include/linux/lock_list.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/lock_list.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2006, Red Hat, Inc., Peter Zijlstra + * Licenced under the GPLv2. + * + * Simple fine grain locked double linked list. + */ +#ifndef _LINUX_LOCK_LIST_H +#define _LINUX_LOCK_LIST_H + +#ifdef __KERNEL__ + +#include +#include +#include + +struct lock_list_head { + union { + struct list_head head; + struct { + struct lock_list_head *next, *prev; + }; + }; + spinlock_t lock; +}; + +enum { + LOCK_LIST_NESTING_PREV = 1, + LOCK_LIST_NESTING_CUR, + LOCK_LIST_NESTING_NEXT, +}; + +static inline void INIT_LOCK_LIST_HEAD(struct lock_list_head *list) +{ + INIT_LIST_HEAD(&list->head); + spin_lock_init(&list->lock); +} + +/* + * Passed pointers are assumed stable by external means (refcount, rcu) + */ +extern void __lock_list_add(struct lock_list_head *new, + struct lock_list_head *list); + +static inline void lock_list_add(struct lock_list_head *new, + struct lock_list_head *list) +{ + spin_lock(&new->lock); + __lock_list_add(new, list); + spin_unlock(&new->lock); +} + +extern void lock_list_del_init(struct lock_list_head *entry); + +static inline +struct lock_list_head *lock_list_next_entry(struct lock_list_head *list, + struct lock_list_head *entry) +{ + struct lock_list_head *next = entry->next; + if (likely(next != list)) { + lock_set_subclass(&entry->lock.dep_map, + LOCK_LIST_NESTING_CUR, _THIS_IP_); + spin_lock_nested(&next->lock, LOCK_LIST_NESTING_NEXT); + BUG_ON(entry->next != next); + } else + next = NULL; + spin_unlock(&entry->lock); + return next; +} + +static inline +struct lock_list_head *lock_list_first_entry(struct lock_list_head *list) +{ + spin_lock(&list->lock); + return lock_list_next_entry(list, list); +} + +#define lock_list_for_each_entry(pos, list, member) \ + for (pos = list_entry(lock_list_first_entry(list), \ + typeof(*pos), member); \ + pos; \ + pos = list_entry(lock_list_next_entry(list, &pos->member), \ + typeof(*pos), member)) + +#define lock_list_for_each_entry_stop(pos, member) \ + spin_unlock(&(pos->member.lock)) + +#endif /* __KERNEL__ */ +#endif /* _LINUX_LOCK_LIST_H */ Index: linux-rt.q/lib/Makefile =================================================================== --- linux-rt.q.orig/lib/Makefile +++ linux-rt.q/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for some libs needed in the kernel. # -lib-y := ctype.o string.o vsprintf.o cmdline.o \ +lib-y := ctype.o string.o vsprintf.o cmdline.o lock_list.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o Index: linux-rt.q/lib/lock_list.c =================================================================== --- /dev/null +++ linux-rt.q/lib/lock_list.c @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2006, Red Hat, Inc., Peter Zijlstra + * Licenced under the GPLv2. + * + * Simple fine grain locked double linked list. + * + * Locking order is from prev -> next. + * Edges are locked not nodes; that is, cur->lock protects: + * - cur->next, + * - cur->next->prev. + * + * Passed pointers are assumed to be stable by external means such as + * refcounts or RCU. The individual list entries are assumed to be RCU + * freed (requirement of __lock_list_del). + */ + +#include + +void __lock_list_add(struct lock_list_head *new, + struct lock_list_head *list) +{ + struct lock_list_head *next; + + spin_lock_nested(&list->lock, LOCK_LIST_NESTING_PREV); + next = list->next; + __list_add(&new->head, &list->head, &next->head); + spin_unlock(&list->lock); +} + +void lock_list_del_init(struct lock_list_head *entry) +{ + struct lock_list_head *prev, *next; + + rcu_read_lock(); +again: + prev = entry->prev; + if (prev == entry) + goto out; + spin_lock_nested(&prev->lock, LOCK_LIST_NESTING_PREV); + if (unlikely(entry->prev != prev)) { + /* + * we lost + */ + spin_unlock(&prev->lock); + goto again; + } + spin_lock_nested(&entry->lock, LOCK_LIST_NESTING_CUR); + next = entry->next; + __list_del(&prev->head, &next->head); + INIT_LIST_HEAD(&entry->head); + spin_unlock(&entry->lock); + spin_unlock(&prev->lock); +out: + rcu_read_unlock(); +} patches/rcu-1.patch0000664000077200007720000013477710646635213013522 0ustar mingomingo This patch re-organizes the RCU code to enable multiple implementations of RCU. Users of RCU continues to include rcupdate.h and the RCU interfaces remain the same. This is in preparation for subsequently merging the preepmtpible RCU implementation. Signed-off-by: Dipankar Sarma --- --- include/linux/rcuclassic.h | 148 +++++++++++ include/linux/rcupdate.h | 154 +++--------- kernel/Makefile | 2 kernel/rcuclassic.c | 561 +++++++++++++++++++++++++++++++++++++++++++++ kernel/rcupdate.c | 558 ++------------------------------------------ 5 files changed, 782 insertions(+), 641 deletions(-) Index: linux-rt.q/include/linux/rcuclassic.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/rcuclassic.h @@ -0,0 +1,148 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (classic version) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2001 + * + * Author: Dipankar Sarma + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rcupdate.html + * + */ + +#ifndef __LINUX_RCUCLASSIC_H +#define __LINUX_RCUCLASSIC_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + long cur; /* Current batch number. */ + long completed; /* Number of the last completed batch */ + int next_pending; /* Is the next batch already waiting? */ + + int signaled; + + spinlock_t lock ____cacheline_internodealigned_in_smp; + cpumask_t cpumask; /* CPUs that need to switch in order */ + /* for current batch to proceed. */ +} ____cacheline_internodealigned_in_smp; + +/* Is batch a before batch b ? */ +static inline int rcu_batch_before(long a, long b) +{ + return (a - b) < 0; +} + +/* Is batch a after batch b ? */ +static inline int rcu_batch_after(long a, long b) +{ + return (a - b) > 0; +} + +/* + * Per-CPU data for Read-Copy UPdate. + * nxtlist - new callbacks are added here + * curlist - current batch for which quiescent cycle started if any + */ +struct rcu_data { + /* 1) quiescent state handling : */ + long quiescbatch; /* Batch # for grace period */ + int passed_quiesc; /* User-mode/idle loop etc. */ + int qs_pending; /* core waits for quiesc state */ + + /* 2) batch handling */ + long batch; /* Batch # for current RCU batch */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail; + long qlen; /* # of queued callbacks */ + struct rcu_head *curlist; + struct rcu_head **curtail; + struct rcu_head *donelist; + struct rcu_head **donetail; + long blimit; /* Upper limit on a processed batch */ + int cpu; +}; + +DECLARE_PER_CPU(struct rcu_data, rcu_data); +DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); + +/* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +static inline void rcu_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + rdp->passed_quiesc = 1; +} +static inline void rcu_bh_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + rdp->passed_quiesc = 1; +} + +extern int rcu_pending(int cpu); +extern int rcu_needs_cpu(int cpu); + +#define __rcu_read_lock() \ + do { \ + preempt_disable(); \ + __acquire(RCU); \ + } while(0) +#define __rcu_read_unlock() \ + do { \ + __release(RCU); \ + preempt_enable(); \ + } while(0) + +#define __rcu_read_lock_bh() \ + do { \ + local_bh_disable(); \ + __acquire(RCU_BH); \ + } while(0) +#define __rcu_read_unlock_bh() \ + do { \ + __release(RCU_BH); \ + local_bh_enable(); \ + } while(0) + +#define __synchronize_sched() synchronize_rcu() + +extern void __rcu_init(void); +extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_restart_cpu(int cpu); +extern long rcu_batches_completed(void); + +#endif /* __KERNEL__ */ +#endif /* __LINUX_RCUCLASSIC_H */ Index: linux-rt.q/include/linux/rcupdate.h =================================================================== --- linux-rt.q.orig/include/linux/rcupdate.h +++ linux-rt.q/include/linux/rcupdate.h @@ -1,5 +1,5 @@ /* - * Read-Copy Update mechanism for mutual exclusion + * Read-Copy Update mechanism for mutual exclusion * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,8 +18,8 @@ * Copyright (C) IBM Corporation, 2001 * * Author: Dipankar Sarma - * - * Based on the original work by Paul McKenney + * + * Based on the original work by Paul McKenney * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf @@ -41,6 +41,7 @@ #include #include #include +#include /** * struct rcu_head - callback structure for use with RCU @@ -59,80 +60,6 @@ struct rcu_head { } while (0) - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - long cur; /* Current batch number. */ - long completed; /* Number of the last completed batch */ - int next_pending; /* Is the next batch already waiting? */ - - int signaled; - - spinlock_t lock ____cacheline_internodealigned_in_smp; - cpumask_t cpumask; /* CPUs that need to switch in order */ - /* for current batch to proceed. */ -} ____cacheline_internodealigned_in_smp; - -/* Is batch a before batch b ? */ -static inline int rcu_batch_before(long a, long b) -{ - return (a - b) < 0; -} - -/* Is batch a after batch b ? */ -static inline int rcu_batch_after(long a, long b) -{ - return (a - b) > 0; -} - -/* - * Per-CPU data for Read-Copy UPdate. - * nxtlist - new callbacks are added here - * curlist - current batch for which quiescent cycle started if any - */ -struct rcu_data { - /* 1) quiescent state handling : */ - long quiescbatch; /* Batch # for grace period */ - int passed_quiesc; /* User-mode/idle loop etc. */ - int qs_pending; /* core waits for quiesc state */ - - /* 2) batch handling */ - long batch; /* Batch # for current RCU batch */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail; - long qlen; /* # of queued callbacks */ - struct rcu_head *curlist; - struct rcu_head **curtail; - struct rcu_head *donelist; - struct rcu_head **donetail; - long blimit; /* Upper limit on a processed batch */ - int cpu; - struct rcu_head barrier; -}; - -DECLARE_PER_CPU(struct rcu_data, rcu_data); -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -static inline void rcu_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - rdp->passed_quiesc = 1; -} -static inline void rcu_bh_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; -} - -extern int rcu_pending(int cpu); -extern int rcu_needs_cpu(int cpu); - /** * rcu_read_lock - mark the beginning of an RCU read-side critical section. * @@ -162,22 +89,14 @@ extern int rcu_needs_cpu(int cpu); * * It is illegal to block while in an RCU read-side critical section. */ -#define rcu_read_lock() \ - do { \ - preempt_disable(); \ - __acquire(RCU); \ - } while(0) +#define rcu_read_lock() __rcu_read_lock() /** * rcu_read_unlock - marks the end of an RCU read-side critical section. * * See rcu_read_lock() for more information. */ -#define rcu_read_unlock() \ - do { \ - __release(RCU); \ - preempt_enable(); \ - } while(0) +#define rcu_read_unlock() __rcu_read_unlock() /* * So where is rcu_write_lock()? It does not exist, as there is no @@ -200,22 +119,14 @@ extern int rcu_needs_cpu(int cpu); * can use just rcu_read_lock(). * */ -#define rcu_read_lock_bh() \ - do { \ - local_bh_disable(); \ - __acquire(RCU_BH); \ - } while(0) +#define rcu_read_lock_bh() __rcu_read_lock_bh() -/* +/** * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ -#define rcu_read_unlock_bh() \ - do { \ - __release(RCU_BH); \ - local_bh_enable(); \ - } while(0) +#define rcu_read_unlock_bh() __rcu_read_unlock_bh() /** * rcu_dereference - fetch an RCU-protected pointer in an @@ -267,22 +178,49 @@ extern int rcu_needs_cpu(int cpu); * In "classic RCU", these two guarantees happen to be one and * the same, but can differ in realtime RCU implementations. */ -#define synchronize_sched() synchronize_rcu() - -extern void rcu_init(void); -extern void rcu_check_callbacks(int cpu, int user); -extern void rcu_restart_cpu(int cpu); -extern long rcu_batches_completed(void); -extern long rcu_batches_completed_bh(void); +#define synchronize_sched() __synchronize_sched() -/* Exported interfaces */ -extern void FASTCALL(call_rcu(struct rcu_head *head, +/** + * call_rcu - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +extern void FASTCALL(call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head))); + + +/** + * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by rcu_read_lock() and + * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() + * and rcu_read_unlock_bh(), if in process context. These may be nested. + */ extern void FASTCALL(call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head))); + +/* Exported common interfaces */ extern void synchronize_rcu(void); -void synchronize_idle(void); extern void rcu_barrier(void); +/* Internal to kernel */ +extern void rcu_init(void); +extern void rcu_check_callbacks(int cpu, int user); + #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPDATE_H */ Index: linux-rt.q/kernel/Makefile =================================================================== --- linux-rt.q.orig/kernel/Makefile +++ linux-rt.q/kernel/Makefile @@ -6,7 +6,7 @@ obj-y = sched.o fork.o exec_domain.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ - rcupdate.o extable.o params.o posix-timers.o \ + rcupdate.o rcuclassic.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o Index: linux-rt.q/kernel/rcuclassic.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/rcuclassic.c @@ -0,0 +1,561 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, classic implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2001 + * + * Authors: Dipankar Sarma + * Manfred Spraul + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * + * Papers: http://www.rdrop.com/users/paulmck/RCU + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_ctrlblk = { + .cur = -300, + .completed = -300, + .lock = SPIN_LOCK_UNLOCKED, + .cpumask = CPU_MASK_NONE, +}; +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .cur = -300, + .completed = -300, + .lock = SPIN_LOCK_UNLOCKED, + .cpumask = CPU_MASK_NONE, +}; + +DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; +DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; + +/* Fake initialization required by compiler */ +static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; +static int blimit = 10; +static int qhimark = 10000; +static int qlowmark = 100; + +#ifdef CONFIG_SMP +static void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + int cpu; + cpumask_t cpumask; + set_need_resched(); + if (unlikely(!rcp->signaled)) { + rcp->signaled = 1; + /* + * Don't send IPI to itself. With irqs disabled, + * rdp->cpu is the current cpu. + */ + cpumask = rcp->cpumask; + cpu_clear(rdp->cpu, cpumask); + for_each_cpu_mask(cpu, cpumask) + smp_send_reschedule(cpu); + } +} +#else +static inline void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + set_need_resched(); +} +#endif + +/* + * call_rcu - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void fastcall call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = &__get_cpu_var(rcu_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } + local_irq_restore(flags); +} + +/* + * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by rcu_read_lock() and + * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() + * and rcu_read_unlock_bh(), if in process context. These may be nested. + */ +void fastcall call_rcu_bh(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = &__get_cpu_var(rcu_bh_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_bh_ctrlblk); + } + + local_irq_restore(flags); +} + +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed(void) +{ + return rcu_ctrlblk.completed; +} + +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_ctrlblk.completed; +} + +/* + * Invoke the completed RCU callbacks. They are expected to be in + * a per-cpu list. + */ +static void rcu_do_batch(struct rcu_data *rdp) +{ + struct rcu_head *next, *list; + int count = 0; + + list = rdp->donelist; + while (list) { + next = list->next; + prefetch(next); + list->func(list); + list = next; + if (++count >= rdp->blimit) + break; + } + rdp->donelist = list; + + local_irq_disable(); + rdp->qlen -= count; + local_irq_enable(); + if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; + + if (!rdp->donelist) + rdp->donetail = &rdp->donelist; + else + tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); +} + +/* + * Grace period handling: + * The grace period handling consists out of two steps: + * - A new grace period is started. + * This is done by rcu_start_batch. The start is not broadcasted to + * all cpus, they must pick this up by comparing rcp->cur with + * rdp->quiescbatch. All cpus are recorded in the + * rcu_ctrlblk.cpumask bitmap. + * - All cpus must go through a quiescent state. + * Since the start of the grace period is not broadcasted, at least two + * calls to rcu_check_quiescent_state are required: + * The first call just notices that a new grace period is running. The + * following calls check if there was a quiescent state since the beginning + * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If + * the bitmap is empty, then the grace period is completed. + * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace + * period (if necessary). + */ +/* + * Register a new batch of callbacks, and start it up if there is currently no + * active batch and the batch to be registered has not already occurred. + * Caller must hold rcu_ctrlblk.lock. + */ +static void rcu_start_batch(struct rcu_ctrlblk *rcp) +{ + if (rcp->next_pending && + rcp->completed == rcp->cur) { + rcp->next_pending = 0; + /* + * next_pending == 0 must be visible in + * __rcu_process_callbacks() before it can see new value of cur. + */ + smp_wmb(); + rcp->cur++; + + /* + * Accessing nohz_cpu_mask before incrementing rcp->cur needs a + * Barrier Otherwise it can cause tickless idle CPUs to be + * included in rcp->cpumask, which will extend graceperiods + * unnecessarily. + */ + smp_mb(); + cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + + rcp->signaled = 0; + } +} + +/* + * cpu went through a quiescent state since the beginning of the grace period. + * Clear it from the cpu mask and complete the grace period if it was the last + * cpu. Start another grace period if someone has further entries pending + */ +static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) +{ + cpu_clear(cpu, rcp->cpumask); + if (cpus_empty(rcp->cpumask)) { + /* batch completed ! */ + rcp->completed = rcp->cur; + rcu_start_batch(rcp); + } +} + +/* + * Check if the cpu has gone through a quiescent state (say context + * switch). If so and if it already hasn't done so in this RCU + * quiescent cycle, then indicate that it has done so. + */ +static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + if (rdp->quiescbatch != rcp->cur) { + /* start new grace period: */ + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + rdp->quiescbatch = rcp->cur; + return; + } + + /* Grace period already completed for this cpu? + * qs_pending is checked instead of the actual bitmap to avoid + * cacheline trashing. + */ + if (!rdp->qs_pending) + return; + + /* + * Was there a quiescent state since the beginning of the grace + * period? If no, then exit and wait for the next call. + */ + if (!rdp->passed_quiesc) + return; + rdp->qs_pending = 0; + + spin_lock(&rcp->lock); + /* + * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync + * during cpu startup. Ignore the quiescent state. + */ + if (likely(rdp->quiescbatch == rcp->cur)) + cpu_quiet(rdp->cpu, rcp); + + spin_unlock(&rcp->lock); +} + + +#ifdef CONFIG_HOTPLUG_CPU + +/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing + * locking requirements, the list it's pulling from has to belong to a cpu + * which is dead and hence not processing interrupts. + */ +static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, + struct rcu_head **tail) +{ + local_irq_disable(); + *this_rdp->nxttail = list; + if (list) + this_rdp->nxttail = tail; + local_irq_enable(); +} + +static void __rcu_offline_cpu(struct rcu_data *this_rdp, + struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ + /* if the cpu going offline owns the grace period + * we can block indefinitely waiting for it, so flush + * it here + */ + spin_lock_bh(&rcp->lock); + if (rcp->cur != rcp->completed) + cpu_quiet(rdp->cpu, rcp); + spin_unlock_bh(&rcp->lock); + rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); +} + +static void rcu_offline_cpu(int cpu) +{ + struct rcu_data *this_rdp = &get_cpu_var(rcu_data); + struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); + + __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, + &per_cpu(rcu_data, cpu)); + __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, + &per_cpu(rcu_bh_data, cpu)); + put_cpu_var(rcu_data); + put_cpu_var(rcu_bh_data); + tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); +} + +#else + +static void rcu_offline_cpu(int cpu) +{ +} + +#endif + +/* + * This does the RCU processing work from tasklet context. + */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { + *rdp->donetail = rdp->curlist; + rdp->donetail = rdp->curtail; + rdp->curlist = NULL; + rdp->curtail = &rdp->curlist; + } + + if (rdp->nxtlist && !rdp->curlist) { + local_irq_disable(); + rdp->curlist = rdp->nxtlist; + rdp->curtail = rdp->nxttail; + rdp->nxtlist = NULL; + rdp->nxttail = &rdp->nxtlist; + local_irq_enable(); + + /* + * start the next batch of callbacks + */ + + /* determine batch number */ + rdp->batch = rcp->cur + 1; + /* see the comment and corresponding wmb() in + * the rcu_start_batch() + */ + smp_rmb(); + + if (!rcp->next_pending) { + /* and start it/schedule start if it's a new batch */ + spin_lock(&rcp->lock); + rcp->next_pending = 1; + rcu_start_batch(rcp); + spin_unlock(&rcp->lock); + } + } + + rcu_check_quiescent_state(rcp, rdp); + if (rdp->donelist) + rcu_do_batch(rdp); +} + +static void rcu_process_callbacks(unsigned long unused) +{ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); + __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); +} + +static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ + /* This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) + return 1; + + /* This cpu has no pending entries, but there are new entries */ + if (!rdp->curlist && rdp->nxtlist) + return 1; + + /* This cpu has finished callbacks to invoke */ + if (rdp->donelist) + return 1; + + /* The rcu core waits for a quiescent state from the cpu */ + if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) + return 1; + + /* nothing to do */ + return 0; +} + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, returning 1 if so. This function is part of the + * RCU implementation; it is -not- an exported member of the RCU API. + */ +int rcu_pending(int cpu) +{ + return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || + __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); +} + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + */ +int rcu_needs_cpu(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); + + return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); +} + +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); + tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); +} + +static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + memset(rdp, 0, sizeof(*rdp)); + rdp->curtail = &rdp->curlist; + rdp->nxttail = &rdp->nxtlist; + rdp->donetail = &rdp->donelist; + rdp->quiescbatch = rcp->completed; + rdp->qs_pending = 0; + rdp->cpu = cpu; + rdp->blimit = blimit; +} + +static void __devinit rcu_online_cpu(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); + + rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); + rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); + tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); +} + +static int __devinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + rcu_online_cpu(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + rcu_offline_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata rcu_nb = { + .notifier_call = rcu_cpu_notify, +}; + +/* + * Initializes rcu mechanism. Assumed to be called early. + * That is before local timer(SMP) or jiffie timer (uniproc) is setup. + * Note that rcu_qsctr and friends are implicitly + * initialized due to the choice of ``0'' for RCU_CTR_INVALID. + */ +void __init __rcu_init(void) +{ + rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + /* Register notifier for non-boot CPUs */ + register_cpu_notifier(&rcu_nb); +} + +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); + +EXPORT_SYMBOL_GPL(rcu_batches_completed); +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_bh); Index: linux-rt.q/kernel/rcupdate.c =================================================================== --- linux-rt.q.orig/kernel/rcupdate.c +++ linux-rt.q/kernel/rcupdate.c @@ -19,7 +19,7 @@ * * Authors: Dipankar Sarma * Manfred Spraul - * + * * Based on the original work by Paul McKenney * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: @@ -40,152 +40,53 @@ #include #include #include -#include #include -#include #include -#include -#include #include #include +#include -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_ctrlblk = { - .cur = -300, - .completed = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, -}; -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .cur = -300, - .completed = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; }; -DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; -DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; - -/* Fake initialization required by compiler */ -static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; -static int blimit = 10; -static int qhimark = 10000; -static int qlowmark = 100; - +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head); static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; -#ifdef CONFIG_SMP -static void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - int cpu; - cpumask_t cpumask; - set_need_resched(); - if (unlikely(!rcp->signaled)) { - rcp->signaled = 1; - /* - * Don't send IPI to itself. With irqs disabled, - * rdp->cpu is the current cpu. - */ - cpumask = rcp->cpumask; - cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask(cpu, cpumask) - smp_send_reschedule(cpu); - } -} -#else -static inline void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) +/* Because of FASTCALL declaration of complete, we use this wrapper */ +static void wakeme_after_rcu(struct rcu_head *head) { - set_need_resched(); + struct rcu_synchronize *rcu; + + rcu = container_of(head, struct rcu_synchronize, head); + complete(&rcu->completion); } -#endif /** - * call_rcu - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period + * synchronize_rcu - wait until a grace period has elapsed. * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU * read-side critical sections have completed. RCU read-side critical * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. - */ -void fastcall call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } - local_irq_restore(flags); -} - -/** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by rcu_read_lock() and - * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() - * and rcu_read_unlock_bh(), if in process context. These may be nested. + * If your read-side code is not protected by rcu_read_lock(), do -not- + * use synchronize_rcu(). */ -void fastcall call_rcu_bh(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) +void synchronize_rcu(void) { - unsigned long flags; - struct rcu_data *rdp; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - - local_irq_restore(flags); -} + struct rcu_synchronize rcu; -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} + init_completion(&rcu.completion); + /* Will wake me after RCU finished */ + call_rcu(&rcu.head, wakeme_after_rcu); -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed_bh(void) -{ - return rcu_bh_ctrlblk.completed; + /* Wait for it */ + wait_for_completion(&rcu.completion); } static void rcu_barrier_callback(struct rcu_head *notused) @@ -200,10 +101,8 @@ static void rcu_barrier_callback(struct static void rcu_barrier_func(void *notused) { int cpu = smp_processor_id(); - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_head *head; + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - head = &rdp->barrier; atomic_inc(&rcu_barrier_cpu_count); call_rcu(head, rcu_barrier_callback); } @@ -224,414 +123,9 @@ void rcu_barrier(void) } EXPORT_SYMBOL_GPL(rcu_barrier); -/* - * Invoke the completed RCU callbacks. They are expected to be in - * a per-cpu list. - */ -static void rcu_do_batch(struct rcu_data *rdp) -{ - struct rcu_head *next, *list; - int count = 0; - - list = rdp->donelist; - while (list) { - next = list->next; - prefetch(next); - list->func(list); - list = next; - if (++count >= rdp->blimit) - break; - } - rdp->donelist = list; - - local_irq_disable(); - rdp->qlen -= count; - local_irq_enable(); - if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - if (!rdp->donelist) - rdp->donetail = &rdp->donelist; - else - tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); -} - -/* - * Grace period handling: - * The grace period handling consists out of two steps: - * - A new grace period is started. - * This is done by rcu_start_batch. The start is not broadcasted to - * all cpus, they must pick this up by comparing rcp->cur with - * rdp->quiescbatch. All cpus are recorded in the - * rcu_ctrlblk.cpumask bitmap. - * - All cpus must go through a quiescent state. - * Since the start of the grace period is not broadcasted, at least two - * calls to rcu_check_quiescent_state are required: - * The first call just notices that a new grace period is running. The - * following calls check if there was a quiescent state since the beginning - * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If - * the bitmap is empty, then the grace period is completed. - * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace - * period (if necessary). - */ -/* - * Register a new batch of callbacks, and start it up if there is currently no - * active batch and the batch to be registered has not already occurred. - * Caller must hold rcu_ctrlblk.lock. - */ -static void rcu_start_batch(struct rcu_ctrlblk *rcp) -{ - if (rcp->next_pending && - rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); - rcp->cur++; - - /* - * Accessing nohz_cpu_mask before incrementing rcp->cur needs a - * Barrier Otherwise it can cause tickless idle CPUs to be - * included in rcp->cpumask, which will extend graceperiods - * unnecessarily. - */ - smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); - - rcp->signaled = 0; - } -} - -/* - * cpu went through a quiescent state since the beginning of the grace period. - * Clear it from the cpu mask and complete the grace period if it was the last - * cpu. Start another grace period if someone has further entries pending - */ -static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) -{ - cpu_clear(cpu, rcp->cpumask); - if (cpus_empty(rcp->cpumask)) { - /* batch completed ! */ - rcp->completed = rcp->cur; - rcu_start_batch(rcp); - } -} - -/* - * Check if the cpu has gone through a quiescent state (say context - * switch). If so and if it already hasn't done so in this RCU - * quiescent cycle, then indicate that it has done so. - */ -static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - if (rdp->quiescbatch != rcp->cur) { - /* start new grace period: */ - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->quiescbatch = rcp->cur; - return; - } - - /* Grace period already completed for this cpu? - * qs_pending is checked instead of the actual bitmap to avoid - * cacheline trashing. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesc) - return; - rdp->qs_pending = 0; - - spin_lock(&rcp->lock); - /* - * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync - * during cpu startup. Ignore the quiescent state. - */ - if (likely(rdp->quiescbatch == rcp->cur)) - cpu_quiet(rdp->cpu, rcp); - - spin_unlock(&rcp->lock); -} - - -#ifdef CONFIG_HOTPLUG_CPU - -/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing - * locking requirements, the list it's pulling from has to belong to a cpu - * which is dead and hence not processing interrupts. - */ -static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) -{ - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); -} - -static void __rcu_offline_cpu(struct rcu_data *this_rdp, - struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* if the cpu going offline owns the grace period - * we can block indefinitely waiting for it, so flush - * it here - */ - spin_lock_bh(&rcp->lock); - if (rcp->cur != rcp->completed) - cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); -} - -static void rcu_offline_cpu(int cpu) -{ - struct rcu_data *this_rdp = &get_cpu_var(rcu_data); - struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); - - __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, - &per_cpu(rcu_data, cpu)); - __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, - &per_cpu(rcu_bh_data, cpu)); - put_cpu_var(rcu_data); - put_cpu_var(rcu_bh_data); - tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); -} - -#else - -static void rcu_offline_cpu(int cpu) -{ -} - -#endif - -/* - * This does the RCU processing work from tasklet context. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } - - if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); - - /* - * start the next batch of callbacks - */ - - /* determine batch number */ - rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() - */ - smp_rmb(); - - if (!rcp->next_pending) { - /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); - spin_unlock(&rcp->lock); - } - } - - rcu_check_quiescent_state(rcp, rdp); - if (rdp->donelist) - rcu_do_batch(rdp); -} - -static void rcu_process_callbacks(unsigned long unused) -{ - __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); - __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); -} - -static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; - - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; - - /* This cpu has finished callbacks to invoke */ - if (rdp->donelist) - return 1; - - /* The rcu core waits for a quiescent state from the cpu */ - if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) - return 1; - - /* nothing to do */ - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -int rcu_pending(int cpu) -{ - return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || - __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); -} - -void rcu_check_callbacks(int cpu, int user) -{ - if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); - } else if (!in_softirq()) - rcu_bh_qsctr_inc(cpu); - tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); -} - -static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; - rdp->donetail = &rdp->donelist; - rdp->quiescbatch = rcp->completed; - rdp->qs_pending = 0; - rdp->cpu = cpu; - rdp->blimit = blimit; -} - -static void __devinit rcu_online_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); - - rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); - rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -/* - * Initializes rcu mechanism. Assumed to be called early. - * That is before local timer(SMP) or jiffie timer (uniproc) is setup. - * Note that rcu_qsctr and friends are implicitly - * initialized due to the choice of ``0'' for RCU_CTR_INVALID. - */ void __init rcu_init(void) { - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); -} - -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* Because of FASTCALL declaration of complete, we use this wrapper */ -static void wakeme_after_rcu(struct rcu_head *head) -{ - struct rcu_synchronize *rcu; - - rcu = container_of(head, struct rcu_synchronize, head); - complete(&rcu->completion); -} - -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - * - * If your read-side code is not protected by rcu_read_lock(), do -not- - * use synchronize_rcu(). - */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished */ - call_rcu(&rcu.head, wakeme_after_rcu); - - /* Wait for it */ - wait_for_completion(&rcu.completion); + __rcu_init(); } -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); -EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); -EXPORT_SYMBOL_GPL(call_rcu); -EXPORT_SYMBOL_GPL(call_rcu_bh); EXPORT_SYMBOL_GPL(synchronize_rcu); patches/preempt-realtime-mm.patch0000664000077200007720000001655510646635215016451 0ustar mingomingo--- include/linux/pagevec.h | 2 +- include/linux/vmstat.h | 10 ++++++++++ mm/bounce.c | 4 ++-- mm/memory.c | 11 +++++++++-- mm/mmap.c | 10 ++++++++-- mm/vmscan.c | 10 ++++++++-- mm/vmstat.c | 38 ++++++++++++++++++++++++++++++++------ 7 files changed, 70 insertions(+), 15 deletions(-) Index: linux-rt.q/include/linux/pagevec.h =================================================================== --- linux-rt.q.orig/include/linux/pagevec.h +++ linux-rt.q/include/linux/pagevec.h @@ -9,7 +9,7 @@ #define _LINUX_PAGEVEC_H /* 14 pointers + two long's align the pagevec structure to a power of two */ -#define PAGEVEC_SIZE 14 +#define PAGEVEC_SIZE 8 struct page; struct address_space; Index: linux-rt.q/include/linux/vmstat.h =================================================================== --- linux-rt.q.orig/include/linux/vmstat.h +++ linux-rt.q/include/linux/vmstat.h @@ -59,7 +59,12 @@ DECLARE_PER_CPU(struct vm_event_state, v static inline void __count_vm_event(enum vm_event_item item) { +#ifdef CONFIG_PREEMPT_RT + get_cpu_var(vm_event_states).event[item]++; + put_cpu(); +#else __get_cpu_var(vm_event_states).event[item]++; +#endif } static inline void count_vm_event(enum vm_event_item item) @@ -70,7 +75,12 @@ static inline void count_vm_event(enum v static inline void __count_vm_events(enum vm_event_item item, long delta) { +#ifdef CONFIG_PREEMPT_RT + get_cpu_var(vm_event_states).event[item] += delta; + put_cpu(); +#else __get_cpu_var(vm_event_states).event[item] += delta; +#endif } static inline void count_vm_events(enum vm_event_item item, long delta) Index: linux-rt.q/mm/bounce.c =================================================================== --- linux-rt.q.orig/mm/bounce.c +++ linux-rt.q/mm/bounce.c @@ -48,11 +48,11 @@ static void bounce_copy_vec(struct bio_v unsigned long flags; unsigned char *vto; - local_irq_save(flags); + local_irq_save_nort(flags); vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #else /* CONFIG_HIGHMEM */ Index: linux-rt.q/mm/memory.c =================================================================== --- linux-rt.q.orig/mm/memory.c +++ linux-rt.q/mm/memory.c @@ -283,7 +283,9 @@ void free_pgtables(struct mmu_gather **t if (!vma) /* Sometimes when exiting after an oops */ return; +#ifndef CONFIG_PREEMPT_RT if (vma->vm_next) +#endif tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb)); /* * Hide vma from rmap and vmtruncate before freeeing pgtables, @@ -294,7 +296,9 @@ void free_pgtables(struct mmu_gather **t unlink_file_vma(unlink); unlink = unlink->vm_next; } +#ifndef CONFIG_PREEMPT_RT if (vma->vm_next) +#endif *tlb = tlb_gather_mmu(vma->vm_mm, fullmm); #endif while (vma) { @@ -807,10 +811,13 @@ static unsigned long unmap_page_range(st return addr; } -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT) # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) #else -/* No preempt: go for improved straight-line efficiency */ +/* + * No preempt: go for improved straight-line efficiency + * on PREEMPT_RT this is not a critical latency-path. + */ # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) #endif Index: linux-rt.q/mm/mmap.c =================================================================== --- linux-rt.q.orig/mm/mmap.c +++ linux-rt.q/mm/mmap.c @@ -1855,10 +1855,16 @@ asmlinkage long sys_munmap(unsigned long static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM - if (unlikely(down_read_trylock(&mm->mmap_sem))) { +# ifdef CONFIG_PREEMPT_RT + if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) { WARN_ON(1); - up_read(&mm->mmap_sem); } +# else + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +# endif #endif } Index: linux-rt.q/mm/vmscan.c =================================================================== --- linux-rt.q.orig/mm/vmscan.c +++ linux-rt.q/mm/vmscan.c @@ -23,6 +23,7 @@ #include #include #include +#include #include /* for try_to_release_page(), buffer_heads_over_limit */ #include @@ -682,7 +683,7 @@ static unsigned long shrink_inactive_lis nr_scanned += nr_scan; nr_freed = shrink_page_list(&page_list, sc); nr_reclaimed += nr_freed; - local_irq_disable(); + local_irq_disable_nort(); if (current_is_kswapd()) { __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); __count_vm_events(KSWAPD_STEAL, nr_freed); @@ -713,9 +714,14 @@ static unsigned long shrink_inactive_lis } } } while (nr_scanned < max_scan); + /* + * Non-PREEMPT_RT relies on IRQs-off protecting the page_states + * per-CPU data. PREEMPT_RT has that data protected even in + * __mod_page_state(), so no need to keep IRQs disabled. + */ spin_unlock(&zone->lru_lock); done: - local_irq_enable(); + local_irq_enable_nort(); pagevec_release(&pvec); return nr_reclaimed; } Index: linux-rt.q/mm/vmstat.c =================================================================== --- linux-rt.q.orig/mm/vmstat.c +++ linux-rt.q/mm/vmstat.c @@ -156,10 +156,14 @@ static void refresh_zone_stat_thresholds void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset *pcp; + int cpu; long x; + s8 *p; + cpu = get_cpu(); + pcp = zone_pcp(zone, cpu); + p = pcp->vm_stat_diff + item; x = delta + *p; if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { @@ -167,6 +171,7 @@ void __mod_zone_page_state(struct zone * x = 0; } *p = x; + put_cpu(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -209,9 +214,13 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset *pcp; + int cpu; + s8 *p; + cpu = get_cpu(); + pcp = zone_pcp(zone, cpu); + p = pcp->vm_stat_diff + item; (*p)++; if (unlikely(*p > pcp->stat_threshold)) { @@ -220,18 +229,34 @@ void __inc_zone_state(struct zone *zone, zone_page_state_add(*p + overstep, zone, item); *p = -overstep; } + put_cpu(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + struct zone *zone; + + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +#else __inc_zone_state(page_zone(page), item); +#endif } EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset *pcp; + int cpu; + s8 *p; + + cpu = get_cpu(); + pcp = zone_pcp(zone, cpu); + p = pcp->vm_stat_diff + item; (*p)--; @@ -241,6 +266,7 @@ void __dec_zone_state(struct zone *zone, zone_page_state_add(*p - overstep, zone, item); *p = overstep; } + put_cpu(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) patches/nmi-profiling.patch0000664000077200007720000000644110646635216015332 0ustar mingomingo--- arch/i386/kernel/irq.c | 2 ++ arch/i386/kernel/nmi.c | 7 +++---- arch/x86_64/kernel/nmi.c | 6 +++--- drivers/char/sysrq.c | 2 +- include/asm-x86_64/apic.h | 2 ++ 5 files changed, 11 insertions(+), 8 deletions(-) Index: linux-rt.q/arch/i386/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/irq.c +++ linux-rt.q/arch/i386/kernel/irq.c @@ -79,7 +79,9 @@ fastcall notrace unsigned int do_IRQ(str u32 *isp; #endif +#ifdef CONFIG_X86_LOCAL_APIC irq_show_regs_callback(smp_processor_id(), regs); +#endif if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", Index: linux-rt.q/arch/i386/kernel/nmi.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/nmi.c +++ linux-rt.q/arch/i386/kernel/nmi.c @@ -348,9 +348,9 @@ void nmi_show_all_regs(void) } } -static DEFINE_SPINLOCK(nmi_print_lock); +static DEFINE_RAW_SPINLOCK(nmi_print_lock); -void irq_show_regs_callback(int cpu, struct pt_regs *regs) +notrace void irq_show_regs_callback(int cpu, struct pt_regs *regs) { if (!nmi_show_regs[cpu]) return; @@ -364,7 +364,7 @@ void irq_show_regs_callback(int cpu, str spin_unlock(&nmi_print_lock); } -__kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) +notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { /* * Since current_thread_info()-> is always on the stack, and we @@ -432,7 +432,6 @@ __kprobes int nmi_watchdog_tick(struct p for_each_online_cpu(i) alert_counter[i] = 0; - } } else { Index: linux-rt.q/arch/x86_64/kernel/nmi.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/nmi.c +++ linux-rt.q/arch/x86_64/kernel/nmi.c @@ -337,9 +337,9 @@ void nmi_show_all_regs(void) } } -static DEFINE_SPINLOCK(nmi_print_lock); +static DEFINE_RAW_SPINLOCK(nmi_print_lock); -void irq_show_regs_callback(int cpu, struct pt_regs *regs) +notrace void irq_show_regs_callback(int cpu, struct pt_regs *regs) { if (!nmi_show_regs[cpu]) return; @@ -352,7 +352,7 @@ void irq_show_regs_callback(int cpu, str spin_unlock(&nmi_print_lock); } -int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int notrace __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; Index: linux-rt.q/drivers/char/sysrq.c =================================================================== --- linux-rt.q.orig/drivers/char/sysrq.c +++ linux-rt.q/drivers/char/sysrq.c @@ -208,7 +208,7 @@ static struct sysrq_key_op sysrq_showreg .enable_mask = SYSRQ_ENABLE_DUMP, }; -#if defined(__i386__) +#if defined(__i386__) || defined(__x86_64__) static void sysrq_handle_showallregs(int key, struct tty_struct *tty) { Index: linux-rt.q/include/asm-x86_64/apic.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/apic.h +++ linux-rt.q/include/asm-x86_64/apic.h @@ -94,6 +94,8 @@ extern void smp_send_nmi_allbutself(void #define K8_APIC_EXT_INT_MSG_EXT 0x7 #define K8_APIC_EXT_LVT_ENTRY_THRESHOLD 0 +extern void smp_send_nmi_allbutself(void); + #define ARCH_APICTIMER_STOPS_ON_C3 1 extern unsigned boot_cpu_id; patches/cputimer-thread-rt_A0.patch0000664000077200007720000002126710646635214016621 0ustar mingomingoIngo, This patch re-adds the posix-cpu-timer functionality by running it from a per-cpu RT thread. This allows cpu rlimits to be enforced against RT processes that would otherwise starve the system. thanks -john Signed-off-by: John Stultz include/linux/init_task.h | 1 include/linux/posix-timers.h | 2 include/linux/sched.h | 2 init/main.c | 2 kernel/fork.c | 2 kernel/posix-cpu-timers.c | 176 ++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 180 insertions(+), 5 deletions(-) Index: linux-rt.q/include/linux/init_task.h =================================================================== --- linux-rt.q.orig/include/linux/init_task.h +++ linux-rt.q/include/linux/init_task.h @@ -162,6 +162,7 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ + .posix_timer_list = NULL, \ .pi_lock = RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ Index: linux-rt.q/include/linux/posix-timers.h =================================================================== --- linux-rt.q.orig/include/linux/posix-timers.h +++ linux-rt.q/include/linux/posix-timers.h @@ -115,4 +115,6 @@ void set_process_cpu_timer(struct task_s long clock_nanosleep_restart(struct restart_block *restart_block); +int posix_cpu_thread_init(void); + #endif Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -1109,6 +1109,8 @@ struct task_struct { unsigned long long it_sched_expires; struct list_head cpu_timers[3]; + struct task_struct* posix_timer_list; + /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; Index: linux-rt.q/init/main.c =================================================================== --- linux-rt.q.orig/init/main.c +++ linux-rt.q/init/main.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -738,6 +739,7 @@ static void __init do_pre_smp_initcalls( migration_init(); #endif + posix_cpu_thread_init(); spawn_ksoftirqd(); spawn_softlockup_task(); } Index: linux-rt.q/kernel/fork.c =================================================================== --- linux-rt.q.orig/kernel/fork.c +++ linux-rt.q/kernel/fork.c @@ -1060,7 +1060,7 @@ static struct task_struct *copy_process( INIT_LIST_HEAD(&p->cpu_timers[0]); INIT_LIST_HEAD(&p->cpu_timers[1]); INIT_LIST_HEAD(&p->cpu_timers[2]); - + p->posix_timer_list = NULL; p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->security = NULL; Index: linux-rt.q/kernel/posix-cpu-timers.c =================================================================== --- linux-rt.q.orig/kernel/posix-cpu-timers.c +++ linux-rt.q/kernel/posix-cpu-timers.c @@ -578,7 +578,7 @@ static void arm_timer(struct k_itimer *t p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); spin_lock(&p->sighand->siglock); listpos = head; @@ -735,7 +735,7 @@ int posix_cpu_timer_set(struct k_itimer /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); ret = 0; spin_lock(&p->sighand->siglock); @@ -1287,12 +1287,11 @@ out: * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +void __run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; - BUG_ON(!irqs_disabled()); #define UNEXPIRED(clock) \ (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ @@ -1355,6 +1354,169 @@ void run_posix_cpu_timers(struct task_st } } +#include +#include +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); + +static int posix_cpu_timers_thread(void *data) +{ + int cpu = (long)data; + + BUG_ON(per_cpu(posix_timer_task,cpu) != current); + + + while (!kthread_should_stop()) { + struct task_struct *tsk = NULL; + struct task_struct *next = NULL; + + if (cpu_is_offline(cpu)) { + goto wait_to_die; + } + + /* grab task list */ + raw_local_irq_disable(); + tsk = per_cpu(posix_timer_tasklist, cpu); + per_cpu(posix_timer_tasklist, cpu) = NULL; + raw_local_irq_enable(); + + + /* its possible the list is empty, just return */ + if (!tsk) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + continue; + } + + /* Process task list */ + while (1) { + /* save next */ + next = tsk->posix_timer_list; + + /* run the task timers, clear its ptr and + * unreference it + */ + __run_posix_cpu_timers(tsk); + tsk->posix_timer_list = NULL; + put_task_struct(tsk); + + /* check if this is the last on the list */ + if (next == tsk) + break; + tsk = next; + } + } + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +void run_posix_cpu_timers(struct task_struct *tsk) +{ + unsigned long cpu = smp_processor_id(); + struct task_struct *tasklist; + + BUG_ON(!irqs_disabled()); + if(!per_cpu(posix_timer_task, cpu)) + return; + /* get per-cpu references */ + tasklist = per_cpu(posix_timer_tasklist, cpu); + + /* check to see if we're already queued */ + if (!tsk->posix_timer_list) { + get_task_struct(tsk); + if (tasklist) { + tsk->posix_timer_list = tasklist; + } else { + /* + * The list is terminated by a self-pointing + * task_struct + */ + tsk->posix_timer_list = tsk; + } + per_cpu(posix_timer_tasklist, cpu) = tsk; + } + /* XXX signal the thread somehow */ + wake_up_process(per_cpu(posix_timer_task,cpu)); +} + + + + +/* + * posix_cpu_thread_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int posix_cpu_thread_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct sched_param param; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(posix_cpu_timers_thread, hcpu, + "posix_cpu_timers/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio to avoid getting starved */ + param.sched_priority = MAX_RT_PRIO-1; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(posix_timer_task,cpu) = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(per_cpu(posix_timer_task,cpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(per_cpu(posix_timer_task,cpu), + any_online_cpu(cpu_online_map)); + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; + case CPU_DEAD: + kthread_stop(per_cpu(posix_timer_task,cpu)); + per_cpu(posix_timer_task,cpu) = NULL; + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata posix_cpu_thread_notifier = { + .notifier_call = posix_cpu_thread_call, + .priority = 10 +}; + +int __init posix_cpu_thread_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu); + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&posix_cpu_thread_notifier); + return 0; +} + + + /* * Set one of the process-wide special case CPU timers. * The tasklist_lock and tsk->sighand->siglock must be held by the caller. @@ -1620,6 +1782,12 @@ static __init int init_posix_cpu_timers( .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + unsigned long cpu; + + /* init the per-cpu posix_timer_tasklets */ + for_each_cpu_mask(cpu, cpu_possible_map) { + per_cpu(posix_timer_tasklist, cpu) = NULL; + } register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); patches/disable-irqpoll.patch0000664000077200007720000000174610646635214015644 0ustar mingomingo kernel/irq/spurious.c | 10 ++++++++++ 1 file changed, 10 insertions(+) Index: linux-rt.q/kernel/irq/spurious.c =================================================================== --- linux-rt.q.orig/kernel/irq/spurious.c +++ linux-rt.q/kernel/irq/spurious.c @@ -229,6 +229,11 @@ __setup("noirqdebug", noirqdebug_setup); static int __init irqfixup_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqfixup boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -240,6 +245,11 @@ __setup("irqfixup", irqfixup_setup); static int __init irqpoll_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqpoll boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); patches/ntp-move-the-cmos-update-code-into-ntpc-fix-fix.patch0000664000077200007720000000153610646635210023505 0ustar mingomingoFrom: Andrew Morton grr, headers_check. Cc: Chris Wright Cc: David Miller Cc: Ingo Molnar Cc: Roman Zippel Cc: Thomas Gleixner Cc: john stultz Cc: Michal Piotrowski Signed-off-by: Andrew Morton --- include/linux/time.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/include/linux/time.h =================================================================== --- linux-rt.q.orig/include/linux/time.h +++ linux-rt.q/include/linux/time.h @@ -2,9 +2,9 @@ #define _LINUX_TIME_H #include -#include #ifdef __KERNEL__ +# include # include #endif patches/mm-lockless-preempt-rt-fixup.patch0000664000077200007720000000733310646635216020235 0ustar mingomingo speculative get for PREEMPT_RT Signed-off-by: Peter Zijlstra --- include/linux/pagemap.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++- mm/filemap.c | 17 ++-------------- 2 files changed, 53 insertions(+), 15 deletions(-) Index: linux-rt.q/include/linux/pagemap.h =================================================================== --- linux-rt.q.orig/include/linux/pagemap.h +++ linux-rt.q/include/linux/pagemap.h @@ -14,6 +14,8 @@ #include #include #include /* for in_interrupt() */ +#include +#include /* * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page @@ -64,6 +66,26 @@ static inline void mapping_set_gfp_mask( #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +/* + * In order to wait for pages to become available there must be + * waitqueues associated with pages. By using a hash table of + * waitqueues where the bucket discipline is to maintain all + * waiters on the same queue and wake all when any of the pages + * become available, and for the woken contexts to check to be + * sure the appropriate page became available, this saves space + * at a cost of "thundering herd" phenomena during rare hash + * collisions. + */ +static inline wait_queue_head_t *page_waitqueue(struct page *page) +{ + const struct zone *zone = page_zone(page); + + return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; +} + +extern int __sleep_on_page(void *); + +#ifndef CONFIG_PREEMPT_RT static inline void set_page_no_new_refs(struct page *page) { VM_BUG_ON(PageNoNewRefs(page)); @@ -85,6 +107,33 @@ static inline void wait_on_new_refs(stru while (unlikely(PageNoNewRefs(page))) cpu_relax(); } +#else +static inline void set_page_no_new_refs(struct page *page) +{ + VM_BUG_ON(PageNoNewRefs(page)); + SetPageNoNewRefs(page); + smp_wmb(); +} + +static inline void end_page_no_new_refs(struct page *page) +{ + VM_BUG_ON(!PageNoNewRefs(page)); + smp_wmb(); + ClearPageNoNewRefs(page); + smp_mb__after_clear_bit(); + __wake_up_bit(page_waitqueue(page), &page->flags, PG_nonewrefs); +} + +static inline void wait_on_new_refs(struct page *page) +{ + might_sleep(); + if (unlikely(PageNoNewRefs(page))) { + DEFINE_WAIT_BIT(wait, &page->flags, PG_nonewrefs); + __wait_on_bit(page_waitqueue(page), &wait, __sleep_on_page, + TASK_UNINTERRUPTIBLE); + } +} +#endif /* * speculatively take a reference to a page. @@ -135,7 +184,7 @@ static inline int page_cache_get_specula { VM_BUG_ON(in_interrupt()); -#ifndef CONFIG_SMP +#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT) # ifdef CONFIG_PREEMPT VM_BUG_ON(!in_atomic()); # endif Index: linux-rt.q/mm/filemap.c =================================================================== --- linux-rt.q.orig/mm/filemap.c +++ linux-rt.q/mm/filemap.c @@ -486,21 +486,10 @@ static int __sleep_on_page_lock(void *wo return 0; } -/* - * In order to wait for pages to become available there must be - * waitqueues associated with pages. By using a hash table of - * waitqueues where the bucket discipline is to maintain all - * waiters on the same queue and wake all when any of the pages - * become available, and for the woken contexts to check to be - * sure the appropriate page became available, this saves space - * at a cost of "thundering herd" phenomena during rare hash - * collisions. - */ -static wait_queue_head_t *page_waitqueue(struct page *page) +int __sleep_on_page(void *word) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + schedule(); + return 0; } static inline void wake_up_page(struct page *page, int bit) patches/vsyscall-add-notrace.patch0000664000077200007720000000550210646635216016574 0ustar mingomingoFrom rostedt@goodmis.org Tue Jun 19 04:41:17 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from ms-smtp-01.nyroc.rr.com (ms-smtp-01.nyroc.rr.com [24.24.2.55]) by mail.tglx.de (Postfix) with ESMTP id 4C19265C3EC for ; Tue, 19 Jun 2007 04:41:17 +0200 (CEST) Received: from [192.168.23.10] (cpe-24-94-51-176.stny.res.rr.com [24.94.51.176]) by ms-smtp-01.nyroc.rr.com (8.13.6/8.13.6) with ESMTP id l5J2f9l0013971; Mon, 18 Jun 2007 22:41:10 -0400 (EDT) Subject: [PATCH RT] Don't call mcount from vsyscall_fn's From: Steven Rostedt To: Ingo Molnar Cc: Thomas Gleixner , LKML , RT Content-Type: text/plain Date: Mon, 18 Jun 2007 22:41:09 -0400 Message-Id: <1182220869.15228.10.camel@localhost.localdomain> Mime-Version: 1.0 X-Mailer: Evolution 2.6.3 X-Virus-Scanned: Symantec AntiVirus Scan Engine X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit This bit me in the butt. I couldn't understand why my init app was segfaulting, with a kernel address, but a user RIP and RSP. Well, the RIP I think was bogus, but the kernel address was always the start of "mcount". Looking deeper, I printed out what was in the RSP (even though it was a user stack). It ended up showing me that the calling address was from the VDSO area. Looking even further, I found the offending culprit, which was vread_hpet. Looking at the assembly dump, I saw the vread_hpet was calling mcount, but I could not see it in the code. Nor could I see it in hpet.i (-E option of compiling). Well, I guess Ingo is a magician when it comes to compiler tricks, and has the mcount being called by "every!!" function, unless you add the "notrace" option. This patch adds the notrace to vsyscall_fn, so that we don't have user land apps calling mcount and crashing! Signed-off-by: Steven Rostedt --- include/asm-x86_64/vsyscall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/include/asm-x86_64/vsyscall.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/vsyscall.h +++ linux-rt.q/include/asm-x86_64/vsyscall.h @@ -22,7 +22,7 @@ enum vsyscall_num { /* Definitions for CONFIG_GENERIC_TIME definitions */ #define __section_vsyscall_gtod_data __attribute__ \ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) -#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) +#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) notrace #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 patches/preempt-realtime-powerpc-b3.patch0000664000077200007720000000374210646635215020013 0ustar mingomingo To fix the following runtime warning. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - BUG: using smp_processor_id() in preemptible [00000000] code: init/371 caller is .pgtable_free_tlb+0x2c/0x14c Call Trace: [C00000000FF6B770] [C00000000000FAAC] .show_stack+0x68/0x1b0 (unreliable) [C00000000FF6B810] [C0000000001F7190] .debug_smp_processor_id+0xc8/0xf8 [C00000000FF6B8A0] [C00000000002C52C] .pgtable_free_tlb+0x2c/0x14c [C00000000FF6B940] [C0000000000B6528] .free_pgd_range+0x234/0x3bc [C00000000FF6BA40] [C0000000000B6AB8] .free_pgtables+0x224/0x260 [C00000000FF6BB00] [C0000000000B7FE8] .exit_mmap+0x100/0x208 [C00000000FF6BBC0] [C000000000055FB0] .mmput+0x70/0x12c [C00000000FF6BC50] [C00000000005B728] .exit_mm+0x150/0x170 [C00000000FF6BCE0] [C00000000005D80C] .do_exit+0x28c/0x9bc [C00000000FF6BDA0] [C00000000005DFF0] .sys_exit_group+0x0/0x8 [C00000000FF6BE30] [C000000000008634] syscall_exit+0x0/0x40 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Would it be better to just use raw_smp_processor_id() rather than tlb->cpu? Signed-off-by: Tsutomu Owa -- owa --- arch/powerpc/mm/tlb_64.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/powerpc/mm/tlb_64.c =================================================================== --- linux-rt.q.orig/arch/powerpc/mm/tlb_64.c +++ linux-rt.q/arch/powerpc/mm/tlb_64.c @@ -94,8 +94,11 @@ static void pte_free_submit(struct pte_f void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) { - /* This is safe since tlb_gather_mmu has disabled preemption */ - cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); + /* + * This is safe since tlb_gather_mmu has disabled preemption. + * tlb->cpu is set by tlb_gather_mmu as well. + */ + cpumask_t local_cpumask = cpumask_of_cpu(tlb->cpu); struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); if (atomic_read(&tlb->mm->mm_users) < 2 || patches/rcu-various-fixups.patch0000664000077200007720000000516510646635216016355 0ustar mingomingo--- net/ipv4/multipath_wrandom.c | 2 ++ security/selinux/avc.c | 9 +++++++++ security/selinux/netif.c | 2 ++ 3 files changed, 13 insertions(+) Index: linux-rt.q/net/ipv4/multipath_wrandom.c =================================================================== --- linux-rt.q.orig/net/ipv4/multipath_wrandom.c +++ linux-rt.q/net/ipv4/multipath_wrandom.c @@ -289,6 +289,7 @@ static void wrandom_flush(void) for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) { struct multipath_route *r; + rcu_read_lock(); spin_lock_bh(&state[i].lock); list_for_each_entry_rcu(r, &state[i].head, list) { struct multipath_dest *d; @@ -303,6 +304,7 @@ static void wrandom_flush(void) } spin_unlock_bh(&state[i].lock); + rcu_read_unlock(); } } Index: linux-rt.q/security/selinux/avc.c =================================================================== --- linux-rt.q.orig/security/selinux/avc.c +++ linux-rt.q/security/selinux/avc.c @@ -312,6 +312,7 @@ static inline int avc_reclaim_node(void) if (!spin_trylock_irqsave(&avc_cache.slots_lock[hvalue], flags)) continue; + rcu_read_lock(); list_for_each_entry(node, &avc_cache.slots[hvalue], list) { if (atomic_dec_and_test(&node->ae.used)) { /* Recently Unused */ @@ -319,11 +320,13 @@ static inline int avc_reclaim_node(void) avc_cache_stats_incr(reclaims); ecx++; if (ecx >= AVC_CACHE_RECLAIM) { + rcu_read_unlock(); spin_unlock_irqrestore(&avc_cache.slots_lock[hvalue], flags); goto out; } } } + rcu_read_unlock(); spin_unlock_irqrestore(&avc_cache.slots_lock[hvalue], flags); } out: @@ -806,8 +809,14 @@ int avc_ss_reset(u32 seqno) for (i = 0; i < AVC_CACHE_SLOTS; i++) { spin_lock_irqsave(&avc_cache.slots_lock[i], flag); + /* + * On -rt the outer spinlock does not prevent RCU + * from being performed: + */ + rcu_read_lock(); list_for_each_entry(node, &avc_cache.slots[i], list) avc_node_delete(node); + rcu_read_unlock(); spin_unlock_irqrestore(&avc_cache.slots_lock[i], flag); } Index: linux-rt.q/security/selinux/netif.c =================================================================== --- linux-rt.q.orig/security/selinux/netif.c +++ linux-rt.q/security/selinux/netif.c @@ -209,6 +209,7 @@ static void sel_netif_flush(void) { int idx; + rcu_read_lock(); spin_lock_bh(&sel_netif_lock); for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) { struct sel_netif *netif; @@ -217,6 +218,7 @@ static void sel_netif_flush(void) sel_netif_destroy(netif); } spin_unlock_bh(&sel_netif_lock); + rcu_read_unlock(); } static int sel_netif_avc_callback(u32 event, u32 ssid, u32 tsid, patches/i386-mark-atomic-irq-ops-raw.patch0000664000077200007720000000113710646635211017622 0ustar mingomingo--- include/asm-i386/atomic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/include/asm-i386/atomic.h =================================================================== --- linux-rt.q.orig/include/asm-i386/atomic.h +++ linux-rt.q/include/asm-i386/atomic.h @@ -195,10 +195,10 @@ static __inline__ int atomic_add_return( #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); + raw_local_irq_save(flags); __i = atomic_read(v); atomic_set(v, i + __i); - local_irq_restore(flags); + raw_local_irq_restore(flags); return i + __i; #endif } patches/x86_64-untangle-asm-hpeth-from-asm-timexh.patch0000664000077200007720000000451710646635210022224 0ustar mingomingoSubject: x86_64: Untangle asm/hpet.h from asm/timex.h From: Chris Wright When making changes to x86_64 timers, I noticed that touching hpet.h triggered an unreasonably large rebuild. Untangling it from timex.h quiets the extra rebuild quite a bit. Signed-off-by: Chris Wright Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: john stultz --- drivers/char/rtc.c | 2 +- include/asm-x86_64/apic.h | 2 ++ include/asm-x86_64/hpet.h | 1 - include/asm-x86_64/timex.h | 1 - 4 files changed, 3 insertions(+), 3 deletions(-) Index: linux-rt.q/drivers/char/rtc.c =================================================================== --- linux-rt.q.orig/drivers/char/rtc.c +++ linux-rt.q/drivers/char/rtc.c @@ -82,7 +82,7 @@ #include #include -#if defined(__i386__) +#ifdef CONFIG_X86 #include #endif Index: linux-rt.q/include/asm-x86_64/apic.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/apic.h +++ linux-rt.q/include/asm-x86_64/apic.h @@ -86,6 +86,8 @@ extern void setup_apic_routing(void); extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, unsigned char msg_type, unsigned char mask); +extern int apic_is_clustered_box(void); + #define K8_APIC_EXT_LVT_BASE 0x500 #define K8_APIC_EXT_INT_MSG_FIX 0x0 #define K8_APIC_EXT_INT_MSG_SMI 0x2 Index: linux-rt.q/include/asm-x86_64/hpet.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/hpet.h +++ linux-rt.q/include/asm-x86_64/hpet.h @@ -55,7 +55,6 @@ extern int is_hpet_enabled(void); extern int hpet_rtc_timer_init(void); -extern int apic_is_clustered_box(void); extern int hpet_arch_init(void); extern int hpet_timer_stop_set_go(unsigned long tick); extern int hpet_reenable(void); Index: linux-rt.q/include/asm-x86_64/timex.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/timex.h +++ linux-rt.q/include/asm-x86_64/timex.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include patches/preempt-realtime-rcu.patch0000664000077200007720000000222010646635215016611 0ustar mingomingo--- kernel/rcupreempt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) Index: linux-rt.q/kernel/rcupreempt.c =================================================================== --- linux-rt.q.orig/kernel/rcupreempt.c +++ linux-rt.q/kernel/rcupreempt.c @@ -55,7 +55,7 @@ */ struct rcu_data { - spinlock_t lock; + raw_spinlock_t lock; long completed; /* Number of last completed batch. */ struct rcu_head *nextlist; struct rcu_head **nexttail; @@ -68,12 +68,12 @@ struct rcu_data { #endif /* #ifdef CONFIG_RCU_TRACE */ }; struct rcu_ctrlblk { - spinlock_t fliplock; + raw_spinlock_t fliplock; long completed; /* Number of last completed batch. */ }; static struct rcu_data rcu_data; static struct rcu_ctrlblk rcu_ctrlblk = { - .fliplock = SPIN_LOCK_UNLOCKED, + .fliplock = RAW_SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), .completed = 0, }; static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) = @@ -348,7 +348,7 @@ int rcu_needs_cpu(int cpu) return !!rcu_data.waitlist || rcu_pending(cpu); } -int rcu_pending(int cpu) +int notrace rcu_pending(int cpu) { return (rcu_data.donelist != NULL || rcu_data.waitlist != NULL || patches/x86_64-apic-add-clockevents-functions.patch0000664000077200007720000000730410646635211021474 0ustar mingomingoSubject: x86_64: Add (not yet used) clock event functions Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/Kconfig | 6 +++ arch/x86_64/kernel/apic.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) Index: linux-rt.q/arch/x86_64/Kconfig =================================================================== --- linux-rt.q.orig/arch/x86_64/Kconfig +++ linux-rt.q/arch/x86_64/Kconfig @@ -28,6 +28,10 @@ config GENERIC_TIME bool default y +config GENERIC_CLOCKEVENTS_MIGR + bool + default y + config GENERIC_TIME_VSYSCALL bool default y @@ -130,6 +134,8 @@ source "init/Kconfig" menu "Processor type and features" +source "kernel/time/Kconfig" + choice prompt "Subarchitecture Type" default X86_PC Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,77 @@ static struct resource lapic_resource = static unsigned int calibration_result; +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); + +static void lapic_timer_broadcast(cpumask_t mask); + +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen); + +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write(APIC_TMICT, delta); + return 0; +} + +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + unsigned int v; + + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return; + + local_irq_save(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; + } + + local_irq_restore(flags); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(cpumask_t mask) +{ +#ifdef CONFIG_SMP + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#endif +} + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -867,6 +939,13 @@ static void __init calibrate_APIC_clock( printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", result / 1000 / 1000, result / 1000 % 1000); + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + calibration_result = result / HZ; } patches/highmem-redo-mainline.patch0000664000077200007720000000100610646635216016707 0ustar mingomingo--- mm/highmem.c | 8 ++++++++ 1 file changed, 8 insertions(+) Index: linux-rt.q/mm/highmem.c =================================================================== --- linux-rt.q.orig/mm/highmem.c +++ linux-rt.q/mm/highmem.c @@ -209,6 +209,14 @@ static unsigned long pkmap_insert(struct return vaddr; } +/* + * Flush all unused kmap mappings in order to remove stray mappings. + */ +void kmap_flush_unused(void) +{ + WARN_ON_ONCE(1); +} + fastcall void *kmap_high(struct page *page) { unsigned long vaddr; patches/preempt-realtime-debug-sysctl.patch0000664000077200007720000000765410646635215020445 0ustar mingomingo--- drivers/char/sysrq.c | 18 ++++++++++++++- drivers/char/tty_io.c | 1 kernel/panic.c | 1 kernel/sysctl.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) Index: linux-rt.q/drivers/char/sysrq.c =================================================================== --- linux-rt.q.orig/drivers/char/sysrq.c +++ linux-rt.q/drivers/char/sysrq.c @@ -208,6 +208,22 @@ static struct sysrq_key_op sysrq_showreg .enable_mask = SYSRQ_ENABLE_DUMP, }; +#if defined(__i386__) + +static void sysrq_handle_showallregs(int key, struct tty_struct *tty) +{ + nmi_show_all_regs(); +} + +static struct sysrq_key_op sysrq_showallregs_op = { + .handler = sysrq_handle_showallregs, + .help_msg = "showalLcpupc", + .action_msg = "Show Regs On All CPUs", +}; +#else +#define sysrq_showallregs_op (*(struct sysrq_key_op *)0) +#endif + static void sysrq_handle_showstate(int key, struct tty_struct *tty) { show_state(); @@ -340,7 +356,7 @@ static struct sysrq_key_op *sysrq_key_ta &sysrq_kill_op, /* i */ NULL, /* j */ &sysrq_SAK_op, /* k */ - NULL, /* l */ + &sysrq_showallregs_op, /* l */ &sysrq_showmem_op, /* m */ &sysrq_unrt_op, /* n */ /* o: This will often be registered as 'Off' at init time */ Index: linux-rt.q/drivers/char/tty_io.c =================================================================== --- linux-rt.q.orig/drivers/char/tty_io.c +++ linux-rt.q/drivers/char/tty_io.c @@ -257,6 +257,7 @@ static int check_tty_count(struct tty_st printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " "!= #fd's(%d) in %s\n", tty->name, tty->count, count, routine); + dump_stack(); return count; } #endif Index: linux-rt.q/kernel/panic.c =================================================================== --- linux-rt.q.orig/kernel/panic.c +++ linux-rt.q/kernel/panic.c @@ -80,6 +80,7 @@ NORET_TYPE void panic(const char * fmt, vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + dump_stack(); bust_spinlocks(0); /* Index: linux-rt.q/kernel/sysctl.c =================================================================== --- linux-rt.q.orig/kernel/sysctl.c +++ linux-rt.q/kernel/sysctl.c @@ -293,6 +293,54 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = KERN_PANIC, + .procname = "prof_pid", + .data = &prof_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_PREEMPT + { + .ctl_name = KERN_PANIC, + .procname = "kernel_preemption", + .data = &kernel_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY + { + .ctl_name = KERN_PANIC, + .procname = "voluntary_preemption", + .data = &voluntary_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = KERN_PANIC, + .procname = "softirq_preemption", + .data = &softirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = KERN_PANIC, + .procname = "hardirq_preemption", + .data = &hardirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_WAKEUP_TIMING { .ctl_name = CTL_UNNUMBERED, @@ -427,6 +475,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_GENERIC_HARDIRQS + { + .ctl_name = KERN_PANIC, + .procname = "debug_direct_keyboard", + .data = &debug_direct_keyboard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", patches/preempt-irqs-hrtimer.patch0000664000077200007720000001023610646635213016652 0ustar mingomingo include/linux/hrtimer.h | 10 ++++++++++ kernel/hrtimer.c | 35 ++++++++++++++++++++++++++++++++++- kernel/itimer.c | 1 + kernel/posix-timers.c | 3 +++ 4 files changed, 48 insertions(+), 1 deletion(-) Index: linux-rt.q/include/linux/hrtimer.h =================================================================== --- linux-rt.q.orig/include/linux/hrtimer.h +++ linux-rt.q/include/linux/hrtimer.h @@ -200,6 +200,9 @@ struct hrtimer_cpu_base { struct list_head cb_pending; unsigned long nr_events; #endif +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; +#endif }; #ifdef CONFIG_HIGH_RES_TIMERS @@ -277,6 +280,13 @@ static inline int hrtimer_restart(struct return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); } +/* Softirq preemption could deadlock timer removal */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); +#else +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) +#endif + /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -922,7 +922,7 @@ int hrtimer_cancel(struct hrtimer *timer if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1033,6 +1033,32 @@ int hrtimer_get_res(const clockid_t whic } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_PREEMPT_SOFTIRQS +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * hrtimer_wait_for_timer - Wait for a running timer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void hrtimer_wait_for_timer(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (base && base->cpu_base) + wait_event(base->cpu_base->wait, + !(timer->state & HRTIMER_STATE_CALLBACK)); +} + +#else +# define wake_up_timer_waiters(b) do { } while (0) +#endif + #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1168,6 +1194,8 @@ static void run_hrtimer_softirq(struct s } } spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1218,6 +1246,8 @@ static inline void run_hrtimer_queue(str } } spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); } /* @@ -1393,6 +1423,9 @@ static void __devinit init_hrtimers_cpu( cpu_base->clock_base[i].cpu_base = cpu_base; hrtimer_init_hres(cpu_base); +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&cpu_base->wait); +#endif } #ifdef CONFIG_HOTPLUG_CPU Index: linux-rt.q/kernel/itimer.c =================================================================== --- linux-rt.q.orig/kernel/itimer.c +++ linux-rt.q/kernel/itimer.c @@ -170,6 +170,7 @@ again: /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_wait_for_timer(&tsk->signal->real_timer); goto again; } expires = timeval_to_ktime(value->it_value); Index: linux-rt.q/kernel/posix-timers.c =================================================================== --- linux-rt.q.orig/kernel/posix-timers.c +++ linux-rt.q/kernel/posix-timers.c @@ -805,6 +805,7 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { + hrtimer_wait_for_timer(&timr->it.real.timer); rtn = NULL; // We already got the old time... goto retry; } @@ -844,6 +845,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } @@ -876,6 +878,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } list_del(&timer->list); patches/arm-imx.patch0000664000077200007720000001173410646635211014126 0ustar mingomingo--- arch/arm/mach-imx/time.c | 121 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 14 deletions(-) Index: linux-rt.q/arch/arm/mach-imx/time.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-imx/time.c +++ linux-rt.q/arch/arm/mach-imx/time.c @@ -3,6 +3,7 @@ * * Copyright (C) 2000-2001 Deep Blue Solutions * Copyright (C) 2002 Shane Nay (shane@minirl.com) + * Copyright (C) 2006-2007 Pavel Pisa (ppisa@pikron.com) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -15,6 +16,7 @@ #include #include #include +#include #include #include @@ -25,7 +27,8 @@ /* Use timer 1 as system timer */ #define TIMER_BASE IMX_TIM1_BASE -static unsigned long evt_diff; +static struct clock_event_device clockevent_imx; +static enum clock_event_mode clockevent_mode = CLOCK_EVT_MODE_UNUSED; /* * IRQ handler for the timer @@ -33,25 +36,20 @@ static unsigned long evt_diff; static irqreturn_t imx_timer_interrupt(int irq, void *dev_id) { + struct clock_event_device *evt = &clockevent_imx; uint32_t tstat; + irqreturn_t ret = IRQ_NONE; /* clear the interrupt */ tstat = IMX_TSTAT(TIMER_BASE); IMX_TSTAT(TIMER_BASE) = 0; if (tstat & TSTAT_COMP) { - do { - - write_seqlock(&xtime_lock); - timer_tick(); - write_sequnlock(&xtime_lock); - IMX_TCMP(TIMER_BASE) += evt_diff; - - } while (unlikely((int32_t)(IMX_TCMP(TIMER_BASE) - - IMX_TCN(TIMER_BASE)) < 0)); + evt->event_handler(evt); + ret = IRQ_HANDLED; } - return IRQ_HANDLED; + return ret; } static struct irqaction imx_timer_irq = { @@ -70,10 +68,8 @@ static void __init imx_timer_hardware_in */ IMX_TCTL(TIMER_BASE) = 0; IMX_TPRER(TIMER_BASE) = 0; - IMX_TCMP(TIMER_BASE) = LATCH - 1; - IMX_TCTL(TIMER_BASE) = TCTL_FRR | TCTL_CLK_PCLK1 | TCTL_IRQEN | TCTL_TEN; - evt_diff = LATCH; + IMX_TCTL(TIMER_BASE) = TCTL_FRR | TCTL_CLK_PCLK1 | TCTL_TEN; } cycle_t imx_get_cycles(void) @@ -99,11 +95,108 @@ static int __init imx_clocksource_init(v return 0; } +static int imx_set_next_event(unsigned long evt, + struct clock_event_device *unused) +{ + unsigned long tcmp; + + tcmp = IMX_TCN(TIMER_BASE) + evt; + IMX_TCMP(TIMER_BASE) = tcmp; + + return (int32_t)(tcmp - IMX_TCN(TIMER_BASE)) < 0 ? -ETIME : 0; +} + +#ifdef DEBUG +static const char *clock_event_mode_label[]={ + [CLOCK_EVT_MODE_PERIODIC] = "CLOCK_EVT_MODE_PERIODIC", + [CLOCK_EVT_MODE_ONESHOT] = "CLOCK_EVT_MODE_ONESHOT", + [CLOCK_EVT_MODE_SHUTDOWN] = "CLOCK_EVT_MODE_SHUTDOWN", + [CLOCK_EVT_MODE_UNUSED] = "CLOCK_EVT_MODE_UNUSED" +}; +#endif /*DEBUG*/ + +static void imx_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) +{ + unsigned long flags; + + /* + * The timer interrupt generation is disabled at least + * for enough time to call imx_set_next_event() + */ + local_irq_save(flags); + /* Disable interrupt in GPT module */ + IMX_TCTL(TIMER_BASE) &= ~TCTL_IRQEN; + if (mode != clockevent_mode) { + /* Set event time into far-far future */ + IMX_TCMP(TIMER_BASE) = IMX_TCN(TIMER_BASE) - 3; + /* Clear pending interrupt */ + IMX_TSTAT(TIMER_BASE) &= ~TSTAT_COMP; + } + +#ifdef DEBUG + printk(KERN_INFO "imx_set_mode: changing mode from %s to %s\n", + clock_event_mode_label[clockevent_mode], clock_event_mode_label[mode]); +#endif /*DEBUG*/ + + /* Remember timer mode */ + clockevent_mode = mode; + local_irq_restore(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + printk(KERN_ERR "imx_set_mode: Periodic mode is not supported for i.MX\n"); + break; + case CLOCK_EVT_MODE_ONESHOT: + /* + * Do not put overhead of interrupt enable/disable into + * imx_set_next_event(), the core has about 4 minutes + * to call imx_set_next_event() or shutdown clock after + * mode switching + */ + local_irq_save(flags); + IMX_TCTL(TIMER_BASE) |= TCTL_IRQEN; + local_irq_restore(flags); + break; + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + /* Left event sources disabled, no more interrupts appears */ + break; + } +} + +static struct clock_event_device clockevent_imx = { + .name = "imx_timer1", + .features = CLOCK_EVT_FEAT_ONESHOT, + .shift = 32, + .set_mode = imx_set_mode, + .set_next_event = imx_set_next_event, + .rating = 200, +}; + +static int __init imx_clockevent_init(void) +{ + clockevent_imx.mult = div_sc(imx_get_perclk1(), NSEC_PER_SEC, + clockevent_imx.shift); + clockevent_imx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_imx); + clockevent_imx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_imx); + + clockevent_imx.cpumask = cpumask_of_cpu(0); + + clockevents_register_device(&clockevent_imx); + + return 0; +} + + static void __init imx_timer_init(void) { imx_timer_hardware_init(); imx_clocksource_init(); + imx_clockevent_init(); + /* * Make irqs happen for the system timer */ patches/lockstat_class_name.patch0000664000077200007720000000240110646635217016562 0ustar mingomingoSubject: lockstat: better class name representation optionally add class->name_server and class->subclass to the class name Signed-off-by: Peter Zijlstra --- kernel/lockdep_proc.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) Index: linux-rt.q/kernel/lockdep_proc.c =================================================================== --- linux-rt.q.orig/kernel/lockdep_proc.c +++ linux-rt.q/kernel/lockdep_proc.c @@ -419,8 +419,30 @@ static void seq_stats(struct seq_file *m class = data->class; stats = &data->stats; - snprintf(name, 38, "%s", class->name); + namelen = 38; + if (class->name_version > 1) + namelen -= 2; /* XXX */ + if (class->subclass) + namelen -= 2; + + if (!class->name) { + char str[KSYM_NAME_LEN]; + const char *key_name; + + key_name = __get_key_name(class->key, str); + snprintf(name, namelen, "%s", key_name); + } else { + snprintf(name, namelen, "%s", class->name); + } namelen = strlen(name); + if (class->name_version > 1) { + snprintf(name+namelen, 3, "#%d", class->name_version); + namelen += 2; + } + if (class->subclass) { + snprintf(name+namelen, 3, "/%d", class->subclass); + namelen += 2; + } if (stats->write_holdtime.nr) { if (stats->read_holdtime.nr) patches/random-driver-latency-fix.patch0000664000077200007720000000171510646635213017546 0ustar mingomingo drivers/char/random.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) Index: linux-rt.q/drivers/char/random.c =================================================================== --- linux-rt.q.orig/drivers/char/random.c +++ linux-rt.q/drivers/char/random.c @@ -580,8 +580,11 @@ static void add_timer_randomness(struct preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -626,9 +629,6 @@ static void add_timer_randomness(struct if(input_pool.entropy_count >= random_read_wakeup_thresh) wake_up_interruptible(&random_read_wait); - -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, patches/i386-pit-stop-only-when-in-periodic-or-oneshot-mode.patch0000664000077200007720000000204510646635210024147 0ustar mingomingoFrom: Thomas Gleixner The patch is necessary on one of my boxen, where programming the stop sequence twice leads to PIT malfunction. Sigh ! Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/i8253.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) Index: linux-rt.q/arch/i386/kernel/i8253.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/i8253.c +++ linux-rt.q/arch/i386/kernel/i8253.c @@ -47,9 +47,12 @@ static void init_pit_timer(enum clock_ev case CLOCK_EVT_MODE_SHUTDOWN: case CLOCK_EVT_MODE_UNUSED: - outb_p(0x30, PIT_MODE); - outb_p(0, PIT_CH0); /* LSB */ - outb_p(0, PIT_CH0); /* MSB */ + if (evt->mode == CLOCK_EVT_MODE_PERIODIC || + evt->mode == CLOCK_EVT_MODE_ONESHOT) { + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } break; case CLOCK_EVT_MODE_ONESHOT: patches/ppc-mcount-dummy-functions.patch0000664000077200007720000000247210646635212020000 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Mon May 14 17:16:37 2007 Date: Mon, 14 May 2007 17:16:37 +0900 From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [patch 2/5] powerpc 2.6.21-rt1: dummy functions and export _mcount to compile add dummy functions save_stack_trace(), early_printk() for now and export _mcount to compile. Signed-off-by: Tsutomu OWA -- owa --- arch/powerpc/kernel/setup_64.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) Index: linux-rt.q/arch/powerpc/kernel/setup_64.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/setup_64.c +++ linux-rt.q/arch/powerpc/kernel/setup_64.c @@ -605,3 +605,22 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif /* CONFIG_PPC_INDIRECT_IO */ +#ifdef CONFIG_STACKTRACE +#include +void notrace save_stack_trace(struct stack_trace *trace, + struct task_struct *task) +{ +} +#endif /* CONFIG_STACKTRACE */ + +#ifdef CONFIG_EARLY_PRINTK +void notrace early_printk(const char *fmt, ...) +{ + BUG(); +} +#endif /* CONFIG_EARLY_PRINTK */ + +#ifdef CONFIG_MCOUNT +extern void _mcount(void); +EXPORT_SYMBOL(_mcount); +#endif /* CONFIG_MCOUNT */ patches/preempt-realtime-ipc.patch0000664000077200007720000000565710646635215016614 0ustar mingomingo--- ipc/mqueue.c | 5 +++++ ipc/msg.c | 25 +++++++++++++++++++------ ipc/sem.c | 6 ++++++ 3 files changed, 30 insertions(+), 6 deletions(-) Index: linux-rt.q/ipc/mqueue.c =================================================================== --- linux-rt.q.orig/ipc/mqueue.c +++ linux-rt.q/ipc/mqueue.c @@ -783,12 +783,17 @@ static inline void pipelined_send(struct struct msg_msg *message, struct ext_wait_queue *receiver) { + /* + * Keep them in one critical section for PREEMPT_RT: + */ + preempt_disable(); receiver->msg = message; list_del(&receiver->list); receiver->state = STATE_PENDING; wake_up_process(receiver->task); smp_wmb(); receiver->state = STATE_READY; + preempt_enable(); } /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() Index: linux-rt.q/ipc/msg.c =================================================================== --- linux-rt.q.orig/ipc/msg.c +++ linux-rt.q/ipc/msg.c @@ -215,12 +215,19 @@ static void expunge_all(struct msg_queue while (tmp != &msq->q_receivers) { struct msg_receiver *msr; + /* + * Make sure that the wakeup doesnt preempt + * this CPU prematurely. (on PREEMPT_RT) + */ + preempt_disable(); + msr = list_entry(tmp, struct msg_receiver, r_list); tmp = tmp->next; msr->r_msg = NULL; - wake_up_process(msr->r_tsk); - smp_mb(); + wake_up_process(msr->r_tsk); /* serializes */ msr->r_msg = ERR_PTR(res); + + preempt_enable(); } } @@ -605,22 +612,28 @@ static inline int pipelined_send(struct !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { + /* + * Make sure that the wakeup doesnt preempt + * this CPU prematurely. (on PREEMPT_RT) + */ + preempt_disable(); + list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { msr->r_msg = NULL; - wake_up_process(msr->r_tsk); - smp_mb(); + wake_up_process(msr->r_tsk); /* serializes */ msr->r_msg = ERR_PTR(-E2BIG); } else { msr->r_msg = NULL; msq->q_lrpid = msr->r_tsk->pid; msq->q_rtime = get_seconds(); - wake_up_process(msr->r_tsk); - smp_mb(); + wake_up_process(msr->r_tsk); /* serializes */ msr->r_msg = msg; + preempt_enable(); return 1; } + preempt_enable(); } } return 0; Index: linux-rt.q/ipc/sem.c =================================================================== --- linux-rt.q.orig/ipc/sem.c +++ linux-rt.q/ipc/sem.c @@ -414,6 +414,11 @@ static void update_queue (struct sem_arr if (error <= 0) { struct sem_queue *n; remove_from_queue(sma,q); + /* + * make sure that the wakeup doesnt preempt + * _this_ cpu prematurely. (on preempt_rt) + */ + preempt_disable(); q->status = IN_WAKEUP; /* * Continue scanning. The next operation @@ -436,6 +441,7 @@ static void update_queue (struct sem_arr */ smp_wmb(); q->status = error; + preempt_enable(); q = n; } else { q = q->next; patches/disable-gtod-functions-if-gtod-is-not-there.patch0000664000077200007720000001323710646635216022770 0ustar mingomingoFrom matsu@igel.co.jp Mon Jun 18 18:14:51 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=none autolearn=unavailable version=3.1.7-deb Received: from mailhost.igel.co.jp (mailhost.igel.co.jp [219.106.231.130]) by mail.tglx.de (Postfix) with ESMTP id 7B29365C065 for ; Mon, 18 Jun 2007 18:14:51 +0200 (CEST) Received: from localhost (vpn1.hq.igel.co.jp [192.168.1.113]) by mailhost.igel.co.jp (Postfix) with ESMTP id 2115968069; Tue, 19 Jun 2007 01:14:42 +0900 (JST) Date: Tue, 19 Jun 2007 01:14:40 +0900 (JST) Message-Id: <20070619.011440.260207451.matsu@igel.co.jp> To: tglx@linutronix.de Cc: nelsoneci@gmail.com, mingo@elte.hu, linux-kernel@vger.kernel.org, linux-rt-users@vger.kernel.org Subject: Re: v2.6.21.4-rt11 From: Katsuya MATSUBARA In-Reply-To: <1182099558.8176.436.camel@chaos> References: <1182098605.8176.422.camel@chaos> <2accc2ff0706170949j26c391aek7ed32e0e55d9a3d1@mail.gmail.com> <1182099558.8176.436.camel@chaos> X-Mailer: Mew version 5.2 on Emacs 21.3.50 / Mule 5.0 (SAKAKI) Mime-Version: 1.0 Content-Type: Text/Plain; charset=us-ascii X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit From: Thomas Gleixner Date: Sun, 17 Jun 2007 18:59:18 +0200 > On Sun, 2007-06-17 at 11:49 -0500, Nelson Castillo wrote: > > > > There are many choices and > > > > I don't know what is the more friendly. By friendly I mean the one that > > > > is likely to be merged and that cooperate with you. > > > > > > Which choices do you mean ? > > > > I mean implementations. I've seen lot of them but i don't know which one > > to try (I'm new to RT and the implementation in this thread seems to > > be very nice). > > Thanks :) > > > > > http://people.redhat.com/mingo/realtime-preempt/patch-2.6.21.4-rt14 > > > > > > > > : undefined reference to `usecs_to_cycles' > > > > make: *** [.tmp_vmlinux1] Error 1 > > > > > > Which ARM sub arch ? > > > > sub arch AT91 -- (Atmel AT91RM9200 processor). > > It lacks support for the generic timeofday and clock event layers, which > causes the compile breakage. I am working on Renesas SuperH platforms. I faced the similar compile errors because 2.6.21.X in SH does not support GENERIC_TIME yet. I made a workaround patch. Is this correct? Thanks, --- Katsuya Matsubara @ Igel Co., Ltd matsu@igel.co.jp diff -cr linux-2.6.21.5-rt14/kernel/hrtimer.c linux-2.6.21.5-rt14-nogt/kernel/hrtimer.c --- kernel/hrtimer.c | 4 ++++ kernel/time.c | 4 ++++ kernel/time/ntp.c | 4 ++++ kernel/time/timekeeping.c | 2 ++ kernel/timer.c | 4 ++++ 5 files changed, 18 insertions(+) Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -120,9 +120,13 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqbegin(&xtime_lock); +#ifdef CONFIG_GENERIC_TIME *ts = xtime; nsecs = __get_nsec_offset(); timespec_add_ns(ts, nsecs); +#else + getnstimeofday(ts); +#endif tomono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); Index: linux-rt.q/kernel/time.c =================================================================== --- linux-rt.q.orig/kernel/time.c +++ linux-rt.q/kernel/time.c @@ -137,7 +137,9 @@ static inline void warp_clock(void) wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; time_interpolator_reset(); +#ifdef CONFIG_GENERIC_TIME warp_check_clock_was_changed(); +#endif write_sequnlock_irq(&xtime_lock); clock_was_set(); } @@ -352,7 +354,9 @@ int do_settimeofday (struct timespec *tv time_esterror = NTP_PHASE_LIMIT; time_interpolator_reset(); } +#ifdef CONFIG_GENERIC_TIME warp_check_clock_was_changed(); +#endif write_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; Index: linux-rt.q/kernel/time/ntp.c =================================================================== --- linux-rt.q.orig/kernel/time/ntp.c +++ linux-rt.q/kernel/time/ntp.c @@ -123,7 +123,9 @@ void second_overflow(void) */ time_interpolator_update(-NSEC_PER_SEC); time_state = TIME_OOP; +#ifdef CONFIG_GENERIC_TIME warp_check_clock_was_changed(); +#endif printk(KERN_NOTICE "Clock: inserting leap second " "23:59:60 UTC\n"); } @@ -138,7 +140,9 @@ void second_overflow(void) */ time_interpolator_update(NSEC_PER_SEC); time_state = TIME_WAIT; +#ifdef CONFIG_GENERIC_TIME warp_check_clock_was_changed(); +#endif printk(KERN_NOTICE "Clock: deleting leap second " "23:59:59 UTC\n"); } Index: linux-rt.q/kernel/time/timekeeping.c =================================================================== --- linux-rt.q.orig/kernel/time/timekeeping.c +++ linux-rt.q/kernel/time/timekeeping.c @@ -352,7 +352,9 @@ static int timekeeping_resume(struct sys clock->cycle_accumulated = 0; clock->error = 0; timekeeping_suspended = 0; +#ifdef CONFIG_GENERIC_TIME warp_check_clock_was_changed(); +#endif write_sequnlock_irqrestore(&xtime_lock, flags); touch_softlockup_watchdog(); Index: linux-rt.q/kernel/timer.c =================================================================== --- linux-rt.q.orig/kernel/timer.c +++ linux-rt.q/kernel/timer.c @@ -1225,9 +1225,13 @@ int do_sysinfo(struct sysinfo *info) * too. */ +#ifdef CONFIG_GENERIC_TIME tp = xtime; nsecs = __get_nsec_offset(); timespec_add_ns(&tp, nsecs); +#else + getnstimeofday(&tp); +#endif tp.tv_sec += wall_to_monotonic.tv_sec; tp.tv_nsec += wall_to_monotonic.tv_nsec; patches/barrier.patch0000664000077200007720000000451010646635216014201 0ustar mingomingoSubject: barrier - a scalable synchonisation barrier This barrier thing is constructed so that it will not write in the sync() condition (the hot path) when there are no active lock sections; thus avoiding cacheline bouncing. -- I'm just not sure how this will work out in relation to PI. We might track those in the barrier scope and boost those by the max prio of the blockers. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/barrier.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) Index: linux-rt.q/include/linux/barrier.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/barrier.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2006, Red Hat, Inc., Peter Zijlstra + * Licenced under the GPLv2. + * + * simple synchonisation barrier + * + * The sync() operation will wait for completion of all lock sections if any. + * + * The lock sections are intended to be rare and the sync operation frequent. + * This construct is created to be scalable and does only 1 read in the fast + * path (sync), hence avoiding cacheline bounces. + * + * NOTE: it _synchronisation_ only, so if there are serialisation requirements + * those must be met by something external to this construct. + */ +#ifndef _LINUX_BARRIER_H +#define _LINUX_BARRIER_H + +#ifdef __KERNEL__ + +#include +#include +#include + +struct barrier { + atomic_t count; + wait_queue_head_t wait; +}; + +static inline void init_barrier(struct barrier *b) +{ + atomic_set(&b->count, 0); + init_waitqueue_head(&b->wait); + __acquire(b); +} + +static inline void barrier_lock(struct barrier *b) +{ + __release(b); + atomic_inc(&b->count); + smp_wmb(); +} + +static inline void barrier_unlock(struct barrier *b) +{ + smp_wmb(); + if (atomic_dec_and_test(&b->count)) + __wake_up(&b->wait, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 0, b); +} + +static inline void barrier_sync(struct barrier *b) +{ + might_sleep(); + + if (unlikely(atomic_read(&b->count))) { + DEFINE_WAIT(wait); + prepare_to_wait(&b->wait, &wait, TASK_UNINTERRUPTIBLE); + while (atomic_read(&b->count)) + schedule(); + finish_wait(&b->wait, &wait); + } +} + +#endif /* __KERNEL__ */ +#endif /* _LINUX_BARRIER_H */ patches/preempt-irqs-ppc-fix-b5.patch0000664000077200007720000000314110646635214017050 0ustar mingomingo To fix the following boot time error by removing ack member added by the rt patch. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Processor 1 found. Brought up 2 CPUs ------------[ cut here ]------------ kernel BUG at arch/powerpc/platforms/cell/interrupt.c:86! pu 0x1: Vector: 700 (Program Check) at [c00000000fff3c80] pc: c000000000033f9c: .iic_eoi+0x58/0x64 lr: c00000000009add8: .handle_percpu_irq+0xd4/0xf4 sp: c00000000fff3f00 msr: 9000000000021032 current = 0xc000000000fee040 paca = 0xc000000000509e80 pid = 0, comm = swapper kernel BUG at arch/powerpc/platforms/cell/interrupt.c:86! enter ? for help [link register ] c00000000009add8 .handle_percpu_irq+0xd4/0xf4 [c00000000fff3f00] c00000000009ada8 .handle_percpu_irq+0xa4/0xf4 (unreliable) [c00000000fff3f90] c000000000023bb8 .call_handle_irq+0x1c/0x2c [c000000000ff7950] c00000000000c910 .do_IRQ+0xf8/0x1b8 [c000000000ff79f0] c000000000034f34 .cbe_system_reset_exception+0x74/0xb4 [c000000000ff7a70] c000000000022610 .system_reset_exception+0x40/0xe0 [c000000000ff7af0] c000000000003378 system_reset_common+0xf8/0x100 --- arch/powerpc/platforms/cell/interrupt.c | 1 - 1 file changed, 1 deletion(-) Index: linux-rt.q/arch/powerpc/platforms/cell/interrupt.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/cell/interrupt.c +++ linux-rt.q/arch/powerpc/platforms/cell/interrupt.c @@ -90,7 +90,6 @@ static struct irq_chip iic_chip = { .typename = " CELL-IIC ", .mask = iic_mask, .unmask = iic_unmask, - .ack = iic_eoi, .eoi = iic_eoi, }; patches/latency-tracing-ppc.patch0000664000077200007720000001051410646635212016414 0ustar mingomingo arch/powerpc/boot/Makefile | 10 +++++++++- arch/powerpc/kernel/time.c | 3 ++- arch/powerpc/mm/fault.c | 4 ++-- arch/ppc/boot/Makefile | 9 +++++++++ arch/ppc/kernel/time.c | 2 +- arch/ppc/mm/fault.c | 2 +- 6 files changed, 24 insertions(+), 6 deletions(-) Index: linux-rt.q/arch/powerpc/boot/Makefile =================================================================== --- linux-rt.q.orig/arch/powerpc/boot/Makefile +++ linux-rt.q/arch/powerpc/boot/Makefile @@ -31,6 +31,14 @@ endif BOOTCFLAGS += -I$(obj) -I$(srctree)/$(obj) +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + $(obj)/44x.o: BOOTCFLAGS += -mcpu=440 $(obj)/ebony.o: BOOTCFLAGS += -mcpu=440 @@ -55,7 +63,7 @@ obj-wlib := $(addsuffix .o, $(basename $ obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat)))) quiet_cmd_copy_zlib = COPY $@ - cmd_copy_zlib = sed "s@__attribute_used__@@;s@]*\).*@\"\1\"@" $< > $@ + cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.@@;s@.include.@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@]*\).*@\"\1\"@" $< > $@ quiet_cmd_copy_zlibheader = COPY $@ cmd_copy_zlibheader = sed "s@]*\).*@\"\1\"@" $< > $@ Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -530,7 +530,7 @@ static __inline__ void timer_recalc_offs } #ifdef CONFIG_SMP -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -906,6 +906,7 @@ void __init time_init(void) tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; + cpu_khz = ppc_tb_freq / 1000; tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); calc_cputime_factors(); Index: linux-rt.q/arch/powerpc/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/powerpc/mm/fault.c +++ linux-rt.q/arch/powerpc/mm/fault.c @@ -138,8 +138,8 @@ static void do_dabr(struct pt_regs *regs * The return value is 0 if the fault was handled, or the signal * number if this is a kernel fault that can't be handled here. */ -int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +int __kprobes notrace do_page_fault(struct pt_regs *regs, + unsigned long address, unsigned long error_code) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; Index: linux-rt.q/arch/ppc/boot/Makefile =================================================================== --- linux-rt.q.orig/arch/ppc/boot/Makefile +++ linux-rt.q/arch/ppc/boot/Makefile @@ -14,6 +14,15 @@ # CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux-rt.q/arch/ppc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/ppc/kernel/time.c +++ linux-rt.q/arch/ppc/kernel/time.c @@ -102,7 +102,7 @@ static inline int tb_delta(unsigned *jif } #ifdef CONFIG_SMP -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); Index: linux-rt.q/arch/ppc/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/ppc/mm/fault.c +++ linux-rt.q/arch/ppc/mm/fault.c @@ -89,7 +89,7 @@ static int store_updates_sp(struct pt_re * the error_code parameter is ESR for a data fault, 0 for an instruction * fault. */ -int do_page_fault(struct pt_regs *regs, unsigned long address, +int notrace do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code) { struct vm_area_struct * vma; patches/acpi-move-timer-broadcast-and-pmtimer-access-before-c3-arbiter-shutdown.patch0000664000077200007720000000276210646635210030226 0ustar mingomingoFrom: Udo A. Steinberg The chipset doc for IHC4 tells us: 1.In general, software should not attempt any non-posted accesses during arbiter disable except to the ICH4's power management registers. This implies that interrupt handlers for any unmasked hardware interrupts and SMI/NMI should check ARB_DIS status before reading from ICH devices. So it's not a good idea to access ICH devices after arbiter shut down. Signed-off-by: Udo A. Steinberg Signed-off-by: Thomas Gleixner Cc: Len Brown Signed-off-by: Andrew Morton --- drivers/acpi/processor_idle.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) Index: linux-rt.q/drivers/acpi/processor_idle.c =================================================================== --- linux-rt.q.orig/drivers/acpi/processor_idle.c +++ linux-rt.q/drivers/acpi/processor_idle.c @@ -988,6 +988,12 @@ static int acpi_idle_enter_c3(struct cpu return 0; } + /* + * Must be done before busmaster disable as we might need to + * access HPET ! + */ + acpi_state_timer_broadcast(pr, cx, 1); + /* disable bus master */ if (pr->flags.bm_check) { spin_lock(&c3_lock); @@ -1007,7 +1013,6 @@ static int acpi_idle_enter_c3(struct cpu /* Get start time (ticks) */ t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); - acpi_state_timer_broadcast(pr, cx, 1); acpi_idle_do_entry(cx); t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); patches/preempt-realtime-prevent-idle-boosting.patch0000664000077200007720000000340210646635215022243 0ustar mingomingoSubject: Premmpt-RT: Preevent boosting of idle task Idle task boosting is a nono in general. There is one exception, when NOHZ is active: The idle task calls get_next_timer_interrupt() and holds the timer wheel base->lock on the CPU and another CPU wants to access the timer (probably to cancel it). We can safely ignore the boosting request, as the idle CPU runs this code with interrupts disabled and will complete the lock protected section without being interrupted. So there is no real need to boost. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -4283,6 +4283,25 @@ void rt_mutex_setprio(struct task_struct BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + + /* + * Idle task boosting is a nono in general. There is one + * exception, when NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + now = rq_clock(rq); oldprio = p->prio; @@ -4316,6 +4335,7 @@ void rt_mutex_setprio(struct task_struct } trace_special(prev_resched, _need_resched(), 0); +out_unlock: task_rq_unlock(rq, &flags); } patches/sched-cfs-v2.6.22.1-v19.patch0000664000077200007720000066712210646635210016111 0ustar mingomingo--- Documentation/kernel-parameters.txt | 43 Documentation/sched-design-CFS.txt | 119 + Makefile | 2 arch/i386/kernel/smpboot.c | 12 arch/i386/kernel/tsc.c | 9 arch/ia64/kernel/setup.c | 6 arch/mips/kernel/smp.c | 11 arch/sparc/kernel/smp.c | 10 arch/sparc64/kernel/smp.c | 27 block/cfq-iosched.c | 3 fs/proc/array.c | 59 fs/proc/base.c | 71 include/asm-generic/bitops/sched.h | 21 include/linux/hardirq.h | 13 include/linux/sched.h | 258 ++- include/linux/topology.h | 14 init/main.c | 5 kernel/delayacct.c | 10 kernel/exit.c | 5 kernel/fork.c | 5 kernel/posix-cpu-timers.c | 34 kernel/sched.c | 2980 +++++++++++++----------------------- kernel/sched_debug.c | 276 +++ kernel/sched_fair.c | 1107 +++++++++++++ kernel/sched_idletask.c | 71 kernel/sched_rt.c | 255 +++ kernel/sched_stats.h | 235 ++ kernel/softirq.c | 1 kernel/sysctl.c | 76 kernel/time.c | 27 lib/Kconfig.debug | 9 31 files changed, 3662 insertions(+), 2112 deletions(-) Index: linux-rt.q/Documentation/kernel-parameters.txt =================================================================== --- linux-rt.q.orig/Documentation/kernel-parameters.txt +++ linux-rt.q/Documentation/kernel-parameters.txt @@ -1014,49 +1014,6 @@ and is between 256 and 4096 characters. mga= [HW,DRM] - migration_cost= - [KNL,SMP] debug: override scheduler migration costs - Format: ,,... - This debugging option can be used to override the - default scheduler migration cost matrix. The numbers - are indexed by 'CPU domain distance'. - E.g. migration_cost=1000,2000,3000 on an SMT NUMA - box will set up an intra-core migration cost of - 1 msec, an inter-core migration cost of 2 msecs, - and an inter-node migration cost of 3 msecs. - - WARNING: using the wrong values here can break - scheduler performance, so it's only for scheduler - development purposes, not production environments. - - migration_debug= - [KNL,SMP] migration cost auto-detect verbosity - Format=<0|1|2> - If a system's migration matrix reported at bootup - seems erroneous then this option can be used to - increase verbosity of the detection process. - We default to 0 (no extra messages), 1 will print - some more information, and 2 will be really - verbose (probably only useful if you also have a - serial console attached to the system). - - migration_factor= - [KNL,SMP] multiply/divide migration costs by a factor - Format= - This debug option can be used to proportionally - increase or decrease the auto-detected migration - costs for all entries of the migration matrix. - E.g. migration_factor=150 will increase migration - costs by 50%. (and thus the scheduler will be less - eager migrating cache-hot tasks) - migration_factor=80 will decrease migration costs - by 20%. (thus the scheduler will be more eager to - migrate tasks) - - WARNING: using the wrong values here can break - scheduler performance, so it's only for scheduler - development purposes, not production environments. - mousedev.tap_time= [MOUSE] Maximum time between finger touching and leaving touchpad surface for touch to be considered Index: linux-rt.q/Documentation/sched-design-CFS.txt =================================================================== --- /dev/null +++ linux-rt.q/Documentation/sched-design-CFS.txt @@ -0,0 +1,119 @@ + +This is the CFS scheduler. + +80% of CFS's design can be summed up in a single sentence: CFS basically +models an "ideal, precise multi-tasking CPU" on real hardware. + +"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% +physical power and which can run each task at precise equal speed, in +parallel, each at 1/nr_running speed. For example: if there are 2 tasks +running then it runs each at 50% physical power - totally in parallel. + +On real hardware, we can run only a single task at once, so while that +one task runs, the other tasks that are waiting for the CPU are at a +disadvantage - the current task gets an unfair amount of CPU time. In +CFS this fairness imbalance is expressed and tracked via the per-task +p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of +time the task should now run on the CPU for it to become completely fair +and balanced. + +( small detail: on 'ideal' hardware, the p->wait_runtime value would + always be zero - no task would ever get 'out of balance' from the + 'ideal' share of CPU time. ) + +CFS's task picking logic is based on this p->wait_runtime value and it +is thus very simple: it always tries to run the task with the largest +p->wait_runtime value. In other words, CFS tries to run the task with +the 'gravest need' for more CPU time. So CFS always tries to split up +CPU time between runnable tasks as close to 'ideal multitasking +hardware' as possible. + +Most of the rest of CFS's design just falls out of this really simple +concept, with a few add-on embellishments like nice levels, +multiprocessing and various algorithm variants to recognize sleepers. + +In practice it works like this: the system runs a task a bit, and when +the task schedules (or a scheduler tick happens) the task's CPU usage is +'accounted for': the (small) time it just spent using the physical CPU +is deducted from p->wait_runtime. [minus the 'fair share' it would have +gotten anyway]. Once p->wait_runtime gets low enough so that another +task becomes the 'leftmost task' of the time-ordered rbtree it maintains +(plus a small amount of 'granularity' distance relative to the leftmost +task so that we do not over-schedule tasks and trash the cache) then the +new leftmost task is picked and the current task is preempted. + +The rq->fair_clock value tracks the 'CPU time a runnable task would have +fairly gotten, had it been runnable during that time'. So by using +rq->fair_clock values we can accurately timestamp and measure the +'expected CPU time' a task should have gotten. All runnable tasks are +sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and +CFS picks the 'leftmost' task and sticks to it. As the system progresses +forwards, newly woken tasks are put into the tree more and more to the +right - slowly but surely giving a chance for every task to become the +'leftmost task' and thus get on the CPU within a deterministic amount of +time. + +Some implementation details: + + - the introduction of Scheduling Classes: an extensible hierarchy of + scheduler modules. These modules encapsulate scheduling policy + details and are handled by the scheduler core without the core + code assuming about them too much. + + - sched_fair.c implements the 'CFS desktop scheduler': it is a + replacement for the vanilla scheduler's SCHED_OTHER interactivity + code. + + I'd like to give credit to Con Kolivas for the general approach here: + he has proven via RSDL/SD that 'fair scheduling' is possible and that + it results in better desktop scheduling. Kudos Con! + + The CFS patch uses a completely different approach and implementation + from RSDL/SD. My goal was to make CFS's interactivity quality exceed + that of RSDL/SD, which is a high standard to meet :-) Testing + feedback is welcome to decide this one way or another. [ and, in any + case, all of SD's logic could be added via a kernel/sched_sd.c module + as well, if Con is interested in such an approach. ] + + CFS's design is quite radical: it does not use runqueues, it uses a + time-ordered rbtree to build a 'timeline' of future task execution, + and thus has no 'array switch' artifacts (by which both the vanilla + scheduler and RSDL/SD are affected). + + CFS uses nanosecond granularity accounting and does not rely on any + jiffies or other HZ detail. Thus the CFS scheduler has no notion of + 'timeslices' and has no heuristics whatsoever. There is only one + central tunable: + + /proc/sys/kernel/sched_granularity_ns + + which can be used to tune the scheduler from 'desktop' (low + latencies) to 'server' (good batching) workloads. It defaults to a + setting suitable for desktop workloads. SCHED_BATCH is handled by the + CFS scheduler module too. + + Due to its design, the CFS scheduler is not prone to any of the + 'attacks' that exist today against the heuristics of the stock + scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all + work fine and do not impact interactivity and produce the expected + behavior. + + the CFS scheduler has a much stronger handling of nice levels and + SCHED_BATCH: both types of workloads should be isolated much more + agressively than under the vanilla scheduler. + + ( another detail: due to nanosec accounting and timeline sorting, + sched_yield() support is very simple under CFS, and in fact under + CFS sched_yield() behaves much better than under any other + scheduler i have tested so far. ) + + - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler + way than the vanilla scheduler does. It uses 100 runqueues (for all + 100 RT priority levels, instead of 140 in the vanilla scheduler) + and it needs no expired array. + + - reworked/sanitized SMP load-balancing: the runqueue-walking + assumptions are gone from the load-balancing code now, and + iterators of the scheduling modules are used. The balancing code got + quite a bit simpler as a result. + Index: linux-rt.q/Makefile =================================================================== --- linux-rt.q.orig/Makefile +++ linux-rt.q/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 22 -EXTRAVERSION = .1 +EXTRAVERSION = .1-cfs-v19 NAME = Holy Dancing Manatees, Batman! # *DOCUMENTATION* Index: linux-rt.q/arch/i386/kernel/smpboot.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/smpboot.c +++ linux-rt.q/arch/i386/kernel/smpboot.c @@ -941,17 +941,6 @@ exit: } #endif -static void smp_tune_scheduling(void) -{ - if (cpu_khz) { - /* cache size in kB */ - long cachesize = boot_cpu_data.x86_cache_size; - - if (cachesize > 0) - max_cache_size = cachesize * 1024; - } -} - /* * Cycle through the processors sending APIC IPIs to boot each. */ @@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigne x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; current_thread_info()->cpu = 0; - smp_tune_scheduling(); set_cpu_sibling_map(0); Index: linux-rt.q/arch/i386/kernel/tsc.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/tsc.c +++ linux-rt.q/arch/i386/kernel/tsc.c @@ -4,6 +4,7 @@ * See comments there for proper credits. */ +#include #include #include #include @@ -106,8 +107,13 @@ unsigned long long sched_clock(void) /* * Fall back to jiffies if there's no TSC available: + * ( But note that we still use it if the TSC is marked + * unstable. We do this because unlike Time Of Day, + * the scheduler clock tolerates small errors and it's + * very important for it to be as fast as the platform + * can achive it. ) */ - if (unlikely(!tsc_enabled)) + if (unlikely(!tsc_enabled && !tsc_unstable)) /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); @@ -277,6 +283,7 @@ static struct clocksource clocksource_ts void mark_tsc_unstable(char *reason) { + sched_clock_unstable_event(); if (!tsc_unstable) { tsc_unstable = 1; tsc_enabled = 0; Index: linux-rt.q/arch/ia64/kernel/setup.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/setup.c +++ linux-rt.q/arch/ia64/kernel/setup.c @@ -805,7 +805,6 @@ static void __cpuinit get_max_cacheline_size (void) { unsigned long line_size, max = 1; - unsigned int cache_size = 0; u64 l, levels, unique_caches; pal_cache_config_info_t cci; s64 status; @@ -835,8 +834,6 @@ get_max_cacheline_size (void) line_size = 1 << cci.pcci_line_size; if (line_size > max) max = line_size; - if (cache_size < cci.pcci_cache_size) - cache_size = cci.pcci_cache_size; if (!cci.pcci_unified) { status = ia64_pal_cache_config_info(l, /* cache_type (instruction)= */ 1, @@ -853,9 +850,6 @@ get_max_cacheline_size (void) ia64_i_cache_stride_shift = cci.pcci_stride; } out: -#ifdef CONFIG_SMP - max_cache_size = max(max_cache_size, cache_size); -#endif if (max > ia64_max_cacheline_size) ia64_max_cacheline_size = max; } Index: linux-rt.q/arch/mips/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/smp.c +++ linux-rt.q/arch/mips/kernel/smp.c @@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS]; /* Map EXPORT_SYMBOL(phys_cpu_present_map); EXPORT_SYMBOL(cpu_online_map); -/* This happens early in bootup, can't really do it better */ -static void smp_tune_scheduling (void) -{ - struct cache_desc *cd = ¤t_cpu_data.scache; - unsigned long cachesize = cd->linesz * cd->sets * cd->ways; - - if (cachesize > max_cache_size) - max_cache_size = cachesize; -} - extern void __init calibrate_delay(void); extern ATTRIB_NORET void cpu_idle(void); @@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned in { init_new_context(current, &init_mm); current_thread_info()->cpu = 0; - smp_tune_scheduling(); plat_prepare_cpus(max_cpus); #ifndef CONFIG_HOTPLUG_CPU cpu_present_map = cpu_possible_map; Index: linux-rt.q/arch/sparc/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/sparc/kernel/smp.c +++ linux-rt.q/arch/sparc/kernel/smp.c @@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id cpu_data(id).prom_node = cpu_node; cpu_data(id).mid = cpu_get_hwmid(cpu_node); - /* this is required to tune the scheduler correctly */ - /* is it possible to have CPUs with different cache sizes? */ - if (id == boot_cpu_id) { - int cache_line,cache_nlines; - cache_line = 0x20; - cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line); - cache_nlines = 0x8000; - cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines); - max_cache_size = cache_line * cache_nlines; - } if (cpu_data(id).mid < 0) panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); } Index: linux-rt.q/arch/sparc64/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/sparc64/kernel/smp.c +++ linux-rt.q/arch/sparc64/kernel/smp.c @@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int m return -EINVAL; } -static void __init smp_tune_scheduling(void) -{ - unsigned int smallest = ~0U; - int i; - - for (i = 0; i < NR_CPUS; i++) { - unsigned int val = cpu_data(i).ecache_size; - - if (val && val < smallest) - smallest = val; - } - - /* Any value less than 256K is nonsense. */ - if (smallest < (256U * 1024U)) - smallest = 256 * 1024; - - max_cache_size = smallest; - - if (smallest < 1U * 1024U * 1024U) - printk(KERN_INFO "Using max_cache_size of %uKB\n", - smallest / 1024U); - else - printk(KERN_INFO "Using max_cache_size of %uMB\n", - smallest / 1024U / 1024U); -} - /* Constrain the number of cpus to max_cpus. */ void __init smp_prepare_cpus(unsigned int max_cpus) { @@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned in } cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy; - smp_tune_scheduling(); } void __devinit smp_prepare_boot_cpu(void) Index: linux-rt.q/block/cfq-iosched.c =================================================================== --- linux-rt.q.orig/block/cfq-iosched.c +++ linux-rt.q/block/cfq-iosched.c @@ -1278,6 +1278,8 @@ static void cfq_init_prio_data(struct cf /* * no prio set, place us in the middle of the BE classes */ + if (tsk->policy == SCHED_IDLE) + goto set_class_idle; cfqq->ioprio = task_nice_ioprio(tsk); cfqq->ioprio_class = IOPRIO_CLASS_BE; break; @@ -1290,6 +1292,7 @@ static void cfq_init_prio_data(struct cf cfqq->ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: + set_class_idle: cfqq->ioprio_class = IOPRIO_CLASS_IDLE; cfqq->ioprio = 7; cfq_clear_cfqq_idle_window(cfqq); Index: linux-rt.q/fs/proc/array.c =================================================================== --- linux-rt.q.orig/fs/proc/array.c +++ linux-rt.q/fs/proc/array.c @@ -165,7 +165,6 @@ static inline char * task_state(struct t rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,7 +172,6 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, @@ -312,6 +310,41 @@ int proc_pid_status(struct task_struct * return buffer - orig; } +static clock_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + return utime; +} + +static clock_t task_stime(struct task_struct *p) +{ + clock_t stime = cputime_to_clock_t(p->stime); + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p); + + return stime; +} + + static int do_task_stat(struct task_struct *task, char * buffer, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; @@ -326,7 +359,8 @@ static int do_task_stat(struct task_stru unsigned long long start_time; unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long min_flt = 0, maj_flt = 0; - cputime_t cutime, cstime, utime, stime; + cputime_t cutime, cstime; + clock_t utime, stime; unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; @@ -344,7 +378,8 @@ static int do_task_stat(struct task_stru sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = cputime_zero; + cutime = cstime = cputime_zero; + utime = stime = 0; rcu_read_lock(); if (lock_task_sighand(task, &flags)) { @@ -370,15 +405,15 @@ static int do_task_stat(struct task_stru do { min_flt += t->min_flt; maj_flt += t->maj_flt; - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); + utime += task_utime(t); + stime += task_stime(t); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; - utime = cputime_add(utime, sig->utime); - stime = cputime_add(stime, sig->stime); + utime += cputime_to_clock_t(sig->utime); + stime += cputime_to_clock_t(sig->stime); } sid = signal_session(sig); @@ -394,8 +429,8 @@ static int do_task_stat(struct task_stru if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - utime = task->utime; - stime = task->stime; + utime = task_utime(task); + stime = task_stime(task); } /* scale priority and nice values from timeslices to -20..20 */ @@ -426,8 +461,8 @@ static int do_task_stat(struct task_stru cmin_flt, maj_flt, cmaj_flt, - cputime_to_clock_t(utime), - cputime_to_clock_t(stime), + utime, + stime, cputime_to_clock_t(cutime), cputime_to_clock_t(cstime), priority, Index: linux-rt.q/fs/proc/base.c =================================================================== --- linux-rt.q.orig/fs/proc/base.c +++ linux-rt.q/fs/proc/base.c @@ -296,7 +296,7 @@ static int proc_pid_wchan(struct task_st */ static int proc_pid_schedstat(struct task_struct *task, char *buffer) { - return sprintf(buffer, "%lu %lu %lu\n", + return sprintf(buffer, "%llu %llu %lu\n", task->sched_info.cpu_time, task->sched_info.run_delay, task->sched_info.pcnt); @@ -929,6 +929,69 @@ static const struct file_operations proc }; #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * Print out various scheduling related per-task fields: + */ +static int sched_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + WARN_ON(!inode); + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + + WARN_ON(!inode); + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_set_task(p); + + put_task_struct(p); + + return count; +} + +static int sched_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1963,6 +2026,9 @@ static const struct pid_entry tgid_base_ INF("environ", S_IRUSR, pid_environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, pid_sched), +#endif INF("cmdline", S_IRUGO, pid_cmdline), INF("stat", S_IRUGO, tgid_stat), INF("statm", S_IRUGO, pid_statm), @@ -2247,6 +2313,9 @@ static const struct pid_entry tid_base_s INF("environ", S_IRUSR, pid_environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, pid_sched), +#endif INF("cmdline", S_IRUGO, pid_cmdline), INF("stat", S_IRUGO, tid_stat), INF("statm", S_IRUGO, pid_statm), Index: linux-rt.q/include/asm-generic/bitops/sched.h =================================================================== --- linux-rt.q.orig/include/asm-generic/bitops/sched.h +++ linux-rt.q/include/asm-generic/bitops/sched.h @@ -6,28 +6,23 @@ /* * Every architecture must define this function. It's the fastest - * way of searching a 140-bit bitmap where the first 100 bits are - * unlikely to be set. It's guaranteed that at least one of the 140 - * bits is cleared. + * way of searching a 100-bit bitmap. It's guaranteed that at least + * one of the 100 bits is cleared. */ static inline int sched_find_first_bit(const unsigned long *b) { #if BITS_PER_LONG == 64 - if (unlikely(b[0])) + if (b[0]) return __ffs(b[0]); - if (likely(b[1])) - return __ffs(b[1]) + 64; - return __ffs(b[2]) + 128; + return __ffs(b[1]) + 64; #elif BITS_PER_LONG == 32 - if (unlikely(b[0])) + if (b[0]) return __ffs(b[0]); - if (unlikely(b[1])) + if (b[1]) return __ffs(b[1]) + 32; - if (unlikely(b[2])) + if (b[2]) return __ffs(b[2]) + 64; - if (b[3]) - return __ffs(b[3]) + 96; - return __ffs(b[4]) + 128; + return __ffs(b[3]) + 96; #else #error BITS_PER_LONG not defined #endif Index: linux-rt.q/include/linux/hardirq.h =================================================================== --- linux-rt.q.orig/include/linux/hardirq.h +++ linux-rt.q/include/linux/hardirq.h @@ -79,6 +79,19 @@ #endif #ifdef CONFIG_PREEMPT +# define PREEMPT_CHECK_OFFSET 1 +#else +# define PREEMPT_CHECK_OFFSET 0 +#endif + +/* + * Check whether we were atomic before we did preempt_disable(): + * (used by the scheduler) + */ +#define in_atomic_preempt_off() \ + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + +#ifdef CONFIG_PREEMPT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -2,7 +2,6 @@ #define _LINUX_SCHED_H #include /* For AT_VECTOR_SIZE */ - /* * cloning flags: */ @@ -34,9 +33,13 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +#define SCHED_ISO 4 +#define SCHED_IDLE 5 #ifdef __KERNEL__ +#include /* For run_node */ + struct sched_param { int sched_priority; }; @@ -130,6 +133,26 @@ extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); extern unsigned long weighted_cpuload(const int cpu); +struct seq_file; +struct cfs_rq; +#ifdef CONFIG_SCHED_DEBUG +extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); +extern void proc_sched_set_task(struct task_struct *p); +extern void +print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now); +#else +static inline void +proc_sched_show_task(struct task_struct *p, struct seq_file *m) +{ +} +static inline void proc_sched_set_task(struct task_struct *p) +{ +} +static inline void +print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) +{ +} +#endif /* * Task state bitmask. NOTE! These bits are also @@ -193,6 +216,7 @@ struct task_struct; extern void sched_init(void); extern void sched_init_smp(void); extern void init_idle(struct task_struct *idle, int cpu); +extern void init_idle_bootup_task(struct task_struct *idle); extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) @@ -479,7 +503,7 @@ struct signal_struct { * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ - unsigned long long sched_time; + unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, @@ -521,31 +545,6 @@ struct signal_struct { #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ - -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ - -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO - -#define MAX_PRIO (MAX_RT_PRIO + 40) - -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -#define rt_task(p) rt_prio((p)->prio) -#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) -#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) - /* * Some day this will be a full-fledged user tracking system.. */ @@ -583,13 +582,13 @@ struct reclaim_state; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info { /* cumulative counters */ - unsigned long cpu_time, /* time spent on the cpu */ - run_delay, /* time spent waiting on a runqueue */ - pcnt; /* # of timeslices run on this cpu */ + unsigned long pcnt; /* # of times run on this cpu */ + unsigned long long cpu_time, /* time spent on the cpu */ + run_delay; /* time spent waiting on a runqueue */ /* timestamps */ - unsigned long last_arrival, /* when we last ran on a cpu */ - last_queued; /* when we were last queued to run */ + unsigned long long last_arrival,/* when we last ran on a cpu */ + last_queued; /* when we were last queued to run */ }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -639,18 +638,24 @@ static inline int sched_info_on(void) #endif } -enum idle_type -{ - SCHED_IDLE, - NOT_IDLE, - NEWLY_IDLE, - MAX_IDLE_TYPES +enum cpu_idle_type { + CPU_IDLE, + CPU_NOT_IDLE, + CPU_NEWLY_IDLE, + CPU_MAX_IDLE_TYPES }; /* * sched-domains (multiprocessor balancing) declarations: */ -#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ + +/* + * Increase resolution of nice-level calculations: + */ +#define SCHED_LOAD_SHIFT 10 +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + +#define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 5) #ifdef CONFIG_SMP #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ @@ -719,14 +724,14 @@ struct sched_domain { #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ - unsigned long lb_cnt[MAX_IDLE_TYPES]; - unsigned long lb_failed[MAX_IDLE_TYPES]; - unsigned long lb_balanced[MAX_IDLE_TYPES]; - unsigned long lb_imbalance[MAX_IDLE_TYPES]; - unsigned long lb_gained[MAX_IDLE_TYPES]; - unsigned long lb_hot_gained[MAX_IDLE_TYPES]; - unsigned long lb_nobusyg[MAX_IDLE_TYPES]; - unsigned long lb_nobusyq[MAX_IDLE_TYPES]; + unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; + unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; + unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; + unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned long lb_gained[CPU_MAX_IDLE_TYPES]; + unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES]; + unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES]; + unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; /* Active load balancing */ unsigned long alb_cnt; @@ -753,12 +758,6 @@ struct sched_domain { extern int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2); -/* - * Maximum cache size the migration-costs auto-tuning code will - * search from: - */ -extern unsigned int max_cache_size; - #endif /* CONFIG_SMP */ @@ -809,14 +808,86 @@ struct mempolicy; struct pipe_inode_info; struct uts_namespace; -enum sleep_type { - SLEEP_NORMAL, - SLEEP_NONINTERACTIVE, - SLEEP_INTERACTIVE, - SLEEP_INTERRUPTED, +struct rq; +struct sched_domain; + +struct sched_class { + struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, + int wakeup, u64 now); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, + int sleep, u64 now); + void (*yield_task) (struct rq *rq, struct task_struct *p); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); + + struct task_struct * (*pick_next_task) (struct rq *rq, u64 now); + void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now); + + int (*load_balance) (struct rq *this_rq, int this_cpu, + struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *total_load_moved); + + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p); + void (*task_new) (struct rq *rq, struct task_struct *p); }; -struct prio_array; +struct load_weight { + unsigned long weight, inv_weight; +}; + +/* + * CFS stats for a schedulable entity (task, task-group etc) + * + * Current field usage histogram: + * + * 4 se->block_start + * 4 se->run_node + * 4 se->sleep_start + * 4 se->sleep_start_fair + * 6 se->load.weight + * 7 se->delta_fair + * 15 se->wait_runtime + */ +struct sched_entity { + long wait_runtime; + unsigned long delta_fair_run; + unsigned long delta_fair_sleep; + unsigned long delta_exec; + s64 fair_key; + struct load_weight load; /* for load-balancing */ + struct rb_node run_node; + unsigned int on_rq; + + u64 wait_start_fair; + u64 wait_start; + u64 exec_start; + u64 sleep_start; + u64 sleep_start_fair; + u64 block_start; + u64 sleep_max; + u64 block_max; + u64 exec_max; + u64 wait_max; + u64 last_ran; + + u64 sum_exec_runtime; + s64 sum_wait_runtime; + s64 sum_sleep_runtime; + unsigned long wait_runtime_overruns; + unsigned long wait_runtime_underruns; +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *parent; + /* rq on which this entity is (to be) queued: */ + struct cfs_rq *cfs_rq; + /* rq "owned" by this entity/group: */ + struct cfs_rq *my_q; +#endif +}; struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -832,23 +903,20 @@ struct task_struct { int oncpu; #endif #endif - int load_weight; /* for niceness load balancing purposes */ + int prio, static_prio, normal_prio; struct list_head run_list; - struct prio_array *array; + struct sched_entity se; unsigned short ioprio; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif - unsigned long sleep_avg; - unsigned long long timestamp, last_ran; - unsigned long long sched_time; /* sched_clock time spent running */ - enum sleep_type sleep_type; unsigned int policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + unsigned int time_slice; + struct sched_class *sched_class; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -1078,6 +1146,37 @@ struct task_struct { #endif }; +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. + * + * The MAX_USER_RT_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) +#define DEFAULT_PRIO (MAX_RT_PRIO + 20) + +static inline int rt_prio(int prio) +{ + if (unlikely(prio < MAX_RT_PRIO)) + return 1; + return 0; +} + +static inline int rt_task(struct task_struct *p) +{ + return rt_prio(p->prio); +} + static inline pid_t process_group(struct task_struct *tsk) { return tsk->signal->pgrp; @@ -1222,8 +1321,9 @@ static inline int set_cpus_allowed(struc #endif extern unsigned long long sched_clock(void); +extern void sched_clock_unstable_event(void); extern unsigned long long -current_sched_time(const struct task_struct *current_task); +task_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP @@ -1239,6 +1339,15 @@ static inline void idle_task_exit(void) #endif extern void sched_idle_next(void); +extern char * sched_print_task_state(struct task_struct *p, char *buffer); + +extern unsigned int sysctl_sched_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern unsigned int sysctl_sched_batch_wakeup_granularity; +extern unsigned int sysctl_sched_stat_granularity; +extern unsigned int sysctl_sched_runtime_limit; +extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_features; #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); @@ -1317,8 +1426,8 @@ extern void FASTCALL(wake_up_new_task(st #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags)); -extern void FASTCALL(sched_exit(struct task_struct * p)); +extern void sched_fork(struct task_struct * p, int clone_flags); +extern void sched_dead(struct task_struct * p); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); @@ -1406,7 +1515,7 @@ extern struct mm_struct * mm_alloc(void) extern void FASTCALL(__mmdrop(struct mm_struct *)); static inline void mmdrop(struct mm_struct * mm) { - if (atomic_dec_and_test(&mm->mm_count)) + if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } @@ -1638,10 +1747,7 @@ static inline unsigned int task_cpu(cons return task_thread_info(p)->cpu; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - task_thread_info(p)->cpu = cpu; -} +extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else @@ -1650,6 +1756,10 @@ static inline unsigned int task_cpu(cons return 0; } +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} + static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } Index: linux-rt.q/include/linux/topology.h =================================================================== --- linux-rt.q.orig/include/linux/topology.h +++ linux-rt.q/include/linux/topology.h @@ -50,10 +50,10 @@ for_each_online_node(node) \ if (nr_cpus_node(node)) -#ifndef node_distance /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 +#ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE @@ -98,7 +98,7 @@ .cache_nice_tries = 0, \ .busy_idx = 0, \ .idle_idx = 0, \ - .newidle_idx = 1, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ @@ -128,14 +128,15 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ + .idle_idx = 0, \ + .newidle_idx = 0, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ | SD_SHARE_PKG_RESOURCES\ | BALANCE_FOR_MC_POWER, \ .last_balance = jiffies, \ @@ -158,14 +159,15 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ + .idle_idx = 0, \ + .newidle_idx = 0, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ | BALANCE_FOR_PKG_POWER,\ .last_balance = jiffies, \ .balance_interval = 1, \ Index: linux-rt.q/init/main.c =================================================================== --- linux-rt.q.orig/init/main.c +++ linux-rt.q/init/main.c @@ -436,15 +436,16 @@ static void noinline __init_refok rest_i /* * The boot idle thread must execute schedule() - * at least one to get things moving: + * at least once to get things moving: */ + init_idle_bootup_task(current); preempt_enable_no_resched(); schedule(); preempt_disable(); /* Call into cpu_idle with preempt disabled */ cpu_idle(); -} +} /* Check for early params. */ static int __init do_early_param(char *param, char *val) Index: linux-rt.q/kernel/delayacct.c =================================================================== --- linux-rt.q.orig/kernel/delayacct.c +++ linux-rt.q/kernel/delayacct.c @@ -99,9 +99,10 @@ void __delayacct_blkio_end(void) int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { s64 tmp; - struct timespec ts; - unsigned long t1,t2,t3; + unsigned long t1; + unsigned long long t2,t3; unsigned long flags; + struct timespec ts; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -124,11 +125,10 @@ int __delayacct_add_tsk(struct taskstats d->cpu_count += t1; - jiffies_to_timespec(t2, &ts); - tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); + tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; - tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; + tmp = (s64)d->cpu_run_virtual_total + t3; d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; Index: linux-rt.q/kernel/exit.c =================================================================== --- linux-rt.q.orig/kernel/exit.c +++ linux-rt.q/kernel/exit.c @@ -122,9 +122,9 @@ static void __exit_signal(struct task_st sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; - sig->sched_time += tsk->sched_time; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -182,7 +182,6 @@ repeat: zap_leader = (leader->exit_signal == -1); } - sched_exit(p); write_unlock_irq(&tasklist_lock); proc_flush_task(p); release_thread(p); @@ -291,7 +290,7 @@ static void reparent_to_kthreadd(void) /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; - if (!has_rt_policy(current) && (task_nice(current) < 0)) + if (task_nice(current) < 0) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ Index: linux-rt.q/kernel/fork.c =================================================================== --- linux-rt.q.orig/kernel/fork.c +++ linux-rt.q/kernel/fork.c @@ -117,6 +117,7 @@ void __put_task_struct(struct task_struc WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + sched_dead(tsk); security_task_free(tsk); free_uid(tsk->user); @@ -877,7 +878,7 @@ static inline int copy_signal(unsigned l sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->sched_time = 0; + sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); @@ -1040,7 +1041,7 @@ static struct task_struct *copy_process( p->utime = cputime_zero; p->stime = cputime_zero; - p->sched_time = 0; + #ifdef CONFIG_TASK_XACCT p->rchar = 0; /* I/O counter: bytes read */ p->wchar = 0; /* I/O counter: bytes written */ Index: linux-rt.q/kernel/posix-cpu-timers.c =================================================================== --- linux-rt.q.orig/kernel/posix-cpu-timers.c +++ linux-rt.q/kernel/posix-cpu-timers.c @@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc } static inline unsigned long long sched_ns(struct task_struct *p) { - return (p == current) ? current_sched_time(p) : p->sched_time; + return task_sched_runtime(p); } int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) @@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked } while (t != p); break; case CPUCLOCK_SCHED: - cpu->sched = p->signal->sched_time; + cpu->sched = p->signal->sum_sched_runtime; /* Add in each other live thread. */ while ((t = next_thread(t)) != p) { - cpu->sched += t->sched_time; + cpu->sched += t->se.sum_exec_runtime; } cpu->sched += sched_ns(p); break; @@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer */ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, - unsigned long long sched_time) + unsigned long long sum_exec_runtime) { struct cpu_timer_list *timer, *next; cputime_t ptime = cputime_add(utime, stime); @@ -451,10 +451,10 @@ static void cleanup_timers(struct list_h ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.sched < sched_time) { + if (timer->expires.sched < sum_exec_runtime) { timer->expires.sched = 0; } else { - timer->expires.sched -= sched_time; + timer->expires.sched -= sum_exec_runtime; } } } @@ -467,7 +467,7 @@ static void cleanup_timers(struct list_h void posix_cpu_timers_exit(struct task_struct *tsk) { cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->sched_time); + tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); } void posix_cpu_timers_exit_group(struct task_struct *tsk) @@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct cleanup_timers(tsk->signal->cpu_timers, cputime_add(tsk->utime, tsk->signal->utime), cputime_add(tsk->stime, tsk->signal->stime), - tsk->sched_time + tsk->signal->sched_time); + tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); } @@ -536,7 +536,7 @@ static void process_timer_rebalance(stru nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { - ns = t->sched_time + nsleft; + ns = t->se.sum_exec_runtime + nsleft; if (t->it_sched_expires == 0 || t->it_sched_expires > ns) { t->it_sched_expires = ns; @@ -1004,7 +1004,7 @@ static void check_thread_timers(struct t struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || tsk->sched_time < t->expires.sched) { + if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { tsk->it_sched_expires = t->expires.sched; break; } @@ -1024,7 +1024,7 @@ static void check_process_timers(struct int maxfire; struct signal_struct *const sig = tsk->signal; cputime_t utime, stime, ptime, virt_expires, prof_expires; - unsigned long long sched_time, sched_expires; + unsigned long long sum_sched_runtime, sched_expires; struct task_struct *t; struct list_head *timers = sig->cpu_timers; @@ -1044,12 +1044,12 @@ static void check_process_timers(struct */ utime = sig->utime; stime = sig->stime; - sched_time = sig->sched_time; + sum_sched_runtime = sig->sum_sched_runtime; t = tsk; do { utime = cputime_add(utime, t->utime); stime = cputime_add(stime, t->stime); - sched_time += t->sched_time; + sum_sched_runtime += t->se.sum_exec_runtime; t = next_thread(t); } while (t != tsk); ptime = cputime_add(utime, stime); @@ -1090,7 +1090,7 @@ static void check_process_timers(struct struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sched_time < t->expires.sched) { + if (!--maxfire || sum_sched_runtime < t->expires.sched) { sched_expires = t->expires.sched; break; } @@ -1182,7 +1182,7 @@ static void check_process_timers(struct virt_left = cputime_sub(virt_expires, utime); virt_left = cputime_div_non_zero(virt_left, nthreads); if (sched_expires) { - sched_left = sched_expires - sched_time; + sched_left = sched_expires - sum_sched_runtime; do_div(sched_left, nthreads); sched_left = max_t(unsigned long long, sched_left, 1); } else { @@ -1208,7 +1208,7 @@ static void check_process_timers(struct t->it_virt_expires = ticks; } - sched = t->sched_time + sched_left; + sched = t->se.sum_exec_runtime + sched_left; if (sched_expires && (t->it_sched_expires == 0 || t->it_sched_expires > sched)) { t->it_sched_expires = sched; @@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_st if (UNEXPIRED(prof) && UNEXPIRED(virt) && (tsk->it_sched_expires == 0 || - tsk->sched_time < tsk->it_sched_expires)) + tsk->se.sum_exec_runtime < tsk->it_sched_expires)) return; #undef UNEXPIRED Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -16,6 +16,11 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith */ #include @@ -100,131 +105,70 @@ unsigned long long __attribute__((weak)) */ #define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -static unsigned int static_prio_timeslice(int static_prio) +static inline int rt_policy(int policy) { - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); + if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + return 1; + return 0; } -#ifdef CONFIG_SMP -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +static inline int task_has_rt_policy(struct task_struct *p) { - return reciprocal_divide(load, sg->reciprocal_cpu_power); + return rt_policy(p->policy); } /* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value + * This is the priority-queue data structure of the RT scheduling class: */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif +struct prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ +struct load_stat { + struct load_weight load; + u64 load_update_start, load_update_last; + unsigned long delta_fair, delta_exec, delta_stat; +}; -static inline unsigned int task_timeslice(struct task_struct *p) -{ - return static_prio_timeslice(p->static_prio); -} +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running; -/* - * These are the runqueue data structures: - */ + s64 fair_clock; + u64 exec_clock; + s64 wait_runtime; + u64 sleeper_bonus; + unsigned long wait_runtime_overruns, wait_runtime_underruns; + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + struct rb_node *rb_load_balance_curr; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ +#endif +}; -struct prio_array { - unsigned int nr_active; - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_PRIO]; +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct prio_array active; + int rt_load_balance_idx; + struct list_head *rt_load_balance_head, *rt_load_balance_curr; }; /* @@ -242,15 +186,21 @@ struct rq { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; - unsigned long raw_weighted_load; -#ifdef CONFIG_SMP - unsigned long cpu_load[3]; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ unsigned char in_nohz_recently; #endif + struct load_stat ls; /* capture load from *all* tasks on this cpu */ + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; +#ifdef CONFIG_FAIR_GROUP_SCHED + struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ #endif - unsigned long long nr_switches; + struct rt_rq rt; /* * This is part of a global counter where only the total sum @@ -260,14 +210,18 @@ struct rq { */ unsigned long nr_uninterruptible; - unsigned long expired_timestamp; - /* Cached timestamp set by update_cpu_clock() */ - unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; - struct prio_array *active, *expired, arrays[2]; - int best_expired_prio; + + u64 clock, prev_clock_raw; + s64 clock_max_delta; + + unsigned int clock_warps, clock_overflows; + unsigned int clock_unstable_events; + + struct sched_class *load_balance_class; + atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -307,6 +261,29 @@ struct rq { static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; static DEFINE_MUTEX(sched_hotcpu_mutex); +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +{ + rq->curr->sched_class->check_preempt_curr(rq, p); +} + +#define SCALE_PRIO(x, prio) \ + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) + +/* + * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + */ +static unsigned int static_prio_timeslice(int static_prio) +{ + if (static_prio == NICE_TO_PRIO(19)) + return 1; + + if (static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, static_prio); +} + static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP @@ -316,6 +293,72 @@ static inline int cpu_of(struct rq *rq) #endif } +#ifdef CONFIG_SMP +/* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} +#endif +/* + * Per-runqueue clock, as finegrained as the platform can give us: + */ +static unsigned long long __rq_clock(struct rq *rq) +{ + u64 prev_raw = rq->prev_clock_raw; + u64 now = sched_clock(); + s64 delta = now - prev_raw; + u64 clock = rq->clock; + + /* + * Protect against sched_clock() occasionally going backwards: + */ + if (unlikely(delta < 0)) { + clock++; + rq->clock_warps++; + } else { + /* + * Catch too large forward jumps too: + */ + if (unlikely(delta > 2*TICK_NSEC)) { + clock++; + rq->clock_overflows++; + } else { + if (unlikely(delta > rq->clock_max_delta)) + rq->clock_max_delta = delta; + clock += delta; + } + } + + rq->prev_clock_raw = now; + rq->clock = clock; + + return clock; +} + +static inline unsigned long long rq_clock(struct rq *rq) +{ + int this_cpu = smp_processor_id(); + + if (this_cpu == cpu_of(rq)) + return __rq_clock(rq); + + return rq->clock; +} + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -331,6 +374,18 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#ifdef CONFIG_FAIR_GROUP_SCHED +/* Change a task's ->cfs_rq if it moves across CPUs */ +static inline void set_task_cfs_rq(struct task_struct *p) +{ + p->se.cfs_rq = &task_rq(p)->cfs; +} +#else +static inline void set_task_cfs_rq(struct task_struct *p) +{ +} +#endif + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif @@ -460,134 +515,6 @@ static inline void task_rq_unlock(struct spin_unlock_irqrestore(&rq->lock, *flags); } -#ifdef CONFIG_SCHEDSTATS -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 14 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcnt = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", - cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, - rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->ttwu_cnt, rq->ttwu_local, - rq->rq_sched_info.cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - preempt_disable(); - for_each_domain(cpu, sd) { - enum idle_type itype; - char mask_str[NR_CPUS]; - - cpumask_scnprintf(mask_str, NR_CPUS, sd->span); - seq_printf(seq, "domain%d %s", dcnt++, mask_str); - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " - "%lu", - sd->lb_cnt[itype], - sd->lb_balanced[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_gained[itype], - sd->lb_hot_gained[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" - " %lu %lu %lu\n", - sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, - sd->ttwu_move_balance); - } - preempt_enable(); -#endif - } - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * Expects runqueue lock to be held for atomicity of update - */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) -{ - if (rq) { - rq->rq_sched_info.run_delay += delta_jiffies; - rq->rq_sched_info.pcnt++; - } -} - -/* - * Expects runqueue lock to be held for atomicity of update - */ -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) -{ - if (rq) - rq->rq_sched_info.cpu_time += delta_jiffies; -} -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -#else /* !CONFIG_SCHEDSTATS */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) -{} -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) -{} -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -#endif - /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -603,177 +530,176 @@ static inline struct rq *this_rq_lock(vo return rq; } -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. + * CPU frequency is/was unstable - start new by setting prev_clock_raw: */ -static inline void sched_info_dequeued(struct task_struct *t) +void sched_clock_unstable_event(void) { - t->sched_info.last_queued = 0; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(current, &flags); + rq->prev_clock_raw = sched_clock(); + rq->clock_unstable_events++; + task_rq_unlock(rq, &flags); } +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + /* - * Called when a task finally hits the cpu. We can now calculate how - * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. */ -static void sched_info_arrive(struct task_struct *t) +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +static void resched_task(struct task_struct *p) { - unsigned long now = jiffies, delta_jiffies = 0; + int cpu; + + assert_spin_locked(&task_rq(p)->lock); + + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; - if (t->sched_info.last_queued) - delta_jiffies = now - t->sched_info.last_queued; - sched_info_dequeued(t); - t->sched_info.run_delay += delta_jiffies; - t->sched_info.last_arrival = now; - t->sched_info.pcnt++; + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} - rq_sched_info_arrive(task_rq(t), delta_jiffies); +static void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); } -/* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * - * This function is only called from enqueue_task(), but also only updates - * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. - */ -static inline void sched_info_queued(struct task_struct *t) +#else +static inline void resched_task(struct task_struct *p) { - if (unlikely(sched_info_on())) - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); } +#endif -/* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. - */ -static inline void sched_info_depart(struct task_struct *t) +static u64 div64_likely32(u64 divident, unsigned long divisor) { - unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; +#if BITS_PER_LONG == 32 + if (likely(divident <= 0xffffffffULL)) + return (u32)divident / divisor; + do_div(divident, divisor); - t->sched_info.cpu_time += delta_jiffies; - rq_sched_info_depart(task_rq(t), delta_jiffies); + return divident; +#else + return divident / divisor; +#endif } -/* - * Called when tasks are switched involuntarily due, typically, to expiring - * their time slice. (This may also be called when switching to or from - * the idle task.) We are only called when prev != next. - */ -static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +static inline unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) { - struct rq *rq = task_rq(prev); + u64 tmp; + + if (unlikely(!lw->inv_weight)) + lw->inv_weight = WMULT_CONST / lw->weight; + tmp = (u64)delta_exec * weight; /* - * prev now departs the cpu. It's not interesting to record - * stats about how efficient we were at scheduling the idle - * process, however. + * Check whether we'd overflow the 64-bit multiplication: */ - if (prev != rq->idle) - sched_info_depart(prev); + if (unlikely(tmp > WMULT_CONST)) { + tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) + >> (WMULT_SHIFT/2); + } else { + tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; + } - if (next != rq->idle) - sched_info_arrive(next); + return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit); } -static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) -{ - if (unlikely(sched_info_on())) - __sched_info_switch(prev, next); -} -#else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ -/* - * Adding/removing a task to/from a priority array: - */ -static void dequeue_task(struct task_struct *p, struct prio_array *array) +static inline unsigned long +calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) { - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); } -static void enqueue_task(struct task_struct *p, struct prio_array *array) +static void update_load_add(struct load_weight *lw, unsigned long inc) { - sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + lw->weight += inc; + lw->inv_weight = 0; } -/* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. - */ -static void requeue_task(struct task_struct *p, struct prio_array *array) +static void update_load_sub(struct load_weight *lw, unsigned long dec) { - list_move_tail(&p->run_list, array->queue + p->prio); + lw->weight -= dec; + lw->inv_weight = 0; } -static inline void -enqueue_task_head(struct task_struct *p, struct prio_array *array) +static void __update_curr_load(struct rq *rq, struct load_stat *ls) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + if (rq->curr != rq->idle && ls->load.weight) { + ls->delta_exec += ls->delta_stat; + ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); + ls->delta_stat = 0; + } } /* - * __normal_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. + * Update delta_exec, delta_fair fields for rq. * - * We use 25% of the full 0...39 priority range so that: + * delta_fair clock advances at a rate inversely proportional to + * total load (rq->ls.load.weight) on the runqueue, while + * delta_exec advances at the same rate as wall-clock (provided + * cpu is not idle). * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * delta_exec / delta_fair is a measure of the (smoothened) load on this + * runqueue over any given interval. This (smoothened) load is used + * during load balance. * - * Both properties are important to certain workloads. + * This function is called /before/ updating rq->ls.load + * and when switching tasks. */ - -static inline int __normal_prio(struct task_struct *p) +static void update_curr_load(struct rq *rq, u64 now) { - int bonus, prio; + struct load_stat *ls = &rq->ls; + u64 start; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; + start = ls->load_update_start; + ls->load_update_start = now; + ls->delta_stat += now - start; + /* + * Stagger updates to ls->delta_fair. Very frequent updates + * can be expensive. + */ + if (ls->delta_stat >= sysctl_sched_stat_granularity) + __update_curr_load(rq, ls); } /* @@ -791,53 +717,141 @@ static inline int __normal_prio(struct t * this code will need modification */ #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define LOAD_WEIGHT(lp) \ +#define load_weight(lp) \ (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) -#define PRIO_TO_LOAD_WEIGHT(prio) \ - LOAD_WEIGHT(static_prio_timeslice(prio)) -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) +#define PRIO_TO_load_weight(prio) \ + load_weight(static_prio_timeslice(prio)) +#define RTPRIO_TO_load_weight(rp) \ + (PRIO_TO_load_weight(MAX_RT_PRIO) + load_weight(rp)) + +#define WEIGHT_IDLEPRIO 2 +#define WMULT_IDLEPRIO (1 << 31) + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. + */ +static const int prio_to_weight[40] = { +/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, +/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, +/* 0 */ NICE_0_LOAD /* 1024 */, +/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, +/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, +}; -static void set_load_weight(struct task_struct *p) -{ - if (has_rt_policy(p)) { -#ifdef CONFIG_SMP - if (p == task_rq(p)->migration_thread) - /* - * The migration thread does the actual balancing. - * Giving its load any weight will skew balancing - * adversely. - */ - p->load_weight = 0; - else -#endif - p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); - } else - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); -} +static const u32 prio_to_wmult[40] = { + 48356, 60446, 75558, 94446, 118058, 147573, + 184467, 230589, 288233, 360285, 450347, + 562979, 703746, 879575, 1099582, 1374389, + 717986, 2147483, 2684354, 3355443, 4194304, + 244160, 6557201, 8196502, 10250518, 12782640, + 16025997, 19976592, 24970740, 31350126, 39045157, + 49367440, 61356675, 76695844, 95443717, 119304647, + 148102320, 186737708, 238609294, 286331153, +}; static inline void -inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) +inc_load(struct rq *rq, const struct task_struct *p, u64 now) { - rq->raw_weighted_load += p->load_weight; + update_curr_load(rq, now); + update_load_add(&rq->ls.load, p->se.load.weight); } static inline void -dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) +dec_load(struct rq *rq, const struct task_struct *p, u64 now) { - rq->raw_weighted_load -= p->load_weight; + update_curr_load(rq, now); + update_load_sub(&rq->ls.load, p->se.load.weight); } -static inline void inc_nr_running(struct task_struct *p, struct rq *rq) +static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) { rq->nr_running++; - inc_raw_weighted_load(rq, p); + inc_load(rq, p, now); } -static inline void dec_nr_running(struct task_struct *p, struct rq *rq) +static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) { rq->nr_running--; - dec_raw_weighted_load(rq, p); + dec_load(rq, p, now); +} + +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); + +struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); +}; + +static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *load_moved, + int this_best_prio, int best_prio, int best_prio_seen, + struct rq_iterator *iterator); + +#include "sched_stats.h" +#include "sched_rt.c" +#include "sched_fair.c" +#include "sched_idletask.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif + +#define sched_class_highest (&rt_sched_class) + +static void set_load_weight(struct task_struct *p) +{ + task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; + p->se.wait_runtime = 0; + + if (task_has_rt_policy(p)) { + p->se.load.weight = prio_to_weight[0] * 2; + p->se.load.inv_weight = prio_to_wmult[0] >> 1; + return; + } + + /* + * SCHED_IDLE tasks get minimal weight: + */ + if (p->policy == SCHED_IDLE) { + p->se.load.weight = WEIGHT_IDLEPRIO; + p->se.load.inv_weight = WMULT_IDLEPRIO; + return; + } + + p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; + p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; +} + +static void +enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) +{ + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, wakeup, now); + p->se.on_rq = 1; +} + +static void +dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) +{ + p->sched_class->dequeue_task(rq, p, sleep, now); + p->se.on_rq = 0; +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ + return p->static_prio; } /* @@ -851,7 +865,7 @@ static inline int normal_prio(struct tas { int prio; - if (has_rt_policy(p)) + if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -879,221 +893,46 @@ static int effective_prio(struct task_st } /* - * __activate_task - move a task to the runqueue. - */ -static void __activate_task(struct task_struct *p, struct rq *rq) -{ - struct prio_array *target = rq->active; - - if (batch_task(p)) - target = rq->expired; - enqueue_task(p, target); - inc_nr_running(p, rq); -} - -/* - * __activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) -{ - enqueue_task_head(p, rq->active); - inc_nr_running(p, rq); -} - -/* - * Recalculate p->normal_prio and p->prio after having slept, - * updating the sleep-average too: + * activate_task - move a task to the runqueue. */ -static int recalc_task_prio(struct task_struct *p, unsigned long long now) +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long sleep_time = now - p->timestamp; - - if (batch_task(p)) - sleep_time = 0; - - if (likely(sleep_time > 0)) { - /* - * This ceiling is set to the lowest priority that would allow - * a task to be reinserted into the active array on timeslice - * completion. - */ - unsigned long ceiling = INTERACTIVE_SLEEP(p); - - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { - /* - * Prevents user tasks from achieving best priority - * with one single large enough sleep. - */ - p->sleep_avg = ceiling; - /* - * Using INTERACTIVE_SLEEP() as a ceiling places a - * nice(0) task 1ms sleep away from promotion, and - * gives it 700ms to round-robin with no chance of - * being demoted. This is more than generous, so - * mark this sleep as non-interactive to prevent the - * on-runqueue bonus logic from intervening should - * this task not receive cpu immediately. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else { - /* - * Tasks waking from uninterruptible sleep are - * limited in their sleep_avg rise as they - * are likely to be waiting on I/O - */ - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { - if (p->sleep_avg >= ceiling) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - ceiling) { - p->sleep_avg = ceiling; - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; + u64 now = rq_clock(rq); - } - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; - return effective_prio(p); + enqueue_task(rq, p, wakeup, now); + inc_nr_running(p, rq, now); } /* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) + * activate_idle_task - move idle task to the _front_ of runqueue. */ -static void activate_task(struct task_struct *p, struct rq *rq, int local) +static inline void activate_idle_task(struct task_struct *p, struct rq *rq) { - unsigned long long now; - - if (rt_task(p)) - goto out; - - now = sched_clock(); -#ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ - struct rq *this_rq = this_rq(); - now = (now - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; - } -#endif - - /* - * Sleep time is in units of nanosecs, so shift by 20 to get a - * milliseconds-range estimation of the amount of time that the task - * spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - if (p->state == TASK_UNINTERRUPTIBLE) - profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (now - p->timestamp) >> 20); - } + u64 now = rq_clock(rq); - p->prio = recalc_task_prio(p, now); + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (p->sleep_type == SLEEP_NORMAL) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->sleep_type = SLEEP_INTERRUPTED; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->sleep_type = SLEEP_INTERACTIVE; - } - } - p->timestamp = now; -out: - __activate_task(p, rq); + enqueue_task(rq, p, 0, now); + inc_nr_running(p, rq, now); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, struct rq *rq) -{ - dec_nr_running(p, rq); - dequeue_task(p, p->array); - p->array = NULL; -} - -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -static void resched_task(struct task_struct *p) +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { - int cpu; - - assert_spin_locked(&task_rq(p)->lock); - - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) - return; + u64 now = rq_clock(rq); - set_tsk_thread_flag(p, TIF_NEED_RESCHED); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); -} - -static void resched_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; - if (!spin_trylock_irqsave(&rq->lock, flags)) - return; - resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); -} -#else -static inline void resched_task(struct task_struct *p) -{ - assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); + dequeue_task(rq, p, sleep, now); + dec_nr_running(p, rq, now); } -#endif /** * task_curr - is this task currently executing on a CPU? @@ -1107,10 +946,42 @@ inline int task_curr(const struct task_s /* Used instead of source_load when we know the type == 0 */ unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->raw_weighted_load; + return cpu_rq(cpu)->ls.load.weight; } #ifdef CONFIG_SMP + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + task_thread_info(p)->cpu = cpu; + set_task_cfs_rq(p); +} + +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ + int old_cpu = task_cpu(p); + struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + u64 clock_offset, fair_clock_offset; + + clock_offset = old_rq->clock - new_rq->clock; + fair_clock_offset = old_rq->cfs.fair_clock - + new_rq->cfs.fair_clock; + if (p->se.wait_start) + p->se.wait_start -= clock_offset; + if (p->se.wait_start_fair) + p->se.wait_start_fair -= fair_clock_offset; + if (p->se.sleep_start) + p->se.sleep_start -= clock_offset; + if (p->se.block_start) + p->se.block_start -= clock_offset; + if (p->se.sleep_start_fair) + p->se.sleep_start_fair -= fair_clock_offset; + + task_thread_info(p)->cpu = new_cpu; + + set_task_cfs_rq(p); +} + struct migration_req { struct list_head list; @@ -1133,7 +1004,7 @@ migrate_task(struct task_struct *p, int * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!p->se.on_rq && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -1158,9 +1029,8 @@ migrate_task(struct task_struct *p, int void wait_task_inactive(struct task_struct *p) { unsigned long flags; + int running, on_rq; struct rq *rq; - struct prio_array *array; - int running; repeat: /* @@ -1192,7 +1062,7 @@ repeat: */ rq = task_rq_lock(p, &flags); running = task_running(rq, p); - array = p->array; + on_rq = p->se.on_rq; task_rq_unlock(rq, &flags); /* @@ -1215,7 +1085,7 @@ repeat: * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(array)) { + if (unlikely(on_rq)) { yield(); goto repeat; } @@ -1261,11 +1131,12 @@ void kick_process(struct task_struct *p) static inline unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); if (type == 0) - return rq->raw_weighted_load; + return total; - return min(rq->cpu_load[type-1], rq->raw_weighted_load); + return min(rq->cpu_load[type-1], total); } /* @@ -1275,11 +1146,12 @@ static inline unsigned long source_load( static inline unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); if (type == 0) - return rq->raw_weighted_load; + return total; - return max(rq->cpu_load[type-1], rq->raw_weighted_load); + return max(rq->cpu_load[type-1], total); } /* @@ -1288,9 +1160,10 @@ static inline unsigned long target_load( static inline unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); unsigned long n = rq->nr_running; - return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; + return n ? total / n : SCHED_LOAD_SCALE; } /* @@ -1392,9 +1265,9 @@ static int sched_balance_self(int cpu, i struct sched_domain *tmp, *sd = NULL; for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ + /* + * If power savings logic is enabled for a domain, stop there. + */ if (tmp->flags & SD_POWERSAVINGS_BALANCE) break; if (tmp->flags & flag) @@ -1521,7 +1394,7 @@ static int try_to_wake_up(struct task_st if (!(old_state & state)) goto out; - if (p->array) + if (p->se.on_rq) goto out_running; cpu = task_cpu(p); @@ -1576,11 +1449,11 @@ static int try_to_wake_up(struct task_st * of the current CPU: */ if (sync) - tl -= current->load_weight; + tl -= current->se.load.weight; if ((tl <= load && tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->load_weight) <= imbalance*load) { + 100*(tl + p->se.load.weight) <= imbalance*load) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1614,7 +1487,7 @@ out_set_cpu: old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (p->se.on_rq) goto out_running; this_cpu = smp_processor_id(); @@ -1623,25 +1496,7 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else - - /* - * Tasks that have marked their sleep as noninteractive get - * woken up with their sleep average not weighted in an - * interactive way. - */ - if (old_state & TASK_NONINTERACTIVE) - p->sleep_type = SLEEP_NONINTERACTIVE; - - - activate_task(p, rq, cpu == this_cpu); + activate_task(rq, p, 1); /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) @@ -1650,10 +1505,8 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + if (!sync || cpu != this_cpu) + check_preempt_curr(rq, p); success = 1; out_running: @@ -1676,19 +1529,36 @@ int fastcall wake_up_state(struct task_s return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: */ -void fastcall sched_fork(struct task_struct *p, int clone_flags) +static void __sched_fork(struct task_struct *p) { - int cpu = get_cpu(); + p->se.wait_start_fair = 0; + p->se.wait_start = 0; + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.delta_exec = 0; + p->se.delta_fair_run = 0; + p->se.delta_fair_sleep = 0; + p->se.wait_runtime = 0; + p->se.sum_wait_runtime = 0; + p->se.sum_sleep_runtime = 0; + p->se.sleep_start = 0; + p->se.sleep_start_fair = 0; + p->se.block_start = 0; + p->se.sleep_max = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.wait_max = 0; + p->se.wait_runtime_overruns = 0; + p->se.wait_runtime_underruns = 0; -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); + INIT_LIST_HEAD(&p->run_list); + p->se.on_rq = 0; /* * We mark the process as running here, but have not actually @@ -1697,16 +1567,29 @@ void fastcall sched_fork(struct task_str * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; +} + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p, int clone_flags) +{ + int cpu = get_cpu(); + + __sched_fork(p); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + __set_task_cpu(p, cpu); /* * Make sure we do not leak PI boosting priority to the child: */ p->prio = current->normal_prio; - INIT_LIST_HEAD(&p->run_list); - p->array = NULL; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (unlikely(sched_info_on())) + if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) @@ -1716,34 +1599,16 @@ void fastcall sched_fork(struct task_str /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ - local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); - } - local_irq_enable(); put_cpu(); } /* + * After fork, child runs first. (default) If set to 0 then + * parent will (try to) run first. + */ +unsigned int __read_mostly sysctl_sched_child_runs_first = 1; + +/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping @@ -1752,108 +1617,33 @@ void fastcall sched_fork(struct task_str */ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { - struct rq *rq, *this_rq; unsigned long flags; - int this_cpu, cpu; + struct rq *rq; + int this_cpu; rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + this_cpu = smp_processor_id(); /* parent's CPU */ p->prio = effective_prio(p); - if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - p->normal_prio = current->normal_prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - inc_nr_running(p, rq); - } - set_need_resched(); - } else - /* Run child last */ - __activate_task(p, rq); - /* - * We skip the following code due to cpu == this_cpu - * - * task_rq_unlock(rq, &flags); - * this_rq = task_rq_lock(current, &flags); - */ - this_rq = rq; + if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || + task_cpu(p) != this_cpu || !current->se.on_rq) { + activate_task(rq, p, 0); } else { - this_rq = cpu_rq(this_cpu); - - /* - * Not the local CPU - must adjust timestamp. This should - * get optimised away in the !CONFIG_SMP case. - */ - p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: + * Let the scheduling class do new task startup + * management (if any): */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); + p->sched_class->task_new(rq, p); } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); + check_preempt_curr(rq, p); + task_rq_unlock(rq, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) - */ -void fastcall sched_exit(struct task_struct *p) +void sched_dead(struct task_struct *p) { - unsigned long flags; - struct rq *rq; - - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); + WARN_ON_ONCE(p->se.on_rq); } /** @@ -1911,13 +1701,13 @@ static inline void finish_task_switch(st prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); - if (mm) + if (likely(mm)) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. - */ + */ kprobe_flush_task(prev); put_task_struct(prev); } @@ -1945,13 +1735,15 @@ asmlinkage void schedule_tail(struct tas * context_switch - switch to the new MM and the new * thread's register state. */ -static inline struct task_struct * +static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; + struct mm_struct *mm, *oldmm; + prepare_task_switch(rq, next); + mm = next->mm; + oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -1959,16 +1751,15 @@ context_switch(struct rq *rq, struct tas */ arch_enter_lazy_cpu_mode(); - if (!mm) { + if (unlikely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (!prev->mm) { + if (unlikely(!prev->mm)) { prev->active_mm = NULL; - WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } /* @@ -1984,7 +1775,13 @@ context_switch(struct rq *rq, struct tas /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - return prev; + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); } /* @@ -2039,36 +1836,84 @@ unsigned long nr_iowait(void) for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); - return sum; -} + return sum; +} + +unsigned long nr_active(void) +{ + unsigned long i, running = 0, uninterruptible = 0; + + for_each_online_cpu(i) { + running += cpu_rq(i)->nr_running; + uninterruptible += cpu_rq(i)->nr_uninterruptible; + } + + if (unlikely((long)uninterruptible < 0)) + uninterruptible = 0; + + return running + uninterruptible; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). + */ +static void update_cpu_load(struct rq *this_rq) +{ + u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; + unsigned long total_load = this_rq->ls.load.weight; + unsigned long this_load = total_load; + struct load_stat *ls = &this_rq->ls; + u64 now = __rq_clock(this_rq); + int i, scale; + + this_rq->nr_load_updates++; + if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) + goto do_avg; + + /* Update delta_fair/delta_exec fields first */ + update_curr_load(this_rq, now); + + fair_delta64 = ls->delta_fair + 1; + ls->delta_fair = 0; + + exec_delta64 = ls->delta_exec + 1; + ls->delta_exec = 0; + + sample_interval64 = now - ls->load_update_last; + ls->load_update_last = now; + + if ((s64)sample_interval64 < (s64)TICK_NSEC) + sample_interval64 = TICK_NSEC; + + if (exec_delta64 > sample_interval64) + exec_delta64 = sample_interval64; + + idle_delta64 = sample_interval64 - exec_delta64; + + tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); + tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); + + this_load = (unsigned long)tmp64; -unsigned long nr_active(void) -{ - unsigned long i, running = 0, uninterruptible = 0; +do_avg: - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } + /* Update our load: */ + for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; + /* scale is effectively 1 << i now, and >> i divides by scale */ - return running + uninterruptible; + old_load = this_rq->cpu_load[i]; + new_load = this_load; + + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + } } #ifdef CONFIG_SMP /* - * Is this task likely cache-hot: - */ -static inline int -task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) -{ - return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; -} - -/* * double_rq_lock - safely lock two runqueues * * Note this does not disable interrupts like task_rq_lock, @@ -2184,23 +2029,17 @@ void sched_exec(void) * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct prio_array *src_array, - struct task_struct *p, struct rq *this_rq, - struct prio_array *this_array, int this_cpu) +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) { - dequeue_task(p, src_array); - dec_nr_running(p, src_rq); + deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); - enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) - + this_rq->most_recent_timestamp; + activate_task(this_rq, p, 0); /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); + check_preempt_curr(this_rq, p); } /* @@ -2208,7 +2047,7 @@ static void pull_task(struct rq *src_rq, */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, + struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { /* @@ -2225,132 +2064,67 @@ int can_migrate_task(struct task_struct return 0; /* - * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. + * Aggressive migration if too many balance attempts have failed: */ - - if (sd->nr_balance_failed > sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->most_recent_timestamp, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif + if (sd->nr_balance_failed > sd->cache_nice_tries) return 1; - } - if (task_hot(p, rq->most_recent_timestamp, sd)) - return 0; return 1; } -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) - -/* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted - * load from busiest to this_rq, as part of a balancing operation within - * "domain". Returns the number of tasks moved. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, +static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum idle_type idle, - int *all_pinned) + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *load_moved, + int this_best_prio, int best_prio, int best_prio_seen, + struct rq_iterator *iterator) { - int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, - best_prio_seen, skip_for_load; - struct prio_array *array, *dst_array; - struct list_head *head, *curr; - struct task_struct *tmp; - long rem_load_move; + int pulled = 0, pinned = 0, skip_for_load; + struct task_struct *p; + long rem_load_move = max_load_move; if (max_nr_move == 0 || max_load_move == 0) goto out; - rem_load_move = max_load_move; pinned = 1; - this_best_prio = rq_best_prio(this_rq); - best_prio = rq_best_prio(busiest); - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) of - * any task we find with that prio. - */ - best_prio_seen = best_prio == busiest->curr->prio; - - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } + /* + * Start the load-balancing iterator: + */ + p = iterator->start(iterator->arg); +next: + if (!p) goto out; - } - - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, struct task_struct, run_list); - - curr = curr->prev; - /* * To help distribute high priority tasks accross CPUs we don't * skip a task if it will be the highest priority task (i.e. smallest * prio value) on its new queue regardless of its load weight */ - skip_for_load = tmp->load_weight > rem_load_move; - if (skip_for_load && idx < this_best_prio) - skip_for_load = !best_prio_seen && idx == best_prio; + skip_for_load = (p->se.load.weight >> 1) > rem_load_move + + SCHED_LOAD_SCALE_FUZZ; + if (skip_for_load && p->prio < this_best_prio) + skip_for_load = !best_prio_seen && p->prio == best_prio; if (skip_for_load || - !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - best_prio_seen |= idx == best_prio; - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + best_prio_seen |= p->prio == best_prio; + p = iterator->next(iterator->arg); + goto next; } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pull_task(busiest, p, this_rq, this_cpu); pulled++; - rem_load_move -= tmp->load_weight; + rem_load_move -= p->se.load.weight; /* * We only want to steal up to the prescribed number of tasks * and the prescribed amount of weighted load. */ if (pulled < max_nr_move && rem_load_move > 0) { - if (idx < this_best_prio) - this_best_prio = idx; - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + if (p->prio < this_best_prio) + this_best_prio = p->prio; + p = iterator->next(iterator->arg); + goto next; } out: /* @@ -2362,18 +2136,48 @@ out: if (all_pinned) *all_pinned = pinned; + *load_moved = max_load_move - rem_load_move; return pulled; } /* + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted + * load from busiest to this_rq, as part of a balancing operation within + * "domain". Returns the number of tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + struct sched_class *class = sched_class_highest; + unsigned long load_moved, total_nr_moved = 0, nr_moved; + long rem_load_move = max_load_move; + + do { + nr_moved = class->load_balance(this_rq, this_cpu, busiest, + max_nr_move, (unsigned long)rem_load_move, + sd, idle, all_pinned, &load_moved); + total_nr_moved += nr_moved; + max_nr_move -= nr_moved; + rem_load_move -= load_moved; + class = class->next; + } while (class && max_nr_move && rem_load_move > 0); + + return total_nr_moved; +} + +/* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the amount of weighted load which * should be moved to restore balance via the imbalance parameter. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle, int *sd_idle, - cpumask_t *cpus, int *balance) + unsigned long *imbalance, enum cpu_idle_type idle, + int *sd_idle, cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2391,9 +2195,9 @@ find_busiest_group(struct sched_domain * max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; - if (idle == NOT_IDLE) + if (idle == CPU_NOT_IDLE) load_idx = sd->busy_idx; - else if (idle == NEWLY_IDLE) + else if (idle == CPU_NEWLY_IDLE) load_idx = sd->newidle_idx; else load_idx = sd->idle_idx; @@ -2437,7 +2241,7 @@ find_busiest_group(struct sched_domain * avg_load += load; sum_nr_running += rq->nr_running; - sum_weighted_load += rq->raw_weighted_load; + sum_weighted_load += weighted_cpuload(i); } /* @@ -2477,8 +2281,9 @@ find_busiest_group(struct sched_domain * * Busy processors will not participate in power savings * balance. */ - if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto group_next; + if (idle == CPU_NOT_IDLE || + !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; /* * If the local group is idle or completely loaded @@ -2488,42 +2293,42 @@ find_busiest_group(struct sched_domain * !this_nr_running)) power_savings_balance = 0; - /* + /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations - */ - if (!power_savings_balance || sum_nr_running >= group_capacity + */ + if (!power_savings_balance || sum_nr_running >= group_capacity || !sum_nr_running) - goto group_next; + goto group_next; - /* + /* * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && first_cpu(group->cpumask) < first_cpu(group_min->cpumask))) { - group_min = group; - min_nr_running = sum_nr_running; + group_min = group; + min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / sum_nr_running; - } + } - /* + /* * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) { + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + first_cpu(group->cpumask) > + first_cpu(group_leader->cpumask))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } } group_next: #endif @@ -2578,7 +2383,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < busiest_load_per_task) { + if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -2592,7 +2397,8 @@ small_imbalance: } else this_load_per_task = SCHED_LOAD_SCALE; - if (max_load - this_load >= busiest_load_per_task * imbn) { + if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; } @@ -2639,7 +2445,7 @@ small_imbalance: out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; if (this == group_leader && group_leader != group_min) { @@ -2656,7 +2462,7 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum idle_type idle, +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long imbalance, cpumask_t *cpus) { struct rq *busiest = NULL, *rq; @@ -2664,17 +2470,19 @@ find_busiest_queue(struct sched_group *g int i; for_each_cpu_mask(i, group->cpumask) { + unsigned long wl; if (!cpu_isset(i, *cpus)) continue; rq = cpu_rq(i); + wl = weighted_cpuload(i); - if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) + if (rq->nr_running == 1 && wl > imbalance) continue; - if (rq->raw_weighted_load > max_load) { - max_load = rq->raw_weighted_load; + if (wl > max_load) { + max_load = wl; busiest = rq; } } @@ -2698,7 +2506,7 @@ static inline unsigned long minus_1_or_z * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle, + struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; @@ -2711,10 +2519,10 @@ static int load_balance(int this_cpu, st /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as NOT_IDLE. + * let the state of idle sibling percolate up as CPU_IDLE, instead of + * portraying it as CPU_NOT_IDLE. */ - if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; @@ -2848,7 +2656,7 @@ out_one_pinned: * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * - * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). * this_rq is locked. */ static int @@ -2865,31 +2673,31 @@ load_balance_newidle(int this_cpu, struc * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as NOT_IDLE. + * portraying it as CPU_NOT_IDLE. */ if (sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, + group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, &sd_idle, &cpus, NULL); if (!group) { - schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, &cpus); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); goto out_balanced; } BUG_ON(busiest == this_rq); - schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); nr_moved = 0; if (busiest->nr_running > 1) { @@ -2897,7 +2705,7 @@ redo: double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), - imbalance, sd, NEWLY_IDLE, NULL); + imbalance, sd, CPU_NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); if (!nr_moved) { @@ -2908,7 +2716,7 @@ redo: } if (!nr_moved) { - schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; @@ -2918,7 +2726,7 @@ redo: return nr_moved; out_balanced: - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; @@ -2934,8 +2742,8 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; - int pulled_task = 0; - unsigned long next_balance = jiffies + 60 * HZ; + int pulled_task = -1; + unsigned long next_balance = jiffies + HZ; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2954,12 +2762,13 @@ static void idle_balance(int this_cpu, s if (pulled_task) break; } - if (!pulled_task) + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on * a busy processor. So reset next_balance. */ this_rq->next_balance = next_balance; + } } /* @@ -3003,7 +2812,7 @@ static void active_load_balance(struct r schedstat_inc(sd, alb_cnt); if (move_tasks(target_rq, target_cpu, busiest_rq, 1, - RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, + RTPRIO_TO_load_weight(100), sd, CPU_IDLE, NULL)) schedstat_inc(sd, alb_pushed); else @@ -3012,32 +2821,6 @@ static void active_load_balance(struct r spin_unlock(&target_rq->lock); } -static void update_load(struct rq *this_rq) -{ - unsigned long this_load; - unsigned int i, scale; - - this_load = this_rq->raw_weighted_load; - - /* Update our load: */ - for (i = 0, scale = 1; i < 3; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; - } -} - #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; @@ -3120,7 +2903,7 @@ static DEFINE_SPINLOCK(balancing); * * Balancing parameters are set up in arch_init_sched_domains. */ -static inline void rebalance_domains(int cpu, enum idle_type idle) +static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); @@ -3134,13 +2917,16 @@ static inline void rebalance_domains(int continue; interval = sd->balance_interval; - if (idle != SCHED_IDLE) + if (idle != CPU_IDLE) interval *= sd->busy_factor; /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; + if (interval > HZ*NR_CPUS/10) + interval = HZ*NR_CPUS/10; + if (sd->flags & SD_SERIALIZE) { if (!spin_trylock(&balancing)) @@ -3154,7 +2940,7 @@ static inline void rebalance_domains(int * longer idle, or one of our SMT siblings is * not idle. */ - idle = NOT_IDLE; + idle = CPU_NOT_IDLE; } sd->last_balance = jiffies; } @@ -3184,7 +2970,8 @@ static void run_rebalance_domains(struct { int local_cpu = smp_processor_id(); struct rq *local_rq = cpu_rq(local_cpu); - enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; + enum cpu_idle_type idle = local_rq->idle_at_tick ? + CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(local_cpu, idle); @@ -3227,9 +3014,8 @@ static void run_rebalance_domains(struct * idle load balancing owner or decide to stop the periodic load balancing, * if the whole system is idle. */ -static inline void trigger_load_balance(int cpu) +static inline void trigger_load_balance(struct rq *rq, int cpu) { - struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_NO_HZ /* * If we were in the nohz mode recently and busy at the current @@ -3281,68 +3067,58 @@ static inline void trigger_load_balance( if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); } -#else + +#else /* CONFIG_SMP */ + /* * on UP we do not need to balance between CPUs: */ static inline void idle_balance(int cpu, struct rq *rq) { } -#endif + +/* Avoid "used but not defined" warning on UP */ +static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *load_moved, + int this_best_prio, int best_prio, int best_prio_seen, + struct rq_iterator *iterator) +{ + *load_moved = 0; + + return 0; +} + +#endif /* CONFIG_SMP */ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * This is called on clock ticks and on context switches. - * Bank in p->sched_time the ns elapsed since the last tick or switch. - */ -static inline void -update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) -{ - p->sched_time += now - p->last_ran; - p->last_ran = rq->most_recent_timestamp = now; -} - -/* - * Return current->sched_time plus any more ns on the sched_clock - * that have not yet been banked. + * Return p->sum_exec_runtime plus any more ns on the sched_clock + * that have not yet been banked in case the task is currently running. */ -unsigned long long current_sched_time(const struct task_struct *p) +unsigned long long task_sched_runtime(struct task_struct *p) { - unsigned long long ns; unsigned long flags; + u64 ns, delta_exec; + struct rq *rq; - local_irq_save(flags); - ns = p->sched_time + sched_clock() - p->last_ran; - local_irq_restore(flags); + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime; + if (rq->curr == p) { + delta_exec = rq_clock(rq) - p->se.exec_start; + if ((s64)delta_exec > 0) + ns += delta_exec; + } + task_rq_unlock(rq, &flags); return ns; } /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -static inline int expired_starving(struct rq *rq) -{ - if (rq->curr->static_prio > rq->best_expired_prio) - return 1; - if (!STARVATION_LIMIT || !rq->expired_timestamp) - return 0; - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) - return 1; - return 0; -} - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3415,81 +3191,6 @@ void account_steal_time(struct task_stru cpustat->steal = cputime64_add(cpustat->steal, tmp); } -static void task_running_tick(struct rq *rq, struct task_struct *p) -{ - if (p->array != rq->active) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); - return; - } - spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - requeue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - requeue_task(p, rq->active); - set_tsk_need_resched(p); - } - } -out_unlock: - spin_unlock(&rq->lock); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3499,20 +3200,19 @@ out_unlock: */ void scheduler_tick(void) { - unsigned long long now = sched_clock(); - struct task_struct *p = current; int cpu = smp_processor_id(); - int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; - update_cpu_clock(p, rq, now); + spin_lock(&rq->lock); + if (curr != rq->idle) /* FIXME: needed? */ + curr->sched_class->task_tick(rq, curr); + update_cpu_load(rq); + spin_unlock(&rq->lock); - if (!idle_at_tick) - task_running_tick(rq, p); #ifdef CONFIG_SMP - update_load(rq); - rq->idle_at_tick = idle_at_tick; - trigger_load_balance(cpu); + rq->idle_at_tick = idle_cpu(cpu); + trigger_load_balance(rq, cpu); #endif } @@ -3554,170 +3254,129 @@ EXPORT_SYMBOL(sub_preempt_count); #endif -static inline int interactive_sleep(enum sleep_type sleep_type) +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) { - return (sleep_type == SLEEP_INTERACTIVE || - sleep_type == SLEEP_INTERRUPTED); + printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", + prev->comm, preempt_count(), prev->pid); + debug_show_held_locks(prev); + if (irqs_disabled()) + print_irqtrace_events(prev); + dump_stack(); } /* - * schedule() is the main scheduler function. + * Various schedule()-time debugging checks and statistics: */ -asmlinkage void __sched schedule(void) +static inline void schedule_debug(struct task_struct *prev) { - struct task_struct *prev, *next; - struct prio_array *array; - struct list_head *queue; - unsigned long long now; - unsigned long run_time; - int cpu, idx, new_prio; - long *switch_count; - struct rq *rq; - /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic() && !current->exit_state)) { - printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) + __schedule_bug(prev); + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -need_resched: - preempt_disable(); - prev = current; - release_kernel_lock(prev); -need_resched_nonpreemptible: - rq = this_rq(); + schedstat_inc(this_rq(), sched_cnt); +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) +{ + struct sched_class *class; + struct task_struct *p; /* - * The idle thread is not allowed to schedule! - * Remove this check after it has been exercised a bit. + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: */ - if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); + if (likely(rq->nr_running == rq->cfs.nr_running)) { + p = fair_sched_class.pick_next_task(rq, now); + if (likely(p)) + return p; } - schedstat_inc(rq, sched_cnt); - now = sched_clock(); - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { - run_time = now - prev->timestamp; - if (unlikely((long long)(now - prev->timestamp) < 0)) - run_time = 0; - } else - run_time = NS_MAX_SLEEP_AVG; + class = sched_class_highest; + for (;;) { + p = class->pick_next_task(rq, now); + if (p) + return p; + /* + * Will never be NULL as the idle class always + * returns a non-NULL p: + */ + class = class->next; + } +} - /* - * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + struct task_struct *prev, *next; + long *switch_count; + struct rq *rq; + u64 now; + int cpu; + +need_resched: + preempt_disable(); + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_qsctr_inc(cpu); + prev = rq->curr; + switch_count = &prev->nivcsw; + + release_kernel_lock(prev); +need_resched_nonpreemptible: + + schedule_debug(prev); spin_lock_irq(&rq->lock); + clear_tsk_need_resched(prev); - switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) + unlikely(signal_pending(prev)))) { prev->state = TASK_RUNNING; - else { - if (prev->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; - deactivate_task(prev, rq); + } else { + deactivate_task(rq, prev, 1); } + switch_count = &prev->nvcsw; } - cpu = smp_processor_id(); - if (unlikely(!rq->nr_running)) { + if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - goto switch_tasks; - } - } - - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - schedstat_inc(rq, sched_switch); - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); - - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { - unsigned long long delta = now - next->timestamp; - if (unlikely((long long)(now - next->timestamp) < 0)) - delta = 0; - - if (next->sleep_type == SLEEP_INTERACTIVE) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - new_prio = recalc_task_prio(next, next->timestamp + delta); - - if (unlikely(next->prio != new_prio)) { - dequeue_task(next, array); - next->prio = new_prio; - enqueue_task(next, array); - } - } - next->sleep_type = SLEEP_NORMAL; -switch_tasks: - if (next == rq->idle) - schedstat_inc(rq, sched_goidle); - prefetch(next); - prefetch_stack(next); - clear_tsk_need_resched(prev); - rcu_qsctr_inc(task_cpu(prev)); - - update_cpu_clock(prev, rq, now); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; + now = __rq_clock(rq); + prev->sched_class->put_prev_task(rq, prev, now); + next = pick_next_task(rq, prev, now); sched_info_switch(prev, next); + if (likely(prev != next)) { - next->timestamp = next->last_ran = now; rq->nr_switches++; rq->curr = next; ++*switch_count; - prepare_task_switch(rq, next); - prev = context_switch(rq, prev, next); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + context_switch(rq, prev, next); /* unlocks the rq */ } else spin_unlock_irq(&rq->lock); - prev = current; - if (unlikely(reacquire_kernel_lock(prev) < 0)) + if (unlikely(reacquire_kernel_lock(current) < 0)) { + cpu = smp_processor_id(); + rq = cpu_rq(cpu); goto need_resched_nonpreemptible; + } preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; @@ -4129,29 +3788,30 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - struct prio_array *array; unsigned long flags; + int oldprio, on_rq; struct rq *rq; - int oldprio; + u64 now; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + now = rq_clock(rq); oldprio = p->prio; - array = p->array; - if (array) - dequeue_task(p, array); + on_rq = p->se.on_rq; + if (on_rq) + dequeue_task(rq, p, 0, now); + + if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + p->prio = prio; - if (array) { - /* - * If changing to an RT priority then queue it - * in the active array! - */ - if (rt_task(p)) - array = rq->active; - enqueue_task(p, array); + if (on_rq) { + enqueue_task(rq, p, 0, now); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -4160,8 +3820,9 @@ void rt_mutex_setprio(struct task_struct if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else { + check_preempt_curr(rq, p); + } } task_rq_unlock(rq, &flags); } @@ -4170,10 +3831,10 @@ void rt_mutex_setprio(struct task_struct void set_user_nice(struct task_struct *p, long nice) { - struct prio_array *array; - int old_prio, delta; + int old_prio, delta, on_rq; unsigned long flags; struct rq *rq; + u64 now; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -4182,20 +3843,21 @@ void set_user_nice(struct task_struct *p * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + now = rq_clock(rq); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * not SCHED_NORMAL/SCHED_BATCH: + * SCHED_FIFO/SCHED_RR: */ - if (has_rt_policy(p)) { + if (task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; - if (array) { - dequeue_task(p, array); - dec_raw_weighted_load(rq, p); + on_rq = p->se.on_rq; + if (on_rq) { + dequeue_task(rq, p, 0, now); + dec_load(rq, p, now); } p->static_prio = NICE_TO_PRIO(nice); @@ -4204,9 +3866,9 @@ void set_user_nice(struct task_struct *p p->prio = effective_prio(p); delta = p->prio - old_prio; - if (array) { - enqueue_task(p, array); - inc_raw_weighted_load(rq, p); + if (on_rq) { + enqueue_task(rq, p, 0, now); + inc_load(rq, p, now); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4326,20 +3988,29 @@ static inline struct task_struct *find_p } /* Actually do priority change: must hold rq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(p->se.on_rq); p->policy = policy; + switch (p->policy) { + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLE: + p->sched_class = &fair_sched_class; + break; + case SCHED_FIFO: + case SCHED_RR: + p->sched_class = &rt_sched_class; + break; + } + p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; set_load_weight(p); } @@ -4354,8 +4025,7 @@ static void __setscheduler(struct task_s int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1; - struct prio_array *array; + int retval, oldprio, oldpolicy = -1, on_rq; unsigned long flags; struct rq *rq; @@ -4366,27 +4036,27 @@ recheck: if (policy < 0) policy = oldpolicy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH) + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_ISO && policy != SCHED_IDLE) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and - * SCHED_BATCH is 0. + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH, SCHED_ISO and SCHED_IDLE is 0. */ if (param->sched_priority < 0 || (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (is_rt_policy(policy) != (param->sched_priority != 0)) + if (rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - if (is_rt_policy(policy)) { + if (rt_policy(policy)) { unsigned long rlim_rtprio; - unsigned long flags; if (!lock_task_sighand(p, &flags)) return -ESRCH; @@ -4402,6 +4072,12 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* + * Like positive nice levels, dont allow tasks to + * move out of SCHED_IDLE either: + */ + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) + return -EPERM; /* can't change other user's priorities */ if ((current->euid != p->euid) && @@ -4429,13 +4105,13 @@ recheck: spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - array = p->array; - if (array) - deactivate_task(p, rq); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq, p, 0); oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); - if (array) { - __activate_task(p, rq); + __setscheduler(rq, p, policy, param->sched_priority); + if (on_rq) { + activate_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -4444,8 +4120,9 @@ recheck: if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else { + check_preempt_curr(rq, p); + } } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4717,41 +4394,18 @@ asmlinkage long sys_sched_getaffinity(pi /** * sys_sched_yield - yield the current processor to other threads. * - * This function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this - * CPU then this function will return. + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { struct rq *rq = this_rq_lock(); - struct prio_array *array = current->array, *target = rq->expired; schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; - - if (array->nr_active == 1) { + if (unlikely(rq->nr_running == 1)) schedstat_inc(rq, yld_act_empty); - if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); - } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); - - if (array != target) { - dequeue_task(current, array); - enqueue_task(current, target); - } else - /* - * requeue_task is cheaper so perform that if possible. - */ - requeue_task(current, array); + else + current->sched_class->yield_task(rq, current); /* * Since we are going to call schedule() anyway, there's @@ -4902,6 +4556,8 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLE: ret = 0; break; } @@ -4926,6 +4582,8 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLE: ret = 0; } return ret; @@ -4960,7 +4618,7 @@ long sys_sched_rr_get_interval(pid_t pid goto out_unlock; jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : task_timeslice(p), &t); + 0 : static_prio_timeslice(p->static_prio), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -5041,6 +4699,14 @@ void show_state_filter(unsigned long sta */ if (state_filter == -1) debug_show_all_locks(); +#ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); +#endif +} + +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ + idle->sched_class = &idle_sched_class; } /** @@ -5056,13 +4722,12 @@ void __cpuinit init_idle(struct task_str struct rq *rq = cpu_rq(cpu); unsigned long flags; - idle->timestamp = sched_clock(); - idle->sleep_avg = 0; - idle->array = NULL; + __sched_fork(idle); + idle->se.exec_start = sched_clock(); + idle->prio = idle->normal_prio = MAX_PRIO; - idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); - set_task_cpu(idle, cpu); + __set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; @@ -5077,6 +4742,10 @@ void __cpuinit init_idle(struct task_str #else task_thread_info(idle)->preempt_count = 0; #endif + /* + * The idle tasks have their own, simple scheduling class: + */ + idle->sched_class = &idle_sched_class; } /* @@ -5088,6 +4757,28 @@ void __cpuinit init_idle(struct task_str */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static inline void sched_init_granularity(void) +{ + unsigned int factor = 1 + ilog2(num_online_cpus()); + const unsigned long gran_limit = 10000000; + + sysctl_sched_granularity *= factor; + if (sysctl_sched_granularity > gran_limit) + sysctl_sched_granularity = gran_limit; + + sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; + sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; +} + #ifdef CONFIG_SMP /* * This is how migration works: @@ -5161,7 +4852,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0; + int ret = 0, on_rq; if (unlikely(cpu_is_offline(dest_cpu))) return ret; @@ -5177,20 +4868,13 @@ static int __migrate_task(struct task_st if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); - if (p->array) { - /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step - * afterwards, and pretending it was a local activate. - * This way is cleaner and logically correct. - */ - p->timestamp = p->timestamp - rq_src->most_recent_timestamp - + rq_dest->most_recent_timestamp; - deactivate_task(p, rq_src); - __activate_task(p, rq_dest); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); + if (on_rq) { + activate_task(rq_dest, p, 0); + check_preempt_curr(rq_dest, p); } ret = 1; out: @@ -5342,7 +5026,8 @@ static void migrate_live_tasks(int src_c write_unlock_irq(&tasklist_lock); } -/* Schedules idle task to be the next runnable task on current CPU. +/* + * Schedules idle task to be the next runnable task on current CPU. * It does so by boosting its priority to highest possible and adding it to * the _front_ of the runqueue. Used by CPU offline code. */ @@ -5362,10 +5047,10 @@ void sched_idle_next(void) */ spin_lock_irqsave(&rq->lock, flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); /* Add idle task to the _front_ of its priority queue: */ - __activate_idle_task(p, rq); + activate_idle_task(p, rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -5415,16 +5100,15 @@ static void migrate_dead(unsigned int de static void migrate_dead_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - unsigned int arr, i; + struct task_struct *next; - for (arr = 0; arr < 2; arr++) { - for (i = 0; i < MAX_PRIO; i++) { - struct list_head *list = &rq->arrays[arr].queue[i]; - - while (!list_empty(list)) - migrate_dead(dead_cpu, list_entry(list->next, - struct task_struct, run_list)); - } + for (;;) { + if (!rq->nr_running) + break; + next = pick_next_task(rq, rq->curr, rq_clock(rq)); + if (!next) + break; + migrate_dead(dead_cpu, next); } } #endif /* CONFIG_HOTPLUG_CPU */ @@ -5455,7 +5139,7 @@ migration_call(struct notifier_block *nf kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; @@ -5486,9 +5170,10 @@ migration_call(struct notifier_block *nf rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); - deactivate_task(rq->idle, rq); + deactivate_task(rq, rq->idle, 0); rq->idle->static_prio = MAX_PRIO; - __setscheduler(rq->idle, SCHED_NORMAL, 0); + __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); + rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); task_rq_unlock(rq, &flags); migrate_nr_uninterruptible(rq); @@ -5797,483 +5482,6 @@ init_sched_build_groups(cpumask_t span, #define SD_NODES_PER_DOMAIN 16 -/* - * Self-tuning task migration cost measurement between source and target CPUs. - * - * This is done by measuring the cost of manipulating buffers of varying - * sizes. For a given buffer-size here are the steps that are taken: - * - * 1) the source CPU reads+dirties a shared buffer - * 2) the target CPU reads+dirties the same shared buffer - * - * We measure how long they take, in the following 4 scenarios: - * - * - source: CPU1, target: CPU2 | cost1 - * - source: CPU2, target: CPU1 | cost2 - * - source: CPU1, target: CPU1 | cost3 - * - source: CPU2, target: CPU2 | cost4 - * - * We then calculate the cost3+cost4-cost1-cost2 difference - this is - * the cost of migration. - * - * We then start off from a small buffer-size and iterate up to larger - * buffer sizes, in 5% steps - measuring each buffer-size separately, and - * doing a maximum search for the cost. (The maximum cost for a migration - * normally occurs when the working set size is around the effective cache - * size.) - */ -#define SEARCH_SCOPE 2 -#define MIN_CACHE_SIZE (64*1024U) -#define DEFAULT_CACHE_SIZE (5*1024*1024U) -#define ITERATIONS 1 -#define SIZE_THRESH 130 -#define COST_THRESH 130 - -/* - * The migration cost is a function of 'domain distance'. Domain - * distance is the number of steps a CPU has to iterate down its - * domain tree to share a domain with the other CPU. The farther - * two CPUs are from each other, the larger the distance gets. - * - * Note that we use the distance only to cache measurement results, - * the distance value is not used numerically otherwise. When two - * CPUs have the same distance it is assumed that the migration - * cost is the same. (this is a simplification but quite practical) - */ -#define MAX_DOMAIN_DISTANCE 32 - -static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = - { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -/* - * Architectures may override the migration cost and thus avoid - * boot-time calibration. Unit is nanoseconds. Mostly useful for - * virtualized hardware: - */ -#ifdef CONFIG_DEFAULT_MIGRATION_COST - CONFIG_DEFAULT_MIGRATION_COST -#else - -1LL -#endif -}; - -/* - * Allow override of migration cost - in units of microseconds. - * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost - * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: - */ -static int __init migration_cost_setup(char *str) -{ - int ints[MAX_DOMAIN_DISTANCE+1], i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - - printk("#ints: %d\n", ints[0]); - for (i = 1; i <= ints[0]; i++) { - migration_cost[i-1] = (unsigned long long)ints[i]*1000; - printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); - } - return 1; -} - -__setup ("migration_cost=", migration_cost_setup); - -/* - * Global multiplier (divisor) for migration-cutoff values, - * in percentiles. E.g. use a value of 150 to get 1.5 times - * longer cache-hot cutoff times. - * - * (We scale it from 100 to 128 to long long handling easier.) - */ - -#define MIGRATION_FACTOR_SCALE 128 - -static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; - -static int __init setup_migration_factor(char *str) -{ - get_option(&str, &migration_factor); - migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; - return 1; -} - -__setup("migration_factor=", setup_migration_factor); - -/* - * Estimated distance of two CPUs, measured via the number of domains - * we have to pass for the two CPUs to be in the same span: - */ -static unsigned long domain_distance(int cpu1, int cpu2) -{ - unsigned long distance = 0; - struct sched_domain *sd; - - for_each_domain(cpu1, sd) { - WARN_ON(!cpu_isset(cpu1, sd->span)); - if (cpu_isset(cpu2, sd->span)) - return distance; - distance++; - } - if (distance >= MAX_DOMAIN_DISTANCE) { - WARN_ON(1); - distance = MAX_DOMAIN_DISTANCE-1; - } - - return distance; -} - -static unsigned int migration_debug; - -static int __init setup_migration_debug(char *str) -{ - get_option(&str, &migration_debug); - return 1; -} - -__setup("migration_debug=", setup_migration_debug); - -/* - * Maximum cache-size that the scheduler should try to measure. - * Architectures with larger caches should tune this up during - * bootup. Gets used in the domain-setup code (i.e. during SMP - * bootup). - */ -unsigned int max_cache_size; - -static int __init setup_max_cache_size(char *str) -{ - get_option(&str, &max_cache_size); - return 1; -} - -__setup("max_cache_size=", setup_max_cache_size); - -/* - * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This - * is the operation that is timed, so we try to generate unpredictable - * cachemisses that still end up filling the L2 cache: - */ -static void touch_cache(void *__cache, unsigned long __size) -{ - unsigned long size = __size / sizeof(long); - unsigned long chunk1 = size / 3; - unsigned long chunk2 = 2 * size / 3; - unsigned long *cache = __cache; - int i; - - for (i = 0; i < size/6; i += 8) { - switch (i % 6) { - case 0: cache[i]++; - case 1: cache[size-1-i]++; - case 2: cache[chunk1-i]++; - case 3: cache[chunk1+i]++; - case 4: cache[chunk2-i]++; - case 5: cache[chunk2+i]++; - } - } -} - -/* - * Measure the cache-cost of one task migration. Returns in units of nsec. - */ -static unsigned long long -measure_one(void *cache, unsigned long size, int source, int target) -{ - cpumask_t mask, saved_mask; - unsigned long long t0, t1, t2, t3, cost; - - saved_mask = current->cpus_allowed; - - /* - * Flush source caches to RAM and invalidate them: - */ - sched_cacheflush(); - - /* - * Migrate to the source CPU: - */ - mask = cpumask_of_cpu(source); - set_cpus_allowed(current, mask); - WARN_ON(smp_processor_id() != source); - - /* - * Dirty the working set: - */ - t0 = sched_clock(); - touch_cache(cache, size); - t1 = sched_clock(); - - /* - * Migrate to the target CPU, dirty the L2 cache and access - * the shared buffer. (which represents the working set - * of a migrated task.) - */ - mask = cpumask_of_cpu(target); - set_cpus_allowed(current, mask); - WARN_ON(smp_processor_id() != target); - - t2 = sched_clock(); - touch_cache(cache, size); - t3 = sched_clock(); - - cost = t1-t0 + t3-t2; - - if (migration_debug >= 2) - printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", - source, target, t1-t0, t1-t0, t3-t2, cost); - /* - * Flush target caches to RAM and invalidate them: - */ - sched_cacheflush(); - - set_cpus_allowed(current, saved_mask); - - return cost; -} - -/* - * Measure a series of task migrations and return the average - * result. Since this code runs early during bootup the system - * is 'undisturbed' and the average latency makes sense. - * - * The algorithm in essence auto-detects the relevant cache-size, - * so it will properly detect different cachesizes for different - * cache-hierarchies, depending on how the CPUs are connected. - * - * Architectures can prime the upper limit of the search range via - * max_cache_size, otherwise the search range defaults to 20MB...64K. - */ -static unsigned long long -measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) -{ - unsigned long long cost1, cost2; - int i; - - /* - * Measure the migration cost of 'size' bytes, over an - * average of 10 runs: - * - * (We perturb the cache size by a small (0..4k) - * value to compensate size/alignment related artifacts. - * We also subtract the cost of the operation done on - * the same CPU.) - */ - cost1 = 0; - - /* - * dry run, to make sure we start off cache-cold on cpu1, - * and to get any vmalloc pagefaults in advance: - */ - measure_one(cache, size, cpu1, cpu2); - for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); - - measure_one(cache, size, cpu2, cpu1); - for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); - - /* - * (We measure the non-migrating [cached] cost on both - * cpu1 and cpu2, to handle CPUs with different speeds) - */ - cost2 = 0; - - measure_one(cache, size, cpu1, cpu1); - for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); - - measure_one(cache, size, cpu2, cpu2); - for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); - - /* - * Get the per-iteration migration cost: - */ - do_div(cost1, 2 * ITERATIONS); - do_div(cost2, 2 * ITERATIONS); - - return cost1 - cost2; -} - -static unsigned long long measure_migration_cost(int cpu1, int cpu2) -{ - unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; - unsigned int max_size, size, size_found = 0; - long long cost = 0, prev_cost; - void *cache; - - /* - * Search from max_cache_size*5 down to 64K - the real relevant - * cachesize has to lie somewhere inbetween. - */ - if (max_cache_size) { - max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); - size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); - } else { - /* - * Since we have no estimation about the relevant - * search range - */ - max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; - size = MIN_CACHE_SIZE; - } - - if (!cpu_online(cpu1) || !cpu_online(cpu2)) { - printk("cpu %d and %d not both online!\n", cpu1, cpu2); - return 0; - } - - /* - * Allocate the working set: - */ - cache = vmalloc(max_size); - if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); - return 1000000; /* return 1 msec on very small boxen */ - } - - while (size <= max_size) { - prev_cost = cost; - cost = measure_cost(cpu1, cpu2, cache, size); - - /* - * Update the max: - */ - if (cost > 0) { - if (max_cost < cost) { - max_cost = cost; - size_found = size; - } - } - /* - * Calculate average fluctuation, we use this to prevent - * noise from triggering an early break out of the loop: - */ - fluct = abs(cost - prev_cost); - avg_fluct = (avg_fluct + fluct)/2; - - if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " - "(%8Ld %8Ld)\n", - cpu1, cpu2, size, - (long)cost / 1000000, - ((long)cost / 100000) % 10, - (long)max_cost / 1000000, - ((long)max_cost / 100000) % 10, - domain_distance(cpu1, cpu2), - cost, avg_fluct); - - /* - * If we iterated at least 20% past the previous maximum, - * and the cost has dropped by more than 20% already, - * (taking fluctuations into account) then we assume to - * have found the maximum and break out of the loop early: - */ - if (size_found && (size*100 > size_found*SIZE_THRESH)) - if (cost+avg_fluct <= 0 || - max_cost*100 > (cost+avg_fluct)*COST_THRESH) { - - if (migration_debug) - printk("-> found max.\n"); - break; - } - /* - * Increase the cachesize in 10% steps: - */ - size = size * 10 / 9; - } - - if (migration_debug) - printk("[%d][%d] working set size found: %d, cost: %Ld\n", - cpu1, cpu2, size_found, max_cost); - - vfree(cache); - - /* - * A task is considered 'cache cold' if at least 2 times - * the worst-case cost of migration has passed. - * - * (this limit is only listened to if the load-balancing - * situation is 'nice' - if there is a large imbalance we - * ignore it for the sake of CPU utilization and - * processing fairness.) - */ - return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; -} - -static void calibrate_migration_costs(const cpumask_t *cpu_map) -{ - int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); - unsigned long j0, j1, distance, max_distance = 0; - struct sched_domain *sd; - - j0 = jiffies; - - /* - * First pass - calculate the cacheflush times: - */ - for_each_cpu_mask(cpu1, *cpu_map) { - for_each_cpu_mask(cpu2, *cpu_map) { - if (cpu1 == cpu2) - continue; - distance = domain_distance(cpu1, cpu2); - max_distance = max(max_distance, distance); - /* - * No result cached yet? - */ - if (migration_cost[distance] == -1LL) - migration_cost[distance] = - measure_migration_cost(cpu1, cpu2); - } - } - /* - * Second pass - update the sched domain hierarchy with - * the new cache-hot-time estimations: - */ - for_each_cpu_mask(cpu, *cpu_map) { - distance = 0; - for_each_domain(cpu, sd) { - sd->cache_hot_time = migration_cost[distance]; - distance++; - } - } - /* - * Print the matrix: - */ - if (migration_debug) - printk("migration: max_cache_size: %d, cpu: %d MHz:\n", - max_cache_size, -#ifdef CONFIG_X86 - cpu_khz/1000 -#else - -1 -#endif - ); - if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); - } - printk("\n"); - } - j1 = jiffies; - if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0) / HZ); - - /* - * Move back to the original CPU. NUMA-Q gets confused - * if we migrate to another quad during bootup. - */ - if (raw_smp_processor_id() != orig_cpu) { - cpumask_t mask = cpumask_of_cpu(orig_cpu), - saved_mask = current->cpus_allowed; - - set_cpus_allowed(current, mask); - set_cpus_allowed(current, saved_mask); - } -} - #ifdef CONFIG_NUMA /** @@ -6574,7 +5782,6 @@ static void init_sched_groups_power(int static int build_sched_domains(const cpumask_t *cpu_map) { int i; - struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6719,6 +5926,7 @@ static int build_sched_domains(const cpu sched_group_nodes[i] = sg; for_each_cpu_mask(j, nodemask) { struct sched_domain *sd; + sd = &per_cpu(node_domains, j); sd->groups = sg; } @@ -6763,19 +5971,22 @@ static int build_sched_domains(const cpu /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(cpu_domains, i); + struct sched_domain *sd = &per_cpu(cpu_domains, i); + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(core_domains, i); + struct sched_domain *sd = &per_cpu(core_domains, i); + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(phys_domains, i); + struct sched_domain *sd = &per_cpu(phys_domains, i); + init_sched_groups_power(i, sd); } @@ -6803,10 +6014,6 @@ static int build_sched_domains(const cpu #endif cpu_attach_domain(sd, i); } - /* - * Tune cache-hot values: - */ - calibrate_migration_costs(cpu_map); return 0; @@ -7013,10 +6220,12 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); + sched_init_granularity(); } #else void __init sched_init_smp(void) { + sched_init_granularity(); } #endif /* CONFIG_SMP */ @@ -7030,10 +6239,27 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } +static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->fair_clock = 1; +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->rq = rq; +#endif +} + void __init sched_init(void) { - int i, j, k; + u64 now = sched_clock(); int highest_cpu = 0; + int i, j; + + /* + * Link up the scheduling class hierarchy: + */ + rt_sched_class.next = &fair_sched_class; + fair_sched_class.next = &idle_sched_class; + idle_sched_class.next = NULL; for_each_possible_cpu(i) { struct prio_array *array; @@ -7043,15 +6269,21 @@ void __init sched_init(void) spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; + rq->clock = 1; + init_cfs_rq(&rq->cfs, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); +#endif + rq->ls.load_update_last = now; + rq->ls.load_update_start = now; + for (j = 0; j < CPU_LOAD_IDX_MAX; j++) + rq->cpu_load[j] = 0; #ifdef CONFIG_SMP rq->sd = NULL; - for (j = 1; j < 3; j++) - rq->cpu_load[j] = 0; rq->active_balance = 0; + rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; rq->migration_thread = NULL; @@ -7059,16 +6291,14 @@ void __init sched_init(void) #endif atomic_set(&rq->nr_iowait, 0); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + array = &rq->rt.active; + for (j = 0; j < MAX_RT_PRIO; j++) { + INIT_LIST_HEAD(array->queue + j); + __clear_bit(j, array->bitmap); } highest_cpu = i; + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); } set_load_weight(&init_task); @@ -7095,6 +6325,10 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -7125,29 +6359,55 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) { - struct prio_array *array; struct task_struct *g, *p; unsigned long flags; struct rq *rq; + int on_rq; read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { - if (!rt_task(p)) + p->se.fair_key = 0; + p->se.wait_runtime = 0; + p->se.wait_start_fair = 0; + p->se.wait_start = 0; + p->se.exec_start = 0; + p->se.sleep_start = 0; + p->se.sleep_start_fair = 0; + p->se.block_start = 0; + task_rq(p)->cfs.fair_clock = 0; + task_rq(p)->clock = 0; + + if (!rt_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: + */ + if (TASK_NICE(p) < 0 && p->mm) + set_user_nice(p, 0); continue; + } spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); +#ifdef CONFIG_SMP + /* + * Do not touch the migration thread: + */ + if (p == rq->migration_thread) + goto out_unlock; +#endif - array = p->array; - if (array) - deactivate_task(p, task_rq(p)); - __setscheduler(p, SCHED_NORMAL, 0); - if (array) { - __activate_task(p, task_rq(p)); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(task_rq(p), p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + activate_task(task_rq(p), p, 0); resched_task(rq->curr); } - +#ifdef CONFIG_SMP + out_unlock: +#endif __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); } while_each_thread(g, p); Index: linux-rt.q/kernel/sched_debug.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/sched_debug.c @@ -0,0 +1,276 @@ +/* + * kernel/time/sched_debug.c + * + * Print the CFS rbtree + * + * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); + +/* + * This allows printing both to /proc/sched_debug and + * to the console + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +static void +print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now) +{ + if (rq->curr == p) + SEQ_printf(m, "R"); + else + SEQ_printf(m, " "); + + SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d " + "%15Ld %15Ld %15Ld %15Ld %15Ld\n", + p->comm, p->pid, + (long long)p->se.fair_key, + (long long)(p->se.fair_key - rq->cfs.fair_clock), + (long long)p->se.wait_runtime, + (long long)(p->nvcsw + p->nivcsw), + p->prio, + (long long)p->se.sum_exec_runtime, + (long long)p->se.sum_wait_runtime, + (long long)p->se.sum_sleep_runtime, + (long long)p->se.wait_runtime_overruns, + (long long)p->se.wait_runtime_underruns); +} + +static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now) +{ + struct task_struct *g, *p; + + SEQ_printf(m, + "\nrunnable tasks:\n" + " task PID tree-key delta waiting" + " switches prio" + " sum-exec sum-wait sum-sleep" + " wait-overrun wait-underrun\n" + "------------------------------------------------------------------" + "----------------" + "------------------------------------------------" + "--------------------------------\n"); + + read_lock_irq(&tasklist_lock); + + do_each_thread(g, p) { + if (!p->se.on_rq || task_cpu(p) != rq_cpu) + continue; + + print_task(m, rq, p, now); + } while_each_thread(g, p); + + read_unlock_irq(&tasklist_lock); +} + +static void +print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +{ + s64 wait_runtime_rq_sum = 0; + struct task_struct *p; + struct rb_node *curr; + unsigned long flags; + struct rq *rq = &per_cpu(runqueues, cpu); + + spin_lock_irqsave(&rq->lock, flags); + curr = first_fair(cfs_rq); + while (curr) { + p = rb_entry(curr, struct task_struct, se.run_node); + wait_runtime_rq_sum += p->se.wait_runtime; + + curr = rb_next(curr); + } + spin_unlock_irqrestore(&rq->lock, flags); + + SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum", + (long long)wait_runtime_rq_sum); +} + +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) +{ + SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq); + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) + + P(fair_clock); + P(exec_clock); + P(wait_runtime); + P(wait_runtime_overruns); + P(wait_runtime_underruns); + P(sleeper_bonus); +#undef P + + print_cfs_rq_runtime_sum(m, cpu, cfs_rq); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct rq *rq = &per_cpu(runqueues, cpu); + +#ifdef CONFIG_X86 + { + unsigned int freq = cpu_khz ? : 1; + + SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + cpu, freq / 1000, (freq % 1000)); + } +#else + SEQ_printf(m, "\ncpu#%d\n", cpu); +#endif + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) + + P(nr_running); + SEQ_printf(m, " .%-30s: %lu\n", "load", + rq->ls.load.weight); + P(ls.delta_fair); + P(ls.delta_exec); + P(nr_switches); + P(nr_load_updates); + P(nr_uninterruptible); + SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); + P(next_balance); + P(curr->pid); + P(clock); + P(prev_clock_raw); + P(clock_warps); + P(clock_overflows); + P(clock_unstable_events); + P(clock_max_delta); + P(cpu_load[0]); + P(cpu_load[1]); + P(cpu_load[2]); + P(cpu_load[3]); + P(cpu_load[4]); +#undef P + + print_cfs_stats(m, cpu, now); + + print_rq(m, rq, cpu, now); +} + +static int sched_debug_show(struct seq_file *m, void *v) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v19, %s %.*s\n", + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); + + for_each_online_cpu(cpu) + print_cpu(m, cpu, now); + + SEQ_printf(m, "\n"); + + return 0; +} + +void sysrq_sched_debug_show(void) +{ + sched_debug_show(NULL, NULL); +} + +static int sched_debug_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_debug_show, NULL); +} + +static struct file_operations sched_debug_fops = { + .open = sched_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init init_sched_debug_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("sched_debug", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &sched_debug_fops; + + return 0; +} + +__initcall(init_sched_debug_procfs); + +void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +{ + unsigned long flags; + int num_threads = 1; + + rcu_read_lock(); + if (lock_task_sighand(p, &flags)) { + num_threads = atomic_read(&p->signal->count); + unlock_task_sighand(p, &flags); + } + rcu_read_unlock(); + + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); + SEQ_printf(m, "----------------------------------------------\n"); +#define P(F) \ + SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) + + P(se.wait_start); + P(se.wait_start_fair); + P(se.exec_start); + P(se.sleep_start); + P(se.sleep_start_fair); + P(se.block_start); + P(se.sleep_max); + P(se.block_max); + P(se.exec_max); + P(se.wait_max); + P(se.wait_runtime); + P(se.wait_runtime_overruns); + P(se.wait_runtime_underruns); + P(se.sum_wait_runtime); + P(se.sum_exec_runtime); + SEQ_printf(m, "%-25s:%20Ld\n", + "nr_switches", (long long)(p->nvcsw + p->nivcsw)); + P(se.load.weight); + P(policy); + P(prio); +#undef P + + { + u64 t0, t1; + + t0 = sched_clock(); + t1 = sched_clock(); + SEQ_printf(m, "%-25s:%20Ld\n", "clock-delta", (long long)(t1-t0)); + } +} + +void proc_sched_set_task(struct task_struct *p) +{ + p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; + p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; + p->se.sum_exec_runtime = 0; +} Index: linux-rt.q/kernel/sched_fair.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/sched_fair.c @@ -0,0 +1,1107 @@ +/* + * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner + */ + +/* + * Preemption granularity: + * (default: 2 msec, units: nanoseconds) + * + * NOTE: this granularity value is not the same as the concept of + * 'timeslice length' - timeslices in CFS will typically be somewhat + * larger than this value. (to see the precise effective timeslice + * length of your workload, run vmstat and monitor the context-switches + * field) + * + * On SMP systems the value of this is multiplied by the log2 of the + * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way + * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) + */ +unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; + +/* + * SCHED_BATCH wake-up granularity. + * (default: 10 msec, units: nanoseconds) + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + */ +unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = + 10000000000ULL/HZ; + +/* + * SCHED_OTHER wake-up granularity. + * (default: 1 msec, units: nanoseconds) + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + */ +unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; + +unsigned int sysctl_sched_stat_granularity __read_mostly; + +/* + * Initialized in sched_init_granularity(): + */ +unsigned int sysctl_sched_runtime_limit __read_mostly; + +/* + * Debugging: various feature bits + */ +enum { + SCHED_FEAT_FAIR_SLEEPERS = 1, + SCHED_FEAT_SLEEPER_AVG = 2, + SCHED_FEAT_SLEEPER_LOAD_AVG = 4, + SCHED_FEAT_PRECISE_CPU_LOAD = 8, + SCHED_FEAT_START_DEBIT = 16, + SCHED_FEAT_SKIP_INITIAL = 32, +}; + +unsigned int sysctl_sched_features __read_mostly = + SCHED_FEAT_FAIR_SLEEPERS *1 | + SCHED_FEAT_SLEEPER_AVG *1 | + SCHED_FEAT_SLEEPER_LOAD_AVG *1 | + SCHED_FEAT_PRECISE_CPU_LOAD *1 | + SCHED_FEAT_START_DEBIT *1 | + SCHED_FEAT_SKIP_INITIAL *0; + +extern struct sched_class fair_sched_class; + +/************************************************************** + * CFS operations on generic schedulable entities: + */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* cpu runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +/* currently running entity (if any) on this cfs_rq */ +static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) +{ + return cfs_rq->curr; +} + +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) + +static inline void +set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->curr = se; +} + +#else /* CONFIG_FAIR_GROUP_SCHED */ + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} + +static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (unlikely(rq->curr->sched_class != &fair_sched_class)) + return NULL; + + return &rq->curr->se; +} + +#define entity_is_task(se) 1 + +static inline void +set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + + +/************************************************************** + * Scheduling class tree data structure manipulation methods: + */ + +/* + * Enqueue an entity into the rb-tree: + */ +static inline void +__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node *parent = NULL; + struct sched_entity *entry; + s64 key = se->fair_key; + int leftmost = 1; + + /* + * Find the right place in the rbtree: + */ + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_entity, run_node); + /* + * We dont care about collisions. Nodes with + * the same key stay together. + */ + if (key - entry->fair_key < 0) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + /* + * Maintain a cache of leftmost tree entries (it is frequently + * used): + */ + if (leftmost) + cfs_rq->rb_leftmost = &se->run_node; + + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + update_load_add(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running++; + se->on_rq = 1; +} + +static inline void +__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->rb_leftmost == &se->run_node) + cfs_rq->rb_leftmost = rb_next(&se->run_node); + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + update_load_sub(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running--; + se->on_rq = 0; +} + +static inline struct rb_node * first_fair(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rb_leftmost; +} + +static struct sched_entity * __pick_next_entity(struct cfs_rq *cfs_rq) +{ + return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); +} + +/************************************************************** + * Scheduling class statistics methods: + */ + +/* + * We rescale the rescheduling granularity of tasks according to their + * nice level, but only linearly, not exponentially: + */ +static long +niced_granularity(struct sched_entity *curr, unsigned long granularity) +{ + u64 tmp; + + /* + * Negative nice levels get the same granularity as nice-0: + */ + if (likely(curr->load.weight >= NICE_0_LOAD)) + return granularity; + /* + * Positive nice level tasks get linearly finer + * granularity: + */ + tmp = curr->load.weight * (u64)granularity; + + /* + * It will always fit into 'long': + */ + return (long) (tmp >> NICE_0_SHIFT); +} + +static inline void +limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + long limit = sysctl_sched_runtime_limit; + + /* + * Niced tasks have the same history dynamic range as + * non-niced tasks: + */ + if (unlikely(se->wait_runtime > limit)) { + se->wait_runtime = limit; + schedstat_inc(se, wait_runtime_overruns); + schedstat_inc(cfs_rq, wait_runtime_overruns); + } + if (unlikely(se->wait_runtime < -limit)) { + se->wait_runtime = -limit; + schedstat_inc(se, wait_runtime_underruns); + schedstat_inc(cfs_rq, wait_runtime_underruns); + } +} + +static inline void +__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) +{ + se->wait_runtime += delta; + schedstat_add(se, sum_wait_runtime, delta); + limit_wait_runtime(cfs_rq, se); +} + +static void +add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) +{ + schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); + __add_wait_runtime(cfs_rq, se, delta); + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); +} + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static inline void +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) +{ + unsigned long delta, delta_exec, delta_fair; + long delta_mine; + struct load_weight *lw = &cfs_rq->load; + unsigned long load = lw->weight; + + if (unlikely(!load)) + return; + + delta_exec = curr->delta_exec; +#ifdef CONFIG_SCHEDSTATS + if (unlikely(delta_exec > curr->exec_max)) + curr->exec_max = delta_exec; +#endif + + curr->sum_exec_runtime += delta_exec; + cfs_rq->exec_clock += delta_exec; + + delta_fair = calc_delta_fair(delta_exec, lw); + delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); + + if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) { + delta = calc_delta_mine(cfs_rq->sleeper_bonus, + curr->load.weight, lw); + if (unlikely(delta > cfs_rq->sleeper_bonus)) + delta = cfs_rq->sleeper_bonus; + + cfs_rq->sleeper_bonus -= delta; + delta_mine -= delta; + } + + cfs_rq->fair_clock += delta_fair; + /* + * We executed delta_exec amount of time on the CPU, + * but we were only entitled to delta_mine amount of + * time during that period (if nr_running == 1 then + * the two values are equal) + * [Note: delta_mine - delta_exec is negative]: + */ + add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); +} + +static void update_curr(struct cfs_rq *cfs_rq, u64 now) +{ + struct sched_entity *curr = cfs_rq_curr(cfs_rq); + unsigned long delta_exec; + + if (unlikely(!curr)) + return; + + /* + * Get the amount of time the current task was running + * since the last time we changed load (this cannot + * overflow on 32 bits): + */ + delta_exec = (unsigned long)(now - curr->exec_start); + + curr->delta_exec += delta_exec; + + if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { + __update_curr(cfs_rq, curr, now); + curr->delta_exec = 0; + } + curr->exec_start = now; +} + +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + se->wait_start_fair = cfs_rq->fair_clock; + se->wait_start = now; +} + +/* + * Task is being enqueued - update stats: + */ +static void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + s64 key; + + /* + * Are we enqueueing a waiting task? (for current tasks + * a dequeue/enqueue event is a NOP) + */ + if (se != cfs_rq_curr(cfs_rq)) + update_stats_wait_start(cfs_rq, se, now); + /* + * Update the key: + */ + key = cfs_rq->fair_clock; + + /* + * Optimize the common nice 0 case: + */ + if (likely(se->load.weight == NICE_0_LOAD)) { + key -= se->wait_runtime; + } else { + u64 tmp; + + if (se->wait_runtime < 0) { + tmp = -se->wait_runtime; + key += (tmp * se->load.inv_weight) >> + (WMULT_SHIFT - NICE_0_SHIFT); + } else { + tmp = se->wait_runtime; + key -= (tmp * se->load.weight) >> NICE_0_SHIFT; + } + } + + se->fair_key = key; +} + +/* + * Note: must be called with a freshly updated rq->fair_clock. + */ +static inline void +__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + unsigned long delta_fair = se->delta_fair_run; + +#ifdef CONFIG_SCHEDSTATS + { + s64 delta_wait = now - se->wait_start; + if (unlikely(delta_wait > se->wait_max)) + se->wait_max = delta_wait; + } +#endif + + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta_fair = (u64)delta_fair * se->load.weight >> NICE_0_SHIFT; + + add_wait_runtime(cfs_rq, se, delta_fair); +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + unsigned long delta_fair; + + delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), + (u64)(cfs_rq->fair_clock - se->wait_start_fair)); + + se->delta_fair_run += delta_fair; + if (unlikely(abs(se->delta_fair_run) >= + sysctl_sched_stat_granularity)) { + __update_stats_wait_end(cfs_rq, se, now); + se->delta_fair_run = 0; + } + + se->wait_start_fair = 0; + se->wait_start = 0; +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + update_curr(cfs_rq, now); + /* + * Mark the end of the wait period if dequeueing a + * waiting task: + */ + if (se != cfs_rq_curr(cfs_rq)) + update_stats_wait_end(cfs_rq, se, now); +} + +/* + * We are picking a new current task - update its stats: + */ +static inline void +update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + /* + * We are starting a new run period: + */ + se->exec_start = now; +} + +/* + * We are descheduling a task - update its stats: + */ +static inline void +update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + se->exec_start = 0; +} + +/************************************************** + * Scheduling class queueing methods: + */ + +static void +__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + unsigned long load = cfs_rq->load.weight, delta_fair; + long prev_runtime; + + if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) + load = rq_of(cfs_rq)->cpu_load[2]; + + delta_fair = se->delta_fair_sleep; + + /* + * Fix up delta_fair with the effect of us running + * during the whole sleep period: + */ + if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) + delta_fair = div64_likely32((u64)delta_fair * load, + load + se->load.weight); + + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta_fair = (u64)delta_fair * se->load.weight >> NICE_0_SHIFT; + + prev_runtime = se->wait_runtime; + __add_wait_runtime(cfs_rq, se, delta_fair); + delta_fair = se->wait_runtime - prev_runtime; + + /* + * Track the amount of bonus we've given to sleepers: + */ + cfs_rq->sleeper_bonus += delta_fair; + + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); +} + +static void +enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + struct task_struct *tsk = task_of(se); + unsigned long delta_fair; + + if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || + !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) + return; + + delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), + (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); + + se->delta_fair_sleep += delta_fair; + if (unlikely(abs(se->delta_fair_sleep) >= + sysctl_sched_stat_granularity)) { + __enqueue_sleeper(cfs_rq, se, now); + se->delta_fair_sleep = 0; + } + + se->sleep_start_fair = 0; + +#ifdef CONFIG_SCHEDSTATS + if (se->sleep_start) { + u64 delta = now - se->sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->sleep_max)) + se->sleep_max = delta; + + se->sleep_start = 0; + se->sum_sleep_runtime += delta; + } + if (se->block_start) { + u64 delta = now - se->block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->block_max)) + se->block_max = delta; + + se->block_start = 0; + se->sum_sleep_runtime += delta; + } +#endif +} + +static void +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + int wakeup, u64 now) +{ + /* + * Update the fair clock. + */ + update_curr(cfs_rq, now); + + if (wakeup) + enqueue_sleeper(cfs_rq, se, now); + + update_stats_enqueue(cfs_rq, se, now); + __enqueue_entity(cfs_rq, se); +} + +static void +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + int sleep, u64 now) +{ + update_stats_dequeue(cfs_rq, se, now); + if (sleep) { + se->sleep_start_fair = cfs_rq->fair_clock; +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->sleep_start = now; + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->block_start = now; + } + cfs_rq->wait_runtime -= se->wait_runtime; +#endif + } + __dequeue_entity(cfs_rq, se); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void +__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, + struct sched_entity *curr, unsigned long granularity) +{ + s64 __delta = curr->fair_key - se->fair_key; + + /* + * Take scheduling granularity into account - do not + * preempt the current task unless the best task has + * a larger than sched_granularity fairness advantage: + */ + if (__delta > niced_granularity(curr, granularity)) + resched_task(rq_of(cfs_rq)->curr); +} + +static inline void +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +{ + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. (note, here we rely on pick_next_task() having + * done a put_prev_task_fair() shortly before this, which + * updated rq->fair_clock - used by update_stats_wait_end()) + */ + update_stats_wait_end(cfs_rq, se, now); + update_stats_curr_start(cfs_rq, se, now); + set_cfs_rq_curr(cfs_rq, se); +} + +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) +{ + struct sched_entity *se = __pick_next_entity(cfs_rq); + + set_next_entity(cfs_rq, se, now); + + return se; +} + +static void +put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now) +{ + /* + * If still on the runqueue then deactivate_task() + * was not called and update_curr() has to be done: + */ + if (prev->on_rq) + update_curr(cfs_rq, now); + + update_stats_curr_end(cfs_rq, prev, now); + + if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev, now); + set_cfs_rq_curr(cfs_rq, NULL); +} + +static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *next; + u64 now = __rq_clock(rq); + + /* + * Dequeue and enqueue the task to update its + * position within the tree: + */ + dequeue_entity(cfs_rq, curr, 0, now); + enqueue_entity(cfs_rq, curr, 0, now); + + /* + * Reschedule if another task tops the current one. + */ + next = __pick_next_entity(cfs_rq); + if (next == curr) + return; + + __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); +} + +/************************************************** + * CFS operations on tasks: + */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on + * another cpu ('this_cpu') + */ +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + /* A later patch will take group into account */ + return &cpu_rq(this_cpu)->cfs; +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) tasks belong to the same group ? */ +static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +{ + if (curr->se.cfs_rq == p->se.cfs_rq) + return 1; + + return 0; +} + +#else /* CONFIG_FAIR_GROUP_SCHED */ + +#define for_each_sched_entity(se) \ + for (; se; se = NULL) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return &cpu_rq(this_cpu)->cfs; +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +{ + return 1; +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* + * The enqueue_task method is called before nr_running is + * increased. Here we update the fair scheduling stats and + * then put the task into the rbtree: + */ +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + for_each_sched_entity(se) { + if (se->on_rq) + break; + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, wakeup, now); + } +} + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static void +dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, sleep, now); + /* Don't dequeue parent if it has other entities besides us */ + if (cfs_rq->load.weight) + break; + } +} + +/* + * sched_yield() support is very simple - we dequeue and enqueue + */ +static void yield_task_fair(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + u64 now = __rq_clock(rq); + + /* + * Dequeue and enqueue the task to update its + * position within the tree: + */ + dequeue_entity(cfs_rq, &p->se, 0, now); + enqueue_entity(cfs_rq, &p->se, 0, now); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + unsigned long gran; + + if (unlikely(rt_prio(p->prio))) { + update_curr(cfs_rq, rq_clock(rq)); + resched_task(curr); + return; + } + + gran = sysctl_sched_wakeup_granularity; + /* + * Batch tasks prefer throughput over latency: + */ + if (unlikely(p->policy == SCHED_BATCH)) + gran = sysctl_sched_batch_wakeup_granularity; + + if (is_same_group(curr, p)) + __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); +} + +static struct task_struct * pick_next_task_fair(struct rq *rq, u64 now) +{ + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se; + + if (unlikely(!cfs_rq->nr_running)) + return NULL; + + do { + se = pick_next_entity(cfs_rq, now); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); +} + +/* + * Account for a descheduled task: + */ +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) +{ + struct sched_entity *se = &prev->se; + struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + put_prev_entity(cfs_rq, se, now); + } +} + +/************************************************** + * Fair scheduling class load-balancing methods: + */ + +/* + * Load-balancing iterator. Note: while the runqueue stays locked + * during the whole iteration, the current task might be + * dequeued so the iterator has to be dequeue-safe. Here we + * achieve that by always pre-iterating before returning + * the current task: + */ +static inline struct task_struct * +__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) +{ + struct task_struct *p; + + if (!curr) + return NULL; + + p = rb_entry(curr, struct task_struct, se.run_node); + cfs_rq->rb_load_balance_curr = rb_next(curr); + + return p; +} + +static struct task_struct *load_balance_start_fair(void *arg) +{ + struct cfs_rq *cfs_rq = arg; + + return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); +} + +static struct task_struct *load_balance_next_fair(void *arg) +{ + struct cfs_rq *cfs_rq = arg; + + return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); +} + +static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr; + struct task_struct *p; + + if (!cfs_rq->nr_running) + return MAX_PRIO; + + curr = __pick_next_entity(cfs_rq); + p = task_of(curr); + + return p->prio; +} + +static int +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *total_load_moved) +{ + struct cfs_rq *busy_cfs_rq; + unsigned long load_moved, total_nr_moved = 0, nr_moved; + long rem_load_move = max_load_move; + struct rq_iterator cfs_rq_iterator; + + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + + for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { + struct cfs_rq *this_cfs_rq; + long imbalance; + unsigned long maxload; + int this_best_prio, best_prio, best_prio_seen = 0; + + this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + + imbalance = busy_cfs_rq->load.weight - + this_cfs_rq->load.weight; + /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ + if (imbalance <= 0) + continue; + + /* Don't pull more than imbalance/2 */ + imbalance /= 2; + maxload = min(rem_load_move, imbalance); + + this_best_prio = cfs_rq_best_prio(this_cfs_rq); + best_prio = cfs_rq_best_prio(busy_cfs_rq); + + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) + * of any task we find with that prio. + */ + if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) + best_prio_seen = 1; + + /* pass busy_cfs_rq argument into + * load_balance_[start|next]_fair iterators + */ + cfs_rq_iterator.arg = busy_cfs_rq; + nr_moved = balance_tasks(this_rq, this_cpu, busiest, + max_nr_move, maxload, sd, idle, all_pinned, + &load_moved, this_best_prio, best_prio, + best_prio_seen, &cfs_rq_iterator); + + total_nr_moved += nr_moved; + max_nr_move -= nr_moved; + rem_load_move -= load_moved; + + if (max_nr_move <= 0 || rem_load_move <= 0) + break; + } + + *total_load_moved = max_load_move - rem_load_move; + + return total_nr_moved; +} + +/* + * scheduler tick hitting a task of our scheduling class: + */ +static void task_tick_fair(struct rq *rq, struct task_struct *curr) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &curr->se; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + entity_tick(cfs_rq, se); + } +} + +/* + * Share the fairness runtime between parent and child, thus the + * total amount of pressure for CPU stays equal - new tasks + * get a chance to run but frequent forkers are not allowed to + * monopolize the CPU. Note: the parent runqueue is locked, + * the child is not running yet. + */ +static void task_new_fair(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct sched_entity *se = &p->se; + u64 now = rq_clock(rq); + + sched_info_queued(p); + + update_stats_enqueue(cfs_rq, se, now); + /* + * Child runs first: we let it run before the parent + * until it reschedules once. We set up the key so that + * it will preempt the parent: + */ + p->se.fair_key = current->se.fair_key - + niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; + /* + * The first wait is dominated by the child-runs-first logic, + * so do not credit it with that waiting time yet: + */ + if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) + p->se.wait_start_fair = 0; + + /* + * The statistical average of wait_runtime is about + * -granularity/2, so initialize the task with that: + */ + if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) + p->se.wait_runtime = -(sysctl_sched_granularity / 2); + + __enqueue_entity(cfs_rq, se); + inc_nr_running(p, rq, now); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_curr_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se; + u64 now = rq_clock(rq); + struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + set_next_entity(cfs_rq, se, now); + } +} +#else +static void set_curr_task_fair(struct rq *rq) +{ +} +#endif + +/* + * All the scheduling class methods: + */ +struct sched_class fair_sched_class __read_mostly = { + .enqueue_task = enqueue_task_fair, + .dequeue_task = dequeue_task_fair, + .yield_task = yield_task_fair, + + .check_preempt_curr = check_preempt_curr_fair, + + .pick_next_task = pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + + .load_balance = load_balance_fair, + + .set_curr_task = set_curr_task_fair, + .task_tick = task_tick_fair, + .task_new = task_new_fair, +}; + +#ifdef CONFIG_SCHED_DEBUG +void print_cfs_stats(struct seq_file *m, int cpu, u64 now) +{ + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) + print_cfs_rq(m, cpu, cfs_rq, now); +} +#endif Index: linux-rt.q/kernel/sched_idletask.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/sched_idletask.c @@ -0,0 +1,71 @@ +/* + * idle-task scheduling class. + * + * (NOTE: these are not related to SCHED_IDLE tasks which are + * handled in sched_fair.c) + */ + +/* + * Idle tasks are unconditionally rescheduled: + */ +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +{ + resched_task(rq->idle); +} + +static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) +{ + schedstat_inc(rq, sched_goidle); + + return rq->idle; +} + +/* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +static void +dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) +{ + spin_unlock_irq(&rq->lock); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + spin_lock_irq(&rq->lock); +} + +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) +{ +} + +static int +load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *total_load_moved) +{ + return 0; +} + +static void task_tick_idle(struct rq *rq, struct task_struct *curr) +{ +} + +/* + * Simple, special scheduling class for the per-CPU idle tasks: + */ +static struct sched_class idle_sched_class __read_mostly = { + /* no enqueue/yield_task for idle tasks */ + + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + + .check_preempt_curr = check_preempt_curr_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, + + .load_balance = load_balance_idle, + + .task_tick = task_tick_idle, + /* no .task_new for idle tasks */ +}; Index: linux-rt.q/kernel/sched_rt.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/sched_rt.c @@ -0,0 +1,255 @@ +/* + * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR + * policies) + */ + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static inline void update_curr_rt(struct rq *rq, u64 now) +{ + struct task_struct *curr = rq->curr; + u64 delta_exec; + + if (!task_has_rt_policy(curr)) + return; + + delta_exec = now - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + if (unlikely(delta_exec > curr->se.exec_max)) + curr->se.exec_max = delta_exec; + + curr->se.sum_exec_runtime += delta_exec; + curr->se.exec_start = now; +} + +static void +enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) +{ + struct prio_array *array = &rq->rt.active; + + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void +dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now) +{ + struct prio_array *array = &rq->rt.active; + + update_curr_rt(rq, now); + + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task_rt(struct rq *rq, struct task_struct *p) +{ + struct prio_array *array = &rq->rt.active; + + list_move_tail(&p->run_list, array->queue + p->prio); +} + +static void +yield_task_rt(struct rq *rq, struct task_struct *p) +{ + requeue_task_rt(rq, p); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +{ + if (p->prio < rq->curr->prio) + resched_task(rq->curr); +} + +static struct task_struct * pick_next_task_rt(struct rq *rq, u64 now) +{ + struct prio_array *array = &rq->rt.active; + struct task_struct *next; + struct list_head *queue; + int idx; + + idx = sched_find_first_bit(array->bitmap); + if (idx >= MAX_RT_PRIO) + return NULL; + + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); + + next->se.exec_start = now; + + return next; +} + +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) +{ + update_curr_rt(rq, now); + p->se.exec_start = 0; +} + +/* + * Load-balancing iterator. Note: while the runqueue stays locked + * during the whole iteration, the current task might be + * dequeued so the iterator has to be dequeue-safe. Here we + * achieve that by always pre-iterating before returning + * the current task: + */ +static struct task_struct *load_balance_start_rt(void *arg) +{ + struct rq *rq = arg; + struct prio_array *array = &rq->rt.active; + struct list_head *head, *curr; + struct task_struct *p; + int idx; + + idx = sched_find_first_bit(array->bitmap); + if (idx >= MAX_RT_PRIO) + return NULL; + + head = array->queue + idx; + curr = head->prev; + + p = list_entry(curr, struct task_struct, run_list); + + curr = curr->prev; + + rq->rt.rt_load_balance_idx = idx; + rq->rt.rt_load_balance_head = head; + rq->rt.rt_load_balance_curr = curr; + + return p; +} + +static struct task_struct *load_balance_next_rt(void *arg) +{ + struct rq *rq = arg; + struct prio_array *array = &rq->rt.active; + struct list_head *head, *curr; + struct task_struct *p; + int idx; + + idx = rq->rt.rt_load_balance_idx; + head = rq->rt.rt_load_balance_head; + curr = rq->rt.rt_load_balance_curr; + + /* + * If we arrived back to the head again then + * iterate to the next queue (if any): + */ + if (unlikely(head == curr)) { + int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + + if (next_idx >= MAX_RT_PRIO) + return NULL; + + idx = next_idx; + head = array->queue + idx; + curr = head->prev; + + rq->rt.rt_load_balance_idx = idx; + rq->rt.rt_load_balance_head = head; + } + + p = list_entry(curr, struct task_struct, run_list); + + curr = curr->prev; + + rq->rt.rt_load_balance_curr = curr; + + return p; +} + +static int +load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, unsigned long *load_moved) +{ + int this_best_prio, best_prio, best_prio_seen = 0; + int nr_moved; + struct rq_iterator rt_rq_iterator; + + best_prio = sched_find_first_bit(busiest->rt.active.bitmap); + this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); + + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) + * of any task we find with that prio. + */ + if (busiest->curr->prio == best_prio) + best_prio_seen = 1; + + rt_rq_iterator.start = load_balance_start_rt; + rt_rq_iterator.next = load_balance_next_rt; + /* pass 'busiest' rq argument into + * load_balance_[start|next]_rt iterators + */ + rt_rq_iterator.arg = busiest; + + nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, + max_load_move, sd, idle, all_pinned, load_moved, + this_best_prio, best_prio, best_prio_seen, + &rt_rq_iterator); + + return nr_moved; +} + +static void task_tick_rt(struct rq *rq, struct task_struct *p) +{ + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if (p->policy != SCHED_RR) + return; + + if (--p->time_slice) + return; + + p->time_slice = static_prio_timeslice(p->static_prio); + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task_rt(rq, p); +} + +/* + * No parent/child timeslice management necessary for RT tasks, + * just activate them: + */ +static void task_new_rt(struct rq *rq, struct task_struct *p) +{ + activate_task(rq, p, 1); +} + +static struct sched_class rt_sched_class __read_mostly = { + .enqueue_task = enqueue_task_rt, + .dequeue_task = dequeue_task_rt, + .yield_task = yield_task_rt, + + .check_preempt_curr = check_preempt_curr_rt, + + .pick_next_task = pick_next_task_rt, + .put_prev_task = put_prev_task_rt, + + .load_balance = load_balance_rt, + + .task_tick = task_tick_rt, + .task_new = task_new_rt, +}; Index: linux-rt.q/kernel/sched_stats.h =================================================================== --- /dev/null +++ linux-rt.q/kernel/sched_stats.h @@ -0,0 +1,235 @@ + +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 14 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcnt = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", + cpu, rq->yld_both_empty, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, + rq->sched_switch, rq->sched_cnt, rq->sched_goidle, + rq->ttwu_cnt, rq->ttwu_local, + rq->rq_sched_info.cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + preempt_disable(); + for_each_domain(cpu, sd) { + enum cpu_idle_type itype; + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " + "%lu", + sd->lb_cnt[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" + " %lu %lu %lu\n", + sd->alb_cnt, sd->alb_failed, sd->alb_pushed, + sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); + } + preempt_enable(); +#endif + } + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +const struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcnt++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.cpu_time += delta; +} +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{} +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +#endif + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct task_struct *t) +{ + unsigned long long now = sched_clock(), delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + rq_sched_info_arrive(task_rq(t), delta); +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct task_struct *t) +{ + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = sched_clock(); +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(struct task_struct *t) +{ + unsigned long long delta = sched_clock() - t->sched_info.last_arrival; + + t->sched_info.cpu_time += delta; + rq_sched_info_depart(task_rq(t), delta); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + struct rq *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -488,7 +488,6 @@ void __init softirq_init(void) static int ksoftirqd(void * __bind_cpu) { - set_user_nice(current, 19); current->flags |= PF_NOFREEZE; set_current_state(TASK_INTERRUPTIBLE); Index: linux-rt.q/kernel/sysctl.c =================================================================== --- linux-rt.q.orig/kernel/sysctl.c +++ linux-rt.q/kernel/sysctl.c @@ -206,8 +206,84 @@ static ctl_table root_table[] = { { .ctl_name = 0 } }; +static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ +static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ +static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ +static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ + static ctl_table kern_table[] = { { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_granularity_ns", + .data = &sysctl_sched_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_wakeup_granularity_ns", + .data = &sysctl_sched_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_batch_wakeup_granularity_ns", + .data = &sysctl_sched_batch_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_stat_granularity_ns", + .data = &sysctl_sched_stat_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_runtime_limit_ns", + .data = &sysctl_sched_runtime_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_features", + .data = &sysctl_sched_features, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, Index: linux-rt.q/kernel/time.c =================================================================== --- linux-rt.q.orig/kernel/time.c +++ linux-rt.q/kernel/time.c @@ -57,14 +57,17 @@ EXPORT_SYMBOL(sys_tz); */ asmlinkage long sys_time(time_t __user * tloc) { - time_t i; - struct timeval tv; + /* + * We read xtime.tv_sec atomically - it's updated + * atomically by update_wall_time(), so no need to + * even read-lock the xtime seqlock: + */ + time_t i = xtime.tv_sec; - do_gettimeofday(&tv); - i = tv.tv_sec; + smp_rmb(); /* sys_time() results are coherent */ if (tloc) { - if (put_user(i,tloc)) + if (put_user(i, tloc)) i = -EFAULT; } return i; @@ -373,6 +376,20 @@ void do_gettimeofday (struct timeval *tv tv->tv_sec = sec; tv->tv_usec = usec; + + /* + * Make sure xtime.tv_sec [returned by sys_time()] always + * follows the gettimeofday() result precisely. This + * condition is extremely unlikely, it can hit at most + * once per second: + */ + if (unlikely(xtime.tv_sec != tv->tv_sec)) { + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock); + update_wall_time(); + write_seqlock_irqrestore(&xtime_lock); + } } EXPORT_SYMBOL(do_gettimeofday); Index: linux-rt.q/lib/Kconfig.debug =================================================================== --- linux-rt.q.orig/lib/Kconfig.debug +++ linux-rt.q/lib/Kconfig.debug @@ -105,6 +105,15 @@ config DETECT_SOFTLOCKUP can be detected via the NMI-watchdog, on platforms that support it.) +config SCHED_DEBUG + bool "Collect scheduler debugging info" + depends on DEBUG_KERNEL && PROC_FS + default y + help + If you say Y here, the /proc/sched_debug file will be provided + that can help debug the scheduler. The runtime overhead of this + option is minimal. + config SCHEDSTATS bool "Collect scheduler statistics" depends on DEBUG_KERNEL && PROC_FS patches/lockstat-rt-hooks.patch0000664000077200007720000001223310646635217016145 0ustar mingomingo--- include/linux/lockdep.h | 28 ++++++++++++++++++++++++++++ kernel/rt.c | 25 ++++++++++++++++--------- kernel/rtmutex.c | 4 ++-- 3 files changed, 46 insertions(+), 11 deletions(-) Index: linux-rt.q/include/linux/lockdep.h =================================================================== --- linux-rt.q.orig/include/linux/lockdep.h +++ linux-rt.q/include/linux/lockdep.h @@ -327,6 +327,28 @@ do { \ } \ } while (0) +#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ +do { \ + if (!f_try(&(_lock)->lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + f_lock(&(_lock)->lock); \ + lock_acquired(&(_lock)->dep_map); \ + } \ +} while (0) + + +#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \ +({ \ + int ret = 0; \ + if (!f_try(&(_lock)->lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + ret = f_lock(&(_lock)->lock); \ + if (!ret) \ + lock_acquired(&(_lock)->dep_map); \ + } \ + ret; \ +}) + #else /* CONFIG_LOCK_STAT */ #define lock_contended(lock, ip) do { } while (0) @@ -335,6 +357,12 @@ do { \ #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) +#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ + f_lock(&(_lock)->lock) + +#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \ + f_lock(&(_lock)->lock) + #endif /* CONFIG_LOCK_STAT */ #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) Index: linux-rt.q/kernel/rt.c =================================================================== --- linux-rt.q.orig/kernel/rt.c +++ linux-rt.q/kernel/rt.c @@ -98,16 +98,22 @@ EXPORT_SYMBOL(_mutex_init); void __lockfunc _mutex_lock(struct mutex *lock) { mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - rt_mutex_lock(&lock->lock); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, rt_mutex_lock); } EXPORT_SYMBOL(_mutex_lock); +static int __lockfunc __rt_mutex_lock_interruptible(struct rt_mutex *lock) +{ + return rt_mutex_lock_interruptible(lock, 0); +} + int __lockfunc _mutex_lock_interruptible(struct mutex *lock) { int ret; mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_lock_interruptible(&lock->lock, 0); + ret = LOCK_CONTENDED_RT_RET(lock, rt_mutex_trylock, + __rt_mutex_lock_interruptible); if (ret) mutex_release(&lock->dep_map, 1, _RET_IP_); return ret; @@ -118,7 +124,7 @@ EXPORT_SYMBOL(_mutex_lock_interruptible) void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) { mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - rt_mutex_lock(&lock->lock); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, rt_mutex_lock); } EXPORT_SYMBOL(_mutex_lock_nested); @@ -127,7 +133,8 @@ int __lockfunc _mutex_lock_interruptible int ret; mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - ret = rt_mutex_lock_interruptible(&lock->lock, 0); + ret = LOCK_CONTENDED_RT_RET(lock, rt_mutex_trylock, + __rt_mutex_lock_interruptible); if (ret) mutex_release(&lock->dep_map, 1, _RET_IP_); return ret; @@ -203,7 +210,7 @@ EXPORT_SYMBOL(rt_read_trylock); void __lockfunc rt_write_lock(rwlock_t *rwlock) { rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); - __rt_spin_lock(&rwlock->lock); + LOCK_CONTENDED_RT(rwlock, rt_mutex_trylock, __rt_spin_lock); } EXPORT_SYMBOL(rt_write_lock); @@ -223,7 +230,7 @@ void __lockfunc rt_read_lock(rwlock_t *r return; } spin_unlock_irqrestore(&lock->wait_lock, flags); - __rt_spin_lock(lock); + LOCK_CONTENDED_RT(rwlock, rt_mutex_trylock, __rt_spin_lock); } EXPORT_SYMBOL(rt_read_lock); @@ -359,14 +366,14 @@ EXPORT_SYMBOL(rt_down_write_trylock); void fastcall rt_down_write(struct rw_semaphore *rwsem) { rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_); - rt_mutex_lock(&rwsem->lock); + LOCK_CONTENDED_RT(rwsem, rt_mutex_trylock, rt_mutex_lock); } EXPORT_SYMBOL(rt_down_write); void fastcall rt_down_write_nested(struct rw_semaphore *rwsem, int subclass) { rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); - rt_mutex_lock(&rwsem->lock); + LOCK_CONTENDED_RT(rwsem, rt_mutex_trylock, rt_mutex_lock); } EXPORT_SYMBOL(rt_down_write_nested); @@ -411,7 +418,7 @@ static void __rt_down_read(struct rw_sem return; } spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); - rt_mutex_lock(&rwsem->lock); + LOCK_CONTENDED_RT(rwsem, rt_mutex_trylock, rt_mutex_lock); } void fastcall rt_down_read(struct rw_semaphore *rwsem) Index: linux-rt.q/kernel/rtmutex.c =================================================================== --- linux-rt.q.orig/kernel/rtmutex.c +++ linux-rt.q/kernel/rtmutex.c @@ -785,8 +785,8 @@ rt_spin_lock_slowunlock(struct rt_mutex void __lockfunc rt_spin_lock(spinlock_t *lock) { - rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, __rt_spin_lock); } EXPORT_SYMBOL(rt_spin_lock); @@ -800,8 +800,8 @@ EXPORT_SYMBOL(__rt_spin_lock); void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) { - rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, __rt_spin_lock); } EXPORT_SYMBOL(rt_spin_lock_nested); patches/preempt-realtime-fs-block.patch0000664000077200007720000003053610646635215017533 0ustar mingomingo--- block/ll_rw_blk.c | 6 ++-- fs/aio.c | 6 +++- fs/block_dev.c | 34 +++++++++++++++++++++------ fs/dcache.c | 5 ++-- fs/dnotify.c | 2 - fs/exec.c | 10 ++++++-- fs/file.c | 5 ++-- fs/lockd/svc.c | 8 +----- fs/pipe.c | 12 +++++++++ fs/proc/proc_misc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ fs/proc/task_mmu.c | 4 ++- fs/xfs/linux-2.6/mrlock.h | 4 +-- fs/xfs/xfs_mount.h | 2 - include/linux/genhd.h | 11 +++++++-- 14 files changed, 133 insertions(+), 32 deletions(-) Index: linux-rt.q/block/ll_rw_blk.c =================================================================== --- linux-rt.q.orig/block/ll_rw_blk.c +++ linux-rt.q/block/ll_rw_blk.c @@ -1549,7 +1549,7 @@ static int ll_merge_requests_fn(request_ */ void blk_plug_device(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); /* * don't plug a stopped queue, it must be paired with blk_start_queue() @@ -1572,7 +1572,7 @@ EXPORT_SYMBOL(blk_plug_device); */ int blk_remove_plug(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) return 0; @@ -1664,7 +1664,7 @@ static void blk_unplug_timeout(unsigned **/ void blk_start_queue(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); Index: linux-rt.q/fs/aio.c =================================================================== --- linux-rt.q.orig/fs/aio.c +++ linux-rt.q/fs/aio.c @@ -582,13 +582,15 @@ static void use_mm(struct mm_struct *mm) tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); - tsk->mm = mm; - tsk->active_mm = mm; + local_irq_disable(); // FIXME /* * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise * it won't work. Update it accordingly if you change it here */ switch_mm(active_mm, mm, tsk); + tsk->mm = mm; + tsk->active_mm = mm; + local_irq_enable(); task_unlock(tsk); mmdrop(active_mm); Index: linux-rt.q/fs/block_dev.c =================================================================== --- linux-rt.q.orig/fs/block_dev.c +++ linux-rt.q/fs/block_dev.c @@ -1212,14 +1212,32 @@ static int __blkdev_get(struct block_dev * For now, block device ->open() routine must _not_ * examine anything in 'inode' argument except ->i_rdev. */ - struct file fake_file = {}; - struct dentry fake_dentry = {}; - fake_file.f_mode = mode; - fake_file.f_flags = flags; - fake_file.f_path.dentry = &fake_dentry; - fake_dentry.d_inode = bdev->bd_inode; - - return do_open(bdev, &fake_file, for_part); + struct file *fake_file; + struct dentry *fake_dentry; + int err = -ENOMEM; + + fake_file = kmalloc(sizeof(*fake_file), GFP_KERNEL); + if (!fake_file) + goto out; + memset(fake_file, 0, sizeof(*fake_file)); + + fake_dentry = kmalloc(sizeof(*fake_dentry), GFP_KERNEL); + if (!fake_dentry) + goto out_free_file; + memset(fake_dentry, 0, sizeof(*fake_dentry)); + + fake_file->f_mode = mode; + fake_file->f_flags = flags; + fake_file->f_path.dentry = fake_dentry; + fake_dentry->d_inode = bdev->bd_inode; + + err = do_open(bdev, fake_file, for_part); + + kfree(fake_dentry); +out_free_file: + kfree(fake_file); +out: + return err; } int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) Index: linux-rt.q/fs/dcache.c =================================================================== --- linux-rt.q.orig/fs/dcache.c +++ linux-rt.q/fs/dcache.c @@ -708,8 +708,9 @@ void shrink_dcache_for_umount(struct sup { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); +// -rt: this might succeed there ... +// if (down_read_trylock(&sb->s_umount)) +// BUG(); dentry = sb->s_root; sb->s_root = NULL; Index: linux-rt.q/fs/dnotify.c =================================================================== --- linux-rt.q.orig/fs/dnotify.c +++ linux-rt.q/fs/dnotify.c @@ -162,7 +162,7 @@ void dnotify_parent(struct dentry *dentr spin_lock(&dentry->d_lock); parent = dentry->d_parent; - if (parent->d_inode->i_dnotify_mask & event) { + if (unlikely(parent->d_inode->i_dnotify_mask & event)) { dget(parent); spin_unlock(&dentry->d_lock); __inode_dir_notify(parent->d_inode, event); Index: linux-rt.q/fs/exec.c =================================================================== --- linux-rt.q.orig/fs/exec.c +++ linux-rt.q/fs/exec.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -555,11 +556,16 @@ static int exec_mmap(struct mm_struct *m } } task_lock(tsk); + + local_irq_disable(); active_mm = tsk->active_mm; + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); + task_unlock(tsk); + arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); @@ -681,7 +687,7 @@ static int de_thread(struct task_struct */ leader = tsk->group_leader; while (leader->exit_state != EXIT_ZOMBIE) - yield(); + msleep(1); /* * The only record we have of the real-time age of a Index: linux-rt.q/fs/file.c =================================================================== --- linux-rt.q.orig/fs/file.c +++ linux-rt.q/fs/file.c @@ -96,14 +96,15 @@ void free_fdtable_rcu(struct rcu_head *r kfree(fdt->open_fds); kfree(fdt); } else { - fddef = &get_cpu_var(fdtable_defer_list); + + fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id()); + spin_lock(&fddef->lock); fdt->next = fddef->next; fddef->next = fdt; /* vmallocs are handled from the workqueue context */ schedule_work(&fddef->wq); spin_unlock(&fddef->lock); - put_cpu_var(fdtable_defer_list); } } Index: linux-rt.q/fs/lockd/svc.c =================================================================== --- linux-rt.q.orig/fs/lockd/svc.c +++ linux-rt.q/fs/lockd/svc.c @@ -340,16 +340,12 @@ lockd_down(void) * Wait for the lockd process to exit, but since we're holding * the lockd semaphore, we can't wait around forever ... */ - clear_thread_flag(TIF_SIGPENDING); - interruptible_sleep_on_timeout(&lockd_exit, HZ); - if (nlmsvc_pid) { + if (wait_event_interruptible_timeout(lockd_exit, + nlmsvc_pid == 0, HZ) <= 0) { printk(KERN_WARNING "lockd_down: lockd failed to exit, clearing pid\n"); nlmsvc_pid = 0; } - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); out: mutex_unlock(&nlmsvc_mutex); } Index: linux-rt.q/fs/pipe.c =================================================================== --- linux-rt.q.orig/fs/pipe.c +++ linux-rt.q/fs/pipe.c @@ -326,8 +326,14 @@ redo: wake_up_interruptible(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_accessed(filp); +#endif return ret; } @@ -499,8 +505,14 @@ out: wake_up_interruptible(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_update_time(filp); +#endif return ret; } Index: linux-rt.q/fs/proc/proc_misc.c =================================================================== --- linux-rt.q.orig/fs/proc/proc_misc.c +++ linux-rt.q/fs/proc/proc_misc.c @@ -96,6 +96,27 @@ static int loadavg_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_PREEMPT_RT +static int loadavg_rt_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + extern unsigned long avenrun_rt[]; + extern unsigned long rt_nr_running(void); + int a, b, c; + int len; + + a = avenrun_rt[0] + (FIXED_1/200); + b = avenrun_rt[1] + (FIXED_1/200); + c = avenrun_rt[2] + (FIXED_1/200); + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + rt_nr_running(), nr_threads, current->nsproxy->pid_ns->last_pid); + return proc_calc_metrics(page, start, off, count, eof, len); +} +#endif + static int uptime_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -528,6 +549,38 @@ static int show_stat(struct seq_file *p, nr_running(), nr_iowait()); +#ifdef CONFIG_PREEMPT_RT + { + unsigned long nr_uninterruptible_cpu(int cpu); + extern int pi_initialized; + unsigned long rt_nr_running(void); + unsigned long rt_nr_running_cpu(int cpu); + unsigned long rt_nr_uninterruptible(void); + unsigned long rt_nr_uninterruptible_cpu(int cpu); + + int i; + + seq_printf(p, "pi_init: %d\n", pi_initialized); + seq_printf(p, "nr_running(): %ld\n", + nr_running()); + seq_printf(p, "nr_uninterruptible(): %ld\n", + nr_uninterruptible()); + for_each_online_cpu(i) + seq_printf(p, "nr_uninterruptible(%d): %ld\n", + i, nr_uninterruptible_cpu(i)); + seq_printf(p, "rt_nr_running(): %ld\n", + rt_nr_running()); + for_each_online_cpu(i) + seq_printf(p, "rt_nr_running(%d): %ld\n", + i, rt_nr_running_cpu(i)); + seq_printf(p, "nr_rt_uninterruptible(): %ld\n", + rt_nr_uninterruptible()); + for_each_online_cpu(i) + seq_printf(p, "nr_rt_uninterruptible(%d): %ld\n", + i, rt_nr_uninterruptible_cpu(i)); + } +#endif + return 0; } @@ -689,6 +742,9 @@ void __init proc_misc_init(void) int (*read_proc)(char*,char**,off_t,int,int*,void*); } *p, simple_ones[] = { {"loadavg", loadavg_read_proc}, +#ifdef CONFIG_PREEMPT_RT + {"loadavgrt", loadavg_rt_read_proc}, +#endif {"uptime", uptime_read_proc}, {"meminfo", meminfo_read_proc}, {"version", version_read_proc}, Index: linux-rt.q/fs/proc/task_mmu.c =================================================================== --- linux-rt.q.orig/fs/proc/task_mmu.c +++ linux-rt.q/fs/proc/task_mmu.c @@ -417,8 +417,10 @@ static void *m_start(struct seq_file *m, vma = NULL; if ((unsigned long)l < mm->map_count) { vma = mm->mmap; - while (l-- && vma) + while (l-- && vma) { vma = vma->vm_next; + cond_resched(); + } goto out; } Index: linux-rt.q/fs/xfs/linux-2.6/mrlock.h =================================================================== --- linux-rt.q.orig/fs/xfs/linux-2.6/mrlock.h +++ linux-rt.q/fs/xfs/linux-2.6/mrlock.h @@ -23,8 +23,8 @@ enum { MR_NONE, MR_ACCESS, MR_UPDATE }; typedef struct { - struct rw_semaphore mr_lock; - int mr_writer; + struct compat_rw_semaphore mr_lock; + int mr_writer; } mrlock_t; #define mrinit(mrp, name) \ Index: linux-rt.q/fs/xfs/xfs_mount.h =================================================================== --- linux-rt.q.orig/fs/xfs/xfs_mount.h +++ linux-rt.q/fs/xfs/xfs_mount.h @@ -376,7 +376,7 @@ typedef struct xfs_mount { uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct compat_rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ sema_t m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ Index: linux-rt.q/include/linux/genhd.h =================================================================== --- linux-rt.q.orig/include/linux/genhd.h +++ linux-rt.q/include/linux/genhd.h @@ -157,15 +157,22 @@ struct disk_attribute { * variants disable/enable preemption. */ #ifdef CONFIG_SMP -#define __disk_stat_add(gendiskp, field, addnd) \ - (per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd) +#define __disk_stat_add(gendiskp, field, addnd) \ +do { \ + preempt_disable(); \ + (per_cpu_ptr(gendiskp->dkstats, \ + smp_processor_id())->field += addnd); \ + preempt_enable(); \ +} while (0) #define disk_stat_read(gendiskp, field) \ ({ \ typeof(gendiskp->dkstats->field) res = 0; \ int i; \ + preempt_disable(); \ for_each_possible_cpu(i) \ res += per_cpu_ptr(gendiskp->dkstats, i)->field; \ + preempt_enable(); \ res; \ }) patches/ppc-gtod-support.patch0000664000077200007720000002453310646635213016006 0ustar mingomingoEarly pass on powerpc conversion to generic timekeeping. Signed-off-by: John Stultz arch/powerpc/Kconfig | 4 arch/powerpc/kernel/time.c | 279 +++++---------------------------------------- 2 files changed, 37 insertions(+), 246 deletions(-) linux-2.6.18-rc6_timeofday-arch-ppc_C6.patch ============================================ Index: linux-rt.q/arch/powerpc/Kconfig =================================================================== --- linux-rt.q.orig/arch/powerpc/Kconfig +++ linux-rt.q/arch/powerpc/Kconfig @@ -31,6 +31,10 @@ config MMU bool default y +config GENERIC_TIME + bool + default y + config GENERIC_HARDIRQS bool default y Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -116,8 +116,6 @@ EXPORT_SYMBOL_GPL(rtc_lock); u64 tb_to_ns_scale; unsigned tb_to_ns_shift; -struct gettimeofday_struct do_gtod; - extern struct timezone sys_tz; static long timezone_offset; @@ -375,160 +373,6 @@ static __inline__ void timer_check_rtc(v } } -/* - * This version of gettimeofday has microsecond resolution. - */ -static inline void __do_gettimeofday(struct timeval *tv) -{ - unsigned long sec, usec; - u64 tb_ticks, xsec; - struct gettimeofday_vars *temp_varp; - u64 temp_tb_to_xs, temp_stamp_xsec; - - /* - * These calculations are faster (gets rid of divides) - * if done in units of 1/2^20 rather than microseconds. - * The conversion to microseconds at the end is done - * without a divide (and in fact, without a multiply) - */ - temp_varp = do_gtod.varp; - - /* Sampling the time base must be done after loading - * do_gtod.varp in order to avoid racing with update_gtod. - */ - data_barrier(temp_varp); - tb_ticks = get_tb() - temp_varp->tb_orig_stamp; - temp_tb_to_xs = temp_varp->tb_to_xs; - temp_stamp_xsec = temp_varp->stamp_xsec; - xsec = temp_stamp_xsec + mulhdu(tb_ticks, temp_tb_to_xs); - sec = xsec / XSEC_PER_SEC; - usec = (unsigned long)xsec & (XSEC_PER_SEC - 1); - usec = SCALE_XSEC(usec, 1000000); - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -void do_gettimeofday(struct timeval *tv) -{ - if (__USE_RTC()) { - /* do this the old way */ - unsigned long flags, seq; - unsigned int sec, nsec, usec; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec + tb_ticks_since(tb_last_jiffy); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - usec = nsec / 1000; - while (usec >= 1000000) { - usec -= 1000000; - ++sec; - } - tv->tv_sec = sec; - tv->tv_usec = usec; - return; - } - __do_gettimeofday(tv); -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * There are two copies of tb_to_xs and stamp_xsec so that no - * lock is needed to access and use these values in - * do_gettimeofday. We alternate the copies and as long as a - * reasonable time elapses between changes, there will never - * be inconsistent values. ntpd has a minimum of one minute - * between updates. - */ -static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, - u64 new_tb_to_xs) -{ - unsigned temp_idx; - struct gettimeofday_vars *temp_varp; - - temp_idx = (do_gtod.var_idx == 0); - temp_varp = &do_gtod.vars[temp_idx]; - - temp_varp->tb_to_xs = new_tb_to_xs; - temp_varp->tb_orig_stamp = new_tb_stamp; - temp_varp->stamp_xsec = new_stamp_xsec; - smp_mb(); - do_gtod.varp = temp_varp; - do_gtod.var_idx = temp_idx; - - /* - * tb_update_count is used to allow the userspace gettimeofday code - * to assure itself that it sees a consistent view of the tb_to_xs and - * stamp_xsec variables. It reads the tb_update_count, then reads - * tb_to_xs and stamp_xsec and then reads tb_update_count again. If - * the two values of tb_update_count match and are even then the - * tb_to_xs and stamp_xsec values are consistent. If not, then it - * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. - */ - vdso_data->tb_orig_stamp = new_tb_stamp; - vdso_data->stamp_xsec = new_stamp_xsec; - vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; - smp_wmb(); - ++(vdso_data->tb_update_count); -} - -/* - * When the timebase - tb_orig_stamp gets too big, we do a manipulation - * between tb_orig_stamp and stamp_xsec. The goal here is to keep the - * difference tb - tb_orig_stamp small enough to always fit inside a - * 32 bits number. This is a requirement of our fast 32 bits userland - * implementation in the vdso. If we "miss" a call to this function - * (interrupt latency, CPU locked in a spinlock, ...) and we end up - * with a too big difference, then the vdso will fallback to calling - * the syscall - */ -static __inline__ void timer_recalc_offset(u64 cur_tb) -{ - unsigned long offset; - u64 new_stamp_xsec; - u64 tlen, t2x; - u64 tb, xsec_old, xsec_new; - struct gettimeofday_vars *varp; - - if (__USE_RTC()) - return; - tlen = current_tick_length(); - offset = cur_tb - do_gtod.varp->tb_orig_stamp; - if (tlen == last_tick_len && offset < 0x80000000u) - return; - if (tlen != last_tick_len) { - t2x = mulhdu(tlen << TICKLEN_SHIFT, ticklen_to_xs); - last_tick_len = tlen; - } else - t2x = do_gtod.varp->tb_to_xs; - new_stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; - - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * Make sure time doesn't go backwards for userspace gettimeofday. - */ - tb = get_tb(); - varp = do_gtod.varp; - xsec_old = mulhdu(tb - varp->tb_orig_stamp, varp->tb_to_xs) - + varp->stamp_xsec; - xsec_new = mulhdu(tb - cur_tb, t2x) + new_stamp_xsec; - if (xsec_new < xsec_old) - new_stamp_xsec += xsec_old - xsec_new; - - update_gtod(cur_tb, new_stamp_xsec, t2x); -} - #ifdef CONFIG_SMP unsigned long notrace profile_pc(struct pt_regs *regs) { @@ -578,11 +422,7 @@ static void iSeries_tb_recal(void) tb_ticks_per_sec = new_tb_ticks_per_sec; calc_cputime_factors(); div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; tb_to_xs = divres.result_low; - do_gtod.varp->tb_to_xs = tb_to_xs; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->tb_to_xs = tb_to_xs; } else { printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" @@ -738,77 +578,6 @@ unsigned long long sched_clock(void) return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift; } -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, new_sec = tv->tv_sec; - long wtm_nsec, new_nsec = tv->tv_nsec; - unsigned long flags; - u64 new_xsec; - unsigned long tb_delta; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - /* - * Updating the RTC is not the job of this code. If the time is - * stepped under NTP, the RTC will be updated after STA_UNSYNC - * is cleared. Tools like clock/hwclock either copy the RTC - * to the system time, in which case there is no point in writing - * to the RTC again, or write to the RTC but then they don't call - * settimeofday to perform this operation. - */ -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES) && first_settimeofday) { - iSeries_tb_recal(); - first_settimeofday = 0; - } -#endif - - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * Subtract off the number of nanoseconds since the - * beginning of the last tick. - */ - tb_delta = tb_ticks_since(tb_last_jiffy); - tb_delta = mulhdu(tb_delta, do_gtod.varp->tb_to_xs); /* in xsec */ - new_nsec -= SCALE_XSEC(tb_delta, 1000000000); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec); - - set_normalized_timespec(&xtime, new_sec, new_nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - /* In case of a large backwards jump in time with NTP, we want the - * clock to be updated as soon as the PLL is again in lock. - */ - last_rtc_update = new_sec - 658; - - ntp_clear(); - - new_xsec = xtime.tv_nsec; - if (new_xsec != 0) { - new_xsec *= XSEC_PER_SEC; - do_div(new_xsec, NSEC_PER_SEC); - } - new_xsec += (u64)xtime.tv_sec * XSEC_PER_SEC; - update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs); - - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; - - write_sequnlock_irqrestore(&xtime_lock, flags); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int __init get_freq(char *name, int cells, unsigned long *val) { struct device_node *cpu; @@ -974,20 +743,6 @@ void __init time_init(void) xtime.tv_sec = tm; xtime.tv_nsec = 0; - do_gtod.varp = &do_gtod.vars[0]; - do_gtod.var_idx = 0; - do_gtod.varp->tb_orig_stamp = tb_last_jiffy; - __get_cpu_var(last_jiffy) = tb_last_jiffy; - do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - do_gtod.varp->tb_to_xs = tb_to_xs; - do_gtod.tb_to_us = tb_to_us; - - vdso_data->tb_orig_stamp = tb_last_jiffy; - vdso_data->tb_update_count = 0; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - vdso_data->tb_to_xs = tb_to_xs; time_freq = 0; @@ -1000,7 +755,6 @@ void __init time_init(void) set_dec(tb_ticks_per_jiffy); } - #define FEBRUARY 2 #define STARTOFTIME 1970 #define SECDAY 86400L @@ -1145,3 +899,36 @@ void div128_by_32(u64 dividend_high, u64 dr->result_low = ((u64)y << 32) + z; } + + +/* powerpc clocksource code */ + +#include +static cycle_t timebase_read(void) +{ + return (cycle_t)get_tb(); +} + +struct clocksource clocksource_timebase = { + .name = "timebase", + .rating = 200, + .read = timebase_read, + .mask = (cycle_t)-1, + .mult = 0, + .shift = 22, +}; + + +/* XXX - this should be calculated or properly externed! */ +static int __init init_timebase_clocksource(void) +{ + if (__USE_RTC()) + return -ENODEV; + + clocksource_timebase.mult = clocksource_hz2mult(tb_ticks_per_sec, + clocksource_timebase.shift); + return clocksource_register(&clocksource_timebase); +} + +module_init(init_timebase_clocksource); + patches/jiffies-remove-unused-macros.patch0000664000077200007720000000266610646635211020255 0ustar mingomingoSubject: jiffies: remove unused macros From: Chris Wright The x86 hpet cleanups allow removal of some unused macros. Signed-off-by: Chris Wright Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 6 ------ 1 file changed, 6 deletions(-) Index: linux-rt.q/include/linux/jiffies.h =================================================================== --- linux-rt.q.orig/include/linux/jiffies.h +++ linux-rt.q/include/linux/jiffies.h @@ -36,8 +36,6 @@ /* LATCH is used in the interval timer and ftape setup. */ #define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ -#define LATCH_HPET ((HPET_TICK_RATE + HZ/2) / HZ) - /* Suppose we want to devide two numbers NOM and DEN: NOM/DEN, the we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN @@ -53,13 +51,9 @@ /* HZ is the requested value. ACTHZ is actual HZ ("<< 8" is for accuracy) */ #define ACTHZ (SH_DIV (CLOCK_TICK_RATE, LATCH, 8)) -#define ACTHZ_HPET (SH_DIV (HPET_TICK_RATE, LATCH_HPET, 8)) - /* TICK_NSEC is the time between ticks in nsec assuming real ACTHZ */ #define TICK_NSEC (SH_DIV (1000000UL * 1000, ACTHZ, 8)) -#define TICK_NSEC_HPET (SH_DIV(1000000UL * 1000, ACTHZ_HPET, 8)) - /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) patches/rt-mutex-core.patch0000664000077200007720000055302110646635214015272 0ustar mingomingo--- drivers/acpi/processor_idle.c | 6 drivers/input/ff-memless.c | 1 fs/proc/array.c | 28 + include/linux/bit_spinlock.h | 4 include/linux/init_task.h | 3 include/linux/irqflags.h | 37 + include/linux/mutex.h | 57 ++ include/linux/plist.h | 4 include/linux/preempt.h | 25 + include/linux/rt_lock.h | 339 ++++++++++++++++ include/linux/rtmutex.h | 4 include/linux/rwsem-spinlock.h | 35 - include/linux/rwsem.h | 108 ++++- include/linux/sched.h | 99 ++++ include/linux/semaphore.h | 50 ++ include/linux/seqlock.h | 195 ++++++++- include/linux/smp.h | 2 include/linux/spinlock.h | 794 +++++++++++++++++++++++++++++--------- include/linux/spinlock_api_smp.h | 91 ++-- include/linux/spinlock_api_up.h | 74 ++- include/linux/spinlock_types.h | 63 ++- include/linux/spinlock_types_up.h | 15 include/linux/spinlock_up.h | 8 init/main.c | 2 kernel/Makefile | 6 kernel/fork.c | 7 kernel/futex.c | 4 kernel/hrtimer.c | 4 kernel/lockdep.c | 2 kernel/rt.c | 571 +++++++++++++++++++++++++++ kernel/rtmutex-debug.c | 113 +---- kernel/rtmutex.c | 433 ++++++++++++++++++-- kernel/rwsem.c | 44 +- kernel/sched.c | 127 ++++-- kernel/softirq.c | 6 kernel/spinlock.c | 267 ++++++++---- kernel/stop_machine.c | 2 lib/debug_locks.c | 1 lib/dec_and_lock.c | 4 lib/kernel_lock.c | 4 lib/locking-selftest.c | 6 lib/plist.c | 2 lib/rwsem-spinlock.c | 29 - lib/rwsem.c | 6 lib/semaphore-sleepers.c | 16 lib/spinlock_debug.c | 64 +-- net/ipv4/route.c | 2 net/ipv4/tcp.c | 4 48 files changed, 3056 insertions(+), 712 deletions(-) Index: linux-rt.q/drivers/acpi/processor_idle.c =================================================================== --- linux-rt.q.orig/drivers/acpi/processor_idle.c +++ linux-rt.q/drivers/acpi/processor_idle.c @@ -884,7 +884,7 @@ static int acpi_idle_enter_c1(struct cpu * NEED_RESCHED: */ smp_mb(); - if (!need_resched()) + if (!need_resched() || !need_resched_delayed()) safe_halt(); current_thread_info()->status |= TS_POLLING; @@ -920,7 +920,7 @@ static int acpi_idle_enter_c2(struct cpu */ smp_mb(); - if (unlikely(need_resched())) { + if (unlikely(need_resched() || need_resched_delayed())) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return 0; @@ -978,7 +978,7 @@ static int acpi_idle_enter_c3(struct cpu */ smp_mb(); - if (unlikely(need_resched())) { + if (unlikely(need_resched() || need_resched_delayed())) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return 0; Index: linux-rt.q/drivers/input/ff-memless.c =================================================================== --- linux-rt.q.orig/drivers/input/ff-memless.c +++ linux-rt.q/drivers/input/ff-memless.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include Index: linux-rt.q/fs/proc/array.c =================================================================== --- linux-rt.q.orig/fs/proc/array.c +++ linux-rt.q/fs/proc/array.c @@ -130,17 +130,19 @@ static inline char * task_name(struct ta */ static const char *task_state_array[] = { "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "M (running-mutex)", /* 1 */ + "S (sleeping)", /* 2 */ + "D (disk sleep)", /* 4 */ + "T (stopped)", /* 8 */ + "T (tracing stop)", /* 16 */ + "Z (zombie)", /* 32 */ + "X (dead)" /* 64 */ }; static inline const char * get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state & (TASK_RUNNING | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_STOPPED | @@ -289,6 +291,19 @@ static inline char *task_cap(struct task cap_t(p->cap_effective)); } +#define get_blocked_on(t) (-1) + +static char *show_blocked_on(struct task_struct *task, char *buffer) +{ + pid_t pid = get_blocked_on(task); + + if (pid < 0) + return buffer; + + return buffer + sprintf(buffer,"BlckOn: %d\n",pid); +} + + int proc_pid_status(struct task_struct *task, char * buffer) { char * orig = buffer; @@ -307,6 +322,7 @@ int proc_pid_status(struct task_struct * #if defined(CONFIG_S390) buffer = task_show_regs(task, buffer); #endif + buffer = show_blocked_on(task,buffer); return buffer - orig; } Index: linux-rt.q/include/linux/bit_spinlock.h =================================================================== --- linux-rt.q.orig/include/linux/bit_spinlock.h +++ linux-rt.q/include/linux/bit_spinlock.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H +#if 0 + /* * bit-based spin_lock() * @@ -73,5 +75,7 @@ static inline int bit_spin_is_locked(int #endif } +#endif + #endif /* __LINUX_BIT_SPINLOCK_H */ Index: linux-rt.q/include/linux/init_task.h =================================================================== --- linux-rt.q.orig/include/linux/init_task.h +++ linux-rt.q/include/linux/init_task.h @@ -8,6 +8,7 @@ #include #include #include +#include #define INIT_FDTABLE \ { \ @@ -161,7 +162,7 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ - .pi_lock = __SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ + .pi_lock = RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ Index: linux-rt.q/include/linux/irqflags.h =================================================================== --- linux-rt.q.orig/include/linux/irqflags.h +++ linux-rt.q/include/linux/irqflags.h @@ -11,6 +11,12 @@ #ifndef _LINUX_TRACE_IRQFLAGS_H #define _LINUX_TRACE_IRQFLAGS_H +#define BUILD_CHECK_IRQ_FLAGS(flags) \ + do { \ + BUILD_BUG_ON(sizeof(flags) != sizeof(unsigned long)); \ + typecheck(unsigned long, flags); \ + } while (0) + #ifdef CONFIG_TRACE_IRQFLAGS extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); @@ -59,10 +65,15 @@ #define local_irq_disable() \ do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0) #define local_irq_save(flags) \ - do { raw_local_irq_save(flags); trace_hardirqs_off(); } while (0) + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_local_irq_save(flags); \ + trace_hardirqs_off(); \ + } while (0) #define local_irq_restore(flags) \ do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ if (raw_irqs_disabled_flags(flags)) { \ raw_local_irq_restore(flags); \ trace_hardirqs_off(); \ @@ -78,8 +89,16 @@ */ # define raw_local_irq_disable() local_irq_disable() # define raw_local_irq_enable() local_irq_enable() -# define raw_local_irq_save(flags) local_irq_save(flags) -# define raw_local_irq_restore(flags) local_irq_restore(flags) +# define raw_local_irq_save(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + local_irq_save(flags); \ + } while (0) +# define raw_local_irq_restore(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + local_irq_restore(flags); \ + } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT @@ -89,7 +108,11 @@ raw_safe_halt(); \ } while (0) -#define local_save_flags(flags) raw_local_save_flags(flags) +#define local_save_flags(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_local_save_flags(flags); \ + } while (0) #define irqs_disabled() \ ({ \ @@ -99,7 +122,11 @@ raw_irqs_disabled_flags(flags); \ }) -#define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags) +#define irqs_disabled_flags(flags) \ +({ \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_irqs_disabled_flags(flags); \ +}) #endif /* CONFIG_X86 */ #endif Index: linux-rt.q/include/linux/mutex.h =================================================================== --- linux-rt.q.orig/include/linux/mutex.h +++ linux-rt.q/include/linux/mutex.h @@ -12,11 +12,66 @@ #include #include +#include #include #include #include +#ifdef CONFIG_PREEMPT_RT + +#include + +struct mutex { + struct rt_mutex lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#define __MUTEX_INITIALIZER(mutexname) \ + { \ + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ + } + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +extern void +_mutex_init(struct mutex *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc _mutex_lock(struct mutex *lock); +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_trylock(struct mutex *lock); +extern void __lockfunc _mutex_unlock(struct mutex *lock); + +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) +#define mutex_lock(l) _mutex_lock(l) +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) +#define mutex_trylock(l) _mutex_trylock(l) +#define mutex_unlock(l) _mutex_unlock(l) +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible_nested(l, s) +#else +# define mutex_lock_nested(l, s) _mutex_lock(l) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible(l) +#endif + +# define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + _mutex_init((mutex), #mutex, &__key); \ +} while (0) + +#else /* * Simple, straightforward mutexes with strict semantics: * @@ -140,3 +195,5 @@ extern int fastcall mutex_trylock(struct extern void fastcall mutex_unlock(struct mutex *lock); #endif + +#endif Index: linux-rt.q/include/linux/plist.h =================================================================== --- linux-rt.q.orig/include/linux/plist.h +++ linux-rt.q/include/linux/plist.h @@ -81,7 +81,7 @@ struct plist_head { struct list_head prio_list; struct list_head node_list; #ifdef CONFIG_DEBUG_PI_LIST - spinlock_t *lock; + raw_spinlock_t *lock; #endif }; @@ -125,7 +125,7 @@ struct plist_node { * @lock: list spinlock, remembered for debugging */ static inline void -plist_head_init(struct plist_head *head, spinlock_t *lock) +plist_head_init(struct plist_head *head, raw_spinlock_t *lock) { INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); Index: linux-rt.q/include/linux/preempt.h =================================================================== --- linux-rt.q.orig/include/linux/preempt.h +++ linux-rt.q/include/linux/preempt.h @@ -8,6 +8,7 @@ #include #include +#include #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) extern void notrace add_preempt_count(unsigned int val); @@ -34,11 +35,12 @@ #define inc_preempt_count() add_preempt_count(1) #define dec_preempt_count() sub_preempt_count(1) -#define preempt_count() (current_thread_info()->preempt_count) +#define preempt_count() (current_thread_info()->preempt_count) #ifdef CONFIG_PREEMPT asmlinkage void preempt_schedule(void); +asmlinkage void preempt_schedule_irq(void); #define preempt_disable() \ do { \ @@ -46,21 +48,34 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define __preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) + +#ifdef CONFIG_DEBUG_PREEMPT +extern void notrace preempt_enable_no_resched(void); +#else +# define preempt_enable_no_resched() __preempt_enable_no_resched() +#endif + #define preempt_check_resched() \ do { \ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ preempt_schedule(); \ } while (0) +#define preempt_check_resched_delayed() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \ + preempt_schedule(); \ +} while (0) + #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ + __preempt_enable_no_resched(); \ barrier(); \ preempt_check_resched(); \ } while (0) @@ -69,8 +84,12 @@ do { \ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) +#define __preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) #define preempt_check_resched() do { } while (0) +#define preempt_check_resched_delayed() do { } while (0) + +#define preempt_schedule_irq() do { } while (0) #endif Index: linux-rt.q/include/linux/rt_lock.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/rt_lock.h @@ -0,0 +1,339 @@ +#ifndef __LINUX_RT_LOCK_H +#define __LINUX_RT_LOCK_H + +/* + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar + * + * This file contains the main data structure definitions. + */ +#include +#include +#include + +#ifdef CONFIG_PREEMPT_RT +/* + * spinlocks - an RT mutex plus lock-break field: + */ +typedef struct { + struct rt_mutex lock; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} spinlock_t; + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define __SPIN_LOCK_UNLOCKED(name) \ + (spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) \ + , .save_state = 1, .file = __FILE__, .line = __LINE__ }, SPIN_DEP_MAP_INIT(name) } +#else +# define __SPIN_LOCK_UNLOCKED(name) \ + (spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) }, SPIN_DEP_MAP_INIT(name) } +#endif +# define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(spin_old_style) +#else /* !PREEMPT_RT */ + typedef raw_spinlock_t spinlock_t; +# ifdef CONFIG_DEBUG_SPINLOCK +# define _SPIN_LOCK_UNLOCKED \ + { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ + .magic = SPINLOCK_MAGIC, \ + .owner = SPINLOCK_OWNER_INIT, \ + .owner_cpu = -1 } +# else +# define _SPIN_LOCK_UNLOCKED \ + { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED } +# endif +# define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED +# define __SPIN_LOCK_UNLOCKED(name) _SPIN_LOCK_UNLOCKED +#endif + +#define __DEFINE_SPINLOCK(name) \ + spinlock_t name = __SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_SPINLOCK(name) \ + spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name) + +#ifdef CONFIG_PREEMPT_RT + +/* + * RW-semaphores are a spinlock plus a reader-depth count. + * + * Note that the semantics are different from the usual + * Linux rw-sems, in PREEMPT_RT mode we do not allow + * multiple readers to hold the lock at once, we only allow + * a read-lock owner to read-lock recursively. This is + * better for latency, makes the implementation inherently + * fair and makes it simpler as well: + */ +struct rw_semaphore { + struct rt_mutex lock; + int read_depth; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +/* + * rwlocks - an RW semaphore plus lock-break field: + */ +typedef struct { + struct rt_mutex lock; + int read_depth; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} rwlock_t; + +# ifdef CONFIG_DEBUG_RT_MUTEXES +# define __RW_LOCK_UNLOCKED(name) (rwlock_t) \ + { .lock = { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name), \ + .save_state = 1, .file = __FILE__, .line = __LINE__ } } +# else +# define __RW_LOCK_UNLOCKED(name) (rwlock_t) \ + { .lock = { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) } } +# endif +#else /* !PREEMPT_RT */ + + typedef raw_rwlock_t rwlock_t; +# ifdef CONFIG_DEBUG_SPINLOCK +# define _RW_LOCK_UNLOCKED \ + (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ + .magic = RWLOCK_MAGIC, \ + .owner = SPINLOCK_OWNER_INIT, \ + .owner_cpu = -1 } +# else +# define _RW_LOCK_UNLOCKED \ + (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED } +# endif +# define __RW_LOCK_UNLOCKED(name) _RW_LOCK_UNLOCKED +#endif + +#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(rw_old_style) + +#define DEFINE_RWLOCK(name) \ + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name) + +#ifdef CONFIG_PREEMPT_RT + +/* + * Semaphores - a spinlock plus the semaphore count: + */ +struct semaphore { + atomic_t count; + struct rt_mutex lock; +}; + +#define DECLARE_MUTEX(name) \ +struct semaphore name = \ + { .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) } + +/* + * DECLARE_MUTEX_LOCKED() is deprecated: very hard to initialize properly + * and it also often signals abuse of semaphores. So we redirect it to + * compat semaphores: + */ +#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED + +extern void fastcall __sema_init(struct semaphore *sem, int val, char *name, char *file, int line); + +#define rt_sema_init(sem, val) \ + __sema_init(sem, val, #sem, __FILE__, __LINE__) + +extern void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file, int line); +#define rt_init_MUTEX(sem) \ + __init_MUTEX(sem, #sem, __FILE__, __LINE__) + +extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void); + +/* + * No locked initialization for RT semaphores + */ +#define rt_init_MUTEX_LOCKED(sem) \ + there_is_no_init_MUTEX_LOCKED_for_RT_semaphores() +extern void fastcall rt_down(struct semaphore *sem); +extern int fastcall rt_down_interruptible(struct semaphore *sem); +extern int fastcall rt_down_trylock(struct semaphore *sem); +extern void fastcall rt_up(struct semaphore *sem); + +#define rt_sem_is_locked(s) rt_mutex_is_locked(&(s)->lock) +#define rt_sema_count(s) atomic_read(&(s)->count) + +extern int __bad_func_type(void); + +#undef TYPE_EQUAL +#define TYPE_EQUAL(var, type) \ + __builtin_types_compatible_p(typeof(var), type *) + +#define PICK_FUNC_1ARG(type1, type2, func1, func2, arg) \ +do { \ + if (TYPE_EQUAL((arg), type1)) \ + func1((type1 *)(arg)); \ + else if (TYPE_EQUAL((arg), type2)) \ + func2((type2 *)(arg)); \ + else __bad_func_type(); \ +} while (0) + +#define PICK_FUNC_1ARG_RET(type1, type2, func1, func2, arg) \ +({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((arg), type1)) \ + __ret = func1((type1 *)(arg)); \ + else if (TYPE_EQUAL((arg), type2)) \ + __ret = func2((type2 *)(arg)); \ + else __ret = __bad_func_type(); \ + \ + __ret; \ +}) + +#define PICK_FUNC_2ARG(type1, type2, func1, func2, arg0, arg1) \ +do { \ + if (TYPE_EQUAL((arg0), type1)) \ + func1((type1 *)(arg0), arg1); \ + else if (TYPE_EQUAL((arg0), type2)) \ + func2((type2 *)(arg0), arg1); \ + else __bad_func_type(); \ +} while (0) + +#define sema_init(sem, val) \ + PICK_FUNC_2ARG(struct compat_semaphore, struct semaphore, \ + compat_sema_init, rt_sema_init, sem, val) + +#define init_MUTEX(sem) \ + PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \ + compat_init_MUTEX, rt_init_MUTEX, sem) + +#define init_MUTEX_LOCKED(sem) \ + PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \ + compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem) + +#define down(sem) \ + PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \ + compat_down, rt_down, sem) + +#define down_interruptible(sem) \ + PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \ + compat_down_interruptible, rt_down_interruptible, sem) + +#define down_trylock(sem) \ + PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \ + compat_down_trylock, rt_down_trylock, sem) + +#define up(sem) \ + PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \ + compat_up, rt_up, sem) + +#define sem_is_locked(sem) \ + PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \ + compat_sem_is_locked, rt_sem_is_locked, sem) + +#define sema_count(sem) \ + PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \ + compat_sema_count, rt_sema_count, sem) + +/* + * rwsems: + */ + +#define __RWSEM_INITIALIZER(name) \ + { .lock = __RT_MUTEX_INITIALIZER(name.lock) } + +#define DECLARE_RWSEM(lockname) \ + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) + +extern void fastcall __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key); + +# define rt_init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwsem_init((sem), #sem, &__key); \ +} while (0) + +extern void fastcall rt_down_write(struct rw_semaphore *rwsem); +extern void fastcall +rt_down_read_nested(struct rw_semaphore *rwsem, int subclass); +extern void fastcall +rt_down_write_nested(struct rw_semaphore *rwsem, int subclass); +extern void fastcall rt_down_read(struct rw_semaphore *rwsem); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void fastcall rt_down_read_non_owner(struct rw_semaphore *rwsem); +#else +# define rt_down_read_non_owner(rwsem) rt_down_read(rwsem) +#endif +extern int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem); +extern int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem); +extern void fastcall rt_up_read(struct rw_semaphore *rwsem); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void fastcall rt_up_read_non_owner(struct rw_semaphore *rwsem); +#else +# define rt_up_read_non_owner(rwsem) rt_up_read(rwsem) +#endif +extern void fastcall rt_up_write(struct rw_semaphore *rwsem); +extern void fastcall rt_downgrade_write(struct rw_semaphore *rwsem); + +# define rt_rwsem_is_locked(rws) (rt_mutex_is_locked(&(rws)->lock)) + +#define init_rwsem(rwsem) \ + PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_init_rwsem, rt_init_rwsem, rwsem) + +#define down_read(rwsem) \ + PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_down_read, rt_down_read, rwsem) + +#define down_read_non_owner(rwsem) \ + PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_down_read_non_owner, rt_down_read_non_owner, rwsem) + +#define down_read_trylock(rwsem) \ + PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_down_read_trylock, rt_down_read_trylock, rwsem) + +#define down_write(rwsem) \ + PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_down_write, rt_down_write, rwsem) + +#define down_read_nested(rwsem, subclass) \ + PICK_FUNC_2ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_down_read_nested, rt_down_read_nested, rwsem, subclass) + + +#define down_write_nested(rwsem, subclass) \ + PICK_FUNC_2ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_down_write_nested, rt_down_write_nested, rwsem, subclass) + +#define down_write_trylock(rwsem) \ + PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_down_write_trylock, rt_down_write_trylock, rwsem) + +#define up_read(rwsem) \ + PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_up_read, rt_up_read, rwsem) + +#define up_read_non_owner(rwsem) \ + PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_up_read_non_owner, rt_up_read_non_owner, rwsem) + +#define up_write(rwsem) \ + PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_up_write, rt_up_write, rwsem) + +#define downgrade_write(rwsem) \ + PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_downgrade_write, rt_downgrade_write, rwsem) + +#define rwsem_is_locked(rwsem) \ + PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \ + compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem) + +#endif /* CONFIG_PREEMPT_RT */ + +#endif + Index: linux-rt.q/include/linux/rtmutex.h =================================================================== --- linux-rt.q.orig/include/linux/rtmutex.h +++ linux-rt.q/include/linux/rtmutex.h @@ -24,7 +24,7 @@ * @owner: the mutex owner */ struct rt_mutex { - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct plist_head wait_list; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -63,7 +63,7 @@ struct hrtimer_sleeper; #endif #define __RT_MUTEX_INITIALIZER(mutexname) \ - { .wait_lock = __SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ + { .wait_lock = RAW_SPIN_LOCK_UNLOCKED(mutexname) \ , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} Index: linux-rt.q/include/linux/rwsem-spinlock.h =================================================================== --- linux-rt.q.orig/include/linux/rwsem-spinlock.h +++ linux-rt.q/include/linux/rwsem-spinlock.h @@ -28,7 +28,7 @@ struct rwsem_waiter; * - if activity is -1 then there is one active writer * - if wait_list is not empty, then there are processes waiting for the semaphore */ -struct rw_semaphore { +struct compat_rw_semaphore { __s32 activity; spinlock_t wait_lock; struct list_head wait_list; @@ -43,33 +43,32 @@ struct rw_semaphore { # define __RWSEM_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ -{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } +#define __COMPAT_RWSEM_INITIALIZER(name) \ +{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define compat_init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __compat_init_rwsem((sem), #sem, &__key); \ } while (0) -extern void FASTCALL(__down_read(struct rw_semaphore *sem)); -extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem)); -extern void FASTCALL(__down_write(struct rw_semaphore *sem)); -extern void FASTCALL(__down_write_nested(struct rw_semaphore *sem, int subclass)); -extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem)); -extern void FASTCALL(__up_read(struct rw_semaphore *sem)); -extern void FASTCALL(__up_write(struct rw_semaphore *sem)); -extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem)); +extern void FASTCALL(__down_read(struct compat_rw_semaphore *sem)); +extern int FASTCALL(__down_read_trylock(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__down_write(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__down_write_nested(struct compat_rw_semaphore *sem, int subclass)); +extern int FASTCALL(__down_write_trylock(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__up_read(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__up_write(struct compat_rw_semaphore *sem)); +extern void FASTCALL(__downgrade_write(struct compat_rw_semaphore *sem)); -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->activity != 0); } Index: linux-rt.q/include/linux/rwsem.h =================================================================== --- linux-rt.q.orig/include/linux/rwsem.h +++ linux-rt.q/include/linux/rwsem.h @@ -9,6 +9,10 @@ #include +#ifdef CONFIG_PREEMPT_RT +# include +#endif + #ifdef __KERNEL__ #include @@ -16,48 +20,59 @@ #include #include -struct rw_semaphore; +#ifndef CONFIG_PREEMPT_RT +/* + * On !PREEMPT_RT all rw-semaphores are compat: + */ +#define compat_rw_semaphore rw_semaphore +#endif + +struct compat_rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK -#include /* use a generic implementation */ +# include /* use a generic implementation */ +# ifndef CONFIG_PREEMPT_RT +# define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER +# define DECLARE_RWSEM COMPAT_DECLARE_RWSEM +# endif #else -#include /* use an arch-specific implementation */ +# include /* use an arch-specific implementation */ #endif /* * lock for reading */ -extern void down_read(struct rw_semaphore *sem); +extern void compat_down_read(struct compat_rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -extern int down_read_trylock(struct rw_semaphore *sem); +extern int compat_down_read_trylock(struct compat_rw_semaphore *sem); /* * lock for writing */ -extern void down_write(struct rw_semaphore *sem); +extern void compat_down_write(struct compat_rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -extern int down_write_trylock(struct rw_semaphore *sem); +extern int compat_down_write_trylock(struct compat_rw_semaphore *sem); /* * release a read lock */ -extern void up_read(struct rw_semaphore *sem); +extern void compat_up_read(struct compat_rw_semaphore *sem); /* * release a write lock */ -extern void up_write(struct rw_semaphore *sem); +extern void compat_up_write(struct compat_rw_semaphore *sem); /* * downgrade write lock to read lock */ -extern void downgrade_write(struct rw_semaphore *sem); +extern void compat_downgrade_write(struct compat_rw_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -73,22 +88,79 @@ extern void downgrade_write(struct rw_se * lockdep_set_class() at lock initialization time. * See Documentation/lockdep-design.txt for more details.) */ -extern void down_read_nested(struct rw_semaphore *sem, int subclass); -extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void +compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass); +extern void +compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass); /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); +extern void +compat_down_read_non_owner(struct compat_rw_semaphore *sem); +extern void +compat_up_read_non_owner(struct compat_rw_semaphore *sem); #else -# define down_read_nested(sem, subclass) down_read(sem) -# define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) +# define compat_down_read_nested(sem, subclass) compat_down_read(sem) +# define compat_down_write_nested(sem, subclass) compat_down_write(sem) +# define compat_down_read_non_owner(sem) compat_down_read(sem) +# define compat_up_read_non_owner(sem) compat_up_read(sem) #endif +#ifndef CONFIG_PREEMPT_RT + +#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM + +/* + * NOTE, lockdep: this has to be a macro, so that separate class-keys + * get generated by the compiler, if the same function does multiple + * init_rwsem() calls to different rwsems. + */ +#define init_rwsem(rwsem) compat_init_rwsem(rwsem) + +static inline void down_read(struct compat_rw_semaphore *rwsem) +{ + compat_down_read(rwsem); +} +static inline int down_read_trylock(struct compat_rw_semaphore *rwsem) +{ + return compat_down_read_trylock(rwsem); +} +static inline void down_write(struct compat_rw_semaphore *rwsem) +{ + compat_down_write(rwsem); +} +static inline int down_write_trylock(struct compat_rw_semaphore *rwsem) +{ + return compat_down_write_trylock(rwsem); +} +static inline void up_read(struct compat_rw_semaphore *rwsem) +{ + compat_up_read(rwsem); +} +static inline void up_write(struct compat_rw_semaphore *rwsem) +{ + compat_up_write(rwsem); +} +static inline void downgrade_write(struct compat_rw_semaphore *rwsem) +{ + compat_downgrade_write(rwsem); +} +static inline int rwsem_is_locked(struct compat_rw_semaphore *sem) +{ + return compat_rwsem_is_locked(sem); +} +# define down_read_nested(sem, subclass) \ + compat_down_read_nested(sem, subclass) +# define down_write_nested(sem, subclass) \ + compat_down_write_nested(sem, subclass) +# define down_read_non_owner(sem) \ + compat_down_read_non_owner(sem) +# define up_read_non_owner(sem) \ + compat_up_read_non_owner(sem) +#endif /* !CONFIG_PREEMPT_RT */ + #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_H */ Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -176,16 +176,17 @@ print_cfs_rq(struct seq_file *m, int cpu * mistake. */ #define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define TASK_STOPPED 4 -#define TASK_TRACED 8 +#define TASK_RUNNING_MUTEX 1 +#define TASK_INTERRUPTIBLE 2 +#define TASK_UNINTERRUPTIBLE 4 +#define TASK_STOPPED 8 +#define TASK_TRACED 16 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 64 /* in tsk->state again */ -#define TASK_NONINTERACTIVE 64 -#define TASK_DEAD 128 +#define TASK_NONINTERACTIVE 128 +#define TASK_DEAD 256 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -293,6 +294,10 @@ static inline void touch_all_softlockup_ } #endif +#ifdef CONFIG_PREEMPT_BKL +extern struct semaphore kernel_sem; +#endif + #if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE) extern void print_traces(struct task_struct *task); #else @@ -1169,7 +1174,7 @@ struct task_struct { spinlock_t alloc_lock; /* Protection of the PI data structures: */ - spinlock_t pi_lock; + raw_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ @@ -1205,13 +1210,32 @@ struct task_struct { unsigned int lockdep_recursion; #endif -#define MAX_PREEMPT_TRACE 16 +#define MAX_PREEMPT_TRACE 25 #ifdef CONFIG_PREEMPT_TRACE unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE]; unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE]; #endif +#define MAX_LOCK_STACK MAX_PREEMPT_TRACE +#ifdef CONFIG_DEBUG_PREEMPT + int lock_count; +# ifdef CONFIG_PREEMPT_RT + struct rt_mutex *owned_lock[MAX_LOCK_STACK]; +# endif +#endif +#ifdef CONFIG_DETECT_SOFTLOCKUP + unsigned long softlockup_count; /* Count to keep track how long the + * thread is in the kernel without + * sleeping. + */ +#endif + /* realtime bits */ + +#ifdef CONFIG_DEBUG_RT_MUTEXES + void *last_kernel_lock; +#endif + /* journalling filesystem info */ void *journal_info; @@ -1392,6 +1416,7 @@ static inline void put_task_struct(struc #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ +#define PF_NOSCHED 0x00000010 /* Userspace does not expect scheduling */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ @@ -1505,6 +1530,7 @@ extern struct task_struct *curr_task(int extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); +void __yield(void); /* * The default (Linux) execution domain. @@ -1551,6 +1577,9 @@ extern void do_timer(unsigned long ticks extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_sync(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_mutex_sync(struct task_struct * tsk)); extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, unsigned long clone_flags)); #ifdef CONFIG_SMP @@ -1821,11 +1850,32 @@ static inline int signal_pending(struct return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } -static inline int need_resched(void) +static inline int _need_resched(void) { return unlikely(test_thread_flag(TIF_NEED_RESCHED)); } +static inline int need_resched(void) +{ + touch_critical_timing(); + return _need_resched(); +} + +static inline void set_tsk_need_resched_delayed(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED); +} + +static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED); +} + +static inline int need_resched_delayed(void) +{ + return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED)); +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -1834,7 +1884,22 @@ static inline int need_resched(void) * cond_resched_softirq() will enable bhs before scheduling. */ extern int cond_resched(void); -extern int cond_resched_lock(spinlock_t * lock); +extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock); +extern int __cond_resched_spinlock(spinlock_t *spinlock); + +#define cond_resched_lock(lock) \ +({ \ + int __ret; \ + \ + if (TYPE_EQUAL((lock), raw_spinlock_t)) \ + __ret = __cond_resched_raw_spinlock((raw_spinlock_t *)lock);\ + else if (TYPE_EQUAL(lock, spinlock_t)) \ + __ret = __cond_resched_spinlock((spinlock_t *)lock); \ + else __ret = __bad_spinlock_type(); \ + \ + __ret; \ +}) + extern int cond_resched_softirq(void); extern int cond_resched_softirq_context(void); extern int cond_resched_hardirq_context(void); @@ -1843,12 +1908,18 @@ extern int cond_resched_hardirq_context( * Does a critical section need to be broken due to another * task waiting?: */ -#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) -# define need_lockbreak(lock) ((lock)->break_lock) +#if (defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)) || defined(CONFIG_PREEMPT_RT) +# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; }) #else # define need_lockbreak(lock) 0 #endif +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +# define need_lockbreak_raw(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; }) +#else +# define need_lockbreak_raw(lock) 0 +#endif + /* * Does a critical section need to be broken due to another * task waiting or preemption being signalled: Index: linux-rt.q/include/linux/semaphore.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/semaphore.h @@ -0,0 +1,50 @@ +#ifndef _LINUX_SEMAPHORE_H +#define _LINUX_SEMAPHORE_H + +#ifdef CONFIG_PREEMPT_RT +# include +#else + +#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX +#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED + +static inline void sema_init(struct compat_semaphore *sem, int val) +{ + compat_sema_init(sem, val); +} +static inline void init_MUTEX(struct compat_semaphore *sem) +{ + compat_init_MUTEX(sem); +} +static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem) +{ + compat_init_MUTEX_LOCKED(sem); +} +static inline void down(struct compat_semaphore *sem) +{ + compat_down(sem); +} +static inline int down_interruptible(struct compat_semaphore *sem) +{ + return compat_down_interruptible(sem); +} +static inline int down_trylock(struct compat_semaphore *sem) +{ + return compat_down_trylock(sem); +} +static inline void up(struct compat_semaphore *sem) +{ + compat_up(sem); +} +static inline int sem_is_locked(struct compat_semaphore *sem) +{ + return compat_sem_is_locked(sem); +} +static inline int sema_count(struct compat_semaphore *sem) +{ + return compat_sema_count(sem); +} + +#endif /* CONFIG_PREEMPT_RT */ + +#endif /* _LINUX_SEMAPHORE_H */ Index: linux-rt.q/include/linux/seqlock.h =================================================================== --- linux-rt.q.orig/include/linux/seqlock.h +++ linux-rt.q/include/linux/seqlock.h @@ -32,46 +32,72 @@ typedef struct { unsigned sequence; spinlock_t lock; -} seqlock_t; +} __seqlock_t; + +typedef struct { + unsigned sequence; + raw_spinlock_t lock; +} __raw_seqlock_t; + +#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock) + +#ifdef CONFIG_PREEMPT_RT +typedef __seqlock_t seqlock_t; +#else +typedef __raw_seqlock_t seqlock_t; +#endif + +typedef __raw_seqlock_t raw_seqlock_t; /* * These macros triggered gcc-3.x compile-time problems. We think these are * OK now. Be cautious. */ -#define __SEQLOCK_UNLOCKED(lockname) \ - { 0, __SPIN_LOCK_UNLOCKED(lockname) } +#define __RAW_SEQLOCK_UNLOCKED(lockname) \ + { 0, RAW_SPIN_LOCK_UNLOCKED(lockname) } + +#ifdef CONFIG_PREEMPT_RT +# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) } +#else +# define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname) +#endif #define SEQLOCK_UNLOCKED \ __SEQLOCK_UNLOCKED(old_style_seqlock_init) -#define seqlock_init(x) \ - do { \ - (x)->sequence = 0; \ - spin_lock_init(&(x)->lock); \ - } while (0) +#define raw_seqlock_init(x) \ + do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0) + +#define seqlock_init(x) \ + do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0) #define DEFINE_SEQLOCK(x) \ seqlock_t x = __SEQLOCK_UNLOCKED(x) +#define DEFINE_RAW_SEQLOCK(name) \ + raw_seqlock_t name __cacheline_aligned_in_smp = \ + __RAW_SEQLOCK_UNLOCKED(name) + + /* Lock out other writers and update the count. * Acts like a normal spin_lock/unlock. * Don't need preempt_disable() because that is in the spin_lock already. */ -static inline void write_seqlock(seqlock_t *sl) +static inline void __write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); ++sl->sequence; smp_wmb(); } -static inline void write_sequnlock(seqlock_t *sl) +static inline void __write_sequnlock(seqlock_t *sl) { smp_wmb(); sl->sequence++; spin_unlock(&sl->lock); } -static inline int write_tryseqlock(seqlock_t *sl) +static inline int __write_tryseqlock(seqlock_t *sl) { int ret = spin_trylock(&sl->lock); @@ -83,7 +109,7 @@ static inline int write_tryseqlock(seqlo } /* Start of read calculation -- fetch last complete writer token */ -static __always_inline unsigned read_seqbegin(const seqlock_t *sl) +static __always_inline unsigned __read_seqbegin(const seqlock_t *sl) { unsigned ret = sl->sequence; smp_rmb(); @@ -98,12 +124,118 @@ static __always_inline unsigned read_seq * * Using xor saves one conditional branch. */ -static __always_inline int read_seqretry(const seqlock_t *sl, unsigned iv) +static inline int __read_seqretry(seqlock_t *sl, unsigned iv) +{ + int ret; + + smp_rmb(); + ret = (iv & 1) | (sl->sequence ^ iv); + /* + * If invalid then serialize with the writer, to make sure we + * are not livelocking it: + */ + if (unlikely(ret)) { + unsigned long flags; + spin_lock_irqsave(&sl->lock, flags); + spin_unlock_irqrestore(&sl->lock, flags); + } + return ret; +} + +static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl) +{ + spin_lock(&sl->lock); + ++sl->sequence; + smp_wmb(); +} + +static __always_inline void __write_sequnlock_raw(raw_seqlock_t *sl) +{ + smp_wmb(); + sl->sequence++; + spin_unlock(&sl->lock); +} + +static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl) +{ + int ret = spin_trylock(&sl->lock); + + if (ret) { + ++sl->sequence; + smp_wmb(); + } + return ret; +} + +static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl) +{ + unsigned ret = sl->sequence; + smp_rmb(); + return ret; +} + +static __always_inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv) { smp_rmb(); return (iv & 1) | (sl->sequence ^ iv); } +extern int __bad_seqlock_type(void); + +#define PICK_SEQOP(op, lock) \ +do { \ + if (TYPE_EQUAL((lock), raw_seqlock_t)) \ + op##_raw((raw_seqlock_t *)(lock)); \ + else if (TYPE_EQUAL((lock), seqlock_t)) \ + op((seqlock_t *)(lock)); \ + else __bad_seqlock_type(); \ +} while (0) + +#define PICK_SEQOP_RET(op, lock) \ +({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((lock), raw_seqlock_t)) \ + __ret = op##_raw((raw_seqlock_t *)(lock)); \ + else if (TYPE_EQUAL((lock), seqlock_t)) \ + __ret = op((seqlock_t *)(lock)); \ + else __ret = __bad_seqlock_type(); \ + \ + __ret; \ +}) + +#define PICK_SEQOP_CONST_RET(op, lock) \ +({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((lock), raw_seqlock_t)) \ + __ret = op##_raw((const raw_seqlock_t *)(lock));\ + else if (TYPE_EQUAL((lock), seqlock_t)) \ + __ret = op((seqlock_t *)(lock)); \ + else __ret = __bad_seqlock_type(); \ + \ + __ret; \ +}) + +#define PICK_SEQOP2_CONST_RET(op, lock, arg) \ + ({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((lock), raw_seqlock_t)) \ + __ret = op##_raw((const raw_seqlock_t *)(lock), (arg)); \ + else if (TYPE_EQUAL((lock), seqlock_t)) \ + __ret = op((seqlock_t *)(lock), (arg)); \ + else __ret = __bad_seqlock_type(); \ + \ + __ret; \ +}) + + +#define write_seqlock(sl) PICK_SEQOP(__write_seqlock, sl) +#define write_sequnlock(sl) PICK_SEQOP(__write_sequnlock, sl) +#define write_tryseqlock(sl) PICK_SEQOP_RET(__write_tryseqlock, sl) +#define read_seqbegin(sl) PICK_SEQOP_CONST_RET(__read_seqbegin, sl) +#define read_seqretry(sl, iv) PICK_SEQOP2_CONST_RET(__read_seqretry, sl, iv) /* * Version using sequence counter only. @@ -155,30 +287,51 @@ static inline void write_seqcount_end(se s->sequence++; } +#define PICK_IRQOP(op, lock) \ +do { \ + if (TYPE_EQUAL((lock), raw_seqlock_t)) \ + op(); \ + else if (TYPE_EQUAL((lock), seqlock_t)) \ + { /* nothing */ } \ + else __bad_seqlock_type(); \ +} while (0) + +#define PICK_IRQOP2(op, arg, lock) \ +do { \ + if (TYPE_EQUAL((lock), raw_seqlock_t)) \ + op(arg); \ + else if (TYPE_EQUAL(lock, seqlock_t)) \ + { /* nothing */ } \ + else __bad_seqlock_type(); \ +} while (0) + + + /* * Possible sw/hw IRQ protected versions of the interfaces. */ #define write_seqlock_irqsave(lock, flags) \ - do { local_irq_save(flags); write_seqlock(lock); } while (0) + do { PICK_IRQOP2(local_irq_save, flags, lock); write_seqlock(lock); } while (0) #define write_seqlock_irq(lock) \ - do { local_irq_disable(); write_seqlock(lock); } while (0) + do { PICK_IRQOP(local_irq_disable, lock); write_seqlock(lock); } while (0) #define write_seqlock_bh(lock) \ - do { local_bh_disable(); write_seqlock(lock); } while (0) + do { PICK_IRQOP(local_bh_disable, lock); write_seqlock(lock); } while (0) #define write_sequnlock_irqrestore(lock, flags) \ - do { write_sequnlock(lock); local_irq_restore(flags); } while(0) + do { write_sequnlock(lock); PICK_IRQOP2(local_irq_restore, flags, lock); preempt_check_resched(); } while(0) #define write_sequnlock_irq(lock) \ - do { write_sequnlock(lock); local_irq_enable(); } while(0) + do { write_sequnlock(lock); PICK_IRQOP(local_irq_enable, lock); preempt_check_resched(); } while(0) #define write_sequnlock_bh(lock) \ - do { write_sequnlock(lock); local_bh_enable(); } while(0) + do { write_sequnlock(lock); PICK_IRQOP(local_bh_enable, lock); } while(0) #define read_seqbegin_irqsave(lock, flags) \ - ({ local_irq_save(flags); read_seqbegin(lock); }) + ({ PICK_IRQOP2(local_irq_save, flags, lock); read_seqbegin(lock); }) #define read_seqretry_irqrestore(lock, iv, flags) \ ({ \ int ret = read_seqretry(lock, iv); \ - local_irq_restore(flags); \ + PICK_IRQOP2(local_irq_restore, flags, lock); \ + preempt_check_resched(); \ ret; \ }) Index: linux-rt.q/include/linux/smp.h =================================================================== --- linux-rt.q.orig/include/linux/smp.h +++ linux-rt.q/include/linux/smp.h @@ -131,7 +131,7 @@ static inline int smp_call_function_sing #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() -#define put_cpu_no_resched() preempt_enable_no_resched() +#define put_cpu_no_resched() __preempt_enable_no_resched() void smp_setup_processor_id(void); Index: linux-rt.q/include/linux/spinlock.h =================================================================== --- linux-rt.q.orig/include/linux/spinlock.h +++ linux-rt.q/include/linux/spinlock.h @@ -44,6 +44,42 @@ * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. + * + * + * Public types and naming conventions: + * ------------------------------------ + * spinlock_t: type: sleep-lock + * raw_spinlock_t: type: spin-lock (debug) + * + * spin_lock([raw_]spinlock_t): API: acquire lock, both types + * + * + * Internal types and naming conventions: + * ------------------------------------- + * __raw_spinlock_t: type: lowlevel spin-lock + * + * _spin_lock(struct rt_mutex): API: acquire sleep-lock + * __spin_lock(raw_spinlock_t): API: acquire spin-lock (highlevel) + * _raw_spin_lock(raw_spinlock_t): API: acquire spin-lock (debug) + * __raw_spin_lock(__raw_spinlock_t): API: acquire spin-lock (lowlevel) + * + * + * spin_lock(raw_spinlock_t) translates into the following chain of + * calls/inlines/macros, if spin-lock debugging is enabled: + * + * spin_lock() [include/linux/spinlock.h] + * -> __spin_lock() [kernel/spinlock.c] + * -> _raw_spin_lock() [lib/spinlock_debug.c] + * -> __raw_spin_lock() [include/asm/spinlock.h] + * + * spin_lock(spinlock_t) translates into the following chain of + * calls/inlines/macros: + * + * spin_lock() [include/linux/spinlock.h] + * -> _spin_lock() [include/linux/spinlock.h] + * -> rt_spin_lock() [kernel/rtmutex.c] + * -> rt_spin_lock_fastlock() [kernel/rtmutex.c] + * -> rt_spin_lock_slowlock() [kernel/rtmutex.c] */ #include @@ -51,29 +87,14 @@ #include #include #include +#include #include #include +#include #include /* - * Must define these before including other files, inline functions need them - */ -#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME - -#define LOCK_SECTION_START(extra) \ - ".subsection 1\n\t" \ - extra \ - ".ifndef " LOCK_SECTION_NAME "\n\t" \ - LOCK_SECTION_NAME ":\n\t" \ - ".endif\n" - -#define LOCK_SECTION_END \ - ".previous\n\t" - -#define __lockfunc fastcall __attribute__((section(".spinlock.text"))) - -/* * Pull the raw_spinlock_t and raw_rwlock_t definitions: */ #include @@ -89,42 +110,10 @@ extern int __lockfunc generic__raw_read_ # include #endif -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key); -# define spin_lock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __spin_lock_init((lock), #lock, &__key); \ -} while (0) - -#else -# define spin_lock_init(lock) \ - do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) -#endif - -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key); -# define rwlock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __rwlock_init((lock), #lock, &__key); \ -} while (0) -#else -# define rwlock_init(lock) \ - do { *(lock) = RW_LOCK_UNLOCKED; } while (0) -#endif - -#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) - -/** - * spin_unlock_wait - wait until the spinlock gets unlocked - * @lock: the spinlock in question. +/* + * Pull the RT types: */ -#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) +#include /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: @@ -136,16 +125,16 @@ do { \ #endif #ifdef CONFIG_DEBUG_SPINLOCK - extern void _raw_spin_lock(spinlock_t *lock); + extern __lockfunc void _raw_spin_lock(raw_spinlock_t *lock); #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) - extern int _raw_spin_trylock(spinlock_t *lock); - extern void _raw_spin_unlock(spinlock_t *lock); - extern void _raw_read_lock(rwlock_t *lock); - extern int _raw_read_trylock(rwlock_t *lock); - extern void _raw_read_unlock(rwlock_t *lock); - extern void _raw_write_lock(rwlock_t *lock); - extern int _raw_write_trylock(rwlock_t *lock); - extern void _raw_write_unlock(rwlock_t *lock); + extern __lockfunc int _raw_spin_trylock(raw_spinlock_t *lock); + extern __lockfunc void _raw_spin_unlock(raw_spinlock_t *lock); + extern __lockfunc void _raw_read_lock(raw_rwlock_t *lock); + extern __lockfunc int _raw_read_trylock(raw_rwlock_t *lock); + extern __lockfunc void _raw_read_unlock(raw_rwlock_t *lock); + extern __lockfunc void _raw_write_lock(raw_rwlock_t *lock); + extern __lockfunc int _raw_write_trylock(raw_rwlock_t *lock); + extern __lockfunc void _raw_write_unlock(raw_rwlock_t *lock); #else # define _raw_spin_lock(lock) __raw_spin_lock(&(lock)->raw_lock) # define _raw_spin_lock_flags(lock, flags) \ @@ -160,141 +149,575 @@ do { \ # define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) #endif -#define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) -#define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) +extern int __bad_spinlock_type(void); +extern int __bad_rwlock_type(void); + +extern void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc rt_spin_lock(spinlock_t *lock); +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); +extern void __lockfunc rt_spin_unlock(spinlock_t *lock); +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); +extern int __lockfunc +rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); +extern int __lockfunc rt_spin_trylock(spinlock_t *lock); +extern int _atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock); /* + * lockdep-less calls, for derived types like rwlock: + * (for trylock they can use rt_mutex_trylock() directly. + */ +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); + +#ifdef CONFIG_PREEMPT_RT +# define _spin_lock(l) rt_spin_lock(l) +# define _spin_lock_nested(l, s) rt_spin_lock_nested(l, s) +# define _spin_lock_bh(l) rt_spin_lock(l) +# define _spin_lock_irq(l) rt_spin_lock(l) +# define _spin_unlock(l) rt_spin_unlock(l) +# define _spin_unlock_no_resched(l) rt_spin_unlock(l) +# define _spin_unlock_bh(l) rt_spin_unlock(l) +# define _spin_unlock_irq(l) rt_spin_unlock(l) +# define _spin_unlock_irqrestore(l, f) rt_spin_unlock(l) +static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + rt_spin_lock(lock); + return 0; +} +static inline unsigned long __lockfunc +_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + rt_spin_lock_nested(lock, subclass); + return 0; +} +#else +static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + return 0; +} +static inline unsigned long __lockfunc +_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + return 0; +} +# define _spin_lock(l) do { } while (0) +# define _spin_lock_nested(l, s) do { } while (0) +# define _spin_lock_bh(l) do { } while (0) +# define _spin_lock_irq(l) do { } while (0) +# define _spin_unlock(l) do { } while (0) +# define _spin_unlock_no_resched(l) do { } while (0) +# define _spin_unlock_bh(l) do { } while (0) +# define _spin_unlock_irq(l) do { } while (0) +# define _spin_unlock_irqrestore(l, f) do { } while (0) +#endif + +#define _spin_lock_init(sl, n, f, l) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_spin_lock_init(sl, n, &__key); \ +} while (0) + +# ifdef CONFIG_PREEMPT_RT +# define _spin_can_lock(l) (!rt_mutex_is_locked(&(l)->lock)) +# define _spin_is_locked(l) rt_mutex_is_locked(&(l)->lock) +# define _spin_unlock_wait(l) rt_spin_unlock_wait(l) + +# define _spin_trylock(l) rt_spin_trylock(l) +# define _spin_trylock_bh(l) rt_spin_trylock(l) +# define _spin_trylock_irq(l) rt_spin_trylock(l) +# define _spin_trylock_irqsave(l,f) rt_spin_trylock_irqsave(l, f) +# else + + extern int this_should_never_be_called_on_non_rt(spinlock_t *lock); +# define TSNBCONRT(l) this_should_never_be_called_on_non_rt(l) +# define _spin_can_lock(l) TSNBCONRT(l) +# define _spin_is_locked(l) TSNBCONRT(l) +# define _spin_unlock_wait(l) TSNBCONRT(l) + +# define _spin_trylock(l) TSNBCONRT(l) +# define _spin_trylock_bh(l) TSNBCONRT(l) +# define _spin_trylock_irq(l) TSNBCONRT(l) +# define _spin_trylock_irqsave(l,f) TSNBCONRT(l) +#endif + +#undef TYPE_EQUAL +#define TYPE_EQUAL(lock, type) \ + __builtin_types_compatible_p(typeof(lock), type *) + +#define PICK_OP(op, lock) \ +do { \ + if (TYPE_EQUAL((lock), raw_spinlock_t)) \ + __spin##op((raw_spinlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, spinlock_t)) \ + _spin##op((spinlock_t *)(lock)); \ + else __bad_spinlock_type(); \ +} while (0) + +#define PICK_OP_RET(op, lock...) \ +({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((lock), raw_spinlock_t)) \ + __ret = __spin##op((raw_spinlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, spinlock_t)) \ + __ret = _spin##op((spinlock_t *)(lock)); \ + else __ret = __bad_spinlock_type(); \ + \ + __ret; \ +}) + +#define PICK_OP2(op, lock, flags) \ +do { \ + if (TYPE_EQUAL((lock), raw_spinlock_t)) \ + __spin##op((raw_spinlock_t *)(lock), flags); \ + else if (TYPE_EQUAL(lock, spinlock_t)) \ + _spin##op((spinlock_t *)(lock), flags); \ + else __bad_spinlock_type(); \ +} while (0) + +#define PICK_OP2_RET(op, lock, flags) \ +({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((lock), raw_spinlock_t)) \ + __ret = __spin##op((raw_spinlock_t *)(lock), flags); \ + else if (TYPE_EQUAL(lock, spinlock_t)) \ + __ret = _spin##op((spinlock_t *)(lock), flags); \ + else __bad_spinlock_type(); \ + \ + __ret; \ +}) + +extern void __lockfunc rt_write_lock(rwlock_t *rwlock); +extern void __lockfunc rt_read_lock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock); +extern void +__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); + +#define _rwlock_init(rwl, n, f, l) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwlock_init(rwl, n, &__key); \ +} while (0) + +#ifdef CONFIG_PREEMPT_RT +# define rt_read_can_lock(rwl) (!rt_mutex_is_locked(&(rwl)->lock)) +# define rt_write_can_lock(rwl) (!rt_mutex_is_locked(&(rwl)->lock)) +#else + extern int rt_rwlock_can_lock_never_call_on_non_rt(rwlock_t *rwlock); +# define rt_read_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) +# define rt_write_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) +#endif + +# define _read_can_lock(rwl) rt_read_can_lock(rwl) +# define _write_can_lock(rwl) rt_write_can_lock(rwl) + +# define _read_trylock(rwl) rt_read_trylock(rwl) +# define _write_trylock(rwl) rt_write_trylock(rwl) +#define _write_trylock_irqsave(rwl, flags) rt_write_trylock_irqsave(rwl, flags) + +# define _read_lock(rwl) rt_read_lock(rwl) +# define _write_lock(rwl) rt_write_lock(rwl) +# define _read_unlock(rwl) rt_read_unlock(rwl) +# define _write_unlock(rwl) rt_write_unlock(rwl) + +# define _read_lock_bh(rwl) rt_read_lock(rwl) +# define _write_lock_bh(rwl) rt_write_lock(rwl) +# define _read_unlock_bh(rwl) rt_read_unlock(rwl) +# define _write_unlock_bh(rwl) rt_write_unlock(rwl) + +# define _read_lock_irq(rwl) rt_read_lock(rwl) +# define _write_lock_irq(rwl) rt_write_lock(rwl) +# define _read_unlock_irq(rwl) rt_read_unlock(rwl) +# define _write_unlock_irq(rwl) rt_write_unlock(rwl) + +# define _read_lock_irqsave(rwl) rt_read_lock_irqsave(rwl) +# define _write_lock_irqsave(rwl) rt_write_lock_irqsave(rwl) + +# define _read_unlock_irqrestore(rwl, f) rt_read_unlock(rwl) +# define _write_unlock_irqrestore(rwl, f) rt_write_unlock(rwl) + +#define __PICK_RW_OP(optype, op, lock) \ +do { \ + if (TYPE_EQUAL((lock), raw_rwlock_t)) \ + __##optype##op((raw_rwlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, rwlock_t)) \ + ##op((rwlock_t *)(lock)); \ + else __bad_rwlock_type(); \ +} while (0) + +#define PICK_RW_OP(optype, op, lock) \ +do { \ + if (TYPE_EQUAL((lock), raw_rwlock_t)) \ + __##optype##op((raw_rwlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, rwlock_t)) \ + _##optype##op((rwlock_t *)(lock)); \ + else __bad_rwlock_type(); \ +} while (0) + +#define __PICK_RW_OP_RET(optype, op, lock...) \ +({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((lock), raw_rwlock_t)) \ + __ret = __##optype##op((raw_rwlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, rwlock_t)) \ + __ret = _##optype##op((rwlock_t *)(lock)); \ + else __ret = __bad_rwlock_type(); \ + \ + __ret; \ +}) + +#define PICK_RW_OP_RET(optype, op, lock...) \ +({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((lock), raw_rwlock_t)) \ + __ret = __##optype##op((raw_rwlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, rwlock_t)) \ + __ret = _##optype##op((rwlock_t *)(lock)); \ + else __ret = __bad_rwlock_type(); \ + \ + __ret; \ +}) + +#define PICK_RW_OP2(optype, op, lock, flags) \ +do { \ + if (TYPE_EQUAL((lock), raw_rwlock_t)) \ + __##optype##op((raw_rwlock_t *)(lock), flags); \ + else if (TYPE_EQUAL(lock, rwlock_t)) \ + _##optype##op((rwlock_t *)(lock), flags); \ + else __bad_rwlock_type(); \ +} while (0) + +#define PICK_RW_OP2_RET(optype, op, lock, flags) \ +({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL((lock), raw_rwlock_t)) \ + __ret = __##optype##op((raw_rwlock_t *)(lock), flags); \ + else if (TYPE_EQUAL(lock, rwlock_t)) \ + __ret = _##optype##op((rwlock_t *)(lock), flags); \ + else __bad_rwlock_type(); \ + \ + __ret; \ +}) + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key); +# define _raw_spin_lock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + __raw_spin_lock_init((lock), #lock, &__key); \ +} while (0) + +#else +#define __raw_spin_lock_init(lock) \ + do { *(lock) = RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) +# define _raw_spin_lock_init(lock) __raw_spin_lock_init(lock) +#endif + +#define PICK_OP_INIT(op, lock) \ +do { \ + if (TYPE_EQUAL((lock), raw_spinlock_t)) \ + _raw_spin##op((raw_spinlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, spinlock_t)) \ + _spin##op((spinlock_t *)(lock), #lock, __FILE__, __LINE__); \ + else __bad_spinlock_type(); \ +} while (0) + + +#define spin_lock_init(lock) PICK_OP_INIT(_lock_init, lock) + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, + struct lock_class_key *key); +# define _raw_rwlock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + __raw_rwlock_init((lock), #lock, &__key); \ +} while (0) +#else +#define __raw_rwlock_init(lock) \ + do { *(lock) = RAW_RW_LOCK_UNLOCKED(lock); } while (0) +# define _raw_rwlock_init(lock) __raw_rwlock_init(lock) +#endif + +#define __PICK_RW_OP_INIT(optype, op, lock) \ +do { \ + if (TYPE_EQUAL((lock), raw_rwlock_t)) \ + _raw_##optype##op((raw_rwlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, rwlock_t)) \ + _##optype##op((rwlock_t *)(lock), #lock, __FILE__, __LINE__);\ + else __bad_spinlock_type(); \ +} while (0) + +#define rwlock_init(lock) __PICK_RW_OP_INIT(rwlock, _init, lock) + +#define __spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) + +#define spin_is_locked(lock) PICK_OP_RET(_is_locked, lock) + +#define __spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) + +#define spin_unlock_wait(lock) PICK_OP(_unlock_wait, lock) +/* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various * methods are defined as nops in the case they are not required. */ -#define spin_trylock(lock) __cond_lock(lock, _spin_trylock(lock)) -#define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) -#define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) +// #define spin_trylock(lock) _spin_trylock(lock) +#define spin_trylock(lock) __cond_lock(lock, PICK_OP_RET(_trylock, lock)) + +//#define read_trylock(lock) _read_trylock(lock) +#define read_trylock(lock) __cond_lock(lock, PICK_RW_OP_RET(read, _trylock, lock)) + +//#define write_trylock(lock) _write_trylock(lock) +#define write_trylock(lock) __cond_lock(lock, PICK_RW_OP_RET(write, _trylock, lock)) + +#define write_trylock_irqsave(lock, flags) \ + __cond_lock(lock, PICK_RW_OP2_RET(write, _trylock_irqsave, lock, &flags)) + +#define __spin_can_lock(lock) __raw_spin_can_lock(&(lock)->raw_lock) +#define __read_can_lock(lock) __raw_read_can_lock(&(lock)->raw_lock) +#define __write_can_lock(lock) __raw_write_can_lock(&(lock)->raw_lock) -#define spin_lock(lock) _spin_lock(lock) +#define spin_can_lock(lock) \ + __cond_lock(lock, PICK_OP_RET(_can_lock, lock)) + +#define read_can_lock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(read, _can_lock, lock)) + +#define write_can_lock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(write, _can_lock, lock)) + +// #define spin_lock(lock) _spin_lock(lock) +#define spin_lock(lock) PICK_OP(_lock, lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) +# define spin_lock_nested(lock, subclass) PICK_OP2(_lock_nested, lock, subclass) #else -# define spin_lock_nested(lock, subclass) _spin_lock(lock) +# define spin_lock_nested(lock, subclass) spin_lock(lock) #endif -#define write_lock(lock) _write_lock(lock) -#define read_lock(lock) _read_lock(lock) +//#define write_lock(lock) _write_lock(lock) +#define write_lock(lock) PICK_RW_OP(write, _lock, lock) -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +// #define read_lock(lock) _read_lock(lock) +#define read_lock(lock) PICK_RW_OP(read, _lock, lock) -#define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) -#define read_lock_irqsave(lock, flags) flags = _read_lock_irqsave(lock) -#define write_lock_irqsave(lock, flags) flags = _write_lock_irqsave(lock) +# define spin_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_OP_RET(_lock_irqsave, lock); \ +} while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - flags = _spin_lock_irqsave_nested(lock, subclass) +# define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_OP2_RET(_lock_irqsave_nested, lock, subclass); \ +} while (0) #else -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - flags = _spin_lock_irqsave(lock) +# define spin_lock_irqsave_nested(lock, flags, subclass) \ + spin_lock_irqsave(lock, flags) #endif -#else +# define read_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_RW_OP_RET(read, _lock_irqsave, lock); \ +} while (0) -#define spin_lock_irqsave(lock, flags) _spin_lock_irqsave(lock, flags) -#define read_lock_irqsave(lock, flags) _read_lock_irqsave(lock, flags) -#define write_lock_irqsave(lock, flags) _write_lock_irqsave(lock, flags) -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - spin_lock_irqsave(lock, flags) +# define write_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_RW_OP_RET(write, _lock_irqsave, lock); \ +} while (0) +// #define spin_lock_irq(lock) _spin_lock_irq(lock) +// #define spin_lock_bh(lock) _spin_lock_bh(lock) +#define spin_lock_irq(lock) PICK_OP(_lock_irq, lock) +#define spin_lock_bh(lock) PICK_OP(_lock_bh, lock) + +// #define read_lock_irq(lock) _read_lock_irq(lock) +// #define read_lock_bh(lock) _read_lock_bh(lock) +#define read_lock_irq(lock) PICK_RW_OP(read, _lock_irq, lock) +#define read_lock_bh(lock) PICK_RW_OP(read, _lock_bh, lock) + +// #define write_lock_irq(lock) _write_lock_irq(lock) +// #define write_lock_bh(lock) _write_lock_bh(lock) +#define write_lock_irq(lock) PICK_RW_OP(write, _lock_irq, lock) +#define write_lock_bh(lock) PICK_RW_OP(write, _lock_bh, lock) + +// #define spin_unlock(lock) _spin_unlock(lock) +// #define write_unlock(lock) _write_unlock(lock) +// #define read_unlock(lock) _read_unlock(lock) +#define spin_unlock(lock) PICK_OP(_unlock, lock) +#define read_unlock(lock) PICK_RW_OP(read, _unlock, lock) +#define write_unlock(lock) PICK_RW_OP(write, _unlock, lock) + +// #define spin_unlock(lock) _spin_unlock_no_resched(lock) +#define spin_unlock_no_resched(lock) \ + PICK_OP(_unlock_no_resched, lock) + +//#define spin_unlock_irqrestore(lock, flags) +// _spin_unlock_irqrestore(lock, flags) +//#define spin_unlock_irq(lock) _spin_unlock_irq(lock) +//#define spin_unlock_bh(lock) _spin_unlock_bh(lock) +#define spin_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_OP2(_unlock_irqrestore, lock, flags); \ +} while (0) + +#define spin_unlock_irq(lock) PICK_OP(_unlock_irq, lock) +#define spin_unlock_bh(lock) PICK_OP(_unlock_bh, lock) + +// #define read_unlock_irqrestore(lock, flags) +// _read_unlock_irqrestore(lock, flags) +// #define read_unlock_irq(lock) _read_unlock_irq(lock) +// #define read_unlock_bh(lock) _read_unlock_bh(lock) +#define read_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_RW_OP2(read, _unlock_irqrestore, lock, flags); \ +} while (0) + +#define read_unlock_irq(lock) PICK_RW_OP(read, _unlock_irq, lock) +#define read_unlock_bh(lock) PICK_RW_OP(read, _unlock_bh, lock) + +// #define write_unlock_irqrestore(lock, flags) +// _write_unlock_irqrestore(lock, flags) +// #define write_unlock_irq(lock) _write_unlock_irq(lock) +// #define write_unlock_bh(lock) _write_unlock_bh(lock) +#define write_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_RW_OP2(write, _unlock_irqrestore, lock, flags); \ +} while (0) +#define write_unlock_irq(lock) PICK_RW_OP(write, _unlock_irq, lock) +#define write_unlock_bh(lock) PICK_RW_OP(write, _unlock_bh, lock) + +// #define spin_trylock_bh(lock) _spin_trylock_bh(lock) +#define spin_trylock_bh(lock) __cond_lock(lock, PICK_OP_RET(_trylock_bh, lock)) + +// #define spin_trylock_irq(lock) + +#define spin_trylock_irq(lock) __cond_lock(lock, PICK_OP_RET(_trylock_irq, lock)) + +// #define spin_trylock_irqsave(lock, flags) + +#define spin_trylock_irqsave(lock, flags) \ + __cond_lock(lock, PICK_OP2_RET(_trylock_irqsave, lock, &flags)) + +/* "lock on reference count zero" */ +#ifndef ATOMIC_DEC_AND_LOCK +# include + extern int __atomic_dec_and_spin_lock(atomic_t *atomic, raw_spinlock_t *lock); #endif -#define spin_lock_irq(lock) _spin_lock_irq(lock) -#define spin_lock_bh(lock) _spin_lock_bh(lock) +#define atomic_dec_and_lock(atomic, lock) \ +__cond_lock(lock, ({ \ + unsigned long __ret; \ + \ + if (TYPE_EQUAL(lock, raw_spinlock_t)) \ + __ret = __atomic_dec_and_spin_lock(atomic, \ + (raw_spinlock_t *)(lock)); \ + else if (TYPE_EQUAL(lock, spinlock_t)) \ + __ret = _atomic_dec_and_spin_lock(atomic, \ + (spinlock_t *)(lock)); \ + else __ret = __bad_spinlock_type(); \ + \ + __ret; \ +})) -#define read_lock_irq(lock) _read_lock_irq(lock) -#define read_lock_bh(lock) _read_lock_bh(lock) -#define write_lock_irq(lock) _write_lock_irq(lock) -#define write_lock_bh(lock) _write_lock_bh(lock) +/* + * bit-based spin_lock() + * + * Don't use this unless you really need to: spin_lock() and spin_unlock() + * are significantly faster. + */ +static inline void bit_spin_lock(int bitnum, unsigned long *addr) +{ + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + while (test_and_set_bit(bitnum, addr)) + while (test_bit(bitnum, addr)) + cpu_relax(); +#endif + __acquire(bitlock); +} /* - * We inline the unlock functions in the nondebug case: + * Return true if it was acquired */ -#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ - !defined(CONFIG_SMP) -# define spin_unlock(lock) _spin_unlock(lock) -# define read_unlock(lock) _read_unlock(lock) -# define write_unlock(lock) _write_unlock(lock) -# define spin_unlock_irq(lock) _spin_unlock_irq(lock) -# define read_unlock_irq(lock) _read_unlock_irq(lock) -# define write_unlock_irq(lock) _write_unlock_irq(lock) -#else -# define spin_unlock(lock) \ - do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define read_unlock(lock) \ - do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define write_unlock(lock) \ - do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define spin_unlock_irq(lock) \ -do { \ - __raw_spin_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) -# define read_unlock_irq(lock) \ -do { \ - __raw_read_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) -# define write_unlock_irq(lock) \ -do { \ - __raw_write_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) +static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + if (test_and_set_bit(bitnum, addr)) + return 0; #endif + __acquire(bitlock); + return 1; +} -#define spin_unlock_irqrestore(lock, flags) \ - _spin_unlock_irqrestore(lock, flags) -#define spin_unlock_bh(lock) _spin_unlock_bh(lock) - -#define read_unlock_irqrestore(lock, flags) \ - _read_unlock_irqrestore(lock, flags) -#define read_unlock_bh(lock) _read_unlock_bh(lock) - -#define write_unlock_irqrestore(lock, flags) \ - _write_unlock_irqrestore(lock, flags) -#define write_unlock_bh(lock) _write_unlock_bh(lock) - -#define spin_trylock_bh(lock) __cond_lock(lock, _spin_trylock_bh(lock)) - -#define spin_trylock_irq(lock) \ -({ \ - local_irq_disable(); \ - spin_trylock(lock) ? \ - 1 : ({ local_irq_enable(); 0; }); \ -}) +/* + * bit-based spin_unlock() + */ +static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + BUG_ON(!test_bit(bitnum, addr)); + smp_mb__before_clear_bit(); + clear_bit(bitnum, addr); +#endif + __release(bitlock); +} -#define spin_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - spin_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) +/* + * Return true if the lock is held. + */ +static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + return test_bit(bitnum, addr); +#else + return 1; +#endif +} -#define write_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - write_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) +/** + * __raw_spin_can_lock - would __raw_spin_trylock() succeed? + * @lock: the spinlock in question. + */ +#define __raw_spin_can_lock(lock) (!__raw_spin_is_locked(lock)) /* * Locks two spinlocks l1 and l2. * l1_first indicates if spinlock l1 should be taken first. */ -static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2, - bool l1_first) +static inline void +raw_double_spin_lock(raw_spinlock_t *l1, raw_spinlock_t *l2, bool l1_first) __acquires(l1) __acquires(l2) { @@ -307,13 +730,29 @@ static inline void double_spin_lock(spin } } +static inline void +double_spin_lock(spinlock_t *l1, spinlock_t *l2, bool l1_first) + __acquires(l1) + __acquires(l2) +{ + if (l1_first) { + spin_lock(l1); + spin_lock(l2); + } else { + spin_lock(l2); + spin_lock(l1); + } +} + + /* * Unlocks two spinlocks l1 and l2. * l1_taken_first indicates if spinlock l1 was taken first and therefore * should be released after spinlock l2. */ -static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2, - bool l1_taken_first) +static inline void +raw_double_spin_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2, + bool l1_taken_first) __releases(l1) __releases(l2) { @@ -326,24 +765,19 @@ static inline void double_spin_unlock(sp } } -/* - * Pull the atomic_t declaration: - * (asm-mips/atomic.h needs above definitions) - */ -#include -/** - * atomic_dec_and_lock - lock on reaching reference count zero - * @atomic: the atomic counter - * @lock: the spinlock in question - */ -extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); -#define atomic_dec_and_lock(atomic, lock) \ - __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) - -/** - * spin_can_lock - would spin_trylock() succeed? - * @lock: the spinlock in question. - */ -#define spin_can_lock(lock) (!spin_is_locked(lock)) +static inline void +double_spin_unlock(spinlock_t *l1, spinlock_t *l2, bool l1_taken_first) + __releases(l1) + __releases(l2) +{ + if (l1_taken_first) { + spin_unlock(l2); + spin_unlock(l1); + } else { + spin_unlock(l1); + spin_unlock(l2); + } +} #endif /* __LINUX_SPINLOCK_H */ + Index: linux-rt.q/include/linux/spinlock_api_smp.h =================================================================== --- linux-rt.q.orig/include/linux/spinlock_api_smp.h +++ linux-rt.q/include/linux/spinlock_api_smp.h @@ -19,43 +19,58 @@ int in_lock_functions(unsigned long addr #define assert_spin_locked(x) BUG_ON(!spin_is_locked(x)) -void __lockfunc _spin_lock(spinlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) - __acquires(lock); -void __lockfunc _read_lock(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock(rwlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_bh(spinlock_t *lock) __acquires(lock); -void __lockfunc _read_lock_bh(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock_bh(rwlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_irq(spinlock_t *lock) __acquires(lock); -void __lockfunc _read_lock_irq(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock_irq(rwlock_t *lock) __acquires(lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) - __acquires(lock); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) - __acquires(lock); -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) - __acquires(lock); -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) - __acquires(lock); -int __lockfunc _spin_trylock(spinlock_t *lock); -int __lockfunc _read_trylock(rwlock_t *lock); -int __lockfunc _write_trylock(rwlock_t *lock); -int __lockfunc _spin_trylock_bh(spinlock_t *lock); -void __lockfunc _spin_unlock(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock_bh(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock_bh(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock_irq(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock_irq(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) - __releases(lock); -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); +#define ACQUIRE_SPIN __acquires(lock) +#define ACQUIRE_RW __acquires(lock) +#define RELEASE_SPIN __releases(lock) +#define RELEASE_RW __releases(lock) + +void __lockfunc __spin_lock(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass) + ACQUIRE_SPIN; +void __lockfunc __read_lock(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __spin_lock_bh(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __read_lock_bh(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock_bh(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __spin_lock_irq(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __read_lock_irq(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock_irq(raw_rwlock_t *lock) ACQUIRE_RW; +unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock) + ACQUIRE_SPIN; +unsigned long __lockfunc +__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) ACQUIRE_SPIN; +unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock) + ACQUIRE_RW; +unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock) + ACQUIRE_RW; +int __lockfunc __spin_trylock(raw_spinlock_t *lock); +int __lockfunc +__spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags); +int __lockfunc __read_trylock(raw_rwlock_t *lock); +int __lockfunc __write_trylock(raw_rwlock_t *lock); +int __lockfunc +__write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags); +int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock); +int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock); +void __lockfunc __spin_unlock(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock) + RELEASE_SPIN; +void __lockfunc __read_unlock(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __read_unlock_bh(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock_bh(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __read_unlock_irq(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock_irq(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc +__spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) + RELEASE_SPIN; +void __lockfunc +__read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) + RELEASE_RW; +void +__lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) + RELEASE_RW; #endif /* __LINUX_SPINLOCK_API_SMP_H */ Index: linux-rt.q/include/linux/spinlock_api_up.h =================================================================== --- linux-rt.q.orig/include/linux/spinlock_api_up.h +++ linux-rt.q/include/linux/spinlock_api_up.h @@ -33,12 +33,22 @@ #define __LOCK_IRQ(lock) \ do { local_irq_disable(); __LOCK(lock); } while (0) -#define __LOCK_IRQSAVE(lock, flags) \ - do { local_irq_save(flags); __LOCK(lock); } while (0) +#define __LOCK_IRQSAVE(lock) \ + ({ unsigned long __flags; local_irq_save(__flags); __LOCK(lock); __flags; }) + +#define __TRYLOCK_IRQSAVE(lock, flags) \ + ({ local_irq_save(*(flags)); __LOCK(lock); 1; }) + +#define __spin_trylock_irqsave(lock, flags) __TRYLOCK_IRQSAVE(lock, flags) + +#define __write_trylock_irqsave(lock, flags) __TRYLOCK_IRQSAVE(lock, flags) #define __UNLOCK(lock) \ do { preempt_enable(); __release(lock); (void)(lock); } while (0) +#define __UNLOCK_NO_RESCHED(lock) \ + do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0) + #define __UNLOCK_BH(lock) \ do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0) @@ -48,34 +58,36 @@ #define __UNLOCK_IRQRESTORE(lock, flags) \ do { local_irq_restore(flags); __UNLOCK(lock); } while (0) -#define _spin_lock(lock) __LOCK(lock) -#define _spin_lock_nested(lock, subclass) __LOCK(lock) -#define _read_lock(lock) __LOCK(lock) -#define _write_lock(lock) __LOCK(lock) -#define _spin_lock_bh(lock) __LOCK_BH(lock) -#define _read_lock_bh(lock) __LOCK_BH(lock) -#define _write_lock_bh(lock) __LOCK_BH(lock) -#define _spin_lock_irq(lock) __LOCK_IRQ(lock) -#define _read_lock_irq(lock) __LOCK_IRQ(lock) -#define _write_lock_irq(lock) __LOCK_IRQ(lock) -#define _spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _read_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _write_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _spin_trylock(lock) ({ __LOCK(lock); 1; }) -#define _read_trylock(lock) ({ __LOCK(lock); 1; }) -#define _write_trylock(lock) ({ __LOCK(lock); 1; }) -#define _spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) -#define _spin_unlock(lock) __UNLOCK(lock) -#define _read_unlock(lock) __UNLOCK(lock) -#define _write_unlock(lock) __UNLOCK(lock) -#define _spin_unlock_bh(lock) __UNLOCK_BH(lock) -#define _write_unlock_bh(lock) __UNLOCK_BH(lock) -#define _read_unlock_bh(lock) __UNLOCK_BH(lock) -#define _spin_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _read_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _write_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) -#define _read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) -#define _write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __spin_lock(lock) __LOCK(lock) +#define __spin_lock_nested(lock, subclass) __LOCK(lock) +#define __read_lock(lock) __LOCK(lock) +#define __write_lock(lock) __LOCK(lock) +#define __spin_lock_bh(lock) __LOCK_BH(lock) +#define __read_lock_bh(lock) __LOCK_BH(lock) +#define __write_lock_bh(lock) __LOCK_BH(lock) +#define __spin_lock_irq(lock) __LOCK_IRQ(lock) +#define __read_lock_irq(lock) __LOCK_IRQ(lock) +#define __write_lock_irq(lock) __LOCK_IRQ(lock) +#define __spin_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __read_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __write_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __spin_trylock(lock) ({ __LOCK(lock); 1; }) +#define __read_trylock(lock) ({ __LOCK(lock); 1; }) +#define __write_trylock(lock) ({ __LOCK(lock); 1; }) +#define __spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) +#define __spin_trylock_irq(lock) ({ __LOCK_IRQ(lock); 1; }) +#define __spin_unlock(lock) __UNLOCK(lock) +#define __spin_unlock_no_resched(lock) __UNLOCK_NO_RESCHED(lock) +#define __read_unlock(lock) __UNLOCK(lock) +#define __write_unlock(lock) __UNLOCK(lock) +#define __spin_unlock_bh(lock) __UNLOCK_BH(lock) +#define __write_unlock_bh(lock) __UNLOCK_BH(lock) +#define __read_unlock_bh(lock) __UNLOCK_BH(lock) +#define __spin_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __read_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __write_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) #endif /* __LINUX_SPINLOCK_API_UP_H */ Index: linux-rt.q/include/linux/spinlock_types.h =================================================================== --- linux-rt.q.orig/include/linux/spinlock_types.h +++ linux-rt.q/include/linux/spinlock_types.h @@ -9,7 +9,22 @@ * Released under the General Public License (GPL). */ -#include +/* + * Must define these before including other files, inline functions need them + */ +#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME + +#define LOCK_SECTION_START(extra) \ + ".subsection 1\n\t" \ + extra \ + ".ifndef " LOCK_SECTION_NAME "\n\t" \ + LOCK_SECTION_NAME ":\n\t" \ + ".endif\n" + +#define LOCK_SECTION_END \ + ".previous\n\t" + +#define __lockfunc fastcall __attribute__((section(".spinlock.text"))) #if defined(CONFIG_SMP) # include @@ -17,8 +32,10 @@ # include #endif +#include + typedef struct { - raw_spinlock_t raw_lock; + __raw_spinlock_t raw_lock; #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) unsigned int break_lock; #endif @@ -29,12 +46,12 @@ typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif -} spinlock_t; +} raw_spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead typedef struct { - raw_rwlock_t raw_lock; + __raw_rwlock_t raw_lock; #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) unsigned int break_lock; #endif @@ -45,7 +62,7 @@ typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif -} rwlock_t; +} raw_rwlock_t; #define RWLOCK_MAGIC 0xdeaf1eed @@ -64,24 +81,24 @@ typedef struct { #endif #ifdef CONFIG_DEBUG_SPINLOCK -# define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ +# define _RAW_SPIN_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ .magic = SPINLOCK_MAGIC, \ .owner = SPINLOCK_OWNER_INIT, \ .owner_cpu = -1, \ SPIN_DEP_MAP_INIT(lockname) } -#define __RW_LOCK_UNLOCKED(lockname) \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ +#define _RAW_RW_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ .magic = RWLOCK_MAGIC, \ .owner = SPINLOCK_OWNER_INIT, \ .owner_cpu = -1, \ RW_DEP_MAP_INIT(lockname) } #else -# define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ +# define _RAW_SPIN_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ SPIN_DEP_MAP_INIT(lockname) } -#define __RW_LOCK_UNLOCKED(lockname) \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ +# define _RAW_RW_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ RW_DEP_MAP_INIT(lockname) } #endif @@ -91,10 +108,22 @@ typedef struct { * Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or * __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate. */ -#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) -#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) -#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) +# define RAW_SPIN_LOCK_UNLOCKED(lockname) \ + (raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED(lockname) + +# define RAW_RW_LOCK_UNLOCKED(lockname) \ + (raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED(lockname) + +#define DEFINE_RAW_SPINLOCK(name) \ + raw_spinlock_t name __cacheline_aligned_in_smp = \ + RAW_SPIN_LOCK_UNLOCKED(name) + +#define __DEFINE_RAW_SPINLOCK(name) \ + raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_RAW_RWLOCK(name) \ + raw_rwlock_t name __cacheline_aligned_in_smp = \ + RAW_RW_LOCK_UNLOCKED(name) #endif /* __LINUX_SPINLOCK_TYPES_H */ Index: linux-rt.q/include/linux/spinlock_types_up.h =================================================================== --- linux-rt.q.orig/include/linux/spinlock_types_up.h +++ linux-rt.q/include/linux/spinlock_types_up.h @@ -12,21 +12,17 @@ * Released under the General Public License (GPL). */ -#if defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_DEBUG_LOCK_ALLOC) +#ifdef CONFIG_DEBUG_SPINLOCK typedef struct { volatile unsigned int slock; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 1 } #else -typedef struct { } raw_spinlock_t; +typedef struct { } __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { } @@ -34,10 +30,7 @@ typedef struct { } raw_spinlock_t; typedef struct { /* no debug version on UP */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { } Index: linux-rt.q/include/linux/spinlock_up.h =================================================================== --- linux-rt.q.orig/include/linux/spinlock_up.h +++ linux-rt.q/include/linux/spinlock_up.h @@ -20,19 +20,19 @@ #ifdef CONFIG_DEBUG_SPINLOCK #define __raw_spin_is_locked(x) ((x)->slock == 0) -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { lock->slock = 0; } static inline void -__raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +__raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { local_irq_save(flags); lock->slock = 0; } -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { char oldval = lock->slock; @@ -41,7 +41,7 @@ static inline int __raw_spin_trylock(raw return oldval > 0; } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { lock->slock = 1; } Index: linux-rt.q/init/main.c =================================================================== --- linux-rt.q.orig/init/main.c +++ linux-rt.q/init/main.c @@ -440,7 +440,7 @@ static void noinline __init_refok rest_i * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); Index: linux-rt.q/kernel/Makefile =================================================================== --- linux-rt.q.orig/kernel/Makefile +++ linux-rt.q/kernel/Makefile @@ -7,12 +7,15 @@ obj-y = sched.o fork.o exec_domain.o sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +ifneq ($(CONFIG_PREEMPT_RT),y) +obj-y += mutex.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +endif obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o @@ -24,6 +27,7 @@ endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_PREEMPT_RT) += rt.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o Index: linux-rt.q/kernel/fork.c =================================================================== --- linux-rt.q.orig/kernel/fork.c +++ linux-rt.q/kernel/fork.c @@ -941,6 +941,9 @@ static inline void rt_mutex_init_task(st #ifdef CONFIG_RT_MUTEXES plist_head_init(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + p->last_kernel_lock = NULL; +# endif #endif } @@ -1102,7 +1105,6 @@ static struct task_struct *copy_process( #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; @@ -1131,6 +1133,9 @@ static struct task_struct *copy_process( retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_namespaces; +#ifdef CONFIG_DEBUG_PREEMPT + p->lock_count = 0; +#endif p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* Index: linux-rt.q/kernel/futex.c =================================================================== --- linux-rt.q.orig/kernel/futex.c +++ linux-rt.q/kernel/futex.c @@ -2089,7 +2089,11 @@ static int __init init(void) } for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { +#ifdef CONFIG_PREEMPT_RT + plist_head_init(&futex_queues[i].chain, NULL); +#else plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); +#endif spin_lock_init(&futex_queues[i].lock); } return 0; Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -1460,7 +1460,7 @@ static void migrate_hrtimers(int cpu) tick_cancel_sched_timer(cpu); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, + raw_double_spin_lock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1468,7 +1468,7 @@ static void migrate_hrtimers(int cpu) &new_base->clock_base[i]); } - double_spin_unlock(&new_base->lock, &old_base->lock, + raw_double_spin_unlock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); local_irq_enable(); put_cpu_var(hrtimer_bases); Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -50,7 +50,7 @@ * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static __raw_spinlock_t lockdep_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static int graph_lock(void) { Index: linux-rt.q/kernel/rt.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/rt.c @@ -0,0 +1,571 @@ +/* + * kernel/rt.c + * + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * historic credit for proving that Linux spinlocks can be implemented via + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow + * and others) who prototyped it on 2.4 and did lots of comparative + * research and analysis; TimeSys, for proving that you can implement a + * fully preemptible kernel via the use of IRQ threading and mutexes; + * Bill Huey for persuasively arguing on lkml that the mutex model is the + * right one; and to MontaVista, who ported pmutexes to 2.6. + * + * This code is a from-scratch implementation and is not based on pmutexes, + * but the idea of converting spinlocks to mutexes is used here too. + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli + * - Derived also from comments by Linus + * + * Pending ownership of locks and ownership stealing: + * + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt + * + * (also by Steven Rostedt) + * - Converted single pi_lock to individual task locks. + * + * By Esben Nielsen: + * Doing priority inheritance with help of the scheduler. + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * - major rework based on Esben Nielsens initial patch + * - replaced thread_info references by task_struct refs + * - removed task->pending_owner dependency + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks + * in the scheduler return path as discussed with Steven Rostedt + * + * Copyright (C) 2006, Kihon Technologies Inc. + * Steven Rostedt + * - debugged and patched Thomas Gleixner's rework. + * - added back the cmpxchg to the rework. + * - turned atomic require back on for SMP. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex_common.h" + +#ifdef CONFIG_PREEMPT_RT +/* + * Unlock these on crash: + */ +void zap_rt_locks(void) +{ + //trace_lock_init(); +} +#endif + +/* + * struct mutex functions + */ +void _mutex_init(struct mutex *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(_mutex_init); + +void __lockfunc _mutex_lock(struct mutex *lock) +{ + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + rt_mutex_lock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_lock); + +int __lockfunc _mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_lock_interruptible(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) +{ + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_mutex_lock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_lock_nested); + +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = rt_mutex_lock_interruptible(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible_nested); +#endif + +int __lockfunc _mutex_trylock(struct mutex *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(_mutex_trylock); + +void __lockfunc _mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_unlock); + +/* + * rwlock_t functions + */ +int __lockfunc rt_write_trylock(rwlock_t *rwlock) +{ + int ret = rt_mutex_trylock(&rwlock->lock); + + if (ret) + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags) +{ + *flags = 0; + return rt_write_trylock(rwlock); +} + +int __lockfunc rt_read_trylock(rwlock_t *rwlock) +{ + struct rt_mutex *lock = &rwlock->lock; + unsigned long flags; + int ret; + + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&lock->wait_lock, flags); + if (rt_mutex_real_owner(lock) == current) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + rwlock->read_depth++; + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + return 1; + } + spin_unlock_irqrestore(&lock->wait_lock, flags); + + ret = rt_mutex_trylock(lock); + if (ret) + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +void __lockfunc rt_write_lock(rwlock_t *rwlock) +{ + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + __rt_spin_lock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_write_lock); + +void __lockfunc rt_read_lock(rwlock_t *rwlock) +{ + unsigned long flags; + struct rt_mutex *lock = &rwlock->lock; + + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + /* + * Read locks within the write lock succeed. + */ + spin_lock_irqsave(&lock->wait_lock, flags); + if (rt_mutex_real_owner(lock) == current) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + rwlock->read_depth++; + return; + } + spin_unlock_irqrestore(&lock->wait_lock, flags); + __rt_spin_lock(lock); +} + +EXPORT_SYMBOL(rt_read_lock); + +void __lockfunc rt_write_unlock(rwlock_t *rwlock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + __rt_spin_unlock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_write_unlock); + +void __lockfunc rt_read_unlock(rwlock_t *rwlock) +{ + struct rt_mutex *lock = &rwlock->lock; + unsigned long flags; + + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + // TRACE_WARN_ON(lock->save_state != 1); + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&lock->wait_lock, flags); + if (rt_mutex_real_owner(lock) == current && rwlock->read_depth) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + rwlock->read_depth--; + return; + } + spin_unlock_irqrestore(&lock->wait_lock, flags); + __rt_spin_unlock(&rwlock->lock); +} +EXPORT_SYMBOL(rt_read_unlock); + +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock) +{ + rt_write_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_write_lock_irqsave); + +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock) +{ + rt_read_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_read_lock_irqsave); + +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map(&rwlock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&rwlock->lock, name); + rwlock->read_depth = 0; +} +EXPORT_SYMBOL(__rt_rwlock_init); + +/* + * rw_semaphores + */ + +void fastcall rt_up_write(struct rw_semaphore *rwsem) +{ + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_write); + +void fastcall rt_up_read(struct rw_semaphore *rwsem) +{ + unsigned long flags; + + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&rwsem->lock.wait_lock, flags); + if (rt_mutex_real_owner(&rwsem->lock) == current && rwsem->read_depth) { + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rwsem->read_depth--; + return; + } + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_read); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void fastcall rt_up_read_non_owner(struct rw_semaphore *rwsem) +{ + unsigned long flags; + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&rwsem->lock.wait_lock, flags); + if (rt_mutex_real_owner(&rwsem->lock) == current && rwsem->read_depth) { + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rwsem->read_depth--; + return; + } + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rt_mutex_unlock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_up_read_non_owner); +#endif + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void fastcall rt_downgrade_write(struct rw_semaphore *rwsem) +{ + BUG(); +} +EXPORT_SYMBOL(rt_downgrade_write); + +int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem) +{ + int ret = rt_mutex_trylock(&rwsem->lock); + + if (ret) + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_down_write_trylock); + +void fastcall rt_down_write(struct rw_semaphore *rwsem) +{ + rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_); + rt_mutex_lock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_down_write); + +void fastcall rt_down_write_nested(struct rw_semaphore *rwsem, int subclass) +{ + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); + rt_mutex_lock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_down_write_nested); + +int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem) +{ + unsigned long flags; + int ret; + + /* + * Read locks within the self-held write lock succeed. + */ + spin_lock_irqsave(&rwsem->lock.wait_lock, flags); + if (rt_mutex_real_owner(&rwsem->lock) == current) { + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rwsem_acquire_read(&rwsem->dep_map, 0, 1, _RET_IP_); + rwsem->read_depth++; + return 1; + } + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + + ret = rt_mutex_trylock(&rwsem->lock); + if (ret) + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_down_read_trylock); + +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) +{ + unsigned long flags; + + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_); + + /* + * Read locks within the write lock succeed. + */ + spin_lock_irqsave(&rwsem->lock.wait_lock, flags); + + if (rt_mutex_real_owner(&rwsem->lock) == current) { + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rwsem->read_depth++; + return; + } + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rt_mutex_lock(&rwsem->lock); +} + +void fastcall rt_down_read(struct rw_semaphore *rwsem) +{ + __rt_down_read(rwsem, 0); +} +EXPORT_SYMBOL(rt_down_read); + +void fastcall rt_down_read_nested(struct rw_semaphore *rwsem, int subclass) +{ + __rt_down_read(rwsem, subclass); +} +EXPORT_SYMBOL(rt_down_read_nested); + + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +/* + * Same as rt_down_read() but no lockdep calls: + */ +void fastcall rt_down_read_non_owner(struct rw_semaphore *rwsem) +{ + unsigned long flags; + /* + * Read locks within the write lock succeed. + */ + spin_lock_irqsave(&rwsem->lock.wait_lock, flags); + + if (rt_mutex_real_owner(&rwsem->lock) == current) { + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rwsem->read_depth++; + return; + } + spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags); + rt_mutex_lock(&rwsem->lock); +} +EXPORT_SYMBOL(rt_down_read_non_owner); + +#endif + +void fastcall __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem)); + lockdep_init_map(&rwsem->dep_map, name, key, 0); +#endif + __rt_mutex_init(&rwsem->lock, name); + rwsem->read_depth = 0; +} +EXPORT_SYMBOL(__rt_rwsem_init); + +/* + * Semaphores + */ +/* + * Linux Semaphores implemented via RT-mutexes. + * + * In the down() variants we use the mutex as the semaphore blocking + * object: we always acquire it, decrease the counter and keep the lock + * locked if we did the 1->0 transition. The next down() will then block. + * + * In the up() path we atomically increase the counter and do the + * unlock if we were the one doing the 0->1 transition. + */ + +static inline void __down_complete(struct semaphore *sem) +{ + int count = atomic_dec_return(&sem->count); + + if (unlikely(count > 0)) + rt_mutex_unlock(&sem->lock); +} + +void fastcall rt_down(struct semaphore *sem) +{ + rt_mutex_lock(&sem->lock); + __down_complete(sem); +} +EXPORT_SYMBOL(rt_down); + +int fastcall rt_down_interruptible(struct semaphore *sem) +{ + int ret; + + ret = rt_mutex_lock_interruptible(&sem->lock, 0); + if (ret) + return ret; + __down_complete(sem); + return 0; +} +EXPORT_SYMBOL(rt_down_interruptible); + +/* + * try to down the semaphore, 0 on success and 1 on failure. (inverted) + */ +int fastcall rt_down_trylock(struct semaphore *sem) +{ + /* + * Here we are a tiny bit different from ordinary Linux semaphores, + * because we can get 'transient' locking-failures when say a + * process decreases the count from 9 to 8 and locks/releases the + * embedded mutex internally. It would be quite complex to remove + * these transient failures so lets try it the simple way first: + */ + if (rt_mutex_trylock(&sem->lock)) { + __down_complete(sem); + return 0; + } + return 1; +} +EXPORT_SYMBOL(rt_down_trylock); + +void fastcall rt_up(struct semaphore *sem) +{ + int count; + + /* + * Disable preemption to make sure a highprio trylock-er cannot + * preempt us here and get into an infinite loop: + */ + preempt_disable(); + count = atomic_inc_return(&sem->count); + /* + * If we did the 0 -> 1 transition then we are the ones to unlock it: + */ + if (likely(count == 1)) + rt_mutex_unlock(&sem->lock); + preempt_enable(); +} +EXPORT_SYMBOL(rt_up); + +void fastcall __sema_init(struct semaphore *sem, int val, + char *name, char *file, int line) +{ + atomic_set(&sem->count, val); + switch (val) { + case 0: + __rt_mutex_init(&sem->lock, name); + rt_mutex_lock(&sem->lock); + break; + default: + __rt_mutex_init(&sem->lock, name); + break; + } +} +EXPORT_SYMBOL(__sema_init); + +void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file, + int line) +{ + __sema_init(sem, 1, name, file, line); +} +EXPORT_SYMBOL(__init_MUTEX); + Index: linux-rt.q/kernel/rtmutex-debug.c =================================================================== --- linux-rt.q.orig/kernel/rtmutex-debug.c +++ linux-rt.q/kernel/rtmutex-debug.c @@ -16,6 +16,7 @@ * * See rt.c in preempt-rt for proper credits and further information */ +#include #include #include #include @@ -29,66 +30,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (spin_is_locked(¤t->pi_lock)) \ - spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -int rt_trace_on = 1; - -void deadlock_trace_off(void) -{ - rt_trace_on = 0; -} - static void printk_task(struct task_struct *p) { if (p) @@ -116,8 +57,8 @@ static void printk_lock(struct rt_mutex void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -130,7 +71,7 @@ void debug_rt_mutex_deadlock(int detect, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -144,14 +85,15 @@ void debug_rt_mutex_print_deadlock(struc { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; task = find_task_by_pid(waiter->deadlock_task_pid); if (!task) return; - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) + return; printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -178,7 +120,6 @@ void debug_rt_mutex_print_deadlock(struc printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -187,7 +128,8 @@ void debug_rt_mutex_lock(struct rt_mutex void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + if (debug_locks) + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -197,7 +139,7 @@ debug_rt_mutex_proxy_lock(struct rt_mute void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -209,9 +151,9 @@ void debug_rt_mutex_init_waiter(struct r void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } @@ -227,9 +169,36 @@ void debug_rt_mutex_init(struct rt_mutex void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (task->lock_count >= MAX_LOCK_STACK) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count overflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[task->lock_count] = lock; +#endif + task->lock_count++; +#endif } void rt_mutex_deadlock_account_unlock(struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (!task->lock_count) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count underflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } + task->lock_count--; +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[task->lock_count] = NULL; +#endif +#endif } - Index: linux-rt.q/kernel/rtmutex.c =================================================================== --- linux-rt.q.orig/kernel/rtmutex.c +++ linux-rt.q/kernel/rtmutex.c @@ -97,6 +97,22 @@ static inline void mark_rt_mutex_waiters } #endif +int pi_initialized; + +/* + * we initialize the wait_list runtime. (Could be done build-time and/or + * boot-time.) + */ +static inline void init_lists(struct rt_mutex *lock) +{ + if (unlikely(!lock->wait_list.prio_list.prev)) { + plist_head_init(&lock->wait_list, &lock->wait_lock); +#ifdef CONFIG_DEBUG_RT_MUTEXES + pi_initialized++; +#endif + } +} + /* * Calculate task priority from the waiter list priority * @@ -253,13 +269,13 @@ static int rt_mutex_adjust_prio_chain(st plist_add(&waiter->list_entry, &lock->wait_list); /* Release the task */ - spin_unlock_irqrestore(&task->pi_lock, flags); + spin_unlock(&task->pi_lock); put_task_struct(task); /* Grab the next task */ task = rt_mutex_owner(lock); get_task_struct(task); - spin_lock_irqsave(&task->pi_lock, flags); + spin_lock(&task->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ @@ -277,10 +293,10 @@ static int rt_mutex_adjust_prio_chain(st __rt_mutex_adjust_prio(task); } - spin_unlock_irqrestore(&task->pi_lock, flags); + spin_unlock(&task->pi_lock); top_waiter = rt_mutex_top_waiter(lock); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -304,7 +320,6 @@ static inline int try_to_steal_lock(stru { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; - unsigned long flags; if (!rt_mutex_owner_pending(lock)) return 0; @@ -312,9 +327,9 @@ static inline int try_to_steal_lock(stru if (pendowner == current) return 1; - spin_lock_irqsave(&pendowner->pi_lock, flags); + spin_lock(&pendowner->pi_lock); if (current->prio >= pendowner->prio) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); return 0; } @@ -324,7 +339,7 @@ static inline int try_to_steal_lock(stru * priority. */ if (likely(!rt_mutex_has_waiters(lock))) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); return 1; } @@ -332,7 +347,7 @@ static inline int try_to_steal_lock(stru next = rt_mutex_top_waiter(lock); plist_del(&next->pi_list_entry, &pendowner->pi_waiters); __rt_mutex_adjust_prio(pendowner); - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); /* * We are going to steal the lock and a waiter was @@ -349,10 +364,10 @@ static inline int try_to_steal_lock(stru * might be current: */ if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); plist_add(&next->pi_list_entry, ¤t->pi_waiters); __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); } return 1; } @@ -411,14 +426,13 @@ static int try_to_take_rt_mutex(struct r */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - int detect_deadlock) + int detect_deadlock, unsigned long flags) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); __rt_mutex_adjust_prio(current); waiter->task = current; waiter->lock = lock; @@ -432,17 +446,17 @@ static int task_blocks_on_rt_mutex(struc current->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { - spin_lock_irqsave(&owner->pi_lock, flags); + spin_lock(&owner->pi_lock); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + spin_unlock(&owner->pi_lock); } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) chain_walk = 1; @@ -457,12 +471,12 @@ static int task_blocks_on_rt_mutex(struc */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, current); - spin_lock(&lock->wait_lock); + spin_lock_irq(&lock->wait_lock); return res; } @@ -475,13 +489,12 @@ static int task_blocks_on_rt_mutex(struc * * Called with lock->wait_lock held. */ -static void wakeup_next_waiter(struct rt_mutex *lock) +static void wakeup_next_waiter(struct rt_mutex *lock, int savestate) { struct rt_mutex_waiter *waiter; struct task_struct *pendowner; - unsigned long flags; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -498,7 +511,7 @@ static void wakeup_next_waiter(struct rt rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); /* * Clear the pi_blocked_on variable and enqueue a possible @@ -507,7 +520,7 @@ static void wakeup_next_waiter(struct rt * waiter with higher priority than pending-owner->normal_prio * is blocked on the unboosted (pending) owner. */ - spin_lock_irqsave(&pendowner->pi_lock, flags); + spin_lock(&pendowner->pi_lock); WARN_ON(!pendowner->pi_blocked_on); WARN_ON(pendowner->pi_blocked_on != waiter); @@ -521,9 +534,12 @@ static void wakeup_next_waiter(struct rt next = rt_mutex_top_waiter(lock); plist_add(&next->pi_list_entry, &pendowner->pi_waiters); } - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); - wake_up_process(pendowner); + if (savestate) + wake_up_process_mutex(pendowner); + else + wake_up_process(pendowner); } /* @@ -532,22 +548,22 @@ static void wakeup_next_waiter(struct rt * Must be called with lock->wait_lock held */ static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + unsigned long flags) { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - unsigned long flags; int chain_walk = 0; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); plist_del(&waiter->list_entry, &lock->wait_list); waiter->task = NULL; current->pi_blocked_on = NULL; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); if (first && owner != current) { - spin_lock_irqsave(&owner->pi_lock, flags); + spin_lock(&owner->pi_lock); plist_del(&waiter->pi_list_entry, &owner->pi_waiters); @@ -562,7 +578,7 @@ static void remove_waiter(struct rt_mute if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + spin_unlock(&owner->pi_lock); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); @@ -573,11 +589,11 @@ static void remove_waiter(struct rt_mute /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - spin_lock(&lock->wait_lock); + spin_lock_irq(&lock->wait_lock); } /* @@ -598,14 +614,307 @@ void rt_mutex_adjust_pi(struct task_stru return; } - spin_unlock_irqrestore(&task->pi_lock, flags); - /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } /* + * preemptible spin_lock functions: + */ + +#ifdef CONFIG_PREEMPT_RT + +static inline void +rt_spin_lock_fastlock(struct rt_mutex *lock, + void fastcall (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) + rt_mutex_deadlock_account_lock(lock, current); + else + slowfn(lock); +} + +static inline void +rt_spin_lock_fastunlock(struct rt_mutex *lock, + void fastcall (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/* + * Slow path lock function spin_lock style: this variant is very + * careful not to miss any non-lock wakeups. + * + * The wakeup side uses wake_up_process_mutex, which, combined with + * the xchg code of this function is a transparent sleep/wakeup + * mechanism nested within any existing sleep/wakeup mechanism. This + * enables the seemless use of arbitrary (blocking) spinlocks within + * sleep/wakeup event loops. + */ +static void fastcall noinline __sched +rt_spin_lock_slowlock(struct rt_mutex *lock) +{ + struct rt_mutex_waiter waiter; + unsigned long saved_state, state, flags; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock)) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + return; + } + + BUG_ON(rt_mutex_owner(lock) == current); + + /* + * Here we save whatever state the task was in originally, + * we'll restore it at the end of the function and we'll take + * any intermediate wakeup into account as well, independently + * of the lock sleep/wakeup mechanism. When we get a real + * wakeup the task->state is TASK_RUNNING and we change + * saved_state accordingly. If we did not get a real wakeup + * then we return with the saved state. + */ + saved_state = xchg(¤t->state, TASK_UNINTERRUPTIBLE); + + for (;;) { + unsigned long saved_flags; + int saved_lock_depth = current->lock_depth; + + /* Try to acquire the lock */ + if (try_to_take_rt_mutex(lock)) + break; + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by an higher prio task. + */ + if (!waiter.task) { + task_blocks_on_rt_mutex(lock, &waiter, 0, flags); + /* Wakeup during boost ? */ + if (unlikely(!waiter.task)) + continue; + } + + /* + * Prevent schedule() to drop BKL, while waiting for + * the lock ! We restore lock_depth when we come back. + */ + saved_flags = current->flags & PF_NOSCHED; + current->lock_depth = -1; + current->flags &= ~PF_NOSCHED; + spin_unlock_irqrestore(&lock->wait_lock, flags); + + debug_rt_mutex_print_deadlock(&waiter); + + schedule_rt_mutex(lock); + + spin_lock_irqsave(&lock->wait_lock, flags); + current->flags |= saved_flags; + current->lock_depth = saved_lock_depth; + state = xchg(¤t->state, TASK_UNINTERRUPTIBLE); + if (unlikely(state == TASK_RUNNING)) + saved_state = TASK_RUNNING; + } + + state = xchg(¤t->state, saved_state); + if (unlikely(state == TASK_RUNNING)) + current->state = TASK_RUNNING; + + /* + * Extremely rare case, if we got woken up by a non-mutex wakeup, + * and we managed to steal the lock despite us not being the + * highest-prio waiter (due to SCHED_OTHER changing prio), then we + * can end up with a non-NULL waiter.task: + */ + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter, flags); + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up: + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock_irqrestore(&lock->wait_lock, flags); + + debug_rt_mutex_free_waiter(&waiter); +} + +/* + * Slow path to release a rt_mutex spin_lock style + */ +static void fastcall noinline __sched +rt_spin_lock_slowunlock(struct rt_mutex *lock) +{ + unsigned long flags; + + spin_lock_irqsave(&lock->wait_lock, flags); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + spin_unlock_irqrestore(&lock->wait_lock, flags); + return; + } + + wakeup_next_waiter(lock, 1); + + spin_unlock_irqrestore(&lock->wait_lock, flags); + + /* Undo pi boosting.when necessary */ + rt_mutex_adjust_prio(current); +} + +void __lockfunc rt_spin_lock(spinlock_t *lock) +{ + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); +} +EXPORT_SYMBOL(rt_spin_lock); + +void __lockfunc __rt_spin_lock(struct rt_mutex *lock) +{ + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); +} +EXPORT_SYMBOL(__rt_spin_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +#endif + +void __lockfunc rt_spin_unlock(spinlock_t *lock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + spin_release(&lock->dep_map, 1, _RET_IP_); + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) +{ + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(__rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), we lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_unlock_wait); + +int __lockfunc rt_spin_trylock(spinlock_t *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) +{ + int ret; + + *flags = 0; + ret = rt_mutex_trylock(&lock->lock); + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_irqsave); + +int _atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + rt_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + rt_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_spin_lock); + +void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(__rt_spin_lock_init); + +#endif + +#ifdef CONFIG_PREEMPT_BKL + +static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags) +{ + int saved_lock_depth = current->lock_depth; + + current->lock_depth = -1; + /* + * try_to_take_lock set the waiters, make sure it's + * still correct. + */ + fixup_rt_mutex_waiters(lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); + + up(&kernel_sem); + + spin_lock_irq(&lock->wait_lock); + + return saved_lock_depth; +} + +static inline void rt_reacquire_bkl(int saved_lock_depth) +{ + down(&kernel_sem); + current->lock_depth = saved_lock_depth; +} + +#else +# define rt_release_bkl(lock, flags) (-1) +# define rt_reacquire_bkl(depth) do { } while (0) +#endif + +/* * Slow path lock function: */ static int __sched @@ -613,20 +922,29 @@ rt_mutex_slowlock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, int detect_deadlock) { + int ret = 0, saved_lock_depth = -1; struct rt_mutex_waiter waiter; - int ret = 0; + unsigned long flags; debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; - spin_lock(&lock->wait_lock); + spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } + /* + * We drop the BKL here before we go into the wait loop to avoid a + * possible deadlock in the scheduler. + */ + if (unlikely(current->lock_depth >= 0)) + saved_lock_depth = rt_release_bkl(lock, flags); + set_current_state(state); /* Setup the timer, when timeout != NULL */ @@ -635,6 +953,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, HRTIMER_MODE_ABS); for (;;) { + unsigned long saved_flags; + /* Try to acquire the lock: */ if (try_to_take_rt_mutex(lock)) break; @@ -660,7 +980,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, */ if (!waiter.task) { ret = task_blocks_on_rt_mutex(lock, &waiter, - detect_deadlock); + detect_deadlock, flags); /* * If we got woken up by the owner then start loop * all over without going into schedule to try @@ -679,22 +999,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, if (unlikely(ret)) break; } + saved_flags = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; - spin_unlock(&lock->wait_lock); + spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(&waiter); if (waiter.task) schedule_rt_mutex(lock); - spin_lock(&lock->wait_lock); + spin_lock_irq(&lock->wait_lock); + + current->flags |= saved_flags; set_current_state(state); } set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) - remove_waiter(lock, &waiter); + remove_waiter(lock, &waiter, flags); /* * try_to_take_rt_mutex() sets the waiter bit @@ -702,7 +1026,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -716,6 +1040,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, if (unlikely(ret)) rt_mutex_adjust_prio(current); + /* Must we reaquire the BKL? */ + if (unlikely(saved_lock_depth >= 0)) + rt_reacquire_bkl(saved_lock_depth); + debug_rt_mutex_free_waiter(&waiter); return ret; @@ -727,12 +1055,15 @@ rt_mutex_slowlock(struct rt_mutex *lock, static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret = 0; - spin_lock(&lock->wait_lock); + spin_lock_irqsave(&lock->wait_lock, flags); if (likely(rt_mutex_owner(lock) != current)) { + init_lists(lock); + ret = try_to_take_rt_mutex(lock); /* * try_to_take_rt_mutex() sets the lock waiters @@ -741,7 +1072,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lo fixup_rt_mutex_waiters(lock); } - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -752,7 +1083,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lo static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { - spin_lock(&lock->wait_lock); + unsigned long flags; + + spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -760,13 +1093,13 @@ rt_mutex_slowunlock(struct rt_mutex *loc if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); return; } - wakeup_next_waiter(lock); + wakeup_next_waiter(lock, 0); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); /* Undo pi boosting if necessary: */ rt_mutex_adjust_prio(current); Index: linux-rt.q/kernel/rwsem.c =================================================================== --- linux-rt.q.orig/kernel/rwsem.c +++ linux-rt.q/kernel/rwsem.c @@ -15,7 +15,7 @@ /* * lock for reading */ -void down_read(struct rw_semaphore *sem) +void compat_down_read(struct compat_rw_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -23,12 +23,12 @@ void down_read(struct rw_semaphore *sem) __down_read(sem); } -EXPORT_SYMBOL(down_read); +EXPORT_SYMBOL(compat_down_read); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int down_read_trylock(struct rw_semaphore *sem) +int compat_down_read_trylock(struct compat_rw_semaphore *sem) { int ret = __down_read_trylock(sem); @@ -37,12 +37,12 @@ int down_read_trylock(struct rw_semaphor return ret; } -EXPORT_SYMBOL(down_read_trylock); +EXPORT_SYMBOL(compat_down_read_trylock); /* * lock for writing */ -void down_write(struct rw_semaphore *sem) +void compat_down_write(struct compat_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -50,12 +50,12 @@ void down_write(struct rw_semaphore *sem __down_write(sem); } -EXPORT_SYMBOL(down_write); +EXPORT_SYMBOL(compat_down_write); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int down_write_trylock(struct rw_semaphore *sem) +int compat_down_write_trylock(struct compat_rw_semaphore *sem) { int ret = __down_write_trylock(sem); @@ -64,36 +64,36 @@ int down_write_trylock(struct rw_semapho return ret; } -EXPORT_SYMBOL(down_write_trylock); +EXPORT_SYMBOL(compat_down_write_trylock); /* * release a read lock */ -void up_read(struct rw_semaphore *sem) +void compat_up_read(struct compat_rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_read(sem); } -EXPORT_SYMBOL(up_read); +EXPORT_SYMBOL(compat_up_read); /* * release a write lock */ -void up_write(struct rw_semaphore *sem) +void compat_up_write(struct compat_rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_write(sem); } -EXPORT_SYMBOL(up_write); +EXPORT_SYMBOL(compat_up_write); /* * downgrade write lock to read lock */ -void downgrade_write(struct rw_semaphore *sem) +void compat_downgrade_write(struct compat_rw_semaphore *sem) { /* * lockdep: a downgraded write will live on as a write @@ -102,11 +102,11 @@ void downgrade_write(struct rw_semaphore __downgrade_write(sem); } -EXPORT_SYMBOL(downgrade_write); +EXPORT_SYMBOL(compat_downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void down_read_nested(struct rw_semaphore *sem, int subclass) +void compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -114,18 +114,18 @@ void down_read_nested(struct rw_semaphor __down_read(sem); } -EXPORT_SYMBOL(down_read_nested); +EXPORT_SYMBOL(compat_down_read_nested); -void down_read_non_owner(struct rw_semaphore *sem) +void compat_down_read_non_owner(struct compat_rw_semaphore *sem) { might_sleep(); __down_read(sem); } -EXPORT_SYMBOL(down_read_non_owner); +EXPORT_SYMBOL(compat_down_read_non_owner); -void down_write_nested(struct rw_semaphore *sem, int subclass) +void compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -133,14 +133,14 @@ void down_write_nested(struct rw_semapho __down_write_nested(sem, subclass); } -EXPORT_SYMBOL(down_write_nested); +EXPORT_SYMBOL(compat_down_write_nested); -void up_read_non_owner(struct rw_semaphore *sem) +void compat_up_read_non_owner(struct compat_rw_semaphore *sem) { __up_read(sem); } -EXPORT_SYMBOL(up_read_non_owner); +EXPORT_SYMBOL(compat_up_read_non_owner); #endif Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -1392,7 +1392,8 @@ static inline int wake_idle(int cpu, str * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1534,17 +1535,51 @@ out: int fastcall wake_up_process(struct task_struct *p) { - int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + int ret; + + ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 0, 0); mcount(); return ret; } EXPORT_SYMBOL(wake_up_process); -int fastcall wake_up_state(struct task_struct *p, unsigned int state) +int fastcall wake_up_process_sync(struct task_struct * p) { - int ret = try_to_wake_up(p, state, 0); + int ret; + + ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 1, 0); + mcount(); + return ret; +} +EXPORT_SYMBOL(wake_up_process_sync); + +int fastcall wake_up_process_mutex(struct task_struct * p) +{ + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 0, 1); + mcount(); + return ret; +} +EXPORT_SYMBOL(wake_up_process_mutex); + +int fastcall wake_up_process_mutex_sync(struct task_struct * p) +{ + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 1, 1); + mcount(); + return ret; +} +EXPORT_SYMBOL(wake_up_process_mutex_sync); +int fastcall wake_up_state(struct task_struct *p, unsigned int state) +{ + int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0); mcount(); return ret; } @@ -3331,6 +3366,7 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); clear_tsk_need_resched(prev); + clear_tsk_need_resched_delayed(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && @@ -3367,8 +3403,9 @@ need_resched_nonpreemptible: rq = cpu_rq(cpu); goto need_resched_nonpreemptible; } - preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + __preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || + test_thread_flag(TIF_NEED_RESCHED_DELAYED))) goto need_resched; } EXPORT_SYMBOL(schedule); @@ -3412,7 +3449,8 @@ need_resched: /* we could miss a preemption opportunity between schedule and now */ barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || + test_thread_flag(TIF_NEED_RESCHED_DELAYED))) goto need_resched; } EXPORT_SYMBOL(preempt_schedule); @@ -3454,7 +3492,8 @@ need_resched: /* we could miss a preemption opportunity between schedule and now */ barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || + test_thread_flag(TIF_NEED_RESCHED_DELAYED))) goto need_resched; } @@ -3463,7 +3502,8 @@ need_resched: int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode | TASK_RUNNING_MUTEX, + sync, 0); } EXPORT_SYMBOL(default_wake_function); @@ -3504,8 +3544,9 @@ void fastcall __wake_up(wait_queue_head_ unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); + __wake_up_common(q, mode, nr_exclusive, 1, key); spin_unlock_irqrestore(&q->lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(__wake_up); @@ -3555,8 +3596,9 @@ void fastcall complete(struct completion spin_lock_irqsave(&x->wait.lock, flags); x->done++; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0, NULL); + 1, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(complete); @@ -3567,11 +3609,18 @@ void fastcall complete_all(struct comple spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0, NULL); + 0, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(complete_all); +unsigned int fastcall completion_done(struct completion *x) +{ + return x->done; +} +EXPORT_SYMBOL(completion_done); + void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); @@ -4401,10 +4450,7 @@ asmlinkage long sys_sched_yield(void) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + spin_unlock_no_resched(&rq->lock); schedule(); @@ -4447,7 +4493,7 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_raw_spinlock(raw_spinlock_t *lock) { int ret = 0; @@ -4458,24 +4504,23 @@ int cond_resched_lock(spinlock_t *lock) spin_lock(lock); } if (need_resched() && system_state == SYSTEM_RUNNING) { - spin_release(&lock->dep_map, 1, _THIS_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); + spin_unlock_no_resched(lock); __cond_resched(); ret = 1; spin_lock(lock); } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_raw_spinlock); /* * Voluntarily preempt a process context that has softirqs disabled: */ int __sched cond_resched_softirq(void) { +#ifndef CONFIG_PREEMPT_RT WARN_ON_ONCE(!in_softirq()); - +#endif if (need_resched() && system_state == SYSTEM_RUNNING) { local_bh_enable(); __cond_resched(); @@ -4661,7 +4706,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = "RMSDTtZX"; static void show_task(struct task_struct *p) { @@ -4669,19 +4714,23 @@ static void show_task(struct task_struct unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk("%-13.13s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); + printk("%-13.13s %c [%p]", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?', p); #if (BITS_PER_LONG == 32) - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(" running "); else printk(" %08lX ", thread_saved_pc(p)); #else - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(" running task "); else printk(" %016lx ", thread_saved_pc(p)); #endif + if (task_curr(p)) + printk("[curr] "); + else if (p->se.on_rq) + printk("[on rq #%d] ", task_cpu(p)); #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long *n = end_of_stack(p); @@ -6358,7 +6407,7 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line) { #ifdef in_atomic @@ -6488,3 +6537,23 @@ void set_curr_task(int cpu, struct task_ } #endif + +#ifdef CONFIG_DEBUG_PREEMPT +void notrace preempt_enable_no_resched(void) +{ + static int once = 1; + + barrier(); + dec_preempt_count(); + + if (once && !preempt_count()) { + once = 0; + printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", + current->comm, current->pid); + dump_stack(); + } +} + +EXPORT_SYMBOL(preempt_enable_no_resched); +#endif + Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -413,7 +413,7 @@ void irq_exit(void) if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) tick_nohz_stop_sched_tick(); #endif - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } /* @@ -601,7 +601,7 @@ static int ksoftirqd(void * __data) while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending() & mask) { - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); } @@ -616,7 +616,7 @@ static int ksoftirqd(void * __data) goto wait_to_die; local_irq_disable(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); set_softirq_pending(local_softirq_pending() & ~mask); local_bh_disable(); local_irq_enable(); Index: linux-rt.q/kernel/spinlock.c =================================================================== --- linux-rt.q.orig/kernel/spinlock.c +++ linux-rt.q/kernel/spinlock.c @@ -21,7 +21,7 @@ #include #include -int __lockfunc _spin_trylock(spinlock_t *lock) +int __lockfunc __spin_trylock(raw_spinlock_t *lock) { preempt_disable(); if (_raw_spin_trylock(lock)) { @@ -32,9 +32,46 @@ int __lockfunc _spin_trylock(spinlock_t preempt_enable(); return 0; } -EXPORT_SYMBOL(_spin_trylock); +EXPORT_SYMBOL(__spin_trylock); -int __lockfunc _read_trylock(rwlock_t *lock) +int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + + __preempt_enable_no_resched(); + local_irq_enable(); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(__spin_trylock_irq); + +int __lockfunc __spin_trylock_irqsave(raw_spinlock_t *lock, + unsigned long *flags) +{ + local_irq_save(*flags); + preempt_disable(); + + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + + __preempt_enable_no_resched(); + local_irq_restore(*flags); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(__spin_trylock_irqsave); + +int __lockfunc __read_trylock(raw_rwlock_t *lock) { preempt_disable(); if (_raw_read_trylock(lock)) { @@ -45,9 +82,9 @@ int __lockfunc _read_trylock(rwlock_t *l preempt_enable(); return 0; } -EXPORT_SYMBOL(_read_trylock); +EXPORT_SYMBOL(__read_trylock); -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc __write_trylock(raw_rwlock_t *lock) { preempt_disable(); if (_raw_write_trylock(lock)) { @@ -58,7 +95,21 @@ int __lockfunc _write_trylock(rwlock_t * preempt_enable(); return 0; } -EXPORT_SYMBOL(_write_trylock); +EXPORT_SYMBOL(__write_trylock); + +int __lockfunc __write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags) +{ + int ret; + + local_irq_save(*flags); + ret = __write_trylock(lock); + if (ret) + return ret; + + local_irq_restore(*flags); + return 0; +} +EXPORT_SYMBOL(__write_trylock_irqsave); /* * If lockdep is enabled then we use the non-preemption spin-ops @@ -66,17 +117,17 @@ EXPORT_SYMBOL(_write_trylock); * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ - defined(CONFIG_DEBUG_LOCK_ALLOC) + defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_PREEMPT_RT) -void __lockfunc _read_lock(rwlock_t *lock) +void __lockfunc __read_lock(raw_rwlock_t *lock) { preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); _raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock); +EXPORT_SYMBOL(__read_lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock) { unsigned long flags; @@ -95,27 +146,27 @@ unsigned long __lockfunc _spin_lock_irqs #endif return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave); +EXPORT_SYMBOL(__spin_lock_irqsave); -void __lockfunc _spin_lock_irq(spinlock_t *lock) +void __lockfunc __spin_lock_irq(raw_spinlock_t *lock) { local_irq_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_spin_lock(lock); } -EXPORT_SYMBOL(_spin_lock_irq); +EXPORT_SYMBOL(__spin_lock_irq); -void __lockfunc _spin_lock_bh(spinlock_t *lock) +void __lockfunc __spin_lock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_spin_lock(lock); } -EXPORT_SYMBOL(_spin_lock_bh); +EXPORT_SYMBOL(__spin_lock_bh); -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; @@ -125,27 +176,27 @@ unsigned long __lockfunc _read_lock_irqs _raw_read_lock(lock); return flags; } -EXPORT_SYMBOL(_read_lock_irqsave); +EXPORT_SYMBOL(__read_lock_irqsave); -void __lockfunc _read_lock_irq(rwlock_t *lock) +void __lockfunc __read_lock_irq(raw_rwlock_t *lock) { local_irq_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); _raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock_irq); +EXPORT_SYMBOL(__read_lock_irq); -void __lockfunc _read_lock_bh(rwlock_t *lock) +void __lockfunc __read_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); _raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock_bh); +EXPORT_SYMBOL(__read_lock_bh); -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; @@ -155,43 +206,43 @@ unsigned long __lockfunc _write_lock_irq _raw_write_lock(lock); return flags; } -EXPORT_SYMBOL(_write_lock_irqsave); +EXPORT_SYMBOL(__write_lock_irqsave); -void __lockfunc _write_lock_irq(rwlock_t *lock) +void __lockfunc __write_lock_irq(raw_rwlock_t *lock) { local_irq_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_write_lock(lock); } -EXPORT_SYMBOL(_write_lock_irq); +EXPORT_SYMBOL(__write_lock_irq); -void __lockfunc _write_lock_bh(rwlock_t *lock) +void __lockfunc __write_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_write_lock(lock); } -EXPORT_SYMBOL(_write_lock_bh); +EXPORT_SYMBOL(__write_lock_bh); -void __lockfunc _spin_lock(spinlock_t *lock) +void __lockfunc __spin_lock(raw_spinlock_t *lock) { preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_spin_lock(lock); } -EXPORT_SYMBOL(_spin_lock); +EXPORT_SYMBOL(__spin_lock); -void __lockfunc _write_lock(rwlock_t *lock) +void __lockfunc __write_lock(raw_rwlock_t *lock) { preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_write_lock(lock); } -EXPORT_SYMBOL(_write_lock); +EXPORT_SYMBOL(__write_lock); #else /* CONFIG_PREEMPT: */ @@ -204,7 +255,7 @@ EXPORT_SYMBOL(_write_lock); */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ +void __lockfunc __##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -214,15 +265,16 @@ void __lockfunc _##op##_lock(locktype##_ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!__raw_##op##_can_lock(&(lock)->raw_lock) && \ + (lock)->break_lock) \ + __raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ \ -EXPORT_SYMBOL(_##op##_lock); \ +EXPORT_SYMBOL(__##op##_lock); \ \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ +unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -236,23 +288,24 @@ unsigned long __lockfunc _##op##_lock_ir \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!__raw_##op##_can_lock(&(lock)->raw_lock) && \ + (lock)->break_lock) \ + __raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ +EXPORT_SYMBOL(__##op##_lock_irqsave); \ \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ +void __lockfunc __##op##_lock_irq(locktype##_t *lock) \ { \ - _##op##_lock_irqsave(lock); \ + __##op##_lock_irqsave(lock); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irq); \ +EXPORT_SYMBOL(__##op##_lock_irq); \ \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ +void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -261,39 +314,40 @@ void __lockfunc _##op##_lock_bh(locktype /* irq-disabling. We use the generic preemption-aware */ \ /* function: */ \ /**/ \ - flags = _##op##_lock_irqsave(lock); \ + flags = __##op##_lock_irqsave(lock); \ local_bh_disable(); \ local_irq_restore(flags); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_bh) +EXPORT_SYMBOL(__##op##_lock_bh) /* * Build preemption-friendly versions of the following * lock-spinning functions: * - * _[spin|read|write]_lock() - * _[spin|read|write]_lock_irq() - * _[spin|read|write]_lock_irqsave() - * _[spin|read|write]_lock_bh() + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); +BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(read, raw_rwlock); +BUILD_LOCK_OPS(write, raw_rwlock); #endif /* CONFIG_PREEMPT */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); _raw_spin_lock(lock); } +EXPORT_SYMBOL(__spin_lock_nested); -EXPORT_SYMBOL(_spin_lock_nested); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +unsigned long __lockfunc +__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { unsigned long flags; @@ -312,117 +366,130 @@ unsigned long __lockfunc _spin_lock_irqs #endif return flags; } - EXPORT_SYMBOL(_spin_lock_irqsave_nested); #endif -void __lockfunc _spin_unlock(spinlock_t *lock) +void __lockfunc __spin_unlock(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(__spin_unlock); + +void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock) +{ + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); + __preempt_enable_no_resched(); +} +/* not exported */ -void __lockfunc _write_unlock(rwlock_t *lock) +void __lockfunc __write_unlock(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(__write_unlock); -void __lockfunc _read_unlock(rwlock_t *lock) +void __lockfunc __read_unlock(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_read_unlock); +EXPORT_SYMBOL(__read_unlock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +void __lockfunc __spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(__spin_unlock_irqrestore); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(__spin_unlock_irq); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_spin_unlock_bh); +EXPORT_SYMBOL(__spin_unlock_bh); -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc __read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irqrestore); +EXPORT_SYMBOL(__read_unlock_irqrestore); -void __lockfunc _read_unlock_irq(rwlock_t *lock) +void __lockfunc __read_unlock_irq(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irq); +EXPORT_SYMBOL(__read_unlock_irq); -void __lockfunc _read_unlock_bh(rwlock_t *lock) +void __lockfunc __read_unlock_bh(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_read_unlock_bh); +EXPORT_SYMBOL(__read_unlock_bh); -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irqrestore); +EXPORT_SYMBOL(__write_unlock_irqrestore); -void __lockfunc _write_unlock_irq(rwlock_t *lock) +void __lockfunc __write_unlock_irq(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irq); +EXPORT_SYMBOL(__write_unlock_irq); -void __lockfunc _write_unlock_bh(rwlock_t *lock) +void __lockfunc __write_unlock_bh(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(__write_unlock_bh); -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); @@ -431,18 +498,30 @@ int __lockfunc _spin_trylock_bh(spinlock return 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + return 0; } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(__spin_trylock_bh); -int in_lock_functions(unsigned long addr) +int notrace in_lock_functions(unsigned long addr) { /* Linker adds these: start and end of __lockfunc functions */ extern char __lock_text_start[], __lock_text_end[]; return addr >= (unsigned long)__lock_text_start - && addr < (unsigned long)__lock_text_end; + && addr < (unsigned long)__lock_text_end; } EXPORT_SYMBOL(in_lock_functions); + +void notrace __debug_atomic_dec_and_test(atomic_t *v) +{ + static int warn_once = 1; + + if (!atomic_read(v) && warn_once) { + warn_once = 0; + printk("BUG: atomic counter underflow!\n"); + WARN_ON(1); + } +} Index: linux-rt.q/kernel/stop_machine.c =================================================================== --- linux-rt.q.orig/kernel/stop_machine.c +++ linux-rt.q/kernel/stop_machine.c @@ -137,7 +137,7 @@ static void restart_machine(void) { stopmachine_set_state(STOPMACHINE_EXIT); local_irq_enable(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } struct stop_machine_data Index: linux-rt.q/lib/debug_locks.c =================================================================== --- linux-rt.q.orig/lib/debug_locks.c +++ linux-rt.q/lib/debug_locks.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include Index: linux-rt.q/lib/dec_and_lock.c =================================================================== --- linux-rt.q.orig/lib/dec_and_lock.c +++ linux-rt.q/lib/dec_and_lock.c @@ -17,7 +17,7 @@ * because the spin-lock and the decrement must be * "atomic". */ -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int __atomic_dec_and_spin_lock(atomic_t *atomic, raw_spinlock_t *lock) { #ifdef CONFIG_SMP /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ @@ -32,4 +32,4 @@ int _atomic_dec_and_lock(atomic_t *atomi return 0; } -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(__atomic_dec_and_spin_lock); Index: linux-rt.q/lib/kernel_lock.c =================================================================== --- linux-rt.q.orig/lib/kernel_lock.c +++ linux-rt.q/lib/kernel_lock.c @@ -24,7 +24,7 @@ * * Don't use in new code. */ -static DECLARE_MUTEX(kernel_sem); +DECLARE_MUTEX(kernel_sem); /* * Re-acquire the kernel semaphore. @@ -44,7 +44,7 @@ int __lockfunc __reacquire_kernel_lock(v BUG_ON(saved_lock_depth < 0); task->lock_depth = -1; - preempt_enable_no_resched(); + __preempt_enable_no_resched(); down(&kernel_sem); Index: linux-rt.q/lib/locking-selftest.c =================================================================== --- linux-rt.q.orig/lib/locking-selftest.c +++ linux-rt.q/lib/locking-selftest.c @@ -940,6 +940,9 @@ static void dotest(void (*testcase_fn)(v { unsigned long saved_preempt_count = preempt_count(); int expected_failure = 0; +#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES) + int saved_lock_count = current->lock_count; +#endif WARN_ON(irqs_disabled()); @@ -989,6 +992,9 @@ static void dotest(void (*testcase_fn)(v #endif reset_locks(); +#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES) + current->lock_count = saved_lock_count; +#endif } static inline void print_testname(const char *testname) Index: linux-rt.q/lib/plist.c =================================================================== --- linux-rt.q.orig/lib/plist.c +++ linux-rt.q/lib/plist.c @@ -53,7 +53,9 @@ static void plist_check_list(struct list static void plist_check_head(struct plist_head *head) { +#ifndef CONFIG_PREEMPT_RT WARN_ON(!head->lock); +#endif if (head->lock) WARN_ON_SMP(!spin_is_locked(head->lock)); plist_check_list(&head->prio_list); Index: linux-rt.q/lib/rwsem-spinlock.c =================================================================== --- linux-rt.q.orig/lib/rwsem-spinlock.c +++ linux-rt.q/lib/rwsem-spinlock.c @@ -20,7 +20,7 @@ struct rwsem_waiter { /* * initialise the semaphore */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, +void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -44,8 +44,8 @@ void __init_rwsem(struct rw_semaphore *s * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct compat_rw_semaphore * +__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -103,8 +103,8 @@ __rwsem_do_wake(struct rw_semaphore *sem /* * wake a single writer */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) +static inline struct compat_rw_semaphore * +__rwsem_wake_one_writer(struct compat_rw_semaphore *sem) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -125,7 +125,7 @@ __rwsem_wake_one_writer(struct rw_semaph /* * get a read lock on the semaphore */ -void fastcall __sched __down_read(struct rw_semaphore *sem) +void fastcall __sched __down_read(struct compat_rw_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -168,7 +168,7 @@ void fastcall __sched __down_read(struct /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int fastcall __down_read_trylock(struct rw_semaphore *sem) +int fastcall __down_read_trylock(struct compat_rw_semaphore *sem) { unsigned long flags; int ret = 0; @@ -191,7 +191,8 @@ int fastcall __down_read_trylock(struct * get a write lock on the semaphore * - we increment the waiting count anyway to indicate an exclusive lock */ -void fastcall __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +void fastcall __sched +__down_write_nested(struct compat_rw_semaphore *sem, int subclass) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -231,7 +232,7 @@ void fastcall __sched __down_write_neste ; } -void fastcall __sched __down_write(struct rw_semaphore *sem) +void fastcall __sched __down_write(struct compat_rw_semaphore *sem) { __down_write_nested(sem, 0); } @@ -239,7 +240,7 @@ void fastcall __sched __down_write(struc /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int fastcall __down_write_trylock(struct rw_semaphore *sem) +int fastcall __down_write_trylock(struct compat_rw_semaphore *sem) { unsigned long flags; int ret = 0; @@ -260,7 +261,7 @@ int fastcall __down_write_trylock(struct /* * release a read lock on the semaphore */ -void fastcall __up_read(struct rw_semaphore *sem) +void fastcall __up_read(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -275,7 +276,7 @@ void fastcall __up_read(struct rw_semaph /* * release a write lock on the semaphore */ -void fastcall __up_write(struct rw_semaphore *sem) +void fastcall __up_write(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -292,7 +293,7 @@ void fastcall __up_write(struct rw_semap * downgrade a write lock into a read lock * - just wake up any readers at the front of the queue */ -void fastcall __downgrade_write(struct rw_semaphore *sem) +void fastcall __downgrade_write(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -305,7 +306,7 @@ void fastcall __downgrade_write(struct r spin_unlock_irqrestore(&sem->wait_lock, flags); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__compat_init_rwsem); EXPORT_SYMBOL(__down_read); EXPORT_SYMBOL(__down_read_trylock); EXPORT_SYMBOL(__down_write_nested); Index: linux-rt.q/lib/rwsem.c =================================================================== --- linux-rt.q.orig/lib/rwsem.c +++ linux-rt.q/lib/rwsem.c @@ -11,8 +11,8 @@ /* * Initialize an rwsem: */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -26,7 +26,7 @@ void __init_rwsem(struct rw_semaphore *s INIT_LIST_HEAD(&sem->wait_list); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__compat_init_rwsem); struct rwsem_waiter { struct list_head list; Index: linux-rt.q/lib/semaphore-sleepers.c =================================================================== --- linux-rt.q.orig/lib/semaphore-sleepers.c +++ linux-rt.q/lib/semaphore-sleepers.c @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -48,12 +49,12 @@ * we cannot lose wakeup events. */ -fastcall void __up(struct semaphore *sem) +fastcall void __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } -fastcall void __sched __down(struct semaphore * sem) +fastcall void __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -90,7 +91,7 @@ fastcall void __sched __down(struct sema tsk->state = TASK_RUNNING; } -fastcall int __sched __down_interruptible(struct semaphore * sem) +fastcall int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -153,7 +154,7 @@ fastcall int __sched __down_interruptibl * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -fastcall int __down_trylock(struct semaphore * sem) +fastcall int __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -174,3 +175,10 @@ fastcall int __down_trylock(struct semap spin_unlock_irqrestore(&sem->wait.lock, flags); return 1; } + +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux-rt.q/lib/spinlock_debug.c =================================================================== --- linux-rt.q.orig/lib/spinlock_debug.c +++ linux-rt.q/lib/spinlock_debug.c @@ -13,8 +13,8 @@ #include #include -void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key) +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -23,16 +23,16 @@ void __spin_lock_init(spinlock_t *lock, debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); #endif - lock->raw_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + lock->raw_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; lock->magic = SPINLOCK_MAGIC; lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } -EXPORT_SYMBOL(__spin_lock_init); +EXPORT_SYMBOL(__raw_spin_lock_init); -void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key) +void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -41,15 +41,15 @@ void __rwlock_init(rwlock_t *lock, const debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); #endif - lock->raw_lock = (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED; + lock->raw_lock = (__raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED; lock->magic = RWLOCK_MAGIC; lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } -EXPORT_SYMBOL(__rwlock_init); +EXPORT_SYMBOL(__raw_rwlock_init); -static void spin_bug(spinlock_t *lock, const char *msg) +static void spin_bug(raw_spinlock_t *lock, const char *msg) { struct task_struct *owner = NULL; @@ -73,7 +73,7 @@ static void spin_bug(spinlock_t *lock, c #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) static inline void -debug_spin_lock_before(spinlock_t *lock) +debug_spin_lock_before(raw_spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); SPIN_BUG_ON(lock->owner == current, lock, "recursion"); @@ -81,13 +81,13 @@ debug_spin_lock_before(spinlock_t *lock) lock, "cpu recursion"); } -static inline void debug_spin_lock_after(spinlock_t *lock) +static inline void debug_spin_lock_after(raw_spinlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); lock->owner = current; } -static inline void debug_spin_unlock(spinlock_t *lock) +static inline void debug_spin_unlock(raw_spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked"); @@ -98,7 +98,7 @@ static inline void debug_spin_unlock(spi lock->owner_cpu = -1; } -static void __spin_lock_debug(spinlock_t *lock) +static void __spin_lock_debug(raw_spinlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -125,7 +125,7 @@ static void __spin_lock_debug(spinlock_t } } -void _raw_spin_lock(spinlock_t *lock) +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); if (unlikely(!__raw_spin_trylock(&lock->raw_lock))) @@ -133,7 +133,7 @@ void _raw_spin_lock(spinlock_t *lock) debug_spin_lock_after(lock); } -int _raw_spin_trylock(spinlock_t *lock) +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { int ret = __raw_spin_trylock(&lock->raw_lock); @@ -148,13 +148,13 @@ int _raw_spin_trylock(spinlock_t *lock) return ret; } -void _raw_spin_unlock(spinlock_t *lock) +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { debug_spin_unlock(lock); __raw_spin_unlock(&lock->raw_lock); } -static void rwlock_bug(rwlock_t *lock, const char *msg) +static void rwlock_bug(raw_rwlock_t *lock, const char *msg) { if (!debug_locks_off()) return; @@ -167,8 +167,8 @@ static void rwlock_bug(rwlock_t *lock, c #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) +#if 1 /* __write_lock_debug() can lock up - maybe this can too? */ +static void __raw_read_lock_debug(raw_rwlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -193,13 +193,13 @@ static void __read_lock_debug(rwlock_t * } #endif -void _raw_read_lock(rwlock_t *lock) +void __lockfunc _raw_read_lock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - __raw_read_lock(&lock->raw_lock); + __raw_read_lock_debug(lock); } -int _raw_read_trylock(rwlock_t *lock) +int __lockfunc _raw_read_trylock(raw_rwlock_t *lock) { int ret = __raw_read_trylock(&lock->raw_lock); @@ -212,13 +212,13 @@ int _raw_read_trylock(rwlock_t *lock) return ret; } -void _raw_read_unlock(rwlock_t *lock) +void __lockfunc _raw_read_unlock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); __raw_read_unlock(&lock->raw_lock); } -static inline void debug_write_lock_before(rwlock_t *lock) +static inline void debug_write_lock_before(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); @@ -226,13 +226,13 @@ static inline void debug_write_lock_befo lock, "cpu recursion"); } -static inline void debug_write_lock_after(rwlock_t *lock) +static inline void debug_write_lock_after(raw_rwlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); lock->owner = current; } -static inline void debug_write_unlock(rwlock_t *lock) +static inline void debug_write_unlock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); @@ -242,8 +242,8 @@ static inline void debug_write_unlock(rw lock->owner_cpu = -1; } -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) +#if 1 /* This can cause lockups */ +static void __raw_write_lock_debug(raw_rwlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -268,14 +268,14 @@ static void __write_lock_debug(rwlock_t } #endif -void _raw_write_lock(rwlock_t *lock) +void __lockfunc _raw_write_lock(raw_rwlock_t *lock) { debug_write_lock_before(lock); - __raw_write_lock(&lock->raw_lock); + __raw_write_lock_debug(lock); debug_write_lock_after(lock); } -int _raw_write_trylock(rwlock_t *lock) +int __lockfunc _raw_write_trylock(raw_rwlock_t *lock) { int ret = __raw_write_trylock(&lock->raw_lock); @@ -290,7 +290,7 @@ int _raw_write_trylock(rwlock_t *lock) return ret; } -void _raw_write_unlock(rwlock_t *lock) +void __lockfunc _raw_write_unlock(raw_rwlock_t *lock) { debug_write_unlock(lock); __raw_write_unlock(&lock->raw_lock); Index: linux-rt.q/net/ipv4/route.c =================================================================== --- linux-rt.q.orig/net/ipv4/route.c +++ linux-rt.q/net/ipv4/route.c @@ -238,7 +238,7 @@ static spinlock_t *rt_hash_locks; spin_lock_init(&rt_hash_locks[i]); \ } #else -# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL) # define rt_hash_lock_init() #endif Index: linux-rt.q/net/ipv4/tcp.c =================================================================== --- linux-rt.q.orig/net/ipv4/tcp.c +++ linux-rt.q/net/ipv4/tcp.c @@ -1144,10 +1144,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru preempt_disable(); if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else - preempt_enable_no_resched(); + preempt_enable(); #endif do { patches/pause-on-oops-head-tail.patch0000664000077200007720000000735310646635216017116 0ustar mingomingoSubject: [patch] introduce pause_on_oops_head/tail boot options From: Ingo Molnar if a system crashes with hard to debug oopses which scroll off the screen then it's useful to stop the crash right after the register info or right after the callback printout. Signed-off-by: Ingo Molnar --- arch/i386/kernel/traps.c | 6 +++++ arch/x86_64/kernel/traps.c | 2 + include/linux/kernel.h | 4 +++ kernel/panic.c | 49 ++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 60 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/traps.c +++ linux-rt.q/arch/i386/kernel/traps.c @@ -252,8 +252,14 @@ static void show_stack_log_lvl(struct ta printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } + + pause_on_oops_head(); + printk("\n%sCall Trace:\n", log_lvl); show_trace_log_lvl(task, regs, esp, log_lvl); + + pause_on_oops_tail(); + debug_show_held_locks(task); } Index: linux-rt.q/arch/x86_64/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/traps.c +++ linux-rt.q/arch/x86_64/kernel/traps.c @@ -343,9 +343,11 @@ static struct stacktrace_ops print_trace void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) { + pause_on_oops_head(); printk("\nCall Trace:\n"); dump_trace(tsk, regs, stack, &print_trace_ops, NULL); printk("\n"); + pause_on_oops_tail(); print_traces(tsk); } Index: linux-rt.q/include/linux/kernel.h =================================================================== --- linux-rt.q.orig/include/linux/kernel.h +++ linux-rt.q/include/linux/kernel.h @@ -202,6 +202,10 @@ extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; + +extern void pause_on_oops_head(void); +extern void pause_on_oops_tail(void); + extern int panic_on_unrecovered_nmi; extern int tainted; extern const char *print_tainted(void); Index: linux-rt.q/kernel/panic.c =================================================================== --- linux-rt.q.orig/kernel/panic.c +++ linux-rt.q/kernel/panic.c @@ -26,7 +26,38 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -int panic_timeout; +/* + * Debugging helper: freeze all console output after printing the + * first oops's head (or tail): + */ +static int pause_on_oops_head_flag = 0; +static int pause_on_oops_tail_flag = 0; + +static void pause_on_oops_loop(int flag) +{ + switch (flag) { + default: + break; + case 1: + for (;;) + local_irq_disable(); + case 2: + for (;;) + local_irq_enable(); + } +} + +void pause_on_oops_head(void) +{ + pause_on_oops_loop(pause_on_oops_head_flag); +} + +void pause_on_oops_tail(void) +{ + pause_on_oops_loop(pause_on_oops_tail_flag); +} + +int panic_timeout __read_mostly; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -190,6 +221,22 @@ static int __init pause_on_oops_setup(ch } __setup("pause_on_oops=", pause_on_oops_setup); +static int __init pause_on_oops_head_setup(char *str) +{ + pause_on_oops_head_flag = simple_strtoul(str, NULL, 0); + printk(KERN_INFO "pause_on_oops_head: %d\n", pause_on_oops_head_flag); + return 1; +} +__setup("pause_on_oops_head=", pause_on_oops_head_setup); + +static int __init pause_on_oops_tail_setup(char *str) +{ + pause_on_oops_tail_flag = simple_strtoul(str, NULL, 0); + printk(KERN_INFO "pause_on_oops_tail: %d\n", pause_on_oops_tail_flag); + return 1; +} +__setup("pause_on_oops_tail=", pause_on_oops_tail_setup); + static void spin_msec(int msecs) { int i; patches/preempt-irqs-core.patch0000664000077200007720000012406010646635213016131 0ustar mingomingo--- include/linux/bottom_half.h | 1 include/linux/interrupt.h | 26 +++ include/linux/irq.h | 26 +++ include/linux/sched.h | 37 +++++ init/main.c | 7 + kernel/irq/autoprobe.c | 1 kernel/irq/chip.c | 33 ++++ kernel/irq/handle.c | 68 +++++++++- kernel/irq/internals.h | 4 kernel/irq/manage.c | 292 ++++++++++++++++++++++++++++++++++++++++++ kernel/irq/proc.c | 129 +++++++++++++----- kernel/irq/spurious.c | 11 + kernel/sched.c | 52 ++++++- kernel/softirq.c | 299 ++++++++++++++++++++++++++++++++++---------- 14 files changed, 863 insertions(+), 123 deletions(-) Index: linux-rt.q/include/linux/bottom_half.h =================================================================== --- linux-rt.q.orig/include/linux/bottom_half.h +++ linux-rt.q/include/linux/bottom_half.h @@ -2,7 +2,6 @@ #define _LINUX_BH_H extern void local_bh_disable(void); -extern void __local_bh_enable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); Index: linux-rt.q/include/linux/interrupt.h =================================================================== --- linux-rt.q.orig/include/linux/interrupt.h +++ linux-rt.q/include/linux/interrupt.h @@ -52,10 +52,12 @@ #define IRQF_SAMPLE_RANDOM 0x00000040 #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 -#define IRQF_TIMER 0x00000200 +#define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 +#define IRQF_NODELAY 0x00002000 +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NODELAY) /* * Migration helpers. Scheduled for removal in 9/2007 @@ -89,7 +91,7 @@ struct irqaction { void *dev_id; struct irqaction *next; int irq; - struct proc_dir_entry *dir; + struct proc_dir_entry *dir, *threaded; }; extern irqreturn_t no_action(int cpl, void *dev_id); @@ -209,6 +211,7 @@ static inline int disable_irq_wake(unsig #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) +// FIXME: PREEMPT_RT: set_bit()? #define or_softirq_pending(x) (local_softirq_pending() |= (x)) #endif @@ -270,6 +273,8 @@ enum HRTIMER_SOFTIRQ, #endif RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ + /* Entries after this are ignored in split softirq mode */ + MAX_SOFTIRQ, }; /* softirq mask and active fields moved to irq_cpustat_t in @@ -285,10 +290,24 @@ struct softirq_action asmlinkage void do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data); extern void softirq_init(void); -#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) + +#ifdef CONFIG_PREEMPT_HARDIRQS +# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr) +# define __do_raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +#else +# define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr) +#endif + extern void FASTCALL(raise_softirq_irqoff(unsigned int nr)); extern void FASTCALL(raise_softirq(unsigned int nr)); +extern void wakeup_irqd(void); +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern void wait_for_softirq(int softirq); +#else +# define wait_for_softirq(x) do {} while(0) +#endif /* Tasklets --- multithreaded analogue of BHs. @@ -400,6 +419,7 @@ extern void tasklet_kill(struct tasklet_ extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +void takeover_tasklets(unsigned int cpu); /* * Autoprobing for irqs: Index: linux-rt.q/include/linux/irq.h =================================================================== --- linux-rt.q.orig/include/linux/irq.h +++ linux-rt.q/include/linux/irq.h @@ -19,10 +19,12 @@ #include #include #include +#include #include #include #include +#include struct irq_desc; typedef void fastcall (*irq_flow_handler_t)(unsigned int irq, @@ -61,6 +63,7 @@ typedef void fastcall (*irq_flow_handler #define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ #define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ #define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ +#define IRQ_NODELAY 0x40000000 /* IRQ must run immediately */ #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) @@ -140,6 +143,9 @@ struct irq_chip { * @wake_depth: enable depth, for multiple set_irq_wake() callers * @irq_count: stats field to detect stalled irqs * @irqs_unhandled: stats field for spurious unhandled interrupts + * @thread: Thread pointer for threaded preemptible irq handling + * @wait_for_handler: Waitqueue to wait for a running preemptible handler + * @cycles: Timestamp for stats and debugging * @lock: locking for SMP * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing @@ -161,6 +167,9 @@ struct irq_desc { unsigned int wake_depth; /* nested wake enables */ unsigned int irq_count; /* For detecting broken IRQs */ unsigned int irqs_unhandled; + struct task_struct *thread; + wait_queue_head_t wait_for_handler; + cycles_t timestamp; spinlock_t lock; #ifdef CONFIG_SMP cpumask_t affinity; @@ -385,7 +394,22 @@ extern int set_irq_msi(unsigned int irq, #define get_irq_data(irq) (irq_desc[irq].handler_data) #define get_irq_msi(irq) (irq_desc[irq].msi_desc) -#endif /* CONFIG_GENERIC_HARDIRQS */ +/* Early initialization of irqs */ +extern void early_init_hardirqs(void); +extern cycles_t irq_timestamp(unsigned int irq); + +#if defined(CONFIG_PREEMPT_HARDIRQS) +extern void init_hardirqs(void); +#else +static inline void init_hardirqs(void) { } +#endif + +#else /* end GENERIC HARDIRQS */ + +static inline void early_init_hardirqs(void) { } +static inline void init_hardirqs(void) { } + +#endif /* !CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -89,6 +89,17 @@ struct sched_param { #include +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern int softirq_preemption; +#else +# define softirq_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_HARDIRQS +extern int hardirq_preemption; +#else +# define hardirq_preemption 0 +#endif + struct exec_domain; struct futex_pi_state; struct bio; @@ -256,6 +267,12 @@ extern void trap_init(void); extern void update_process_times(int user); extern void scheduler_tick(void); +#ifdef CONFIG_GENERIC_HARDIRQS +extern int debug_direct_keyboard; +#else +# define debug_direct_keyboard 0 +#endif + #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void spawn_softlockup_task(void); @@ -1393,6 +1410,8 @@ static inline void put_task_struct(struc #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_SOFTIRQ 0x04000000 /* softirq context */ +#define PF_HARDIRQ 0x08000000 /* hardirq context */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ @@ -1817,6 +1836,8 @@ static inline int need_resched(void) extern int cond_resched(void); extern int cond_resched_lock(spinlock_t * lock); extern int cond_resched_softirq(void); +extern int cond_resched_softirq_context(void); +extern int cond_resched_hardirq_context(void); /* * Does a critical section need to be broken due to another @@ -1832,10 +1853,20 @@ extern int cond_resched_softirq(void); * Does a critical section need to be broken due to another * task waiting or preemption being signalled: */ -static inline int lock_need_resched(spinlock_t *lock) +#define lock_need_resched(lock) \ + unlikely(need_lockbreak(lock) || need_resched()) + +static inline int softirq_need_resched(void) { - if (need_lockbreak(lock) || need_resched()) - return 1; + if (softirq_preemption && (current->flags & PF_SOFTIRQ)) + return need_resched(); + return 0; +} + +static inline int hardirq_need_resched(void) +{ + if (hardirq_preemption && (current->flags & PF_HARDIRQ)) + return need_resched(); return 0; } Index: linux-rt.q/init/main.c =================================================================== --- linux-rt.q.orig/init/main.c +++ linux-rt.q/init/main.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -540,8 +541,10 @@ asmlinkage void __init start_kernel(void * fragile until we cpu_idle() for the first time. */ preempt_disable(); + build_all_zonelists(); page_alloc_init(); + early_init_hardirqs(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, @@ -809,6 +812,8 @@ static int __init kernel_init(void * unu smp_prepare_cpus(max_cpus); + init_hardirqs(); + do_pre_smp_initcalls(); smp_init(); @@ -837,5 +842,7 @@ static int __init kernel_init(void * unu * initmem segments and start the user-mode stuff.. */ init_post(); + WARN_ON(debug_direct_keyboard); + return 0; } Index: linux-rt.q/kernel/irq/autoprobe.c =================================================================== --- linux-rt.q.orig/kernel/irq/autoprobe.c +++ linux-rt.q/kernel/irq/autoprobe.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include Index: linux-rt.q/kernel/irq/chip.c =================================================================== --- linux-rt.q.orig/kernel/irq/chip.c +++ linux-rt.q/kernel/irq/chip.c @@ -269,8 +269,10 @@ static inline void mask_ack_irq(struct i if (desc->chip->mask_ack) desc->chip->mask_ack(irq); else { - desc->chip->mask(irq); - desc->chip->ack(irq); + if (desc->chip->mask) + desc->chip->mask(irq); + if (desc->chip->ack) + desc->chip->ack(irq); } } @@ -310,6 +312,11 @@ handle_simple_irq(unsigned int irq, stru desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); @@ -358,6 +365,11 @@ handle_level_irq(unsigned int irq, struc } desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); @@ -411,6 +423,16 @@ handle_fasteoi_irq(unsigned int irq, str } desc->status |= IRQ_INPROGRESS; + + /* + * In the threaded case we fall back to a mask+eoi sequence: + */ + if (redirect_hardirq(desc)) { + if (desc->chip->mask) + desc->chip->mask(irq); + goto out; + } + desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); @@ -422,7 +444,6 @@ handle_fasteoi_irq(unsigned int irq, str desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - spin_unlock(&desc->lock); } @@ -471,6 +492,12 @@ handle_edge_irq(unsigned int irq, struct /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; + do { struct irqaction *action = desc->action; irqreturn_t action_ret; Index: linux-rt.q/kernel/irq/handle.c =================================================================== --- linux-rt.q.orig/kernel/irq/handle.c +++ linux-rt.q/kernel/irq/handle.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -131,26 +132,87 @@ irqreturn_t handle_IRQ_event(unsigned in irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; +#ifdef __i386__ + if (debug_direct_keyboard && irq == 1) + lockdep_off(); +#endif + handle_dynamic_tick(action); - if (!(action->flags & IRQF_DISABLED)) - local_irq_enable_in_hardirq(); + /* + * Unconditionally enable interrupts for threaded + * IRQ handlers: + */ + if (!hardirq_count() || !(action->flags & IRQF_DISABLED)) + local_irq_enable(); do { + unsigned int preempt_count = preempt_count(); + ret = action->handler(irq, action->dev_id); + if (preempt_count() != preempt_count) { + print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + dump_stack(); + preempt_count() = preempt_count; + } if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (status & IRQF_SAMPLE_RANDOM) { + local_irq_enable(); add_interrupt_randomness(irq); + } local_irq_disable(); +#ifdef __i386__ + if (debug_direct_keyboard && irq == 1) + lockdep_on(); +#endif return retval; } +/* + * Hack - used for development only. + */ +int __read_mostly debug_direct_keyboard = 0; + +int __init debug_direct_keyboard_setup(char *str) +{ + debug_direct_keyboard = 1; + printk(KERN_INFO "Switching IRQ 1 (keyboard) to to direct!\n"); +#ifdef CONFIG_PREEMPT_RT + printk(KERN_INFO "WARNING: kernel may easily crash this way!\n"); +#endif + return 1; +} + +__setup("debug_direct_keyboard", debug_direct_keyboard_setup); + +int redirect_hardirq(struct irq_desc *desc) +{ + /* + * Direct execution: + */ + if (!hardirq_preemption || (desc->status & IRQ_NODELAY) || + !desc->thread) + return 0; + +#ifdef __i386__ + if (debug_direct_keyboard && (desc - irq_desc == 1)) + return 0; +#endif + + BUG_ON(!irqs_disabled()); + if (desc->thread && desc->thread->state != TASK_RUNNING) + wake_up_process(desc->thread); + + return 1; +} + #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ /** * __do_IRQ - original all in one highlevel IRQ handler Index: linux-rt.q/kernel/irq/internals.h =================================================================== --- linux-rt.q.orig/kernel/irq/internals.h +++ linux-rt.q/kernel/irq/internals.h @@ -10,6 +10,10 @@ extern void irq_chip_set_defaults(struct /* Set default handler: */ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); +extern int redirect_hardirq(struct irq_desc *desc); + +void recalculate_desc_flags(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq); extern void register_handler_proc(unsigned int irq, struct irqaction *action); Index: linux-rt.q/kernel/irq/manage.c =================================================================== --- linux-rt.q.orig/kernel/irq/manage.c +++ linux-rt.q/kernel/irq/manage.c @@ -8,8 +8,10 @@ */ #include -#include #include +#include +#include +#include #include #include "internals.h" @@ -33,8 +35,12 @@ void synchronize_irq(unsigned int irq) if (irq >= NR_IRQS) return; - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); + if (hardirq_preemption && !(desc->status & IRQ_NODELAY)) + wait_event(desc->wait_for_handler, + !(desc->status & IRQ_INPROGRESS)); + else + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); } EXPORT_SYMBOL(synchronize_irq); @@ -218,6 +224,21 @@ int set_irq_wake(unsigned int irq, unsig EXPORT_SYMBOL(set_irq_wake); /* + * If any action has IRQF_NODELAY then turn IRQ_NODELAY on: + */ +void recalculate_desc_flags(struct irq_desc *desc) +{ + struct irqaction *action; + + desc->status &= ~IRQ_NODELAY; + for (action = desc->action ; action; action = action->next) + if (action->flags & IRQF_NODELAY) + desc->status |= IRQ_NODELAY; +} + +static int start_irq_thread(int irq, struct irq_desc *desc); + +/* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. @@ -282,6 +303,9 @@ int setup_irq(unsigned int irq, struct i rand_initialize_irq(irq); } + if (!(new->flags & IRQF_NODELAY)) + if (start_irq_thread(irq, desc)) + return -ENOMEM; /* * The following block of code has to be executed atomically */ @@ -325,6 +349,11 @@ int setup_irq(unsigned int irq, struct i if (!shared) { irq_chip_set_defaults(desc->chip); + /* + * Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY: + */ + recalculate_desc_flags(desc); + #if defined(CONFIG_IRQ_PER_CPU) if (new->flags & IRQF_PERCPU) desc->status |= IRQ_PER_CPU; @@ -368,7 +397,7 @@ int setup_irq(unsigned int irq, struct i new->irq = irq; register_irq_proc(irq); - new->dir = NULL; + new->dir = new->threaded = NULL; register_handler_proc(irq, new); return 0; @@ -440,6 +469,7 @@ void free_irq(unsigned int irq, void *de else desc->chip->disable(irq); } + recalculate_desc_flags(desc); spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -563,3 +593,257 @@ int request_irq(unsigned int irq, irq_ha return retval; } EXPORT_SYMBOL(request_irq); + +#ifdef CONFIG_PREEMPT_HARDIRQS + +int hardirq_preemption = 1; + +EXPORT_SYMBOL(hardirq_preemption); + +static int __init hardirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + hardirq_preemption = 0; + else + get_option(&str, &hardirq_preemption); + if (!hardirq_preemption) + printk("turning off hardirq preemption!\n"); + + return 1; +} + +__setup("hardirq-preempt=", hardirq_preempt_setup); + + +/* + * threaded simple handler + */ +static void thread_simple_irq(irq_desc_t *desc) +{ + struct irqaction *action = desc->action; + unsigned int irq = desc - irq_desc; + irqreturn_t action_ret; + + if (action && !desc->depth) { + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + cond_resched_hardirq_context(); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } + desc->status &= ~IRQ_INPROGRESS; +} + +/* + * threaded level type irq handler + */ +static void thread_level_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + thread_simple_irq(desc); + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); +} + +/* + * threaded fasteoi type irq handler + */ +static void thread_fasteoi_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + thread_simple_irq(desc); + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); +} + +/* + * threaded edge type IRQ handler + */ +static void thread_edge_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->status &= ~IRQ_INPROGRESS; + desc->chip->mask(irq); + return; + } + + /* + * When another irq arrived while we were handling + * one, we could have masked the irq. + * Renable it, if it was not disabled in meantime. + */ + if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) == + (IRQ_PENDING | IRQ_MASKED)) && !desc->depth)) + desc->chip->unmask(irq); + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + cond_resched_hardirq_context(); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while ((desc->status & IRQ_PENDING) && !desc->depth); + + desc->status &= ~IRQ_INPROGRESS; +} + +/* + * threaded edge type IRQ handler + */ +static void thread_do_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->status &= ~IRQ_INPROGRESS; + desc->chip->disable(irq); + return; + } + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + cond_resched_hardirq_context(); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while ((desc->status & IRQ_PENDING) && !desc->depth); + + desc->status &= ~IRQ_INPROGRESS; + desc->chip->end(irq); +} + +static void do_hardirq(struct irq_desc *desc) +{ + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + + if (!(desc->status & IRQ_INPROGRESS)) + goto out; + + if (desc->handle_irq == handle_simple_irq) + thread_simple_irq(desc); + else if (desc->handle_irq == handle_level_irq) + thread_level_irq(desc); + else if (desc->handle_irq == handle_fasteoi_irq) + thread_fasteoi_irq(desc); + else if (desc->handle_irq == handle_edge_irq) + thread_edge_irq(desc); + else + thread_do_irq(desc); + out: + spin_unlock_irqrestore(&desc->lock, flags); + + if (waitqueue_active(&desc->wait_for_handler)) + wake_up(&desc->wait_for_handler); +} + +extern asmlinkage void __do_softirq(void); + +static int do_irqd(void * __desc) +{ + struct sched_param param = { 0, }; + struct irq_desc *desc = __desc; + +#ifdef CONFIG_SMP + set_cpus_allowed(current, desc->affinity); +#endif + current->flags |= PF_NOFREEZE | PF_HARDIRQ; + + /* + * Set irq thread priority to SCHED_FIFO/50: + */ + param.sched_priority = MAX_USER_RT_PRIO/2; + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + + while (!kthread_should_stop()) { + local_irq_disable(); + set_current_state(TASK_INTERRUPTIBLE); + irq_enter(); + do_hardirq(desc); + irq_exit(); + local_irq_enable(); + cond_resched(); +#ifdef CONFIG_SMP + /* + * Did IRQ affinities change? + */ + if (!cpus_equal(current->cpus_allowed, desc->affinity)) + set_cpus_allowed(current, desc->affinity); +#endif + schedule(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +static int ok_to_create_irq_threads; + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + if (desc->thread || !ok_to_create_irq_threads) + return 0; + + desc->thread = kthread_create(do_irqd, desc, "IRQ-%d", irq); + if (!desc->thread) { + printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq); + return -ENOMEM; + } + + /* + * An interrupt may have come in before the thread pointer was + * stored in desc->thread; make sure the thread gets woken up in + * such a case: + */ + smp_mb(); + wake_up_process(desc->thread); + + return 0; +} + +void __init init_hardirqs(void) +{ + int i; + ok_to_create_irq_threads = 1; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + + if (desc->action && !(desc->status & IRQ_NODELAY)) + start_irq_thread(i, desc); + } +} + +#else + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + return 0; +} + +#endif + +void __init early_init_hardirqs(void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) + init_waitqueue_head(&irq_desc[i].wait_for_handler); +} Index: linux-rt.q/kernel/irq/proc.c =================================================================== --- linux-rt.q.orig/kernel/irq/proc.c +++ linux-rt.q/kernel/irq/proc.c @@ -7,6 +7,8 @@ */ #include +#include +#include #include #include @@ -67,44 +69,6 @@ static int irq_affinity_write_proc(struc #endif -#define MAX_NAMELEN 128 - -static int name_unique(unsigned int irq, struct irqaction *new_action) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *action; - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { - if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } - } - spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} - -void register_handler_proc(unsigned int irq, struct irqaction *action) -{ - char name [MAX_NAMELEN]; - - if (!irq_desc[irq].dir || action->dir || !action->name || - !name_unique(irq, action)) - return; - - memset(name, 0, MAX_NAMELEN); - snprintf(name, MAX_NAMELEN, "%s", action->name); - - /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); -} - -#undef MAX_NAMELEN - #define MAX_NAMELEN 10 void register_irq_proc(unsigned int irq) @@ -142,10 +106,96 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { + if (action->threaded) + remove_proc_entry(action->threaded->name, action->dir); if (action->dir) remove_proc_entry(action->dir->name, irq_desc[irq].dir); } +#ifndef CONFIG_PREEMPT_RT + +static int threaded_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return sprintf(page, "%c\n", + ((struct irqaction *)data)->flags & IRQF_NODELAY ? '0' : '1'); +} + +static int threaded_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + int c; + struct irqaction *action = data; + irq_desc_t *desc = irq_desc + action->irq; + + if (get_user(c, buffer)) + return -EFAULT; + if (c != '0' && c != '1') + return -EINVAL; + + spin_lock_irq(&desc->lock); + + if (c == '0') + action->flags |= IRQF_NODELAY; + if (c == '1') + action->flags &= ~IRQF_NODELAY; + recalculate_desc_flags(desc); + + spin_unlock_irq(&desc->lock); + + return 1; +} + +#endif + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; + + for (action = desc->action ; action; action = action->next) + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) + return 0; + return 1; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + + if (!irq_desc[irq].dir || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, irq_desc[irq].dir); + + if (!action->dir) + return; +#ifndef CONFIG_PREEMPT_RT + { + struct proc_dir_entry *entry; + /* create /proc/irq/1234/handler/threaded */ + entry = create_proc_entry("threaded", 0600, action->dir); + if (!entry) + return; + entry->nlink = 1; + entry->data = (void *)action; + entry->read_proc = threaded_read_proc; + entry->write_proc = threaded_write_proc; + action->threaded = entry; + } +#endif +} + +#undef MAX_NAMELEN + void init_irq_proc(void) { int i; @@ -155,6 +205,9 @@ void init_irq_proc(void) if (!root_irq_dir) return; + /* create /proc/irq/prof_cpu_mask */ + create_prof_cpu_mask(root_irq_dir); + /* * Create entries for all existing IRQs. */ Index: linux-rt.q/kernel/irq/spurious.c =================================================================== --- linux-rt.q.orig/kernel/irq/spurious.c +++ linux-rt.q/kernel/irq/spurious.c @@ -10,6 +10,10 @@ #include #include #include +#ifdef CONFIG_X86_IO_APIC +# include +# include +#endif static int irqfixup __read_mostly; @@ -193,6 +197,12 @@ void note_interrupt(unsigned int irq, st * The interrupt is stuck */ __report_bad_irq(irq, desc, action_ret); +#ifdef CONFIG_X86_IO_APIC + if (!sis_apic_bug) { + sis_apic_bug = 1; + printk(KERN_ERR "turning off IO-APIC fast mode.\n"); + } +#else /* * Now kill the IRQ */ @@ -200,6 +210,7 @@ void note_interrupt(unsigned int irq, st desc->status |= IRQ_DISABLED; desc->depth = 1; desc->chip->disable(irq); +#endif } desc->irqs_unhandled = 0; } Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -3182,9 +3182,9 @@ void account_system_time(struct task_str /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) + if (hardirq_count() - hardirq_offset || (p->flags & PF_HARDIRQ)) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (softirq_count() || (p->flags & PF_SOFTIRQ)) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else if (p != rq->idle) cpustat->system = cputime64_add(cpustat->system, tmp); @@ -3431,7 +3431,7 @@ asmlinkage void __sched preempt_schedule int saved_lock_depth; #endif /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + WARN_ON_ONCE(ti->preempt_count || !irqs_disabled()); need_resched: add_preempt_count(PREEMPT_ACTIVE); @@ -3694,7 +3694,6 @@ out: } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - #define SLEEP_ON_VAR \ unsigned long flags; \ wait_queue_t wait; \ @@ -4470,9 +4469,12 @@ int cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(cond_resched_lock); +/* + * Voluntarily preempt a process context that has softirqs disabled: + */ int __sched cond_resched_softirq(void) { - BUG_ON(!in_softirq()); + WARN_ON_ONCE(!in_softirq()); if (need_resched() && system_state == SYSTEM_RUNNING) { local_bh_enable(); @@ -4484,6 +4486,46 @@ int __sched cond_resched_softirq(void) } EXPORT_SYMBOL(cond_resched_softirq); +/* + * Voluntarily preempt a softirq context (possible with softirq threading): + */ +int __sched cond_resched_softirq_context(void) +{ + WARN_ON_ONCE(!in_softirq()); + + if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { + raw_local_irq_disable(); + _local_bh_enable(); + raw_local_irq_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq_context); + +/* + * Preempt a hardirq context if necessary (possible with hardirq threading): + */ +int cond_resched_hardirq_context(void) +{ + WARN_ON_ONCE(!in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (hardirq_need_resched()) { + irq_exit(); + local_irq_enable(); + __cond_resched(); + local_irq_disable(); + __irq_enter(); + + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_hardirq_context); + /** * yield - yield the current processor to other threads. * Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -4,9 +4,15 @@ * Copyright (C) 1992 Linus Torvalds * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Softirq-split implemetation by + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar */ #include +#include +#include +#include #include #include #include @@ -45,7 +51,41 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +struct softirqdata { + int nr; + unsigned long cpu; + struct task_struct *tsk; +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; + int running; +#endif +}; + +static DEFINE_PER_CPU(struct softirqdata [MAX_SOFTIRQ], ksoftirqd); + +#ifdef CONFIG_PREEMPT_SOFTIRQS +/* + * Preempting the softirq causes cases that would not be a + * problem when the softirq is not preempted. That is a + * process may have code to spin while waiting for a softirq + * to finish on another CPU. But if it happens that the + * process has preempted the softirq, this could cause a + * deadlock. + */ +void wait_for_softirq(int softirq) +{ + struct softirqdata *data = &__get_cpu_var(ksoftirqd)[softirq]; + if (data->running) { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&data->wait, &wait); + if (data->running) + schedule(); + remove_wait_queue(&data->wait, &wait); + __set_current_state(TASK_RUNNING); + } +} +#endif /* * we cannot loop indefinitely here to avoid userspace starvation, @@ -53,16 +93,32 @@ static DEFINE_PER_CPU(struct task_struct * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -static inline void wakeup_softirqd(void) +static void wakeup_softirqd(int softirq) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; if (tsk && tsk->state != TASK_RUNNING) wake_up_process(tsk); } /* + * Wake up the softirq threads which have work + */ +static void trigger_softirqs(void) +{ + u32 pending = local_softirq_pending(); + int curr = 0; + + while (pending) { + if (pending & 1) + wakeup_softirqd(curr); + pending >>= 1; + curr++; + } +} + +/* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ @@ -97,20 +153,6 @@ void local_bh_disable(void) EXPORT_SYMBOL(local_bh_disable); -void __local_bh_enable(void) -{ - WARN_ON_ONCE(in_irq()); - - /* - * softirqs should never be enabled by __local_bh_enable(), - * it always nests inside local_bh_enable() sections: - */ - WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); - - sub_preempt_count(SOFTIRQ_OFFSET); -} -EXPORT_SYMBOL_GPL(__local_bh_enable); - /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), @@ -204,7 +246,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 -asmlinkage void __do_softirq(void) +asmlinkage void ___do_softirq(void) { struct softirq_action *h; __u32 pending; @@ -214,9 +256,6 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); - trace_softirq_enter(); - cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ @@ -228,8 +267,17 @@ restart: do { if (pending & 1) { - h->action(h); + { + u32 preempt_count = preempt_count(); + h->action(h); + if (preempt_count != preempt_count()) { + print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; + } + } rcu_bh_qsctr_inc(cpu); + cond_resched_softirq_context(); } h++; pending >>= 1; @@ -242,12 +290,69 @@ restart: goto restart; if (pending) - wakeup_softirqd(); + trigger_softirqs(); +} + +asmlinkage void __do_softirq(void) +{ + unsigned long p_flags; + +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * 'preempt harder'. Push all softirq processing off to ksoftirqd. + */ + if (softirq_preemption) { + if (local_softirq_pending()) + trigger_softirqs(); + return; + } +#endif + /* + * 'immediate' softirq execution: + */ + __local_bh_disable((unsigned long)__builtin_return_address(0)); + trace_softirq_enter(); + p_flags = current->flags & PF_HARDIRQ; + current->flags &= ~PF_HARDIRQ; + + ___do_softirq(); + + trace_softirq_exit(); + + account_system_vtime(current); + _local_bh_enable(); + + current->flags |= p_flags; +} + +/* + * Process softirqs straight from hardirq context, + * without having to switch to a softirq thread. + * This can reduce the context-switch rate. + * + * NOTE: this is unused right now. + */ +void do_softirq_from_hardirq(void) +{ + unsigned long p_flags; + + if (!local_softirq_pending()) + return; + /* + * 'immediate' softirq execution: + */ + __local_bh_disable((unsigned long)__builtin_return_address(0)); + p_flags = current->flags & PF_HARDIRQ; + current->flags &= ~PF_HARDIRQ; + + ___do_softirq(); trace_softirq_exit(); account_system_vtime(current); _local_bh_enable(); + + current->flags |= p_flags; } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -316,19 +421,9 @@ void irq_exit(void) */ inline fastcall void raise_softirq_irqoff(unsigned int nr) { - __raise_softirq_irqoff(nr); + __do_raise_softirq_irqoff(nr); - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); + wakeup_softirqd(nr); } EXPORT_SYMBOL(raise_softirq_irqoff); @@ -413,7 +508,7 @@ static void tasklet_action(struct softir local_irq_disable(); t->next = __get_cpu_var(tasklet_vec).list; __get_cpu_var(tasklet_vec).list = t; - __raise_softirq_irqoff(TASKLET_SOFTIRQ); + __do_raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_enable(); } } @@ -446,7 +541,7 @@ static void tasklet_hi_action(struct sof local_irq_disable(); t->next = __get_cpu_var(tasklet_hi_vec).list; __get_cpu_var(tasklet_hi_vec).list = t; - __raise_softirq_irqoff(HI_SOFTIRQ); + __do_raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); } } @@ -486,15 +581,26 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); } -static int ksoftirqd(void * __bind_cpu) +static int ksoftirqd(void * __data) { - current->flags |= PF_NOFREEZE; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 }; + struct softirqdata *data = __data; + u32 mask = (1 << data->nr); + struct softirq_action *h; + + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; + +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&data->wait); +#endif + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) { + if (!local_softirq_pending() & mask) { preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -502,19 +608,37 @@ static int ksoftirqd(void * __bind_cpu) __set_current_state(TASK_RUNNING); - while (local_softirq_pending()) { + while (local_softirq_pending() & mask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ - if (cpu_is_offline((long)__bind_cpu)) + if (cpu_is_offline(data->cpu)) goto wait_to_die; - do_softirq(); + + local_irq_disable(); preempt_enable_no_resched(); + set_softirq_pending(local_softirq_pending() & ~mask); + local_bh_disable(); + local_irq_enable(); + + h = &softirq_vec[data->nr]; + if (h) + h->action(h); + rcu_bh_qsctr_inc(data->cpu); + + local_irq_disable(); + _local_bh_enable(); + local_irq_enable(); + cond_resched(); preempt_disable(); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); +#ifdef CONFIG_PREEMPT_SOFTIRQS + data->running = 0; + wake_up(&data->wait); +#endif } __set_current_state(TASK_RUNNING); return 0; @@ -561,7 +685,7 @@ void tasklet_kill_immediate(struct taskl BUG(); } -static void takeover_tasklets(unsigned int cpu) +void takeover_tasklets(unsigned int cpu) { struct tasklet_struct **i; @@ -583,45 +707,73 @@ static void takeover_tasklets(unsigned i } #endif /* CONFIG_HOTPLUG_CPU */ +static const char *softirq_names [] = +{ + [HI_SOFTIRQ] = "high", + [SCHED_SOFTIRQ] = "sched", + [TIMER_SOFTIRQ] = "timer", + [NET_TX_SOFTIRQ] = "net-tx", + [NET_RX_SOFTIRQ] = "net-rx", + [BLOCK_SOFTIRQ] = "block", + [TASKLET_SOFTIRQ] = "tasklet", +#ifdef CONFIG_HIGH_RES_TIMERS + [HRTIMER_SOFTIRQ] = "hrtimer", +#endif + [RCU_SOFTIRQ] = "rcu", +}; + static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; + int hotcpu = (unsigned long)hcpu, i; struct task_struct *p; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; + for (i = 0; i < MAX_SOFTIRQ; i++) { + per_cpu(ksoftirqd, hotcpu)[i].nr = i; + per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; + p = kthread_create(ksoftirqd, + &per_cpu(ksoftirqd, hotcpu)[i], + "softirq-%s/%d", softirq_names[i], + hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd %d for %i failed\n", i, + hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd, hotcpu)[i].tsk = p; } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; + break; + break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); + for (i = 0; i < MAX_SOFTIRQ; i++) + wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); + for (i = 0; i < MAX_SOFTIRQ; i++) { + if (!per_cpu(ksoftirqd, hotcpu)[i].tsk) + continue; + kthread_bind(per_cpu(ksoftirqd, hotcpu)[i].tsk, + any_online_cpu(cpu_online_map)); + } case CPU_DEAD: case CPU_DEAD_FROZEN: - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - kthread_stop(p); + for (i = 0; i < MAX_SOFTIRQ; i++) { + p = per_cpu(ksoftirqd, hotcpu)[i].tsk; + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + kthread_stop(p); + } takeover_tasklets(hotcpu); break; #endif /* CONFIG_HOTPLUG_CPU */ - } + } return NOTIFY_OK; } @@ -640,6 +792,29 @@ __init int spawn_ksoftirqd(void) return 0; } + +#ifdef CONFIG_PREEMPT_SOFTIRQS + +int softirq_preemption = 1; + +EXPORT_SYMBOL(softirq_preemption); + +static int __init softirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + softirq_preemption = 0; + else + get_option(&str, &softirq_preemption); + if (!softirq_preemption) + printk("turning off softirq preemption!\n"); + + return 1; +} + +__setup("softirq-preempt=", softirq_preempt_setup); + +#endif + #ifdef CONFIG_SMP /* * Call a function on all processors patches/loopback-revert.patch0000664000077200007720000000217010646635213015647 0ustar mingomingo revert this commit: commit 58f539740b1ccfc5ef4e509ec2efe82621b546e3 Author: Eric Dumazet Date: Fri Oct 20 00:32:41 2006 -0700 [NET]: Can use __get_cpu_var() instead of per_cpu() in loopback driver. As BHs are off in loopback_xmit(), preemption cannot occurs, so we can use __get_cpu_var() instead of per_cpu() (and avoid a preempt_enable()/preempt_disable() pair) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/loopback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/drivers/net/loopback.c =================================================================== --- linux-rt.q.orig/drivers/net/loopback.c +++ linux-rt.q/drivers/net/loopback.c @@ -154,10 +154,10 @@ static int loopback_xmit(struct sk_buff #endif dev->last_rx = jiffies; - /* it's OK to use __get_cpu_var() because BHs are off */ - lb_stats = &__get_cpu_var(pcpu_lstats); + lb_stats = &per_cpu(pcpu_lstats, get_cpu()); lb_stats->bytes += skb->len; lb_stats->packets++; + put_cpu(); netif_rx(skb); patches/preempt-irqs-ppc-celleb-beatic-eoi.patch0000664000077200007720000000674210646635214021215 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Tue May 15 17:44:07 2007 Date: Tue, 15 May 2007 17:44:07 +0900 From: Tsutomu OWA To: linuxppc-dev@ozlabs.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [RFC] [patch 1/2] powerpc 2.6.21-rt1: fix kernel hang and/or panic > It occurs on 2.6.21 + patch-2.6.21-rt1 + series of patches that I posted > yesterday. When doing 'hdparm -t /dev/hda' several times, it silently hangs. I think it freezes since It does not response to ping as well. On the other hand, PREEMPT_NONE kernel works just fine. After looking into the rt interrupt handling code, I noticed that code path differs between PREEMPT_NONE and PREEMPT_RT; NONE: mask() -> unmask() -> eoi() RT: mask() -> eoi() -> unmask() The hypervisor underlying the linux on Celleb wants to be called in this "mask() -> unmask() -> eoi()" order. This patch mimics the behavior of PREEPT_NONE even if PREEMPT_RT is specified. Or, would it be better to create/add a new (threaded) irq handler? Any comments? Thanks in advance Signed-off-by: Tsutomu OWA -- owa --- arch/powerpc/platforms/celleb/interrupt.c | 39 +++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 6 deletions(-) Index: linux-rt.q/arch/powerpc/platforms/celleb/interrupt.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/celleb/interrupt.c +++ linux-rt.q/arch/powerpc/platforms/celleb/interrupt.c @@ -29,6 +29,10 @@ #include "interrupt.h" #include "beat_wrapper.h" +#ifdef CONFIG_PREEMPT_HARDIRQS +extern int hardirq_preemption; +#endif /* CONFIG_PREEMPT_HARDIRQS */ + #define MAX_IRQS NR_IRQS static DEFINE_SPINLOCK(beatic_irq_mask_lock); static uint64_t beatic_irq_mask_enable[(MAX_IRQS+255)/64]; @@ -71,12 +75,35 @@ static void beatic_mask_irq(unsigned int spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } +static void __beatic_eoi_irq(unsigned int irq_plug) +{ + s64 err; + + if ((err = beat_downcount_of_interrupt(irq_plug)) != 0) { + if ((err & 0xFFFFFFFF) != 0xFFFFFFF5) /* -11: wrong state */ + panic("Failed to downcount IRQ! Error = %16lx", err); + + printk(KERN_ERR "IRQ over-downcounted, plug %d\n", irq_plug); + } +} + static void beatic_unmask_irq(unsigned int irq_plug) { unsigned long flags; +#ifdef CONFIG_PREEMPT_HARDIRQS + if (hardirq_preemption) + __beatic_eoi_irq(irq_plug); +#endif /* CONFIG_PREEMPT_HARDIRQS */ + spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_enable[irq_plug/64] |= 1UL << (63 - (irq_plug%64)); + +#ifdef CONFIG_PREEMPT_HARDIRQS + if (hardirq_preemption) + beatic_irq_mask_ack[irq_plug/64] |= 1UL << (63 - (irq_plug%64)); +#endif /* CONFIG_PREEMPT_HARDIRQS */ + beatic_update_irq_mask(irq_plug); spin_unlock_irqrestore(&beatic_irq_mask_lock, flags); } @@ -93,15 +120,15 @@ static void beatic_ack_irq(unsigned int static void beatic_end_irq(unsigned int irq_plug) { - s64 err; unsigned long flags; - if ((err = beat_downcount_of_interrupt(irq_plug)) != 0) { - if ((err & 0xFFFFFFFF) != 0xFFFFFFF5) /* -11: wrong state */ - panic("Failed to downcount IRQ! Error = %16lx", err); +#ifdef CONFIG_PREEMPT_HARDIRQS + if (hardirq_preemption) + return; +#endif /* CONFIG_PREEMPT_HARDIRQS */ + + __beatic_eoi_irq(irq_plug); - printk(KERN_ERR "IRQ over-downcounted, plug %d\n", irq_plug); - } spin_lock_irqsave(&beatic_irq_mask_lock, flags); beatic_irq_mask_ack[irq_plug/64] |= 1UL << (63 - (irq_plug%64)); beatic_update_irq_mask(irq_plug); patches/realtime-lsm.patch0000664000077200007720000001371610646635216015156 0ustar mingomingo--- security/Kconfig | 9 +++ security/Makefile | 1 security/realcap.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+) Index: linux-rt.q/security/Kconfig =================================================================== --- linux-rt.q.orig/security/Kconfig +++ linux-rt.q/security/Kconfig @@ -80,6 +80,15 @@ config SECURITY_CAPABILITIES This enables the "default" Linux capabilities functionality. If you are unsure how to answer this question, answer Y. +config REALTIME_CAPABILITIES + tristate "Real-Time LSM (Obsolete)" + depends on SECURITY && EXPERIMENTAL + help + This is an obsolete LSM - use newer PAM and rt-limites + to manage your real-time apps. + + If you are unsure how to answer this question, answer N. + config SECURITY_ROOTPLUG tristate "Root Plug Support" depends on USB && SECURITY Index: linux-rt.q/security/Makefile =================================================================== --- linux-rt.q.orig/security/Makefile +++ linux-rt.q/security/Makefile @@ -15,4 +15,5 @@ obj-$(CONFIG_SECURITY) += security.o d # Must precede capability.o in order to stack properly. obj-$(CONFIG_SECURITY_SELINUX) += selinux/built-in.o obj-$(CONFIG_SECURITY_CAPABILITIES) += commoncap.o capability.o +obj-$(CONFIG_REALTIME_CAPABILITIES) += commoncap.o realcap.o obj-$(CONFIG_SECURITY_ROOTPLUG) += commoncap.o root_plug.o Index: linux-rt.q/security/realcap.c =================================================================== --- /dev/null +++ linux-rt.q/security/realcap.c @@ -0,0 +1,144 @@ +/* + * Realtime Capabilities Linux Security Module + * + * Copyright (C) 2003 Torben Hohn + * Copyright (C) 2003, 2004 Jack O'Quin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include +#include + +#define RT_LSM "Realtime LSM " /* syslog module name prefix */ +#define RT_ERR "Realtime: " /* syslog error message prefix */ + +/* module parameters + * + * These values could change at any time due to some process writing + * a new value in /sys/module/realtime/parameters. This is OK, + * because each is referenced only once in each function call. + * Nothing depends on parameters having the same value every time. + */ + +/* if TRUE, any process is realtime */ +static int rt_any; +module_param_named(any, rt_any, int, 0644); +MODULE_PARM_DESC(any, " grant realtime privileges to any process."); + +/* realtime group id, or NO_GROUP */ +static int rt_gid = -1; +module_param_named(gid, rt_gid, int, 0644); +MODULE_PARM_DESC(gid, " the group ID with access to realtime privileges."); + +/* enable mlock() privileges */ +static int rt_mlock = 1; +module_param_named(mlock, rt_mlock, int, 0644); +MODULE_PARM_DESC(mlock, " enable memory locking privileges."); + +/* helper function for testing group membership */ +static inline int gid_ok(int gid) +{ + if (gid == -1) + return 0; + + if (gid == current->gid) + return 1; + + return in_egroup_p(gid); +} + +static void realtime_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +{ + cap_bprm_apply_creds(bprm, unsafe); + + /* If a non-zero `any' parameter was specified, we grant + * realtime privileges to every process. If the `gid' + * parameter was specified and it matches the group id of the + * executable, of the current process or any supplementary + * groups, we grant realtime capabilites. + */ + + if (rt_any || gid_ok(rt_gid)) { + cap_raise(current->cap_effective, CAP_SYS_NICE); + if (rt_mlock) { + cap_raise(current->cap_effective, CAP_IPC_LOCK); + cap_raise(current->cap_effective, CAP_SYS_RESOURCE); + } + } +} + +static struct security_operations capability_ops = { + .ptrace = cap_ptrace, + .capget = cap_capget, + .capset_check = cap_capset_check, + .capset_set = cap_capset_set, + .capable = cap_capable, + .netlink_send = cap_netlink_send, + .netlink_recv = cap_netlink_recv, + .bprm_apply_creds = realtime_bprm_apply_creds, + .bprm_set_security = cap_bprm_set_security, + .bprm_secureexec = cap_bprm_secureexec, + .task_post_setuid = cap_task_post_setuid, + .task_reparent_to_init = cap_task_reparent_to_init, + .syslog = cap_syslog, + .vm_enough_memory = cap_vm_enough_memory, +}; + +#define MY_NAME __stringify(KBUILD_MODNAME) + +static int secondary; /* flag to keep track of how we were registered */ + +static int __init realtime_init(void) +{ + /* register ourselves with the security framework */ + if (register_security(&capability_ops)) { + + /* try registering with primary module */ + if (mod_reg_security(MY_NAME, &capability_ops)) { + printk(KERN_INFO RT_ERR "Failure registering " + "capabilities with primary security module.\n"); + printk(KERN_INFO RT_ERR "Is kernel configured " + "with CONFIG_SECURITY_CAPABILITIES=m?\n"); + return -EINVAL; + } + secondary = 1; + } + + if (rt_any) + printk(KERN_INFO RT_LSM + "initialized (all groups, mlock=%d)\n", rt_mlock); + else if (rt_gid == -1) + printk(KERN_INFO RT_LSM + "initialized (no groups, mlock=%d)\n", rt_mlock); + else + printk(KERN_INFO RT_LSM + "initialized (group %d, mlock=%d)\n", rt_gid, rt_mlock); + + return 0; +} + +static void __exit realtime_exit(void) +{ + /* remove ourselves from the security framework */ + if (secondary) { + if (mod_unreg_security(MY_NAME, &capability_ops)) + printk(KERN_INFO RT_ERR "Failure unregistering " + "capabilities with primary module.\n"); + + } else if (unregister_security(&capability_ops)) { + printk(KERN_INFO RT_ERR + "Failure unregistering capabilities with the kernel\n"); + } + printk(KERN_INFO "Realtime Capability LSM exiting\n"); +} + +late_initcall(realtime_init); +module_exit(realtime_exit); + +MODULE_DESCRIPTION("Realtime Capabilities Security Module"); +MODULE_LICENSE("GPL"); patches/latency-tracing-arm.patch0000664000077200007720000002743710646635212016425 0ustar mingomingo arch/arm/boot/compressed/head.S | 12 ++++ arch/arm/kernel/entry-common.S | 109 ++++++++++++++++++++++++++++++++++++++++ arch/arm/kernel/fiq.c | 4 - arch/arm/kernel/irq.c | 5 + arch/arm/kernel/traps.c | 1 arch/arm/mm/copypage-v4mc.c | 4 - arch/arm/mm/copypage-xscale.c | 4 - arch/arm/mm/fault.c | 14 ++--- include/asm-arm/pgalloc.h | 4 - include/asm-arm/timex.h | 10 +++ include/asm-arm/unistd.h | 4 + 11 files changed, 154 insertions(+), 17 deletions(-) Index: linux-rt.q/arch/arm/boot/compressed/head.S =================================================================== --- linux-rt.q.orig/arch/arm/boot/compressed/head.S +++ linux-rt.q/arch/arm/boot/compressed/head.S @@ -836,6 +836,18 @@ memdump: mov r12, r0 mov pc, r10 #endif +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + .ltorg reloc_end: Index: linux-rt.q/arch/arm/kernel/entry-common.S =================================================================== --- linux-rt.q.orig/arch/arm/kernel/entry-common.S +++ linux-rt.q/arch/arm/kernel/entry-common.S @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * FUNCTION_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -395,5 +397,112 @@ ENTRY(sys_oabi_call_table) #undef ABI #undef OBSOLETE +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc,lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl __trace +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr + +#endif + #endif Index: linux-rt.q/arch/arm/kernel/fiq.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/fiq.c +++ linux-rt.q/arch/arm/kernel/fiq.c @@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux-rt.q/arch/arm/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/irq.c +++ linux-rt.q/arch/arm/kernel/irq.c @@ -108,11 +108,14 @@ static struct irq_desc bad_irq_desc = { * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage void __exception notrace asm_do_IRQ(unsigned int irq, + struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc = irq_desc + irq; + trace_special(instruction_pointer(regs), irq, 0); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux-rt.q/arch/arm/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/traps.c +++ linux-rt.q/arch/arm/kernel/traps.c @@ -182,6 +182,7 @@ static void dump_backtrace(struct pt_reg void dump_stack(void) { __backtrace(); + print_traces(current); } EXPORT_SYMBOL(dump_stack); Index: linux-rt.q/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/copypage-v4mc.c +++ linux-rt.q/arch/arm/mm/copypage-v4mc.c @@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -88,7 +88,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-rt.q/arch/arm/mm/copypage-xscale.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/copypage-xscale.c +++ linux-rt.q/arch/arm/mm/copypage-xscale.c @@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -110,7 +110,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-rt.q/arch/arm/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/fault.c +++ linux-rt.q/arch/arm/mm/fault.c @@ -215,7 +215,7 @@ out: return fault; } -static int +static notrace int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -315,7 +315,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int +static notrace int do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -358,7 +358,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { do_bad_area(addr, fsr, regs); @@ -368,7 +368,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -423,7 +423,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -437,7 +437,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void __exception +asmlinkage void __exception notrace do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -456,7 +456,7 @@ do_DataAbort(unsigned long addr, unsigne arm_notify_die("", regs, &info, fsr, 0); } -asmlinkage void __exception +asmlinkage void __exception notrace do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux-rt.q/include/asm-arm/pgalloc.h =================================================================== --- linux-rt.q.orig/include/asm-arm/pgalloc.h +++ linux-rt.q/include/asm-arm/pgalloc.h @@ -109,7 +109,7 @@ static inline void __pmd_populate(pmd_t * * Ensure that we always set both PMD entries. */ -static inline void +static inline void notrace pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { unsigned long pte_ptr = (unsigned long)ptep; @@ -122,7 +122,7 @@ pmd_populate_kernel(struct mm_struct *mm __pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE); } -static inline void +static inline void notrace pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep) { __pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE); Index: linux-rt.q/include/asm-arm/timex.h =================================================================== --- linux-rt.q.orig/include/asm-arm/timex.h +++ linux-rt.q/include/asm-arm/timex.h @@ -16,9 +16,17 @@ typedef unsigned long cycles_t; +#ifndef mach_read_cycles + #define mach_read_cycles() (0) +#ifdef CONFIG_LATENCY_TIMING + #define mach_cycles_to_usecs(d) (d) + #define mach_usecs_to_cycles(d) (d) +#endif +#endif + static inline cycles_t get_cycles (void) { - return 0; + return mach_read_cycles(); } #endif Index: linux-rt.q/include/asm-arm/unistd.h =================================================================== --- linux-rt.q.orig/include/asm-arm/unistd.h +++ linux-rt.q/include/asm-arm/unistd.h @@ -379,6 +379,10 @@ #define __NR_timerfd (__NR_SYSCALL_BASE+350) #define __NR_eventfd (__NR_SYSCALL_BASE+351) +#ifndef __ASSEMBLY__ +#define NR_syscalls (__NR_eventfd + 1 - __NR_SYSCALL_BASE) +#endif + /* * The following SWIs are ARM private. */ patches/preempt-realtime-powerpc-celleb-raw-spinlocks.patch0000664000077200007720000000314010646635215023517 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Mon May 14 15:28:23 2007 Date: Mon, 14 May 2007 15:28:23 +0900 From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [patch 2/4] powerpc 2.6.21-rt1: convert spinlocks to raw ones for Celleb. Convert more spinlocks to raw ones for Celleb. Signed-off-by: Tsutomu OWA -- owa --- arch/powerpc/platforms/celleb/htab.c | 2 +- arch/powerpc/platforms/celleb/interrupt.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/powerpc/platforms/celleb/htab.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/celleb/htab.c +++ linux-rt.q/arch/powerpc/platforms/celleb/htab.c @@ -40,7 +40,7 @@ #define DBG_LOW(fmt...) do { } while(0) #endif -static DEFINE_SPINLOCK(beat_htab_lock); +static DEFINE_RAW_SPINLOCK(beat_htab_lock); static inline unsigned int beat_read_mask(unsigned hpte_group) { Index: linux-rt.q/arch/powerpc/platforms/celleb/interrupt.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/celleb/interrupt.c +++ linux-rt.q/arch/powerpc/platforms/celleb/interrupt.c @@ -34,7 +34,7 @@ extern int hardirq_preemption; #endif /* CONFIG_PREEMPT_HARDIRQS */ #define MAX_IRQS NR_IRQS -static DEFINE_SPINLOCK(beatic_irq_mask_lock); +static DEFINE_RAW_SPINLOCK(beatic_irq_mask_lock); static uint64_t beatic_irq_mask_enable[(MAX_IRQS+255)/64]; static uint64_t beatic_irq_mask_ack[(MAX_IRQS+255)/64]; patches/floppy-resume-fix.patch0000664000077200007720000000416110646635211016143 0ustar mingomingoSubject: [patch] floppy: suspend/resume fix From: Ingo Molnar introduce a floppy platform-driver and suspend/resume ops to stop/start the floppy driver. Bug reported by Mikael Pettersson. Signed-off-by: Ingo Molnar --- drivers/block/floppy.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) Index: linux-rt.q/drivers/block/floppy.c =================================================================== --- linux-rt.q.orig/drivers/block/floppy.c +++ linux-rt.q/drivers/block/floppy.c @@ -4157,6 +4157,28 @@ static void floppy_device_release(struct complete(&device_release); } +static int floppy_suspend(struct platform_device *dev, pm_message_t state) +{ + floppy_release_irq_and_dma(); + + return 0; +} + +static int floppy_resume(struct platform_device *dev) +{ + floppy_grab_irq_and_dma(); + + return 0; +} + +static struct platform_driver floppy_driver = { + .suspend = floppy_suspend, + .resume = floppy_resume, + .driver = { + .name = "floppy", + }, +}; + static struct platform_device floppy_device[N_DRIVE]; static struct kobject *floppy_find(dev_t dev, int *part, void *data) @@ -4205,10 +4227,14 @@ static int __init floppy_init(void) if (err) goto out_put_disk; + err = platform_driver_register(&floppy_driver); + if (err) + goto out_unreg_blkdev; + floppy_queue = blk_init_queue(do_fd_request, &floppy_lock); if (!floppy_queue) { err = -ENOMEM; - goto out_unreg_blkdev; + goto out_unreg_driver; } blk_queue_max_sectors(floppy_queue, 64); @@ -4357,6 +4383,8 @@ out_flush_work: out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); blk_cleanup_queue(floppy_queue); +out_unreg_driver: + platform_driver_unregister(&floppy_driver); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: @@ -4548,6 +4576,7 @@ void cleanup_module(void) init_completion(&device_release); blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); unregister_blkdev(FLOPPY_MAJOR, "fd"); + platform_driver_unregister(&floppy_driver); for (drive = 0; drive < N_DRIVE; drive++) { del_timer_sync(&motor_off_timer[drive]); patches/sched-cfs-latest.patch0000664000077200007720000000525310646635210015703 0ustar mingomingo--- kernel/sched.c | 22 ++++++++-------------- kernel/sched_debug.c | 2 +- 2 files changed, 9 insertions(+), 15 deletions(-) Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -750,7 +750,7 @@ static const u32 prio_to_wmult[40] = { 184467, 230589, 288233, 360285, 450347, 562979, 703746, 879575, 1099582, 1374389, 717986, 2147483, 2684354, 3355443, 4194304, - 244160, 6557201, 8196502, 10250518, 12782640, + 5244160, 6557201, 8196502, 10250518, 12782640, 16025997, 19976592, 24970740, 31350126, 39045157, 49367440, 61356675, 76695844, 95443717, 119304647, 148102320, 186737708, 238609294, 286331153, @@ -4657,11 +4657,7 @@ static void show_task(struct task_struct free = (unsigned long)n - (unsigned long)end_of_stack(p); } #endif - printk("%5lu %5d %6d", free, p->pid, p->parent->pid); - if (!p->mm) - printk(" (L-TLB)\n"); - else - printk(" (NOTLB)\n"); + printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); if (state != TASK_RUNNING) show_stack(p, NULL); @@ -4671,14 +4667,12 @@ void show_state_filter(unsigned long sta { struct task_struct *g, *p; -#if (BITS_PER_LONG == 32) - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); #else - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -4769,7 +4763,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 10000000; + const unsigned long gran_limit = 100000000; sysctl_sched_granularity *= factor; if (sysctl_sched_granularity > gran_limit) Index: linux-rt.q/kernel/sched_debug.c =================================================================== --- linux-rt.q.orig/kernel/sched_debug.c +++ linux-rt.q/kernel/sched_debug.c @@ -173,7 +173,7 @@ static int sched_debug_show(struct seq_f u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v19, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v20, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); patches/preempt-realtime-ppc-need-resched-delayed.patch0000664000077200007720000000214010646635214022533 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Mon May 14 15:29:17 2007 Date: Mon, 14 May 2007 15:29:17 +0900 From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [patch 3/4] powerpc 2.6.21-rt1: add a need_resched_delayed() check Add a need_resched_delayed() check. This was pointed by Sergei Shtylyov; http://ozlabs.org/pipermail/linuxppc-dev/2007-March/033148.html Signed-off-by: Tsutomu Owa -- owa --- arch/powerpc/kernel/idle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/powerpc/kernel/idle.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/idle.c +++ linux-rt.q/arch/powerpc/kernel/idle.c @@ -75,7 +75,9 @@ void cpu_idle(void) local_irq_disable(); /* check again after disabling irqs */ - if (!need_resched() && !cpu_should_die()) + if (!need_resched() && + !need_resched_delayed() && + !cpu_should_die()) ppc_md.power_save(); local_irq_enable(); patches/nf_conntrack-fix-smp-processor-id.patch0000664000077200007720000000126110646635216021210 0ustar mingomingo--- include/net/netfilter/nf_conntrack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/include/net/netfilter/nf_conntrack.h =================================================================== --- linux-rt.q.orig/include/net/netfilter/nf_conntrack.h +++ linux-rt.q/include/net/netfilter/nf_conntrack.h @@ -260,7 +260,7 @@ DECLARE_PER_CPU(struct ip_conntrack_stat #define NF_CT_STAT_INC_ATOMIC(count) \ do { \ local_bh_disable(); \ - __get_cpu_var(nf_conntrack_stat).count++; \ + __raw_get_cpu_var(nf_conntrack_stat).count++; \ local_bh_enable(); \ } while (0) #define NF_CT_STAT_INC(count) (__raw_get_cpu_var(nf_conntrack_stat).count++) patches/x86_64-use-generic-xtime-init.patch0000664000077200007720000000462210646635210020000 0ustar mingomingoSubject: x86_64: Use generic xtime init xtime can be initialized including the cmos update from the generic timekeeping code. Remove the arch specific implementation. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/time.c | 40 +--------------------------------------- 1 file changed, 1 insertion(+), 39 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/time.c +++ linux-rt.q/arch/x86_64/kernel/time.c @@ -194,7 +194,7 @@ static irqreturn_t timer_interrupt(int i return IRQ_HANDLED; } -static unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned int year, mon, day, hour, min, sec; unsigned long flags; @@ -368,11 +368,6 @@ void __init time_init(void) { if (nohpet) hpet_address = 0; - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = 0; - - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); if (hpet_arch_init()) hpet_address = 0; @@ -410,54 +405,21 @@ void __init time_init(void) setup_irq(0, &irq0); } - -static long clock_cmos_diff; -static unsigned long sleep_start; - /* * sysfs support for the timer. */ static int timer_suspend(struct sys_device *dev, pm_message_t state) { - /* - * Estimate time zone so that set_time can update the clock - */ - long cmos_time = get_cmos_time(); - - clock_cmos_diff = -cmos_time; - clock_cmos_diff += get_seconds(); - sleep_start = cmos_time; return 0; } static int timer_resume(struct sys_device *dev) { - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - - if (sleep_length < 0) { - printk(KERN_WARNING "Time skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } if (hpet_address) hpet_reenable(); else i8254_timer_resume(); - - sec = ctime + clock_cmos_diff; - write_seqlock_irqsave(&xtime_lock,flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - jiffies += sleep_length; - write_sequnlock_irqrestore(&xtime_lock,flags); - touch_softlockup_watchdog(); return 0; } patches/s_files-schedule_on_each_cpu_wq.patch0000664000077200007720000000610010646635216021020 0ustar mingomingo--- include/linux/workqueue.h | 1 kernel/workqueue.c | 66 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 11 deletions(-) Index: linux-rt.q/include/linux/workqueue.h =================================================================== --- linux-rt.q.orig/include/linux/workqueue.h +++ linux-rt.q/include/linux/workqueue.h @@ -144,6 +144,7 @@ extern int FASTCALL(schedule_delayed_wor unsigned long delay)); extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay); +extern int schedule_on_each_cpu_wq(struct workqueue_struct *wq, work_func_t func); extern int schedule_on_each_cpu(work_func_t func); extern int current_is_keventd(void); extern int keventd_up(void); Index: linux-rt.q/kernel/workqueue.c =================================================================== --- linux-rt.q.orig/kernel/workqueue.c +++ linux-rt.q/kernel/workqueue.c @@ -240,6 +240,20 @@ int queue_delayed_work_on(int cpu, struc } EXPORT_SYMBOL_GPL(queue_delayed_work_on); +static void leak_check(void *func) +{ + if (!in_atomic() && lockdep_depth(current) <= 0) + return; + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), + current->pid); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)func); + debug_show_held_locks(current); + dump_stack(); +} + static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); @@ -261,18 +275,10 @@ static void run_workqueue(struct cpu_wor BUG_ON(get_wq_data(work) != cwq); work_clear_pending(work); - f(work); - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - current->pid); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); - } + leak_check(NULL); + f(work); + leak_check(f); spin_lock_irq(&cwq->lock); cwq->current_work = NULL; @@ -586,6 +592,44 @@ int schedule_on_each_cpu(work_func_t fun return 0; } +/** + * schedule_on_each_cpu_wq - call a function on each online CPU on a per-CPU wq + * @func: the function to call + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * Appears to be racy against CPU hotplug. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu_wq(struct workqueue_struct *wq, work_func_t func) +{ + int cpu; + struct work_struct *works; + + if (is_single_threaded(wq)) { + WARN_ON(1); + return -EINVAL; + } + works = alloc_percpu(struct work_struct); + if (!works) + return -ENOMEM; + + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, func); + set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + } + flush_workqueue(wq); + free_percpu(works); + + return 0; +} + + void flush_scheduled_work(void) { flush_workqueue(keventd_wq); patches/preempt-realtime-arm-footbridge.patch0000664000077200007720000000217610646635214020732 0ustar mingomingo--- arch/arm/mach-footbridge/netwinder-hw.c | 2 +- arch/arm/mach-footbridge/netwinder-leds.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/arm/mach-footbridge/netwinder-hw.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-footbridge/netwinder-hw.c +++ linux-rt.q/arch/arm/mach-footbridge/netwinder-hw.c @@ -67,7 +67,7 @@ static inline void wb977_ww(int reg, int /* * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE */ -DEFINE_SPINLOCK(gpio_lock); +DEFINE_RAW_SPINLOCK(gpio_lock); static unsigned int current_gpio_op; static unsigned int current_gpio_io; Index: linux-rt.q/arch/arm/mach-footbridge/netwinder-leds.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-footbridge/netwinder-leds.c +++ linux-rt.q/arch/arm/mach-footbridge/netwinder-leds.c @@ -32,7 +32,7 @@ static char led_state; static char hw_led_state; static DEFINE_SPINLOCK(leds_lock); -extern spinlock_t gpio_lock; +extern raw_spinlock_t gpio_lock; static void netwinder_leds_event(led_event_t evt) { patches/arm-trace-preempt-idle.patch0000664000077200007720000000522510646635214017015 0ustar mingomingoFrom linux-rt-users-owner@vger.kernel.org Fri Jul 13 20:13:14 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by mail.tglx.de (Postfix) with ESMTP id 5902865C3EB; Fri, 13 Jul 2007 20:13:14 +0200 (CEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933095AbXGMSNN (ORCPT + 1 other); Fri, 13 Jul 2007 14:13:13 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S933031AbXGMSNM (ORCPT ); Fri, 13 Jul 2007 14:13:12 -0400 Received: from deeprooted.net ([216.254.16.51]:38941 "EHLO paris.hilman.org" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1760089AbXGMSNH (ORCPT ); Fri, 13 Jul 2007 14:13:07 -0400 Received: by paris.hilman.org (Postfix, from userid 1000) id E61B1D2857A; Fri, 13 Jul 2007 10:52:28 -0700 (PDT) Message-Id: <20070713175228.623525155@mvista.com> References: <20070713175214.336577416@mvista.com> User-Agent: quilt/0.45-1 Date: Fri, 13 Jul 2007 10:52:18 -0700 From: Kevin Hilman To: tglx@linutronix.de, mingo@elte.hu Cc: linux-rt-users@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [PATCH -rt 4/6] Add trace_preempt_*_idle() support for ARM. Content-Disposition: inline; filename=arm-trace-preempt-idle.patch Sender: linux-rt-users-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-rt-users@vger.kernel.org X-Filter-To: .Kernel.rt-users X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Mime-Version: 1.0 Add trace functions to ARM idle loop and also move the tick_nohz_restart_sched_tick() after the local_irq_disable() as is done on x86. Signed-off-by: Kevin Hilman --- arch/arm/kernel/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/arm/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/process.c +++ linux-rt.q/arch/arm/kernel/process.c @@ -165,11 +165,13 @@ void cpu_idle(void) while (!need_resched() && !need_resched_delayed()) idle(); leds_event(led_idle_end); - tick_nohz_restart_sched_tick(); local_irq_disable(); + trace_preempt_exit_idle(); + tick_nohz_restart_sched_tick(); __preempt_enable_no_resched(); __schedule(); preempt_disable(); + trace_preempt_enter_idle(); local_irq_enable(); } } patches/x86-64-smpboot-whitespace.patch0000664000077200007720000000104310646635212017234 0ustar mingomingo--- arch/x86_64/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/smpboot.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/smpboot.c +++ linux-rt.q/arch/x86_64/kernel/smpboot.c @@ -331,8 +331,8 @@ void __cpuinit start_secondary(void) barrier(); /* - * Check TSC sync first: - */ + * Check TSC sync first: + */ check_tsc_sync_target(); Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); patches/lockstat_bounce_rt.patch0000664000077200007720000000153310646635217016442 0ustar mingomingo--- include/linux/lockdep.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) Index: linux-rt.q/include/linux/lockdep.h =================================================================== --- linux-rt.q.orig/include/linux/lockdep.h +++ linux-rt.q/include/linux/lockdep.h @@ -347,8 +347,8 @@ do { \ if (!f_try(&(_lock)->lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ f_lock(&(_lock)->lock); \ - lock_acquired(&(_lock)->dep_map); \ } \ + lock_acquired(&(_lock)->dep_map); \ } while (0) @@ -358,9 +358,9 @@ do { \ if (!f_try(&(_lock)->lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ret = f_lock(&(_lock)->lock); \ - if (!ret) \ - lock_acquired(&(_lock)->dep_map); \ } \ + if (!ret) \ + lock_acquired(&(_lock)->dep_map); \ ret; \ }) patches/i386-remove-pit-interrupt-hook.patch0000664000077200007720000000367010646635210020321 0ustar mingomingoSubject: i386: remove pit_interrupt_hook From: Chris Wright Remove pit_interrupt_hook as it adds just an extra layer. Signed-off-by: Chris Wright Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/asm-i386/i8253.h | 11 ----------- include/asm-i386/mach-default/do_timer.h | 2 +- include/asm-i386/mach-voyager/do_timer.h | 2 +- 3 files changed, 2 insertions(+), 13 deletions(-) Index: linux-rt.q/include/asm-i386/i8253.h =================================================================== --- linux-rt.q.orig/include/asm-i386/i8253.h +++ linux-rt.q/include/asm-i386/i8253.h @@ -12,17 +12,6 @@ extern spinlock_t i8253_lock; extern struct clock_event_device *global_clock_event; -/** - * pit_interrupt_hook - hook into timer tick - * @regs: standard registers from interrupt - * - * Call the global clock event handler. - **/ -static inline void pit_interrupt_hook(void) -{ - global_clock_event->event_handler(global_clock_event); -} - extern void setup_pit_timer(void); #endif /* __ASM_I8253_H__ */ Index: linux-rt.q/include/asm-i386/mach-default/do_timer.h =================================================================== --- linux-rt.q.orig/include/asm-i386/mach-default/do_timer.h +++ linux-rt.q/include/asm-i386/mach-default/do_timer.h @@ -12,5 +12,5 @@ static inline void do_timer_interrupt_hook(void) { - pit_interrupt_hook(); + global_clock_event->event_handler(global_clock_event); } Index: linux-rt.q/include/asm-i386/mach-voyager/do_timer.h =================================================================== --- linux-rt.q.orig/include/asm-i386/mach-voyager/do_timer.h +++ linux-rt.q/include/asm-i386/mach-voyager/do_timer.h @@ -12,7 +12,7 @@ **/ static inline void do_timer_interrupt_hook(void) { - pit_interrupt_hook(); + global_clock_event->event_handler(global_clock_event); voyager_timer_interrupt(); } patches/timerc-cleanup-recently-introduced-whitespace-damage.patch0000664000077200007720000000576710646635210025023 0ustar mingomingoFrom: Thomas Gleixner Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/timer.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) Index: linux-rt.q/kernel/timer.c =================================================================== --- linux-rt.q.orig/kernel/timer.c +++ linux-rt.q/kernel/timer.c @@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_bas static inline void timer_set_deferrable(struct timer_list *timer) { timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); + TBASE_DEFERRABLE_FLAG)); } static inline void timer_set_base(struct timer_list *timer, tvec_base_t *new_base) { timer->base = (tvec_base_t *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); + tbase_get_deferrable(timer->base)); } /** @@ -431,10 +431,10 @@ EXPORT_SYMBOL(__mod_timer); void add_timer_on(struct timer_list *timer, int cpu) { tvec_base_t *base = per_cpu(tvec_bases, cpu); - unsigned long flags; + unsigned long flags; timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); + BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); @@ -613,7 +613,7 @@ static inline void __run_timers(tvec_bas while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; + int index = base->timer_jiffies & TVR_MASK; /* * Cascade timers: @@ -630,8 +630,8 @@ static inline void __run_timers(tvec_bas unsigned long data; timer = list_first_entry(head, struct timer_list,entry); - fn = timer->function; - data = timer->data; + fn = timer->function; + data = timer->data; timer_stats_account_timer(timer); @@ -675,8 +675,8 @@ static unsigned long __next_timer_interr index = slot = timer_jiffies & TVR_MASK; do { list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; + if (tbase_get_deferrable(nte->base)) + continue; found = 1; expires = nte->expires; @@ -820,7 +820,7 @@ void update_process_times(int user_tick) if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); scheduler_tick(); - run_posix_cpu_timers(p); + run_posix_cpu_timers(p); } /* @@ -895,7 +895,7 @@ static inline void update_times(unsigned update_wall_time(); calc_load(ticks); } - + /* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. @@ -1091,7 +1091,7 @@ asmlinkage long sys_gettid(void) /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill - */ + */ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; patches/arm-leds-timer.patch0000664000077200007720000000127110646635211015371 0ustar mingomingoThe clockevent layer now handles everything done by the ARM timer_tick() call, except the LED stuff. Here we add an arch_tick_leds() to handle LED toggling which is called by do_timer(). --- arch/arm/kernel/time.c | 7 +++++++ 1 file changed, 7 insertions(+) Index: linux-rt.q/arch/arm/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/time.c +++ linux-rt.q/arch/arm/kernel/time.c @@ -236,6 +236,13 @@ static inline void do_leds(void) #define do_leds() #endif +void arch_tick_leds(void) +{ +#ifdef CONFIG_LEDS_TIMER + do_leds(); +#endif +} + #ifndef CONFIG_GENERIC_TIME void do_gettimeofday(struct timeval *tv) { patches/ppc-remove-last-cpukhz.patch0000664000077200007720000000177110646635212017075 0ustar mingomingoFrom sshtylyov@ru.mvista.com Thu May 24 06:02:00 2007 From: Sergei Shtylyov Subject: [PATCH 2.6.21-rt7] PowerPC: kill cpu_khz reference Date: Thu, 24 May 2007 06:02:00 +1000 X-Patchwork-ID: 11304 Remove forgotten reference to 'cpu_khz' which have been removed for PowerPC in 2.6.21-rt7... Signed-off-by: Sergei Shtylyov --- The irony here is that it was me who sent a patch to add that line. :-) --- --- arch/powerpc/kernel/time.c | 1 - 1 file changed, 1 deletion(-) Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -906,7 +906,6 @@ void __init time_init(void) tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; - cpu_khz = ppc_tb_freq / 1000; tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); calc_cputime_factors(); patches/rt-mutex-ppc.patch0000664000077200007720000006515310646635214015130 0ustar mingomingo--- arch/powerpc/Kconfig | 19 +++++++---- arch/powerpc/kernel/Makefile | 3 + arch/powerpc/kernel/ppc_ksyms.c | 1 arch/powerpc/kernel/semaphore.c | 20 +++++++----- arch/powerpc/lib/locks.c | 4 +- arch/ppc/Kconfig | 19 +++++++---- arch/ppc/kernel/entry.S | 4 +- arch/ppc/kernel/semaphore.c | 13 +++++-- arch/ppc/lib/locks.c | 38 +++++++++++------------ arch/ppc/syslib/ocp.c | 2 - drivers/macintosh/adb.c | 10 +++--- include/asm-powerpc/rwsem.h | 42 ++++++++++++++----------- include/asm-powerpc/semaphore.h | 57 ++++++++++++++++++++++------------- include/asm-powerpc/spinlock.h | 38 +++++++++++------------ include/asm-powerpc/spinlock_types.h | 4 +- include/asm-ppc/ocp.h | 2 - 16 files changed, 159 insertions(+), 117 deletions(-) Index: linux-rt.q/arch/powerpc/Kconfig =================================================================== --- linux-rt.q.orig/arch/powerpc/Kconfig +++ linux-rt.q/arch/powerpc/Kconfig @@ -43,13 +43,6 @@ config IRQ_PER_CPU bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default y @@ -422,6 +415,18 @@ config GENERIC_CLOCKEVENTS source kernel/time/Kconfig source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" # We optimistically allocate largepages from the VM, so make the limit Index: linux-rt.q/arch/powerpc/kernel/Makefile =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/Makefile +++ linux-rt.q/arch/powerpc/kernel/Makefile @@ -10,10 +10,11 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ +obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ init_task.o process.o systbl.o idle.o obj-y += vdso32/ +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o cpu_setup_ppc970.o \ Index: linux-rt.q/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/ppc_ksyms.c +++ linux-rt.q/arch/powerpc/kernel/ppc_ksyms.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include Index: linux-rt.q/arch/powerpc/kernel/semaphore.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/semaphore.c +++ linux-rt.q/arch/powerpc/kernel/semaphore.c @@ -31,7 +31,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -50,7 +50,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -63,7 +63,7 @@ void __up(struct semaphore *sem) __sem_update_count(sem, 1); wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se */ wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore *sem) { int retval = 0; struct task_struct *tsk = current; @@ -132,4 +132,10 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux-rt.q/arch/powerpc/lib/locks.c =================================================================== --- linux-rt.q.orig/arch/powerpc/lib/locks.c +++ linux-rt.q/arch/powerpc/lib/locks.c @@ -25,7 +25,7 @@ #include #include -void __spin_yield(raw_spinlock_t *lock) +void __spin_yield(__raw_spinlock_t *lock) { unsigned int lock_value, holder_cpu, yield_count; @@ -82,7 +82,7 @@ void __rw_yield(raw_rwlock_t *rw) } #endif -void __raw_spin_unlock_wait(raw_spinlock_t *lock) +void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (lock->slock) { HMT_low(); Index: linux-rt.q/arch/ppc/Kconfig =================================================================== --- linux-rt.q.orig/arch/ppc/Kconfig +++ linux-rt.q/arch/ppc/Kconfig @@ -12,13 +12,6 @@ config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default y @@ -988,6 +981,18 @@ config ARCH_POPULATES_NODE_MAP source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "mm/Kconfig" source "fs/Kconfig.binfmt" Index: linux-rt.q/arch/ppc/kernel/entry.S =================================================================== --- linux-rt.q.orig/arch/ppc/kernel/entry.S +++ linux-rt.q/arch/ppc/kernel/entry.S @@ -863,7 +863,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -877,7 +877,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,18 lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING beq restore_user Index: linux-rt.q/arch/ppc/kernel/semaphore.c =================================================================== --- linux-rt.q.orig/arch/ppc/kernel/semaphore.c +++ linux-rt.q/arch/ppc/kernel/semaphore.c @@ -29,7 +29,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -48,7 +48,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -70,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -129,3 +129,8 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} Index: linux-rt.q/arch/ppc/lib/locks.c =================================================================== --- linux-rt.q.orig/arch/ppc/lib/locks.c +++ linux-rt.q/arch/ppc/lib/locks.c @@ -42,7 +42,7 @@ static inline unsigned long __spin_trylo return ret; } -void _raw_spin_lock(spinlock_t *lock) +void __raw_spin_lock(raw_spinlock_t *lock) { int cpu = smp_processor_id(); unsigned int stuck = INIT_STUCK; @@ -62,9 +62,9 @@ void _raw_spin_lock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); lock->owner_cpu = cpu; } -EXPORT_SYMBOL(_raw_spin_lock); +EXPORT_SYMBOL(__raw_spin_lock); -int _raw_spin_trylock(spinlock_t *lock) +int __raw_spin_trylock(raw_spinlock_t *lock) { if (__spin_trylock(&lock->lock)) return 0; @@ -72,9 +72,9 @@ int _raw_spin_trylock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); return 1; } -EXPORT_SYMBOL(_raw_spin_trylock); +EXPORT_SYMBOL(__raw_spin_trylock); -void _raw_spin_unlock(spinlock_t *lp) +void __raw_spin_unlock(raw_spinlock_t *lp) { if ( !lp->lock ) printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n", @@ -88,13 +88,13 @@ void _raw_spin_unlock(spinlock_t *lp) wmb(); lp->lock = 0; } -EXPORT_SYMBOL(_raw_spin_unlock); +EXPORT_SYMBOL(__raw_spin_unlock); /* * For rwlocks, zero is unlocked, -1 is write-locked, * positive is read-locked. */ -static __inline__ int __read_trylock(rwlock_t *rw) +static __inline__ int __read_trylock(raw_rwlock_t *rw) { signed int tmp; @@ -114,13 +114,13 @@ static __inline__ int __read_trylock(rwl return tmp; } -int _raw_read_trylock(rwlock_t *rw) +int __raw_read_trylock(raw_rwlock_t *rw) { return __read_trylock(rw) > 0; } -EXPORT_SYMBOL(_raw_read_trylock); +EXPORT_SYMBOL(__raw_read_trylock); -void _raw_read_lock(rwlock_t *rw) +void __raw_read_lock(rwlock_t *rw) { unsigned int stuck; @@ -135,9 +135,9 @@ void _raw_read_lock(rwlock_t *rw) } } } -EXPORT_SYMBOL(_raw_read_lock); +EXPORT_SYMBOL(__raw_read_lock); -void _raw_read_unlock(rwlock_t *rw) +void __raw_read_unlock(raw_rwlock_t *rw) { if ( rw->lock == 0 ) printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n", @@ -146,9 +146,9 @@ void _raw_read_unlock(rwlock_t *rw) wmb(); atomic_dec((atomic_t *) &(rw)->lock); } -EXPORT_SYMBOL(_raw_read_unlock); +EXPORT_SYMBOL(__raw_read_unlock); -void _raw_write_lock(rwlock_t *rw) +void __raw_write_lock(raw_rwlock_t *rw) { unsigned int stuck; @@ -164,18 +164,18 @@ void _raw_write_lock(rwlock_t *rw) } wmb(); } -EXPORT_SYMBOL(_raw_write_lock); +EXPORT_SYMBOL(__raw_write_lock); -int _raw_write_trylock(rwlock_t *rw) +int __raw_write_trylock(raw_rwlock_t *rw) { if (cmpxchg(&rw->lock, 0, -1) != 0) return 0; wmb(); return 1; } -EXPORT_SYMBOL(_raw_write_trylock); +EXPORT_SYMBOL(__raw_write_trylock); -void _raw_write_unlock(rwlock_t *rw) +void __raw_write_unlock(raw_rwlock_t *rw) { if (rw->lock >= 0) printk("_write_lock(): %s/%d (nip %08lX) lock %d\n", @@ -184,6 +184,6 @@ void _raw_write_unlock(rwlock_t *rw) wmb(); rw->lock = 0; } -EXPORT_SYMBOL(_raw_write_unlock); +EXPORT_SYMBOL(__raw_write_unlock); #endif Index: linux-rt.q/arch/ppc/syslib/ocp.c =================================================================== --- linux-rt.q.orig/arch/ppc/syslib/ocp.c +++ linux-rt.q/arch/ppc/syslib/ocp.c @@ -44,11 +44,11 @@ #include #include #include +#include #include #include #include -#include #include //#define DBG(x) printk x Index: linux-rt.q/drivers/macintosh/adb.c =================================================================== --- linux-rt.q.orig/drivers/macintosh/adb.c +++ linux-rt.q/drivers/macintosh/adb.c @@ -256,6 +256,8 @@ adb_probe_task(void *x) sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); + down(&adb_probe_mutex); + printk(KERN_INFO "adb: starting probe task...\n"); do_adb_reset_bus(); printk(KERN_INFO "adb: finished probe task...\n"); @@ -282,7 +284,9 @@ adb_reset_bus(void) return 0; } - down(&adb_probe_mutex); + if (adb_got_sleep) + return 0; + schedule_work(&adb_reset_work); return 0; } @@ -345,9 +349,8 @@ adb_notify_sleep(struct pmu_sleep_notifi { switch (when) { case PBOOK_SLEEP_REQUEST: + /* Signal to discontiue probing */ adb_got_sleep = 1; - /* We need to get a lock on the probe thread */ - down(&adb_probe_mutex); /* Stop autopoll */ if (adb_controller->autopoll) adb_controller->autopoll(0); @@ -356,7 +359,6 @@ adb_notify_sleep(struct pmu_sleep_notifi break; case PBOOK_WAKE: adb_got_sleep = 0; - up(&adb_probe_mutex); adb_reset_bus(); break; } Index: linux-rt.q/include/asm-powerpc/rwsem.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/rwsem.h +++ linux-rt.q/include/asm-powerpc/rwsem.h @@ -1,6 +1,10 @@ #ifndef _ASM_POWERPC_RWSEM_H #define _ASM_POWERPC_RWSEM_H +#ifndef _LINUX_RWSEM_H +#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" +#endif + #ifdef __KERNEL__ /* @@ -17,7 +21,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct compat_rw_semaphore { /* XXX this should be able to be an atomic_t -- paulus */ signed int count; #define RWSEM_UNLOCKED_VALUE 0x00000000 @@ -26,7 +30,7 @@ struct rw_semaphore { #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; }; @@ -34,15 +38,15 @@ struct rw_semaphore { { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem); -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void compat_init_rwsem(struct compat_rw_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -52,13 +56,13 @@ static inline void init_rwsem(struct rw_ /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct compat_rw_semaphore *sem) { if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0)) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -74,7 +78,7 @@ static inline int __down_read_trylock(st /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct compat_rw_semaphore *sem) { int tmp; @@ -84,7 +88,7 @@ static inline void __down_write(struct r rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -96,7 +100,7 @@ static inline int __down_write_trylock(s /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct compat_rw_semaphore *sem) { int tmp; @@ -108,7 +112,7 @@ static inline void __up_read(struct rw_s /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct compat_rw_semaphore *sem) { if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)) < 0)) @@ -118,7 +122,7 @@ static inline void __up_write(struct rw_ /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -126,7 +130,7 @@ static inline void rwsem_atomic_add(int /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct compat_rw_semaphore *sem) { int tmp; @@ -138,12 +142,12 @@ static inline void __downgrade_write(str /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) { return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->count != 0); } Index: linux-rt.q/include/asm-powerpc/semaphore.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/semaphore.h +++ linux-rt.q/include/asm-powerpc/semaphore.h @@ -10,54 +10,65 @@ #ifdef __KERNEL__ +/*#include */ #include #include #include #include -struct semaphore { +/* + * On !PREEMPT_RT all sempahores are compat + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { /* * Note that any negative value of count is equivalent to 0, * but additionally indicates that some process(es) might be * sleeping on `wait'. */ atomic_t count; + int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name, count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1) -#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name, 0) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) +#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0) -static inline void sema_init (struct semaphore *sem, int val) +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern void __compat_up(struct compat_semaphore * sem); + +extern int compat_sem_is_locked(struct compat_semaphore *sem); -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -65,31 +76,35 @@ static inline void down(struct semaphore * Try to get the semaphore, take the slow path if we fail. */ if (unlikely(atomic_dec_return(&sem->count) < 0)) - __down(sem); + __compat_down(sem); } -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; might_sleep(); if (unlikely(atomic_dec_return(&sem->count) < 0)) - ret = __down_interruptible(sem); + ret = __compat_down_interruptible(sem); return ret; } -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { return atomic_dec_if_positive(&sem->count) < 0; } -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { if (unlikely(atomic_inc_return(&sem->count) <= 0)) - __up(sem); + __compat_up(sem); } +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_SEMAPHORE_H */ Index: linux-rt.q/include/asm-powerpc/spinlock.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/spinlock.h +++ linux-rt.q/include/asm-powerpc/spinlock.h @@ -53,7 +53,7 @@ * This returns the old value in the lock, so we succeeded * in getting the lock if the return value is 0. */ -static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock) +static __inline__ unsigned long ___raw_spin_trylock(__raw_spinlock_t *lock) { unsigned long tmp, token; @@ -72,10 +72,10 @@ static __inline__ unsigned long __spin_t return tmp; } -static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock) +static int __inline__ __raw_spin_trylock(__raw_spinlock_t *lock) { CLEAR_IO_SYNC; - return __spin_trylock(lock) == 0; + return ___raw_spin_trylock(lock) == 0; } /* @@ -95,19 +95,19 @@ static int __inline__ __raw_spin_trylock #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES) /* We only yield to the hypervisor if we are in shared processor mode */ #define SHARED_PROCESSOR (get_lppaca()->shared_proc) -extern void __spin_yield(raw_spinlock_t *lock); -extern void __rw_yield(raw_rwlock_t *lock); +extern void __spin_yield(__raw_spinlock_t *lock); +extern void __rw_yield(__raw_rwlock_t *lock); #else /* SPLPAR || ISERIES */ #define __spin_yield(x) barrier() #define __rw_yield(x) barrier() #define SHARED_PROCESSOR 0 #endif -static void __inline__ __raw_spin_lock(raw_spinlock_t *lock) +static void __inline__ __raw_spin_lock(__raw_spinlock_t *lock) { CLEAR_IO_SYNC; while (1) { - if (likely(__spin_trylock(lock) == 0)) + if (likely(___raw_spin_trylock(lock) == 0)) break; do { HMT_low(); @@ -118,13 +118,13 @@ static void __inline__ __raw_spin_lock(r } } -static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +static void __inline__ __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { unsigned long flags_dis; CLEAR_IO_SYNC; while (1) { - if (likely(__spin_trylock(lock) == 0)) + if (likely(___raw_spin_trylock(lock) == 0)) break; local_save_flags(flags_dis); local_irq_restore(flags); @@ -138,7 +138,7 @@ static void __inline__ __raw_spin_lock_f } } -static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock) +static __inline__ void __raw_spin_unlock(__raw_spinlock_t *lock) { SYNC_IO; __asm__ __volatile__("# __raw_spin_unlock\n\t" @@ -147,7 +147,7 @@ static __inline__ void __raw_spin_unlock } #ifdef CONFIG_PPC64 -extern void __raw_spin_unlock_wait(raw_spinlock_t *lock); +extern void __raw_spin_unlock_wait(__raw_spinlock_t *lock); #else #define __raw_spin_unlock_wait(lock) \ do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0) @@ -179,7 +179,7 @@ extern void __raw_spin_unlock_wait(raw_s * This returns the old value in the lock + 1, * so we got a read lock if the return value is > 0. */ -static long __inline__ __read_trylock(raw_rwlock_t *rw) +static long __inline__ __read_trylock(__raw_rwlock_t *rw) { long tmp; @@ -203,7 +203,7 @@ static long __inline__ __read_trylock(ra * This returns the old value in the lock, * so we got the write lock if the return value is 0. */ -static __inline__ long __write_trylock(raw_rwlock_t *rw) +static __inline__ long __write_trylock(__raw_rwlock_t *rw) { long tmp, token; @@ -223,7 +223,7 @@ static __inline__ long __write_trylock(r return tmp; } -static void __inline__ __raw_read_lock(raw_rwlock_t *rw) +static void __inline__ __raw_read_lock(__raw_rwlock_t *rw) { while (1) { if (likely(__read_trylock(rw) > 0)) @@ -237,7 +237,7 @@ static void __inline__ __raw_read_lock(r } } -static void __inline__ __raw_write_lock(raw_rwlock_t *rw) +static void __inline__ __raw_write_lock(__raw_rwlock_t *rw) { while (1) { if (likely(__write_trylock(rw) == 0)) @@ -251,17 +251,17 @@ static void __inline__ __raw_write_lock( } } -static int __inline__ __raw_read_trylock(raw_rwlock_t *rw) +static int __inline__ __raw_read_trylock(__raw_rwlock_t *rw) { return __read_trylock(rw) > 0; } -static int __inline__ __raw_write_trylock(raw_rwlock_t *rw) +static int __inline__ __raw_write_trylock(__raw_rwlock_t *rw) { return __write_trylock(rw) == 0; } -static void __inline__ __raw_read_unlock(raw_rwlock_t *rw) +static void __inline__ __raw_read_unlock(__raw_rwlock_t *rw) { long tmp; @@ -278,7 +278,7 @@ static void __inline__ __raw_read_unlock : "cr0", "memory"); } -static __inline__ void __raw_write_unlock(raw_rwlock_t *rw) +static __inline__ void __raw_write_unlock(__raw_rwlock_t *rw) { __asm__ __volatile__("# write_unlock\n\t" LWSYNC_ON_SMP: : :"memory"); Index: linux-rt.q/include/asm-powerpc/spinlock_types.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/spinlock_types.h +++ linux-rt.q/include/asm-powerpc/spinlock_types.h @@ -7,13 +7,13 @@ typedef struct { volatile unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } typedef struct { volatile signed int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { 0 } Index: linux-rt.q/include/asm-ppc/ocp.h =================================================================== --- linux-rt.q.orig/include/asm-ppc/ocp.h +++ linux-rt.q/include/asm-ppc/ocp.h @@ -27,10 +27,10 @@ #include #include #include +#include #include #include -#include #include #ifdef CONFIG_PPC_OCP patches/preempt-irqs-x86-64.patch0000664000077200007720000000352610646635213016060 0ustar mingomingo--- arch/x86_64/kernel/i8259.c | 3 ++- arch/x86_64/kernel/io_apic.c | 3 ++- arch/x86_64/kernel/time.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/i8259.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/i8259.c +++ linux-rt.q/arch/x86_64/kernel/i8259.c @@ -395,7 +395,8 @@ device_initcall(i8259A_init_sysfs); * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; + DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... IRQ0_VECTOR - 1] = -1, [IRQ0_VECTOR] = 0, Index: linux-rt.q/arch/x86_64/kernel/io_apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/io_apic.c +++ linux-rt.q/arch/x86_64/kernel/io_apic.c @@ -777,9 +777,10 @@ static void ioapic_register_intr(int irq if (trigger) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else + else { set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); + } } static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, Index: linux-rt.q/arch/x86_64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/time.c +++ linux-rt.q/arch/x86_64/kernel/time.c @@ -255,7 +255,8 @@ static unsigned int __init tsc_calibrate static struct irqaction irq0 = { .handler = timer_event_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | + IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; patches/ppc-add-ppc32-mcount.patch0000664000077200007720000000436410646635212016316 0ustar mingomingo--- arch/powerpc/kernel/entry_32.S | 82 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) Index: linux-rt.q/arch/powerpc/kernel/entry_32.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/entry_32.S +++ linux-rt.q/arch/powerpc/kernel/entry_32.S @@ -989,3 +989,85 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_MCOUNT +/* + * mcount() is not the same as _mcount(). The callers of mcount() have a + * normal context. The callers of _mcount() do not have a stack frame and + * have not saved the "caller saves" registers. + */ +_GLOBAL(mcount) + stwu r1,-16(r1) + mflr r3 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + stw r3,20(r1) + cmpwi r5,0 + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,16(r1) + lwz r4,4(r4) + bl __trace +1: + lwz r0,20(r1) + mtlr r0 + addi r1,r1,16 + blr + +/* + * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the + * C compiler to add a call to _mcount() at the start of each function + * preamble, before the stack frame is created. An example of this preamble + * code is: + * + * mflr r0 + * lis r12,-16354 + * stw r0,4(r1) + * addi r0,r12,-19652 + * bl 0xc00034c8 <_mcount> + * mflr r0 + * stwu r1,-16(r1) + */ +_GLOBAL(_mcount) +#define M_STK_SIZE 48 + /* Would not expect to need to save cr, but glibc version of */ + /* _mcount() does, so cautiously saving it here too. */ + stwu r1,-M_STK_SIZE(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 /* will use as first arg to __trace() */ + mfcr r4 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + cmpwi r5,0 + stw r3, 44(r1) /* lr */ + stw r4, 8(r1) /* cr */ + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,M_STK_SIZE+4(r1) + bl __trace +1: + lwz r8, 8(r1) /* cr */ + lwz r9, 44(r1) /* lr */ + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + mtcrf 0xff,r8 + mtctr r9 + lwz r0, 52(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1,r1,M_STK_SIZE + mtlr r0 + bctr + +#endif /* CONFIG_MCOUNT */ patches/preempt-realtime-sched.patch0000664000077200007720000011224210646635215017114 0ustar mingomingo--- include/linux/sched.h | 58 +++ kernel/sched.c | 761 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 712 insertions(+), 107 deletions(-) Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -89,6 +89,16 @@ struct sched_param { #include +#ifdef CONFIG_PREEMPT +extern int kernel_preemption; +#else +# define kernel_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY +extern int voluntary_preemption; +#else +# define voluntary_preemption 0 +#endif #ifdef CONFIG_PREEMPT_SOFTIRQS extern int softirq_preemption; #else @@ -193,6 +203,28 @@ print_cfs_rq(struct seq_file *m, int cpu #define set_task_state(tsk, state_value) \ set_mb((tsk)->state, (state_value)) +// #define PREEMPT_DIRECT + +#ifdef CONFIG_X86_LOCAL_APIC +extern void nmi_show_all_regs(void); +#else +# define nmi_show_all_regs() do { } while (0) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct exec_domain; + /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to @@ -408,6 +440,11 @@ extern signed long FASTCALL(schedule_tim extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +/* + * This one can be called with interrupts disabled, only + * to be used by lowlevel arch code! + */ +asmlinkage void __sched __schedule(void); struct nsproxy; @@ -512,6 +549,9 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; + /* realtime bits */ + struct list_head delayed_drop; + /* Swap token stuff */ /* * Last value of global fault stamp as seen by this process. @@ -1402,6 +1442,15 @@ extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#ifdef CONFIG_PREEMPT_RT +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->rcu, __put_task_struct_cb); +} +#else extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) @@ -1409,6 +1458,7 @@ static inline void put_task_struct(struc if (atomic_dec_and_test(&t->usage)) __put_task_struct(t); } +#endif /* * Per process flags @@ -1676,12 +1726,20 @@ extern struct mm_struct * mm_alloc(void) /* mmdrop drops the mm and the page tables */ extern void FASTCALL(__mmdrop(struct mm_struct *)); +extern void FASTCALL(__mmdrop_delayed(struct mm_struct *)); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +static inline void mmdrop_delayed(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop_delayed(mm); +} + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,6 +17,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar * 2007-04-15 Work begun on replacing all interactivity tuning with a * fair scheduling design by Con Kolivas. * 2007-05-05 Load balancing (smp-nice) and other improvements @@ -54,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -100,6 +103,20 @@ unsigned long long __attribute__((weak)) #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#if (BITS_PER_LONG < 64) +#define JIFFIES_TO_NS64(TIME) \ + ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) + +#define NS64_TO_JIFFIES(TIME) \ + ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ + (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) +#else /* BITS_PER_LONG < 64 */ + +#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) +#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) + +#endif /* BITS_PER_LONG < 64 */ + /* * These are the 'tuning knobs' of the scheduler: * @@ -110,6 +127,32 @@ unsigned long long __attribute__((weak)) #define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Tweaks for current + */ + +#ifdef CURRENT_PTR +struct task_struct * const ___current = &init_task; +struct task_struct ** const current_ptr = (struct task_struct ** const)&___current; +struct thread_info * const current_ti = &init_thread_union.thread_info; +struct thread_info ** const current_ti_ptr = (struct thread_info ** const)¤t_ti; + +EXPORT_SYMBOL(___current); +EXPORT_SYMBOL(current_ti); + +/* + * The scheduler itself doesnt want 'current' to be cached + * during context-switches: + */ +# undef current +# define current __current() +# undef current_thread_info +# define current_thread_info() __current_thread_info() +#endif + static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) @@ -183,7 +226,7 @@ struct rt_rq { * acquire operations must be ordered by ascending &runqueue. */ struct rq { - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -214,6 +257,13 @@ struct rq { */ unsigned long nr_uninterruptible; +#ifdef CONFIG_PREEMPT_RT + unsigned long rt_nr_running; + unsigned long rt_nr_uninterruptible; +#endif + + unsigned long switch_timestamp; + unsigned long slice_avg; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -258,6 +308,11 @@ struct rq { /* try_to_wake_up() stats */ unsigned long ttwu_cnt; unsigned long ttwu_local; + + /* RT-overload stats: */ + unsigned long rto_schedule; + unsigned long rto_wakeup; + unsigned long rto_pulled; #endif struct lock_class_key rq_lock_key; }; @@ -390,11 +445,23 @@ static inline void set_task_cfs_rq(struc } #endif +/* + * We really dont want to do anything complex within switch_to() + * on PREEMPT_RT - this check enforces this. + */ +#ifdef prepare_arch_switch +# ifdef CONFIG_PREEMPT_RT +# error FIXME +# else +# define _finish_arch_switch finish_arch_switch +# endif +#endif + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif #ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) +# define _finish_arch_switch(prev) do { } while (0) #endif #ifndef __ARCH_WANT_UNLOCKED_CTXSW @@ -420,7 +487,7 @@ static inline void finish_lock_switch(st */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -461,8 +528,8 @@ static inline void finish_lock_switch(st smp_wmb(); prev->oncpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); #endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -534,6 +601,53 @@ static inline struct rq *this_rq_lock(vo return rq; } +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) +static __cacheline_aligned_in_smp atomic_t rt_overload; +#endif + +static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq) +{ +#ifdef CONFIG_PREEMPT_RT + if (rt_task(p)) { + rq->rt_nr_running++; +# ifdef CONFIG_SMP + if (rq->rt_nr_running == 2) + atomic_inc(&rt_overload); +# endif + } +#endif +} + +static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq) +{ +#ifdef CONFIG_PREEMPT_RT + if (rt_task(p)) { + WARN_ON(!rq->rt_nr_running); + rq->rt_nr_running--; +# ifdef CONFIG_SMP + if (rq->rt_nr_running == 1) + atomic_dec(&rt_overload); +# endif + } +#endif +} + +static inline void incr_rt_nr_uninterruptible(struct task_struct *p, struct rq *rq) +{ +#ifdef CONFIG_PREEMPT_RT + if (rt_task(p)) + rq->rt_nr_uninterruptible++; +#endif +} + +static inline void decr_rt_nr_uninterruptible(struct task_struct *p, struct rq *rq) +{ +#ifdef CONFIG_PREEMPT_RT + if (rt_task(p)) + rq->rt_nr_uninterruptible--; +#endif +} + /* * CPU frequency is/was unstable - start new by setting prev_clock_raw: */ @@ -551,12 +665,6 @@ void sched_clock_unstable_event(void) #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT -static inline void trace_start_sched_wakeup(struct task_struct *p, struct rq *rq) -{ - if (p != rq->curr) - __trace_start_sched_wakeup(p); -} - /* * resched_task - mark a task 'to be rescheduled now'. * @@ -574,8 +682,6 @@ static void resched_task(struct task_str { int cpu; - trace_start_sched_wakeup(p, task_rq(p)); - assert_spin_locked(&task_rq(p)->lock); if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) @@ -607,8 +713,6 @@ static void resched_cpu(int cpu) #else static inline void resched_task(struct task_struct *p) { - trace_start_sched_wakeup(p, task_rq(p)); - assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } @@ -883,6 +987,8 @@ static inline int normal_prio(struct tas prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); + + trace_special_pid(p->pid, PRIO(p), __PRIO(prio)); return prio; } @@ -906,6 +1012,13 @@ static int effective_prio(struct task_st return p->prio; } +static inline void trace_start_sched_wakeup(struct task_struct *p, + struct rq *rq) +{ + if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr)) + __trace_start_sched_wakeup(p); +} + /* * activate_task - move a task to the runqueue. */ @@ -913,8 +1026,12 @@ static void activate_task(struct rq *rq, { u64 now = rq_clock(rq); - if (p->state == TASK_UNINTERRUPTIBLE) + if (p->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible--; + decr_rt_nr_uninterruptible(p, rq); + } + + trace_special_pid(p->pid, PRIO(p), rq->nr_running); enqueue_task(rq, p, wakeup, now); inc_nr_running(p, rq, now); @@ -927,8 +1044,12 @@ static inline void activate_idle_task(st { u64 now = rq_clock(rq); - if (p->state == TASK_UNINTERRUPTIBLE) + if (p->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible--; + decr_rt_nr_uninterruptible(p, rq); + } + + trace_special_pid(p->pid, PRIO(p), rq->nr_running); enqueue_task(rq, p, 0, now); inc_nr_running(p, rq, now); @@ -941,8 +1062,12 @@ static void deactivate_task(struct rq *r { u64 now = rq_clock(rq); - if (p->state == TASK_UNINTERRUPTIBLE) + if (p->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible++; + incr_rt_nr_uninterruptible(p, rq); + } + + trace_special_pid(p->pid, PRIO(p), rq->nr_running); dequeue_task(rq, p, sleep, now); dec_nr_running(p, rq, now); @@ -963,11 +1088,42 @@ unsigned long weighted_cpuload(const int return cpu_rq(cpu)->ls.load.weight; } +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) +{ + struct sched_class *class; + struct task_struct *p; + + /* + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: + */ + if (likely(rq->nr_running == rq->cfs.nr_running)) { + p = fair_sched_class.pick_next_task(rq, now); + if (likely(p)) + return p; + } + + class = sched_class_highest; + for (;;) { + p = class->pick_next_task(rq, now); + if (p) + return p; + /* + * Will never be NULL as the idle class always + * returns a non-NULL p: + */ + class = class->next; + } +} + #ifdef CONFIG_SMP static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { - trace_change_sched_cpu(p, cpu); task_thread_info(p)->cpu = cpu; set_task_cfs_rq(p); } @@ -1237,6 +1393,119 @@ nextgroup: return idlest; } +#ifdef CONFIG_PREEMPT_RT + +static struct task_struct * pick_rt_task(struct rq *src_rq, int this_cpu) +{ + struct list_head *head, *curr; + struct prio_array *array = &src_rq->rt.active; + struct task_struct *tmp; + int idx; + + WARN_ON(!spin_is_locked(&src_rq->lock)); + + idx = sched_find_first_bit(array->bitmap); +next_in_bitmap: + /* + * Only non-RT tasks available - abort the search: + */ + if (idx >= MAX_RT_PRIO) + return NULL; + + head = array->queue + idx; + curr = head->next; +next_in_queue: + tmp = list_entry(curr, struct task_struct, run_list); + /* + * Return the highest-prio non-running RT task (if task + * may run on this CPU): + */ + if (!task_running(src_rq, tmp) && + cpu_isset(this_cpu, tmp->cpus_allowed)) + return tmp; + + curr = curr->next; + if (curr != head) + goto next_in_queue; + + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + goto next_in_bitmap; +} + +static int double_lock_balance(struct rq *this_rq, struct rq *busiest); + +/* + * Pull RT tasks from other CPUs in the RT-overload + * case. Interrupts are disabled, local rq is locked. + */ +static void balance_rt_tasks(struct rq *this_rq, int this_cpu) +{ + struct task_struct *p, *next; + struct rq *src_rq; + int cpu; + + WARN_ON(!irqs_disabled()); + + /* + * No need to do array switching - there can be no + * RT tasks in the expired array and the idle task + * is more than enough for comparing against RT tasks: + */ + next = pick_next_task(this_rq, this_rq->curr, rq_clock(this_rq)); + + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + src_rq = cpu_rq(cpu); + if (src_rq->rt_nr_running <= 1) + continue; + + /* + * We can potentially drop this_rq's lock in + * double_lock_balance, and another CPU could + * steal our next task - hence we must cause + * the caller to recalculate the next task + * in that case: + */ + if (double_lock_balance(this_rq, src_rq)) + next = pick_next_task(this_rq, this_rq->curr, + rq_clock(this_rq)); + /* + * Are there still pullable RT tasks? + */ + if (src_rq->rt_nr_running <= 1) { + spin_unlock(&src_rq->lock); + continue; + } + + p = pick_rt_task(src_rq, this_cpu); + + /* + * Do we have an RT task that preempts + * the to-be-scheduled task? + */ + if (p && (p->prio < next->prio)) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->se.on_rq); + schedstat_inc(this_rq, rto_pulled); + + set_task_cpu(p, this_cpu); + + deactivate_task(src_rq, p, 0); + activate_task(this_rq, p, 0); + /* + * We continue with the search, just in + * case there's an even higher prio task + * in another runqueue. (low likelyhood + * but possible) + */ + } + spin_unlock(&src_rq->lock); + } +} + +#endif + /* * find_idlest_cpu - find the idlest cpu among the cpus in group. */ @@ -1405,6 +1674,14 @@ try_to_wake_up(struct task_struct *p, un int new_cpu; #endif + trace_special_sym(); +#ifdef CONFIG_PREEMPT_RT + /* + * sync wakeups can increase wakeup latencies: + */ + if (rt_task(p)) + sync = 0; +#endif rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) @@ -1510,9 +1787,45 @@ out_set_cpu: cpu = task_cpu(p); } + /* + * If a newly woken up RT task cannot preempt the + * current (RT) task (on a target runqueue) then try + * to find another CPU it can preempt: + */ + if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) { + struct rq *this_rq = cpu_rq(this_cpu); + /* + * Special-case: the task on this CPU can be + * preempted. In that case there's no need to + * trigger reschedules on other CPUs, we can + * mark the current task for reschedule. + * + * (Note that it's safe to access this_rq without + * extra locking in this particular case, because + * we are on the current CPU.) + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + set_tsk_need_resched(this_rq->curr); + else + /* + * Neither the intended target runqueue + * nor the current CPU can take this task. + * Trigger a reschedule on all other CPUs + * nevertheless, maybe one of them can take + * this task: + */ + smp_send_reschedule_allbutself(); + + schedstat_inc(this_rq, rto_wakeup); + } + out_activate: #endif /* CONFIG_SMP */ + activate_task(rq, p, 1); + + trace_start_sched_wakeup(p, rq); + /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) @@ -1523,10 +1836,20 @@ out_activate: */ if (!sync || cpu != this_cpu) check_preempt_curr(rq, p); + else { + if (TASK_PREEMPTS_CURR(p, rq)) + set_tsk_need_resched_delayed(rq->curr); + } + if (rq->curr && p && rq && _need_resched()) + trace_special_pid(p->pid, PRIO(p), PRIO(rq->curr)); + success = 1; out_running: - p->state = TASK_RUNNING; + if (mutex) + p->state = TASK_RUNNING_MUTEX; + else + p->state = TASK_RUNNING; out: task_rq_unlock(rq, &flags); @@ -1540,7 +1863,6 @@ int fastcall wake_up_process(struct task ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 0); - mcount(); return ret; } EXPORT_SYMBOL(wake_up_process); @@ -1552,7 +1874,6 @@ int fastcall wake_up_process_sync(struct ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 1, 0); - mcount(); return ret; } EXPORT_SYMBOL(wake_up_process_sync); @@ -1562,7 +1883,6 @@ int fastcall wake_up_process_mutex(struc int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 1); - mcount(); return ret; } EXPORT_SYMBOL(wake_up_process_mutex); @@ -1572,16 +1892,13 @@ int fastcall wake_up_process_mutex_sync( int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 1, 1); - mcount(); return ret; } EXPORT_SYMBOL(wake_up_process_mutex_sync); int fastcall wake_up_state(struct task_struct *p, unsigned int state) { - int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0); - mcount(); - return ret; + return try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0); } /* @@ -1753,12 +2070,27 @@ static inline void finish_task_switch(st * be dropped twice. * Manfred Spraul */ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + /* + * If we pushed an RT task off the runqueue, + * then kick other CPUs, they might run it: + */ + if (unlikely(rt_task(current) && prev->se.on_rq && rt_task(prev))) { + schedstat_inc(rq, rto_schedule); + smp_send_reschedule_allbutself(); + } +#endif prev_state = prev->state; - finish_arch_switch(prev); + _finish_arch_switch(prev); finish_lock_switch(rq, prev); trace_stop_sched_switched(current); - if (likely(mm)) - mmdrop(mm); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ + if (mm) + mmdrop_delayed(mm); + if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -1776,12 +2108,15 @@ static inline void finish_task_switch(st asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + preempt_disable(); // TODO: move this to fork setup + finish_task_switch(this_rq(), prev); + __preempt_enable_no_resched(); + local_irq_enable(); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); +#else + preempt_check_resched(); #endif if (current->set_child_tid) put_user(current->pid, current->set_child_tid); @@ -1830,6 +2165,11 @@ context_switch(struct rq *rq, struct tas trace_cmdline(); +#ifdef CURRENT_PTR + barrier(); + *current_ptr = next; + *current_ti_ptr = next->thread_info; +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -1879,6 +2219,43 @@ unsigned long nr_uninterruptible(void) return sum; } +unsigned long nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_uninterruptible; +} + +#if defined(CONFIG_PREEMPT_RT) +unsigned long rt_nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt_nr_running; + + return sum; +} + +unsigned long rt_nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->rt_nr_running; +} + +unsigned long rt_nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt_nr_uninterruptible; + + return sum; +} + +unsigned long rt_nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->rt_nr_uninterruptible; +} +#endif + unsigned long long nr_context_switches(void) { int i; @@ -2019,7 +2396,7 @@ static void double_rq_unlock(struct rq * /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static void double_lock_balance(struct rq *this_rq, struct rq *busiest) +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) @@ -2034,9 +2411,12 @@ static void double_lock_balance(struct r spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); + + return 1; } else spin_lock(&busiest->lock); } + return 0; } /* @@ -3269,6 +3649,8 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + BUG_ON(!irqs_disabled()); + spin_lock(&rq->lock); if (curr != rq->idle) /* FIXME: needed? */ curr->sched_class->task_tick(rq, curr); @@ -3286,8 +3668,11 @@ void scheduler_tick(void) */ static noinline void __schedule_bug(struct task_struct *prev) { - printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", - prev->comm, preempt_count(), prev->pid); + stop_trace(); + + printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n", + prev->comm, preempt_count(), prev->pid, smp_processor_id()); + debug_show_held_locks(prev); if (irqs_disabled()) print_irqtrace_events(prev); @@ -3299,6 +3684,8 @@ static noinline void __schedule_bug(stru */ static inline void schedule_debug(struct task_struct *prev) { + WARN_ON(system_state == SYSTEM_BOOTING); + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -3309,45 +3696,15 @@ static inline void schedule_debug(struct profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_cnt); -} + trace_special_sym(); -/* - * Pick up the highest-prio task: - */ -static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) -{ - struct sched_class *class; - struct task_struct *p; - - /* - * Optimization: we know that if all tasks are in - * the fair class we can call that function directly: - */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { - p = fair_sched_class.pick_next_task(rq, now); - if (likely(p)) - return p; - } - - class = sched_class_highest; - for (;;) { - p = class->pick_next_task(rq, now); - if (p) - return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; - } + schedstat_inc(this_rq(), sched_cnt); } /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; long *switch_count; @@ -3355,7 +3712,6 @@ asmlinkage void __sched schedule(void) u64 now; int cpu; -need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3364,24 +3720,34 @@ need_resched: switch_count = &prev->nivcsw; release_kernel_lock(prev); -need_resched_nonpreemptible: schedule_debug(prev); spin_lock_irq(&rq->lock); + cpu = smp_processor_id(); clear_tsk_need_resched(prev); clear_tsk_need_resched_delayed(prev); - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if ((prev->state & ~TASK_RUNNING_MUTEX) && + !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) { prev->state = TASK_RUNNING; } else { + touch_softlockup_watchdog(); deactivate_task(rq, prev, 1); } switch_count = &prev->nvcsw; } + if (preempt_count() & PREEMPT_ACTIVE) + sub_preempt_count(PREEMPT_ACTIVE); + +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (unlikely(atomic_read(&rt_overload))) + balance_rt_tasks(rq, cpu); +#endif + if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -3397,24 +3763,93 @@ need_resched_nonpreemptible: ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ + __preempt_enable_no_resched(); } else { - spin_unlock_irq(&rq->lock); + __preempt_enable_no_resched(); + spin_unlock(&rq->lock); trace_stop_sched_switched(next); } - if (unlikely(reacquire_kernel_lock(current) < 0)) { - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - goto need_resched_nonpreemptible; + reacquire_kernel_lock(current); + if (!irqs_disabled()) { + static int once = 1; + if (once) { + once = 0; + print_irqtrace_events(current); + WARN_ON(1); + } } - __preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || - test_thread_flag(TIF_NEED_RESCHED_DELAYED))) - goto need_resched; +} + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + WARN_ON(system_state == SYSTEM_BOOTING); + /* + * Test if we have interrupts disabled. + */ + if (unlikely(irqs_disabled())) { + stop_trace(); + printk(KERN_ERR "BUG: scheduling with irqs disabled: " + "%s/0x%08x/%d\n", current->comm, preempt_count(), + current->pid); + print_symbol("caller is %s\n", + (long)__builtin_return_address(0)); + dump_stack(); + } + + if (unlikely(current->flags & PF_NOSCHED)) { + current->flags &= ~PF_NOSCHED; + printk(KERN_ERR "%s:%d userspace BUG: scheduling in " + "user-atomic context!\n", current->comm, current->pid); + dump_stack(); + send_sig(SIGUSR2, current, 1); + } + + local_irq_disable(); + + do { + __schedule(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || + test_thread_flag(TIF_NEED_RESCHED_DELAYED))); + + local_irq_enable(); } EXPORT_SYMBOL(schedule); + #ifdef CONFIG_PREEMPT + +/* + * Global flag to turn preemption off on a CONFIG_PREEMPT kernel: + */ +int kernel_preemption = 1; + +static int __init preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) { + if (kernel_preemption) { + printk(KERN_INFO "turning off kernel preemption!\n"); + kernel_preemption = 0; + } + return 1; + } + if (!strncmp(str, "on", 2)) { + if (!kernel_preemption) { + printk(KERN_INFO "turning on kernel preemption!\n"); + kernel_preemption = 1; + } + return 1; + } + get_option(&str, &kernel_preemption); + + return 1; +} + +__setup("preempt=", preempt_setup); + /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -3427,6 +3862,8 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; #endif + if (!kernel_preemption) + return; /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -3435,6 +3872,7 @@ asmlinkage void __sched preempt_schedule return; need_resched: + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* * We keep the big kernel semaphore locked, but we @@ -3445,25 +3883,25 @@ need_resched: saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - schedule(); + __schedule(); #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ barrier(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED))) goto need_resched; + local_irq_enable(); } + EXPORT_SYMBOL(preempt_schedule); /* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * this is is the entry point for the IRQ return path. Called with + * interrupts disabled. To avoid infinite irq-entry recursion problems + * with fast-paced IRQ sources we do all of this carefully to never + * enable interrupts again. */ asmlinkage void __sched preempt_schedule_irq(void) { @@ -3472,10 +3910,18 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; #endif - /* Catch callers which need to be fixed */ - WARN_ON_ONCE(ti->preempt_count || !irqs_disabled()); + + if (!kernel_preemption) + return; + /* + * If there is a non-zero preempt_count then just return. + * (interrupts are disabled) + */ + if (unlikely(ti->preempt_count)) + return; need_resched: + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* * We keep the big kernel semaphore locked, but we @@ -3486,14 +3932,13 @@ need_resched: saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - local_irq_enable(); - schedule(); + __schedule(); + local_irq_disable(); + #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ barrier(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || @@ -3831,7 +4276,7 @@ EXPORT_SYMBOL(sleep_on_timeout); void rt_mutex_setprio(struct task_struct *p, int prio) { unsigned long flags; - int oldprio, on_rq; + int oldprio, prev_resched, on_rq; struct rq *rq; u64 now; @@ -3852,6 +4297,9 @@ void rt_mutex_setprio(struct task_struct p->prio = prio; + trace_special_pid(p->pid, __PRIO(oldprio), PRIO(p)); + prev_resched = _need_resched(); + if (on_rq) { enqueue_task(rq, p, 0, now); /* @@ -3866,6 +4314,7 @@ void rt_mutex_setprio(struct task_struct check_preempt_curr(rq, p); } } + trace_special(prev_resched, _need_resched(), 0); task_rq_unlock(rq, &flags); } @@ -4455,15 +4904,19 @@ asmlinkage long sys_sched_yield(void) * no need to preempt or enable interrupts: */ spin_unlock_no_resched(&rq->lock); + rcu_read_unlock(); - schedule(); + __schedule(); + + local_irq_enable(); + preempt_check_resched(); return 0; } static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) __might_sleep(__FILE__, __LINE__); #endif /* @@ -4472,10 +4925,11 @@ static void __cond_resched(void) * cond_resched() call. */ do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __schedule(); } while (need_resched()); + local_irq_enable(); } int __sched cond_resched(void) @@ -4501,7 +4955,7 @@ int __cond_resched_raw_spinlock(raw_spin { int ret = 0; - if (need_lockbreak(lock)) { + if (need_lockbreak_raw(lock)) { spin_unlock(lock); cpu_relax(); ret = 1; @@ -4517,6 +4971,25 @@ int __cond_resched_raw_spinlock(raw_spin } EXPORT_SYMBOL(__cond_resched_raw_spinlock); +#ifdef CONFIG_PREEMPT_RT + +int __cond_resched_spinlock(spinlock_t *lock) +{ +#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock_no_resched(lock); + __cond_resched(); + spin_lock(lock); + return 1; + } +#endif + return 0; +} +EXPORT_SYMBOL(__cond_resched_spinlock); + +#endif + /* * Voluntarily preempt a process context that has softirqs disabled: */ @@ -4563,29 +5036,73 @@ int cond_resched_hardirq_context(void) WARN_ON_ONCE(!irqs_disabled()); if (hardirq_need_resched()) { +#ifndef CONFIG_PREEMPT_RT irq_exit(); +#endif local_irq_enable(); __cond_resched(); +#ifndef CONFIG_PREEMPT_RT local_irq_disable(); __irq_enter(); - +#endif return 1; } return 0; } EXPORT_SYMBOL(cond_resched_hardirq_context); +#ifdef CONFIG_PREEMPT_VOLUNTARY + +int voluntary_preemption = 1; + +EXPORT_SYMBOL(voluntary_preemption); + +static int __init voluntary_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + voluntary_preemption = 0; + else + get_option(&str, &voluntary_preemption); + if (!voluntary_preemption) + printk("turning off voluntary preemption!\n"); + + return 1; +} + +__setup("voluntary-preempt=", voluntary_preempt_setup); + +#endif + /** * yield - yield the current processor to other threads. * * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void __sched yield(void) +void __sched __yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } + +void __sched yield(void) +{ + static int once = 1; + + /* + * it's a bug to rely on yield() with RT priorities. We print + * the first occurance after bootup ... this will still give + * us an idea about the scope of the problem, without spamming + * the syslog: + */ + if (once && rt_task(current)) { + once = 0; + printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", + current->comm, current->pid); + dump_stack(); + } + __yield(); +} EXPORT_SYMBOL(yield); /* @@ -4752,6 +5269,7 @@ static void show_task(struct task_struct void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; + int do_unlock = 1; #if BITS_PER_LONG == 32 printk(KERN_INFO @@ -4760,7 +5278,16 @@ void show_state_filter(unsigned long sta printk(KERN_INFO " task PC stack pid father\n"); #endif +#ifdef CONFIG_PREEMPT_RT + if (!read_trylock(&tasklist_lock)) { + printk("hm, tasklist_lock write-locked.\n"); + printk("ignoring ...\n"); + do_unlock = 0; + } +#else read_lock(&tasklist_lock); +#endif + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -4773,7 +5300,9 @@ void show_state_filter(unsigned long sta touch_all_softlockup_watchdogs(); - read_unlock(&tasklist_lock); + if (do_unlock) + read_unlock(&tasklist_lock); + /* * Only show locks if all tasks are dumped: */ @@ -4817,7 +5346,9 @@ void __cpuinit init_idle(struct task_str spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) +#if defined(CONFIG_PREEMPT) && \ + !defined(CONFIG_PREEMPT_BKL) && \ + !defined(CONFIG_PREEMPT_RT) task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); #else task_thread_info(idle)->preempt_count = 0; @@ -4932,11 +5463,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; + unsigned long flags; int ret = 0, on_rq; if (unlikely(cpu_is_offline(dest_cpu))) return ret; + /* + * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock) + * disabling interrupts - which on PREEMPT_RT does not do: + */ + local_irq_save(flags); + rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -4951,6 +5489,7 @@ static int __migrate_task(struct task_st on_rq = p->se.on_rq; if (on_rq) deactivate_task(rq_src, p, 0); + set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); @@ -4959,6 +5498,8 @@ static int __migrate_task(struct task_st ret = 1; out: double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + return ret; } @@ -6398,6 +6939,9 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); +#ifdef CONFIG_PREEMPT_RT + printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n"); +#endif /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -6419,14 +6963,17 @@ void __might_sleep(char *file, int line) if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (debug_direct_keyboard && hardirq_count()) + return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; stop_trace(); printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + " context %s(%d) at %s:%d\n", + current->comm, current->pid, file, line); + printk("in_atomic():%d [%08x], irqs_disabled():%d\n", + in_atomic(), preempt_count(), irqs_disabled()); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); patches/preempt-realtime-powerpc-missing-raw-spinlocks.patch0000664000077200007720000001050310646635215023743 0ustar mingomingoFrom sshtylyov@ru.mvista.com Thu Jun 21 22:24:22 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from imap.sh.mvista.com (unknown [63.81.120.155]) by mail.tglx.de (Postfix) with ESMTP id 2149065C065 for ; Thu, 21 Jun 2007 22:24:22 +0200 (CEST) Received: from wasted.dev.rtsoft.ru (unknown [10.150.0.9]) by imap.sh.mvista.com (Postfix) with ESMTP id D27113EC9; Thu, 21 Jun 2007 13:24:15 -0700 (PDT) From: Sergei Shtylyov Organization: MontaVista Software Inc. To: tglx@linutronix.de, bruce.ashfield@gmail.com, rostedt@goodmis.org Subject: [PATCH] (2.6.20-rt3) PowerPC: convert spinlocks into raw Date: Thu, 21 Jun 2007 23:25:58 +0300 User-Agent: KMail/1.5 MIME-Version: 1.0 Content-Disposition: inline Content-Type: text/plain; charset="iso-8859-1" Message-Id: <200706220025.58799.sshtylyov@ru.mvista.com> X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Convert the spinlocks in the PowerPC interrupt related code into the raw ones, also convert the PURR and PMC related spinlocks... Signed-off-by: Mark A. Greer Signed-off-by: Sergei Shtylyov --- Resending in hopes it still can apply -- if it doesn't, bug me again... :-) --- arch/powerpc/kernel/pmc.c | 2 +- arch/powerpc/kernel/time.c | 2 +- arch/powerpc/sysdev/i8259.c | 2 +- arch/powerpc/sysdev/ipic.c | 2 +- arch/powerpc/sysdev/mpic.c | 2 +- include/asm-powerpc/mpic.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/pmc.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/pmc.c +++ linux-rt.q/arch/powerpc/kernel/pmc.c @@ -37,7 +37,7 @@ static void dummy_perf(struct pt_regs *r } -static DEFINE_SPINLOCK(pmc_owner_lock); +static DEFINE_RAW_SPINLOCK(pmc_owner_lock); static void *pmc_owner_caller; /* mostly for debugging */ perf_irq_t perf_irq = dummy_perf; Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -301,7 +301,7 @@ struct cpu_purr_data { int initialized; /* thread is running */ u64 tb; /* last TB value read */ u64 purr; /* last PURR value read */ - spinlock_t lock; + raw_spinlock_t lock; }; static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data); Index: linux-rt.q/arch/powerpc/sysdev/i8259.c =================================================================== --- linux-rt.q.orig/arch/powerpc/sysdev/i8259.c +++ linux-rt.q/arch/powerpc/sysdev/i8259.c @@ -23,7 +23,7 @@ static unsigned char cached_8259[2] = { #define cached_A1 (cached_8259[0]) #define cached_21 (cached_8259[1]) -static DEFINE_SPINLOCK(i8259_lock); +static DEFINE_RAW_SPINLOCK(i8259_lock); static struct device_node *i8259_node; static struct irq_host *i8259_host; Index: linux-rt.q/arch/powerpc/sysdev/ipic.c =================================================================== --- linux-rt.q.orig/arch/powerpc/sysdev/ipic.c +++ linux-rt.q/arch/powerpc/sysdev/ipic.c @@ -30,7 +30,7 @@ #include "ipic.h" static struct ipic * primary_ipic; -static DEFINE_SPINLOCK(ipic_lock); +static DEFINE_RAW_SPINLOCK(ipic_lock); static struct ipic_info ipic_info[] = { [9] = { Index: linux-rt.q/arch/powerpc/sysdev/mpic.c =================================================================== --- linux-rt.q.orig/arch/powerpc/sysdev/mpic.c +++ linux-rt.q/arch/powerpc/sysdev/mpic.c @@ -46,7 +46,7 @@ static struct mpic *mpics; static struct mpic *mpic_primary; -static DEFINE_SPINLOCK(mpic_lock); +static DEFINE_RAW_SPINLOCK(mpic_lock); #ifdef CONFIG_PPC32 /* XXX for now */ #ifdef CONFIG_IRQ_ALL_CPUS Index: linux-rt.q/include/asm-powerpc/mpic.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/mpic.h +++ linux-rt.q/include/asm-powerpc/mpic.h @@ -280,7 +280,7 @@ struct mpic #ifdef CONFIG_MPIC_U3_HT_IRQS /* The fixup table */ struct mpic_irq_fixup *fixups; - spinlock_t fixup_lock; + raw_spinlock_t fixup_lock; #endif /* Register access method */ patches/ppc-fix-clocksource-timebase-shift.patch0000664000077200007720000000226210646635213021334 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Mon May 14 17:23:17 2007 Date: Mon, 14 May 2007 17:23:17 +0900 From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [patch 5/5] powerpc 2.6.21-rt1] fix clocksource_timebase.shift value Calculate clocksource_timebase.shift from tb_ticks_per_jiffy to get an accurate translation, though I don't understand why current version of clocksource_timebase.shift could be constant... Signed-off-by: Tsutomu OWA -- owa --- arch/powerpc/kernel/time.c | 3 +++ 1 file changed, 3 insertions(+) Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -926,6 +926,9 @@ static int __init init_timebase_clocksou if (__USE_RTC()) return -ENODEV; +#ifdef CONFIG_PPC64 + clocksource_timebase.shift = tb_ticks_per_jiffy / 1000000; +#endif clocksource_timebase.mult = clocksource_hz2mult(tb_ticks_per_sec, clocksource_timebase.shift); return clocksource_register(&clocksource_timebase); patches/preempt-realtime-loopback.patch0000664000077200007720000000065510646635215017624 0ustar mingomingo--- drivers/net/loopback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/drivers/net/loopback.c =================================================================== --- linux-rt.q.orig/drivers/net/loopback.c +++ linux-rt.q/drivers/net/loopback.c @@ -159,7 +159,7 @@ static int loopback_xmit(struct sk_buff lb_stats->packets++; put_cpu(); - netif_rx(skb); + netif_rx_ni(skb); return 0; } patches/acpi-remove-the-useless-ifdef-code.patch0000664000077200007720000000334710646635211021206 0ustar mingomingoSubject: ACPI: remove the now unused ifdef code The conversion of x86-64 to clock events makes the #ifdef CONFIG_GENERIC_CLOCKEVENTS n the timer broadcast functions useless. Remove it. Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Cc: john stultz Cc: Andi Kleen Signed-off-by: Andrew Morton --- drivers/acpi/processor_idle.c | 12 ------------ 1 file changed, 12 deletions(-) Index: linux-rt.q/drivers/acpi/processor_idle.c =================================================================== --- linux-rt.q.orig/drivers/acpi/processor_idle.c +++ linux-rt.q/drivers/acpi/processor_idle.c @@ -203,21 +203,12 @@ static void acpi_timer_check_state(int s static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) { -#ifdef CONFIG_GENERIC_CLOCKEVENTS unsigned long reason; reason = pr->power.timer_broadcast_on_state < INT_MAX ? CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF; clockevents_notify(reason, &pr->id); -#else - cpumask_t mask = cpumask_of_cpu(pr->id); - - if (pr->power.timer_broadcast_on_state < INT_MAX) - on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1); - else - on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1); -#endif } /* Power(C) State timer broadcast control */ @@ -225,8 +216,6 @@ static void acpi_state_timer_broadcast(s struct acpi_processor_cx *cx, int broadcast) { -#ifdef CONFIG_GENERIC_CLOCKEVENTS - int state = cx - pr->power.states; if (state >= pr->power.timer_broadcast_on_state) { @@ -236,7 +225,6 @@ static void acpi_state_timer_broadcast(s CLOCK_EVT_NOTIFY_BROADCAST_EXIT; clockevents_notify(reason, &pr->id); } -#endif } #else patches/rt-mutex-x86-64.patch0000664000077200007720000004176310646635214015223 0ustar mingomingo--- arch/x86_64/Kconfig | 15 ++++--- arch/x86_64/kernel/entry.S | 18 ++++----- arch/x86_64/kernel/tsc_sync.c | 2 - arch/x86_64/kernel/vsyscall.c | 4 +- arch/x86_64/kernel/x8664_ksyms.c | 10 +++-- arch/x86_64/lib/thunk.S | 12 +++--- include/asm-x86_64/semaphore.h | 69 +++++++++++++++++++++--------------- include/asm-x86_64/spinlock.h | 28 +++++++------- include/asm-x86_64/spinlock_types.h | 4 +- include/asm-x86_64/thread_info.h | 2 + 10 files changed, 92 insertions(+), 72 deletions(-) Index: linux-rt.q/arch/x86_64/Kconfig =================================================================== --- linux-rt.q.orig/arch/x86_64/Kconfig +++ linux-rt.q/arch/x86_64/Kconfig @@ -78,13 +78,6 @@ config ISA config SBUS bool -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_HWEIGHT bool default y @@ -361,6 +354,14 @@ config NUMA If the system is EM64T, you should say N unless your system is EM64T NUMA. +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + bool + config K8_NUMA bool "Old style AMD Opteron NUMA detection" depends on NUMA && PCI Index: linux-rt.q/arch/x86_64/kernel/entry.S =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/entry.S +++ linux-rt.q/arch/x86_64/kernel/entry.S @@ -310,8 +310,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz sysret_signal TRACE_IRQS_ON sti pushq %rdi @@ -334,7 +334,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli @@ -389,8 +389,8 @@ int_with_check: /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz int_very_careful TRACE_IRQS_ON sti pushq %rdi @@ -425,7 +425,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi int_restore_rest: RESTORE_REST cli @@ -629,8 +629,8 @@ bad_iret: /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz retint_signal TRACE_IRQS_ON sti pushq %rdi @@ -656,7 +656,7 @@ retint_signal: RESTORE_REST cli TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi + movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi GET_THREAD_INFO(%rcx) jmp retint_check Index: linux-rt.q/arch/x86_64/kernel/tsc_sync.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/tsc_sync.c +++ linux-rt.q/arch/x86_64/kernel/tsc_sync.c @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata __raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; Index: linux-rt.q/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/vsyscall.c +++ linux-rt.q/arch/x86_64/kernel/vsyscall.c @@ -58,7 +58,7 @@ * Try to keep this structure as small as possible to avoid cache line ping pongs */ struct vsyscall_gtod_data_t { - seqlock_t lock; + raw_seqlock_t lock; /* open coded 'struct timespec' */ time_t wall_time_sec; @@ -78,7 +78,7 @@ int __vgetcpu_mode __section_vgetcpu_mod struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = { - .lock = SEQLOCK_UNLOCKED, + .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; Index: linux-rt.q/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/x8664_ksyms.c +++ linux-rt.q/arch/x86_64/kernel/x8664_ksyms.c @@ -11,10 +11,12 @@ EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); Index: linux-rt.q/arch/x86_64/lib/thunk.S =================================================================== --- linux-rt.q.orig/arch/x86_64/lib/thunk.S +++ linux-rt.q/arch/x86_64/lib/thunk.S @@ -40,11 +40,13 @@ thunk rwsem_wake_thunk,rwsem_wake thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + thunk __compat_down_failed,__compat_down + thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible + thunk_retrax __compat_down_failed_trylock,__compat_down_trylock + thunk __compat_up_wakeup,__compat_up +#endif #ifdef CONFIG_TRACE_IRQFLAGS /* put return address in rdi (arg1) */ Index: linux-rt.q/include/asm-x86_64/semaphore.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/semaphore.h +++ linux-rt.q/include/asm-x86_64/semaphore.h @@ -5,6 +5,10 @@ #ifdef __KERNEL__ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + /* * SMP- and interrupt-safe semaphores.. * @@ -43,29 +47,34 @@ #include #include -struct semaphore { +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) + +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) -#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0) -static inline void sema_init (struct semaphore *sem, int val) +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { /* - * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val); + * *sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val); * * i'd rather use the more flexible initialization above, but sadly * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well. @@ -75,32 +84,33 @@ static inline void sema_init (struct sem init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -asmlinkage void __down_failed(void /* special register calling convention */); -asmlinkage int __down_failed_interruptible(void /* params in registers */); -asmlinkage int __down_failed_trylock(void /* params in registers */); -asmlinkage void __up_wakeup(void /* special register calling convention */); +asmlinkage void __compat_down_failed(void /* special register calling convention */); +asmlinkage int __compat_down_failed_interruptible(void /* params in registers */); +asmlinkage int __compat_down_failed_trylock(void /* params in registers */); +asmlinkage void __compat_up_wakeup(void /* special register calling convention */); -asmlinkage void __down(struct semaphore * sem); -asmlinkage int __down_interruptible(struct semaphore * sem); -asmlinkage int __down_trylock(struct semaphore * sem); -asmlinkage void __up(struct semaphore * sem); +asmlinkage void __compat_down(struct compat_semaphore * sem); +asmlinkage int __compat_down_interruptible(struct compat_semaphore * sem); +asmlinkage int __compat_down_trylock(struct compat_semaphore * sem); +asmlinkage void __compat_up(struct compat_semaphore * sem); +asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem); /* * This is ugly, but we want the default case to fall through. * "__down_failed" is a special asm handler that calls the C * routine that actually waits. See arch/x86_64/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -108,7 +118,7 @@ static inline void down(struct semaphore "# atomic down operation\n\t" LOCK_PREFIX "decl %0\n\t" /* --sem->count */ "jns 1f\n\t" - "call __down_failed\n" + "call __compat_down_failed\n" "1:" :"=m" (sem->count) :"D" (sem) @@ -119,7 +129,7 @@ static inline void down(struct semaphore * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR */ -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int result; @@ -130,7 +140,7 @@ static inline int down_interruptible(str "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" - "call __down_failed_interruptible\n" + "call __compat_down_failed_interruptible\n" "2:\n" :"=&a" (result), "=m" (sem->count) :"D" (sem) @@ -142,7 +152,7 @@ static inline int down_interruptible(str * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it */ -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { int result; @@ -151,7 +161,7 @@ static inline int down_trylock(struct se "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" - "call __down_failed_trylock\n\t" + "call __compat_down_failed_trylock\n\t" "2:\n" :"=&a" (result), "=m" (sem->count) :"D" (sem) @@ -165,17 +175,20 @@ static inline int down_trylock(struct se * The default case (no contention) will result in NO * jumps for both down() and up(). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __asm__ __volatile__( "# atomic up operation\n\t" LOCK_PREFIX "incl %0\n\t" /* ++sem->count */ "jg 1f\n\t" - "call __up_wakeup\n" + "call __compat_up_wakeup\n" "1:" :"=m" (sem->count) :"D" (sem) :"memory"); } + +#include + #endif /* __KERNEL__ */ #endif Index: linux-rt.q/include/asm-x86_64/spinlock.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/spinlock.h +++ linux-rt.q/include/asm-x86_64/spinlock.h @@ -17,12 +17,12 @@ * (the type definitions are in asm/spinlock_types.h) */ -static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +static inline int __raw_spin_is_locked(__raw_spinlock_t *lock) { return *(volatile signed int *)(&(lock)->slock) <= 0; } -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { asm volatile( "\n1:\t" @@ -40,7 +40,7 @@ static inline void __raw_spin_lock(raw_s * Same as __raw_spin_lock, but reenable interrupts during spinning. */ #ifndef CONFIG_PROVE_LOCKING -static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +static inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { asm volatile( "\n1:\t" @@ -65,7 +65,7 @@ static inline void __raw_spin_lock_flags } #endif -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { int oldval; @@ -77,12 +77,12 @@ static inline int __raw_spin_trylock(raw return oldval > 0; } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { asm volatile("movl $1,%0" :"=m" (lock->slock) :: "memory"); } -static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +static inline void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (__raw_spin_is_locked(lock)) cpu_relax(); @@ -102,17 +102,17 @@ static inline void __raw_spin_unlock_wai * with the high bit (sign) being the "contended" bit. */ -static inline int __raw_read_can_lock(raw_rwlock_t *lock) +static inline int __raw_read_can_lock(__raw_rwlock_t *lock) { return (int)(lock)->lock > 0; } -static inline int __raw_write_can_lock(raw_rwlock_t *lock) +static inline int __raw_write_can_lock(__raw_rwlock_t *lock) { return (lock)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void __raw_read_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t" "jns 1f\n" @@ -121,7 +121,7 @@ static inline void __raw_read_lock(raw_r ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void __raw_write_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "subl %1,(%0)\n\t" "jz 1f\n" @@ -130,7 +130,7 @@ static inline void __raw_write_lock(raw_ ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *lock) +static inline int __raw_read_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; atomic_dec(count); @@ -140,7 +140,7 @@ static inline int __raw_read_trylock(raw return 0; } -static inline int __raw_write_trylock(raw_rwlock_t *lock) +static inline int __raw_write_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; if (atomic_sub_and_test(RW_LOCK_BIAS, count)) @@ -149,12 +149,12 @@ static inline int __raw_write_trylock(ra return 0; } -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void __raw_read_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " ; incl %0" :"=m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void __raw_write_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " ; addl $" RW_LOCK_BIAS_STR ",%0" : "=m" (rw->lock) : : "memory"); Index: linux-rt.q/include/asm-x86_64/spinlock_types.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/spinlock_types.h +++ linux-rt.q/include/asm-x86_64/spinlock_types.h @@ -7,13 +7,13 @@ typedef struct { unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 1 } typedef struct { unsigned int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } Index: linux-rt.q/include/asm-x86_64/thread_info.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/thread_info.h +++ linux-rt.q/include/asm-x86_64/thread_info.h @@ -112,6 +112,7 @@ static inline struct thread_info *stack_ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_IRET 5 /* force IRET */ +#define TIF_NEED_RESCHED_DELAYED 6 /* reschedul on return to userspace */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ @@ -133,6 +134,7 @@ static inline struct thread_info *stack_ #define _TIF_SYSCALL_AUDIT (1< -- owa --- arch/powerpc/kernel/entry_64.S | 5 +++++ 1 file changed, 5 insertions(+) Index: linux-rt.q/arch/powerpc/kernel/entry_64.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/entry_64.S +++ linux-rt.q/arch/powerpc/kernel/entry_64.S @@ -578,6 +578,11 @@ do_work: user_work: #endif + /* here we are preempting the current task */ + li r0,1 + stb r0,PACASOFTIRQEN(r13) + stb r0,PACAHARDIRQEN(r13) + /* Enable interrupts */ ori r10,r10,MSR_EE mtmsrd r10,1 patches/RT_utsname.patch0000664000077200007720000000247510646635217014645 0ustar mingomingo--- init/Makefile | 2 +- scripts/mkcompile_h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) Index: linux-rt.q/init/Makefile =================================================================== --- linux-rt.q.orig/init/Makefile +++ linux-rt.q/init/Makefile @@ -30,4 +30,4 @@ $(obj)/version.o: include/linux/compile. include/linux/compile.h: FORCE @echo ' CHK $@' $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(CFLAGS)" + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT)" "$(CC) $(CFLAGS)" Index: linux-rt.q/scripts/mkcompile_h =================================================================== --- linux-rt.q.orig/scripts/mkcompile_h +++ linux-rt.q/scripts/mkcompile_h @@ -2,7 +2,8 @@ TARGET=$1 ARCH=$2 SMP=$3 PREEMPT=$4 -CC=$5 +PREEMPT_RT=$5 +CC=$6 # If compile.h exists already and we don't own autoconf.h # (i.e. we're not the same user who did make *config), don't @@ -43,6 +44,7 @@ UTS_VERSION="#$VERSION" CONFIG_FLAGS="" if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi +if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" # Truncate to maximum length patches/Allocate-RTSJ-memory-for-TCK-conformance-test.patch0000664000077200007720000001163110646635216023037 0ustar mingomingoAllocate RTSJ memory for TCK conformance test. From: Theodore Ts'o This kernel message allocates memory which is required by the real-time TCK conformance test which tests the JVM's RTSJ implementation. Unfortunately, RTSJ requires that Java programs have direct access to physical memory. This kernel reserves memory which can then be used by an external /dev/rmem loadable kernel module. Signed-off-by: "Theodore Ts'o" --- drivers/char/Kconfig | 7 +++ drivers/char/Makefile | 2 drivers/char/alloc_rtsj_mem.c | 88 ++++++++++++++++++++++++++++++++++++++++++ init/main.c | 7 +++ 4 files changed, 104 insertions(+) Index: linux-rt.q/drivers/char/Kconfig =================================================================== --- linux-rt.q.orig/drivers/char/Kconfig +++ linux-rt.q/drivers/char/Kconfig @@ -1126,6 +1126,13 @@ config RMEM patching /dev/mem because we don't expect this functionality to ever be accepted into mainline. +config ALLOC_RTSJ_MEM + tristate "RTSJ-specific hack to reserve memory" + default m + help + The RTSJ TCK conformance test requires reserving some physical + memory for testing /dev/rmem. + config DEVPORT bool depends on !M68K Index: linux-rt.q/drivers/char/Makefile =================================================================== --- linux-rt.q.orig/drivers/char/Makefile +++ linux-rt.q/drivers/char/Makefile @@ -108,6 +108,8 @@ obj-$(CONFIG_IPMI_HANDLER) += ipmi/ obj-$(CONFIG_HANGCHECK_TIMER) += hangcheck-timer.o obj-$(CONFIG_TCG_TPM) += tpm/ +obj-$(CONFIG_ALLOC_RTSJ_MEM) += alloc_rtsj_mem.o + # Files generated that shall be removed upon make clean clean-files := consolemap_deftbl.c defkeymap.c Index: linux-rt.q/drivers/char/alloc_rtsj_mem.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/char/alloc_rtsj_mem.c @@ -0,0 +1,88 @@ +/* + * alloc_rtsj_mem.c -- Hack to allocate some memory + * + * Copyright (C) 2005 by Theodore Ts'o + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +MODULE_AUTHOR("Theodore Tso"); +MODULE_DESCRIPTION("RTSJ alloc memory"); +MODULE_LICENSE("GPL"); + +static void *mem = 0; +int size = 0, addr = 0; + +module_param(size, int, 0444); +module_param(addr, int, 0444); + +static void __exit shutdown_module(void) +{ + kfree(mem); +} + +#ifndef MODULE +void __init alloc_rtsj_mem_early_setup(void) +{ + if (size > PAGE_SIZE*2) { + mem = alloc_bootmem(size); + if (mem) { + printk(KERN_INFO "alloc_rtsj_mem: got %d bytes " + "using alloc_bootmem\n", size); + } else { + printk(KERN_INFO "alloc_rtsj_mem: failed to " + "get %d bytes from alloc_bootmem\n", size); + } + } +} +#endif + +static int __init startup_module(void) +{ + static char test_string[] = "The BOFH: Servicing users the way the " + "military\n\tservices targets for 15 years.\n"; + + if (!size) + return 0; + + if (!mem) { + mem = kmalloc(size, GFP_KERNEL); + if (mem) { + printk(KERN_INFO "alloc_rtsj_mem: got %d bytes " + "using kmalloc\n", size); + } else { + printk(KERN_ERR "alloc_rtsj_mem: failed to get " + "%d bytes using kmalloc\n", size); + return -ENOMEM; + } + } + memcpy(mem, test_string, min(sizeof(test_string), (size_t) size)); + addr = virt_to_phys(mem); + return 0; +} + +module_init(startup_module); +module_exit(shutdown_module); + Index: linux-rt.q/init/main.c =================================================================== --- linux-rt.q.orig/init/main.c +++ linux-rt.q/init/main.c @@ -105,6 +105,12 @@ static inline void acpi_early_init(void) #ifndef CONFIG_DEBUG_RODATA static inline void mark_rodata_ro(void) { } #endif +#ifdef CONFIG_ALLOC_RTSJ_MEM +extern void alloc_rtsj_mem_early_setup(void); +#else +static inline void alloc_rtsj_mem_early_setup(void) { } +#endif + #ifdef CONFIG_TC extern void tc_init(void); @@ -604,6 +610,7 @@ asmlinkage void __init start_kernel(void #endif vfs_caches_init_early(); cpuset_init_early(); + alloc_rtsj_mem_early_setup(); mem_init(); kmem_cache_init(); setup_per_cpu_pageset(); patches/i386-remove-volatile-in-apicc.patch0000664000077200007720000000230010646635210020022 0ustar mingomingoSubject: i386: remove volatile in apic.c Remove the volatile in apic. We have a cpu_relax() in the wait loop. Fix a coding style issue while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/i386/kernel/apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/i386/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/apic.c +++ linux-rt.q/arch/i386/kernel/apic.c @@ -318,7 +318,7 @@ static void __devinit setup_APIC_timer(v #define LAPIC_CAL_LOOPS (HZ/10) -static __initdata volatile int lapic_cal_loops = -1; +static __initdata int lapic_cal_loops = -1; static __initdata long lapic_cal_t1, lapic_cal_t2; static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; @@ -488,7 +488,7 @@ void __init setup_boot_APIC_clock(void) /* Let the interrupts run */ local_irq_enable(); - while(lapic_cal_loops <= LAPIC_CAL_LOOPS) + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) cpu_relax(); local_irq_disable(); patches/preempt-realtime-compile-fixes.patch0000664000077200007720000000111410646635215020565 0ustar mingomingo--- drivers/block/paride/pseudo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/drivers/block/paride/pseudo.h =================================================================== --- linux-rt.q.orig/drivers/block/paride/pseudo.h +++ linux-rt.q/drivers/block/paride/pseudo.h @@ -43,7 +43,7 @@ static unsigned long ps_timeout; static int ps_tq_active = 0; static int ps_nice = 0; -static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); +static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock); static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int); patches/preempt-realtime-arm-integrator.patch0000664000077200007720000000204510646635214020757 0ustar mingomingo--- arch/arm/mach-integrator/core.c | 2 +- arch/arm/mach-integrator/pci_v3.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/arm/mach-integrator/core.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-integrator/core.c +++ linux-rt.q/arch/arm/mach-integrator/core.c @@ -164,7 +164,7 @@ static struct amba_pl010_data integrator #define CM_CTRL IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET -static DEFINE_SPINLOCK(cm_lock); +static DEFINE_RAW_SPINLOCK(cm_lock); /** * cm_control - update the CM_CTRL register. Index: linux-rt.q/arch/arm/mach-integrator/pci_v3.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-integrator/pci_v3.c +++ linux-rt.q/arch/arm/mach-integrator/pci_v3.c @@ -162,7 +162,7 @@ * 7:2 register number * */ -static DEFINE_SPINLOCK(v3_lock); +static DEFINE_RAW_SPINLOCK(v3_lock); #define PCI_BUS_NONMEM_START 0x00000000 #define PCI_BUS_NONMEM_SIZE SZ_256M patches/preempt-realtime-arm.patch0000664000077200007720000001467710646635214016621 0ustar mingomingo--- arch/arm/kernel/dma.c | 2 +- arch/arm/kernel/irq.c | 2 +- arch/arm/kernel/signal.c | 8 ++++++++ arch/arm/kernel/smp.c | 2 +- arch/arm/kernel/traps.c | 4 ++-- arch/arm/mm/consistent.c | 2 +- arch/arm/mm/copypage-v4mc.c | 2 +- arch/arm/mm/copypage-v6.c | 2 +- arch/arm/mm/copypage-xscale.c | 2 +- arch/arm/mm/mmu.c | 2 +- include/asm-arm/dma.h | 2 +- include/asm-arm/tlb.h | 9 ++++++--- 12 files changed, 25 insertions(+), 14 deletions(-) Index: linux-rt.q/arch/arm/kernel/dma.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/dma.c +++ linux-rt.q/arch/arm/kernel/dma.c @@ -20,7 +20,7 @@ #include -DEFINE_SPINLOCK(dma_spin_lock); +DEFINE_RAW_SPINLOCK(dma_spin_lock); EXPORT_SYMBOL(dma_spin_lock); static dma_t dma_chan[MAX_DMA_CHANNELS]; Index: linux-rt.q/arch/arm/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/irq.c +++ linux-rt.q/arch/arm/kernel/irq.c @@ -100,7 +100,7 @@ unlock: /* Handle bad interrupts */ static struct irq_desc bad_irq_desc = { .handle_irq = handle_bad_irq, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock) }; /* Index: linux-rt.q/arch/arm/kernel/signal.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/signal.c +++ linux-rt.q/arch/arm/kernel/signal.c @@ -623,6 +623,14 @@ static int do_signal(sigset_t *oldset, s siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux-rt.q/arch/arm/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/smp.c +++ linux-rt.q/arch/arm/kernel/smp.c @@ -521,7 +521,7 @@ static void ipi_call_function(unsigned i cpu_clear(cpu, data->unfinished); } -static DEFINE_SPINLOCK(stop_lock); +static DEFINE_RAW_SPINLOCK(stop_lock); /* * ipi_cpu_stop - handle IPI from smp_send_stop() Index: linux-rt.q/arch/arm/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/traps.c +++ linux-rt.q/arch/arm/kernel/traps.c @@ -234,7 +234,7 @@ static void __die(const char *str, int e } } -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -276,7 +276,7 @@ void arm_notify_die(const char *str, str } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_RAW_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { Index: linux-rt.q/arch/arm/mm/consistent.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/consistent.c +++ linux-rt.q/arch/arm/mm/consistent.c @@ -40,7 +40,7 @@ * These are the page tables (2MB each) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte[NUM_CONSISTENT_PTES]; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_RAW_SPINLOCK(consistent_lock); /* * VM region handling support. Index: linux-rt.q/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/copypage-v4mc.c +++ linux-rt.q/arch/arm/mm/copypage-v4mc.c @@ -30,7 +30,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_page Index: linux-rt.q/arch/arm/mm/copypage-v6.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/copypage-v6.c +++ linux-rt.q/arch/arm/mm/copypage-v6.c @@ -26,7 +26,7 @@ #define from_address (0xffff8000) #define to_address (0xffffc000) -static DEFINE_SPINLOCK(v6_lock); +static DEFINE_RAW_SPINLOCK(v6_lock); /* * Copy the user page. No aliasing to deal with so we can just Index: linux-rt.q/arch/arm/mm/copypage-xscale.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/copypage-xscale.c +++ linux-rt.q/arch/arm/mm/copypage-xscale.c @@ -32,7 +32,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_page Index: linux-rt.q/arch/arm/mm/mmu.c =================================================================== --- linux-rt.q.orig/arch/arm/mm/mmu.c +++ linux-rt.q/arch/arm/mm/mmu.c @@ -25,7 +25,7 @@ #include "mm.h" -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); extern void _stext, _etext, __data_start, _end; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; Index: linux-rt.q/include/asm-arm/dma.h =================================================================== --- linux-rt.q.orig/include/asm-arm/dma.h +++ linux-rt.q/include/asm-arm/dma.h @@ -27,7 +27,7 @@ typedef unsigned int dmamode_t; #define DMA_MODE_CASCADE 2 #define DMA_AUTOINIT 4 -extern spinlock_t dma_spin_lock; +extern raw_spinlock_t dma_spin_lock; static inline unsigned long claim_dma_lock(void) { Index: linux-rt.q/include/asm-arm/tlb.h =================================================================== --- linux-rt.q.orig/include/asm-arm/tlb.h +++ linux-rt.q/include/asm-arm/tlb.h @@ -36,15 +36,18 @@ struct mmu_gather { struct mm_struct *mm; unsigned int fullmm; + int cpu; }; -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); static inline struct mmu_gather * tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + int cpu; + struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu); + tlb->cpu = cpu; tlb->mm = mm; tlb->fullmm = full_mm_flush; @@ -60,7 +63,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + put_cpu_var_locked(mmu_gathers, tlb->cpu); } #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) patches/futex-tidy-up-the-code-v2.patch0000664000077200007720000002643010646635210017311 0ustar mingomingoFrom: Thomas Gleixner The recent PRIVATE and REQUEUE_PI changes to the futex code made it hard to read. Tidy it up. Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton --- kernel/futex.c | 138 ++++++++++++++++++++++-------------------------- kernel/rtmutex-debug.c | 6 -- kernel/rtmutex.c | 6 -- kernel/rtmutex_common.h | 9 ++- 4 files changed, 74 insertions(+), 85 deletions(-) Index: linux-rt.q/kernel/futex.c =================================================================== --- linux-rt.q.orig/kernel/futex.c +++ linux-rt.q/kernel/futex.c @@ -121,6 +121,24 @@ static struct futex_hash_bucket futex_qu static struct vfsmount *futex_mnt; /* + * Take mm->mmap_sem, when futex is shared + */ +static inline void futex_lock_mm(struct rw_semaphore *fshared) +{ + if (fshared) + down_read(fshared); +} + +/* + * Release mm->mmap_sem, when the futex is shared + */ +static inline void futex_unlock_mm(struct rw_semaphore *fshared) +{ + if (fshared) + up_read(fshared); +} + +/* * We hash on the keys returned from get_futex_key (see below). */ static struct futex_hash_bucket *hash_futex(union futex_key *key) @@ -287,7 +305,18 @@ void drop_futex_key_refs(union futex_key } EXPORT_SYMBOL_GPL(drop_futex_key_refs); -static inline int get_futex_value_locked(u32 *dest, u32 __user *from) +static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +{ + u32 curval; + + pagefault_disable(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + pagefault_enable(); + + return curval; +} + +static int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; @@ -620,9 +649,7 @@ static int wake_futex_pi(u32 __user *uad newval = FUTEX_WAITERS | new_owner->pid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (curval == -EFAULT) ret = -EFAULT; @@ -659,9 +686,7 @@ static int unlock_futex_pi(u32 __user *u * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - pagefault_disable(); - oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); - pagefault_enable(); + oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); if (oldval == -EFAULT) return oldval; @@ -700,8 +725,7 @@ static int futex_wake(u32 __user *uaddr, union futex_key key; int ret; - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -725,8 +749,7 @@ static int futex_wake(u32 __user *uaddr, spin_unlock(&hb->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -746,8 +769,7 @@ futex_wake_op(u32 __user *uaddr1, struct int ret, op_ret, attempt = 0; retryfull: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) @@ -793,7 +815,7 @@ retry: */ if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr2, - fshared, attempt); + fshared, attempt); if (ret) goto out; goto retry; @@ -803,8 +825,7 @@ retry: * If we would have faulted, release mmap_sem, * fault it in and start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(dummy, uaddr2); if (ret) @@ -841,8 +862,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); + return ret; } @@ -861,8 +882,7 @@ static int futex_requeue(u32 __user *uad int ret, drop_count = 0; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) @@ -890,8 +910,7 @@ static int futex_requeue(u32 __user *uad * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(curval, uaddr1); @@ -944,8 +963,7 @@ out_unlock: drop_futex_key_refs(&key1); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -1113,10 +1131,7 @@ static int fixup_pi_state_owner(u32 __us while (!ret) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (curval == -EFAULT) ret = -EFAULT; @@ -1134,6 +1149,7 @@ static int fixup_pi_state_owner(u32 __us #define ARG3_SHARED 1 static long futex_wait_restart(struct restart_block *restart); + static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, u32 val, ktime_t *abs_time) { @@ -1148,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, q.pi_state = NULL; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1186,8 +1201,7 @@ static int futex_wait(u32 __user *uaddr, * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); @@ -1206,8 +1220,7 @@ static int futex_wait(u32 __user *uaddr, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); /* * There might have been scheduling since the queue_me(), as we @@ -1285,8 +1298,7 @@ static int futex_wait(u32 __user *uaddr, queue_unlock(&q, hb); out_release_sem: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; } @@ -1333,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uad q.pi_state = NULL; retry: - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1353,9 +1364,7 @@ static int futex_lock_pi(u32 __user *uad */ newval = current->pid; - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1398,9 +1407,7 @@ static int futex_lock_pi(u32 __user *uad lock_taken = 1; } - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - pagefault_enable(); + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1428,8 +1435,7 @@ static int futex_lock_pi(u32 __user *uad * exit to complete. */ queue_unlock(&q, hb); - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); cond_resched(); goto retry; @@ -1465,8 +1471,7 @@ static int futex_lock_pi(u32 __user *uad * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); WARN_ON(!q.pi_state); /* @@ -1480,8 +1485,7 @@ static int futex_lock_pi(u32 __user *uad ret = ret ? 0 : -EWOULDBLOCK; } - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); spin_lock(q.lock_ptr); if (!ret) { @@ -1518,8 +1522,7 @@ static int futex_lock_pi(u32 __user *uad /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret != -EINTR ? ret : -ERESTARTNOINTR; @@ -1527,8 +1530,7 @@ static int futex_lock_pi(u32 __user *uad queue_unlock(&q, hb); out_release_sem: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; uaddr_faulted: @@ -1550,8 +1552,7 @@ static int futex_lock_pi(u32 __user *uad goto retry_unlocked; } - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1585,8 +1586,7 @@ retry: /* * First take all the futex related locks: */ - if (fshared) - down_read(fshared); + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -1601,11 +1601,9 @@ retry_unlocked: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) { - pagefault_disable(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - pagefault_enable(); - } + if (!(uval & FUTEX_OWNER_DIED)) + uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); + if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1647,8 +1645,7 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); out: - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); return ret; @@ -1671,8 +1668,7 @@ pi_faulted: goto retry_unlocked; } - if (fshared) - up_read(fshared); + futex_unlock_mm(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1729,8 +1725,8 @@ static int futex_fd(u32 __user *uaddr, i if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " - "will be removed from the kernel in June 2007\n", - current->comm); + "will be removed from the kernel in June 2007\n", + current->comm); } ret = -EINVAL; @@ -1908,10 +1904,8 @@ retry: * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ - if (!pi) { - if (uval & FUTEX_WAITERS) + if (!pi && (uval & FUTEX_WAITERS)) futex_wake(uaddr, &curr->mm->mmap_sem, 1); - } } return 0; } Index: linux-rt.q/kernel/rtmutex-debug.c =================================================================== --- linux-rt.q.orig/kernel/rtmutex-debug.c +++ linux-rt.q/kernel/rtmutex-debug.c @@ -29,12 +29,6 @@ #include "rtmutex_common.h" -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif - # define TRACE_WARN_ON(x) WARN_ON(x) # define TRACE_BUG_ON(x) BUG_ON(x) Index: linux-rt.q/kernel/rtmutex.c =================================================================== --- linux-rt.q.orig/kernel/rtmutex.c +++ linux-rt.q/kernel/rtmutex.c @@ -17,12 +17,6 @@ #include "rtmutex_common.h" -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif - /* * lock->owner state tracking: * Index: linux-rt.q/kernel/rtmutex_common.h =================================================================== --- linux-rt.q.orig/kernel/rtmutex_common.h +++ linux-rt.q/kernel/rtmutex_common.h @@ -103,7 +103,7 @@ static inline struct task_struct *rt_mut static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) { - return (struct task_struct *) + return (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } @@ -120,4 +120,11 @@ extern void rt_mutex_init_proxy_locked(s struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + #endif patches/ich-force-hpet-ich5-quirk-to-force-detect-enable.patch0000664000077200007720000001240210646635211023527 0ustar mingomingoFrom: Venki Pallipadi force_enable hpet for ICH5. Signed-off-by: Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andi Kleen Cc: john stultz Cc: Greg KH Signed-off-by: Andrew Morton --- arch/i386/kernel/hpet.c | 2 arch/i386/kernel/quirks.c | 101 +++++++++++++++++++++++++++++++++++++++++++++- include/asm-i386/hpet.h | 2 include/linux/pci_ids.h | 1 4 files changed, 103 insertions(+), 3 deletions(-) Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -181,7 +181,7 @@ static void hpet_start_counter(void) static void hpet_resume_device(void) { - ich_force_hpet_resume(); + force_hpet_resume(); } static void hpet_restart_counter(void) Index: linux-rt.q/arch/i386/kernel/quirks.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/quirks.c +++ linux-rt.q/arch/i386/kernel/quirks.c @@ -53,9 +53,15 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_IN #if defined(CONFIG_HPET_TIMER) unsigned long force_hpet_address; +static enum { + NONE_FORCE_HPET_RESUME, + OLD_ICH_FORCE_HPET_RESUME, + ICH_FORCE_HPET_RESUME +} force_hpet_resume_type; + static void __iomem *rcba_base; -void ich_force_hpet_resume(void) +static void ich_force_hpet_resume(void) { u32 val; @@ -133,6 +139,7 @@ static void ich_force_enable_hpet(struct iounmap(rcba_base); printk(KERN_DEBUG "Failed to force enable HPET\n"); } else { + force_hpet_resume_type = ICH_FORCE_HPET_RESUME; printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", force_hpet_address); } @@ -148,4 +155,96 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); + + +static struct pci_dev *cached_dev; + +static void old_ich_force_hpet_resume(void) +{ + u32 val, gen_cntl; + + if (!force_hpet_address || !cached_dev) + return; + + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + + pci_write_config_dword(cached_dev, 0xD0, gen_cntl); + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + val = gen_cntl >> 15; + val &= 0x7; + if (val == 0x4) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void old_ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val, gen_cntl; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + /* + * Bit 17 is HPET enable bit. + * Bit 16:15 control the HPET base address. + */ + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "HPET at base address 0x%lx\n", + force_hpet_address); + cached_dev = dev; + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + pci_write_config_dword(dev, 0xD0, gen_cntl); + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; + return; + } + + printk(KERN_DEBUG "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, + old_ich_force_enable_hpet); + +void force_hpet_resume(void) +{ + switch (force_hpet_resume_type) { + case ICH_FORCE_HPET_RESUME: + return ich_force_hpet_resume(); + + case OLD_ICH_FORCE_HPET_RESUME: + return old_ich_force_hpet_resume(); + + default: + break; + } +} + #endif Index: linux-rt.q/include/asm-i386/hpet.h =================================================================== --- linux-rt.q.orig/include/asm-i386/hpet.h +++ linux-rt.q/include/asm-i386/hpet.h @@ -68,7 +68,7 @@ extern unsigned long force_hpet_address; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern unsigned long hpet_readl(unsigned long a); -extern void ich_force_hpet_resume(void); +extern void force_hpet_resume(void); #ifdef CONFIG_HPET_EMULATE_RTC Index: linux-rt.q/include/linux/pci_ids.h =================================================================== --- linux-rt.q.orig/include/linux/pci_ids.h +++ linux-rt.q/include/linux/pci_ids.h @@ -2221,6 +2221,7 @@ #define PCI_DEVICE_ID_INTEL_82801EB_5 0x24d5 #define PCI_DEVICE_ID_INTEL_82801EB_6 0x24d6 #define PCI_DEVICE_ID_INTEL_82801EB_11 0x24db +#define PCI_DEVICE_ID_INTEL_82801EB_12 0x24dc #define PCI_DEVICE_ID_INTEL_82801EB_13 0x24dd #define PCI_DEVICE_ID_INTEL_ESB_1 0x25a1 #define PCI_DEVICE_ID_INTEL_ESB_2 0x25a2 patches/preempt-realtime-powerpc-update.patch0000664000077200007720000000405510646635214020766 0ustar mingomingo--- arch/powerpc/Kconfig.debug | 4 ++++ arch/powerpc/kernel/idle.c | 2 +- include/asm-powerpc/hw_irq.h | 2 +- include/asm-powerpc/pmac_feature.h | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) Index: linux-rt.q/arch/powerpc/Kconfig.debug =================================================================== --- linux-rt.q.orig/arch/powerpc/Kconfig.debug +++ linux-rt.q/arch/powerpc/Kconfig.debug @@ -2,6 +2,10 @@ menu "Kernel hacking" source "lib/Kconfig.debug" +config TRACE_IRQFLAGS_SUPPORT + bool + default y + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL Index: linux-rt.q/arch/powerpc/kernel/idle.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/idle.c +++ linux-rt.q/arch/powerpc/kernel/idle.c @@ -100,7 +100,7 @@ void cpu_idle(void) tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); } Index: linux-rt.q/include/asm-powerpc/hw_irq.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/hw_irq.h +++ linux-rt.q/include/asm-powerpc/hw_irq.h @@ -120,7 +120,7 @@ static inline void raw_local_irq_save_pt #define hard_irq_enable() local_irq_enable() #define hard_irq_disable() local_irq_disable() -#include +#include #endif /* CONFIG_PPC64 */ Index: linux-rt.q/include/asm-powerpc/pmac_feature.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/pmac_feature.h +++ linux-rt.q/include/asm-powerpc/pmac_feature.h @@ -378,7 +378,7 @@ extern struct macio_chip* macio_find(str * Those are exported by pmac feature for internal use by arch code * only like the platform function callbacks, do not use directly in drivers */ -extern spinlock_t feature_lock; +extern raw_spinlock_t feature_lock; extern struct device_node *uninorth_node; extern u32 __iomem *uninorth_base; patches/preempt-realtime-cfs-accounting-fix.patch0000664000077200007720000000214510646635215021515 0ustar mingomingoSubject: [Patch RT] Fix CFS load balancing for RT tasks From: Sébastien Dugué The RT overload mechanism of the O(1) scheduler has not been activated in the new CFS. This patch fixes that by inserting calls to inc_rt_tasks() and dec_rt_tasks() in enqueue_task_rt() and dequeue_task_rt() respectively, which enables the balance_rt_tasks() to be run in the rt_overload case. Signed-off-by: Sébastien Dugué --- kernel/sched_rt.c | 4 ++++ 1 file changed, 4 insertions(+) Index: linux-rt.q/kernel/sched_rt.c =================================================================== --- linux-rt.q.orig/kernel/sched_rt.c +++ linux-rt.q/kernel/sched_rt.c @@ -32,6 +32,8 @@ enqueue_task_rt(struct rq *rq, struct ta list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); + + inc_rt_tasks(p, rq); } /* @@ -44,6 +46,8 @@ dequeue_task_rt(struct rq *rq, struct ta update_curr_rt(rq, now); + dec_rt_tasks(p, rq); + list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); patches/rt-page_alloc.patch0000664000077200007720000001463310646635214015271 0ustar mingomingoSubject: rt-friendly per-cpu pages From: Ingo Molnar rt-friendly per-cpu pages: convert the irqs-off per-cpu locking method into a preemptible, explicit-per-cpu-locks method. Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 111 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 32 deletions(-) Index: linux-rt.q/mm/page_alloc.c =================================================================== --- linux-rt.q.orig/mm/page_alloc.c +++ linux-rt.q/mm/page_alloc.c @@ -136,6 +136,53 @@ static unsigned long __meminitdata dma_r #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU_LOCKED(int, pcp_locks); +#endif + +static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) +{ +#ifdef CONFIG_PREEMPT_RT + spin_lock(&__get_cpu_lock(pcp_locks, cpu)); + flags = 0; +#else + local_irq_save(*flags); +#endif +} + +static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + (void)get_cpu_var_locked(pcp_locks, this_cpu); + flags = 0; +#else + local_irq_save(*flags); + *this_cpu = smp_processor_id(); +#endif +} + +static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + put_cpu_var_locked(pcp_locks, this_cpu); +#else + local_irq_restore(flags); +#endif +} + +static struct per_cpu_pageset * +get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) +{ + lock_cpu_pcp(flags, this_cpu); + return zone_pcp(zone, *this_cpu); +} + +static void +put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) +{ + unlock_cpu_pcp(flags, this_cpu); +} + #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; EXPORT_SYMBOL(nr_node_ids); @@ -390,8 +437,8 @@ static inline int page_is_buddy(struct p * -- wli */ -static inline void __free_one_page(struct page *page, - struct zone *zone, unsigned int order) +static inline void +__free_one_page(struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; int order_size = 1 << order; @@ -501,8 +548,9 @@ static void free_one_page(struct zone *z static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int i; int reserved = 0; + int this_cpu; + int i; for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); @@ -514,10 +562,10 @@ static void __free_pages_ok(struct page arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); - local_irq_save(flags); - __count_vm_events(PGFREE, 1 << order); + lock_cpu_pcp(&flags, &this_cpu); + count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); - local_irq_restore(flags); + unlock_cpu_pcp(flags, this_cpu); } /* @@ -685,23 +733,19 @@ static int rmqueue_bulk(struct zone *zon */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain; - local_irq_save(flags); if (pcp->count >= pcp->batch) to_drain = pcp->batch; else to_drain = pcp->count; free_pages_bulk(zone, to_drain, &pcp->list, 0); pcp->count -= to_drain; - local_irq_restore(flags); } #endif static void __drain_pages(unsigned int cpu) { - unsigned long flags; struct zone *zone; int i; @@ -712,14 +756,16 @@ static void __drain_pages(unsigned int c continue; pset = zone_pcp(zone, cpu); + if (!pset) { + WARN_ON(1); + continue; + } for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - local_irq_save(flags); free_pages_bulk(zone, pcp->count, &pcp->list, 0); pcp->count = 0; - local_irq_restore(flags); } } } @@ -765,10 +811,11 @@ void mark_free_pages(struct zone *zone) void drain_local_pages(void) { unsigned long flags; + int this_cpu; - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); + lock_cpu_pcp(&flags, &this_cpu); + __drain_pages(this_cpu); + unlock_cpu_pcp(flags, this_cpu); } #endif /* CONFIG_PM */ @@ -778,8 +825,10 @@ void drain_local_pages(void) static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); + struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; unsigned long flags; + int this_cpu; if (PageAnon(page)) page->mapping = NULL; @@ -791,24 +840,25 @@ static void fastcall free_hot_cold_page( arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; - local_irq_save(flags); - __count_vm_event(PGFREE); + pset = get_zone_pcp(zone, &flags, &this_cpu); + pcp = &pset->pcp[cold]; + + count_vm_event(PGFREE); + list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) { free_pages_bulk(zone, pcp->batch, &pcp->list, 0); pcp->count -= pcp->batch; } - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); } void fastcall free_hot_page(struct page *page) { free_hot_cold_page(page, 0); } - + void fastcall free_cold_page(struct page *page) { free_hot_cold_page(page, 1); @@ -840,18 +890,17 @@ void split_page(struct page *page, unsig static struct page *buffered_rmqueue(struct zonelist *zonelist, struct zone *zone, int order, gfp_t gfp_flags) { + int cold = !!(gfp_flags & __GFP_COLD); + struct per_cpu_pageset *pset; unsigned long flags; struct page *page; - int cold = !!(gfp_flags & __GFP_COLD); - int cpu; + int this_cpu; again: - cpu = get_cpu(); + pset = get_zone_pcp(zone, &flags, &this_cpu); if (likely(order == 0)) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = &pset->pcp[cold]; - pcp = &zone_pcp(zone, cpu)->pcp[cold]; - local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); @@ -862,7 +911,7 @@ again: list_del(&page->lru); pcp->count--; } else { - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); page = __rmqueue(zone, order); spin_unlock(&zone->lock); if (!page) @@ -871,8 +920,7 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(zonelist, zone); - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -880,8 +928,7 @@ again: return page; failed: - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); return NULL; } patches/fix-emac-locking-2.6.16.patch0000664000077200007720000000551110646635214016420 0ustar mingomingo drivers/net/ibm_emac/ibm_emac_core.c | 11 +++++++++++ drivers/net/ibm_emac/ibm_emac_core.h | 2 ++ 2 files changed, 13 insertions(+) Index: linux-rt.q/drivers/net/ibm_emac/ibm_emac_core.c =================================================================== --- linux-rt.q.orig/drivers/net/ibm_emac/ibm_emac_core.c +++ linux-rt.q/drivers/net/ibm_emac/ibm_emac_core.c @@ -1059,6 +1059,8 @@ static inline int emac_xmit_finish(struc ++dev->stats.tx_packets; dev->stats.tx_bytes += len; + spin_unlock(&dev->tx_lock); + return 0; } @@ -1072,6 +1074,7 @@ static int emac_start_xmit(struct sk_buf u16 ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY | MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb); + spin_lock(&dev->tx_lock); slot = dev->tx_slot++; if (dev->tx_slot == NUM_TX_BUFF) { dev->tx_slot = 0; @@ -1134,6 +1137,8 @@ static int emac_start_xmit_sg(struct sk_ if (likely(!nr_frags && len <= MAL_MAX_TX_SIZE)) return emac_start_xmit(skb, ndev); + spin_lock(&dev->tx_lock); + len -= skb->data_len; /* Note, this is only an *estimation*, we can still run out of empty @@ -1202,6 +1207,7 @@ static int emac_start_xmit_sg(struct sk_ stop_queue: netif_stop_queue(ndev); DBG2("%d: stopped TX queue" NL, dev->def->index); + spin_unlock(&dev->tx_lock); return 1; } #else @@ -1241,6 +1247,7 @@ static void emac_poll_tx(void *param) DBG2("%d: poll_tx, %d %d" NL, dev->def->index, dev->tx_cnt, dev->ack_slot); + spin_lock(&dev->tx_lock); if (dev->tx_cnt) { u16 ctrl; int slot = dev->ack_slot, n = 0; @@ -1250,6 +1257,7 @@ static void emac_poll_tx(void *param) struct sk_buff *skb = dev->tx_skb[slot]; ++n; + spin_unlock(&dev->tx_lock); if (skb) { dev_kfree_skb(skb); dev->tx_skb[slot] = NULL; @@ -1259,6 +1267,7 @@ static void emac_poll_tx(void *param) if (unlikely(EMAC_IS_BAD_TX(ctrl))) emac_parse_tx_error(dev, ctrl); + spin_lock(&dev->tx_lock); if (--dev->tx_cnt) goto again; } @@ -1271,6 +1280,7 @@ static void emac_poll_tx(void *param) DBG2("%d: tx %d pkts" NL, dev->def->index, n); } } + spin_unlock(&dev->tx_lock); } static inline void emac_recycle_rx_skb(struct ocp_enet_private *dev, int slot, @@ -1963,6 +1973,7 @@ static int __init emac_probe(struct ocp_ dev->ldev = &ocpdev->dev; dev->def = ocpdev->def; SET_MODULE_OWNER(ndev); + spin_lock_init(&dev->tx_lock); /* Find MAL device we are connected to */ maldev = Index: linux-rt.q/drivers/net/ibm_emac/ibm_emac_core.h =================================================================== --- linux-rt.q.orig/drivers/net/ibm_emac/ibm_emac_core.h +++ linux-rt.q/drivers/net/ibm_emac/ibm_emac_core.h @@ -193,6 +193,8 @@ struct ocp_enet_private { struct ibm_emac_error_stats estats; struct net_device_stats nstats; + spinlock_t tx_lock; + struct device* ldev; }; patches/time-warp-detect.patch0000664000077200007720000001100410646635213015717 0ustar mingomingo include/linux/hrtimer.h | 1 + include/linux/time.h | 1 + kernel/time.c | 2 ++ kernel/time/ntp.c | 2 ++ kernel/time/timekeeping.c | 36 ++++++++++++++++++++++++++++++++++++ 5 files changed, 42 insertions(+) Index: linux-rt.q/include/linux/hrtimer.h =================================================================== --- linux-rt.q.orig/include/linux/hrtimer.h +++ linux-rt.q/include/linux/hrtimer.h @@ -234,6 +234,7 @@ static inline ktime_t hrtimer_cb_get_tim * clock_was_set() is a NOP for non- high-resolution systems. The * time-sorted order guarantees that a timer does not expire early and * is expired in the next softirq when the clock was advanced. + * (we still call the warp-check debugging code) */ static inline void clock_was_set(void) { } Index: linux-rt.q/include/linux/time.h =================================================================== --- linux-rt.q.orig/include/linux/time.h +++ linux-rt.q/include/linux/time.h @@ -126,6 +126,7 @@ s64 __get_nsec_offset(void); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_is_continuous(void); extern void update_wall_time(void); +extern void warp_check_clock_was_changed(void); /** * timespec_to_ns - Convert timespec to nanoseconds Index: linux-rt.q/kernel/time.c =================================================================== --- linux-rt.q.orig/kernel/time.c +++ linux-rt.q/kernel/time.c @@ -137,6 +137,7 @@ static inline void warp_clock(void) wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; time_interpolator_reset(); + warp_check_clock_was_changed(); write_sequnlock_irq(&xtime_lock); clock_was_set(); } @@ -351,6 +352,7 @@ int do_settimeofday (struct timespec *tv time_esterror = NTP_PHASE_LIMIT; time_interpolator_reset(); } + warp_check_clock_was_changed(); write_sequnlock_irq(&xtime_lock); clock_was_set(); return 0; Index: linux-rt.q/kernel/time/ntp.c =================================================================== --- linux-rt.q.orig/kernel/time/ntp.c +++ linux-rt.q/kernel/time/ntp.c @@ -123,6 +123,7 @@ void second_overflow(void) */ time_interpolator_update(-NSEC_PER_SEC); time_state = TIME_OOP; + warp_check_clock_was_changed(); printk(KERN_NOTICE "Clock: inserting leap second " "23:59:60 UTC\n"); } @@ -137,6 +138,7 @@ void second_overflow(void) */ time_interpolator_update(NSEC_PER_SEC); time_state = TIME_WAIT; + warp_check_clock_was_changed(); printk(KERN_NOTICE "Clock: deleting leap second " "23:59:59 UTC\n"); } Index: linux-rt.q/kernel/time/timekeeping.c =================================================================== --- linux-rt.q.orig/kernel/time/timekeeping.c +++ linux-rt.q/kernel/time/timekeeping.c @@ -98,6 +98,16 @@ cycle_t notrace usecs_to_cycles(unsigned return ns2cyc(clock, (u64)usecs * 1000); } +static DEFINE_PER_CPU(ktime_t, timestamp); + +void warp_check_clock_was_changed(void) +{ + int cpu; + + for_each_online_cpu(cpu) + per_cpu(timestamp, cpu).tv64 = 0; +} + /** * __get_realtime_clock_ts - Returns the time of day in a timespec * @ts: pointer to the timespec to be set @@ -109,7 +119,12 @@ static inline void __get_realtime_clock_ { unsigned long seq; s64 nsecs; + unsigned long flags; + static int once = 1; + ktime_t prev, now; + int cpu; + local_irq_save(flags); do { seq = read_seqbegin(&xtime_lock); @@ -119,6 +134,25 @@ static inline void __get_realtime_clock_ } while (read_seqretry(&xtime_lock, seq)); timespec_add_ns(ts, nsecs); + + now = timespec_to_ktime(*ts); + + cpu = raw_smp_processor_id(); + prev = per_cpu(timestamp, cpu); + per_cpu(timestamp, cpu) = now; + + if (once > 0 && prev.tv64 > now.tv64) { + once--; + stop_trace(); + user_trace_stop(); + local_irq_restore(flags); + + printk("BUG: time warp detected!\n"); + printk("prev > now, %016Lx > %016Lx:\n", prev.tv64, now.tv64); + printk("= %Ld delta, on CPU#%d\n", prev.tv64 - now.tv64, cpu); + dump_stack(); + } else + local_irq_restore(flags); } /** @@ -179,6 +213,7 @@ int do_settimeofday(struct timespec *tv) ntp_clear(); update_vsyscall(&xtime, clock); + warp_check_clock_was_changed(); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -314,6 +349,7 @@ static int timekeeping_resume(struct sys clock->cycle_last = clocksource_read(clock); clock->error = 0; timekeeping_suspended = 0; + warp_check_clock_was_changed(); write_sequnlock_irqrestore(&xtime_lock, flags); touch_softlockup_watchdog(); patches/softirq-per-cpu-assumptions-fixes.patch0000664000077200007720000001271310646635216021316 0ustar mingomingo--- kernel/hrtimer.c | 38 +++++++++++++++++++++----------------- kernel/sched.c | 2 +- kernel/softirq.c | 5 +++-- kernel/timer.c | 2 +- 4 files changed, 26 insertions(+), 21 deletions(-) Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -347,9 +347,9 @@ static inline int hrtimer_is_hres_enable /* * Is the high resolution mode active ? */ -static inline int hrtimer_hres_active(void) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { - return __get_cpu_var(hrtimer_bases).hres_active; + return cpu_base->hres_active; } /* @@ -426,11 +426,12 @@ static int hrtimer_reprogram(struct hrti */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base; + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct timespec realtime_offset; unsigned long seq; - if (!hrtimer_hres_active()) + if (!hrtimer_hres_active(base)) return; do { @@ -440,8 +441,6 @@ static void retrigger_next_event(void *a -wall_to_monotonic.tv_nsec); } while (read_seqretry(&xtime_lock, seq)); - base = &__get_cpu_var(hrtimer_bases); - /* Adjust CLOCK_REALTIME offset */ spin_lock(&base->lock); base->clock_base[CLOCK_REALTIME].offset = @@ -563,10 +562,8 @@ static inline int hrtimer_enqueue_reprog /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -577,7 +574,7 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", raw_smp_processor_id()); return 0; } base->hres_active = 1; @@ -595,9 +592,15 @@ static int hrtimer_switch_to_hres(void) #else -static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) @@ -789,7 +792,7 @@ static void __remove_hrtimer(struct hrti if (base->first == &timer->node) { base->first = rb_next(&timer->node); /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) + if (reprogram && hrtimer_hres_active(base->cpu_base)) hrtimer_force_reprogram(base->cpu_base); } rb_erase(&timer->node, &base->active); @@ -961,7 +964,7 @@ ktime_t hrtimer_get_next_event(void) spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { + if (!hrtimer_hres_active(cpu_base)) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -1258,10 +1261,11 @@ static inline void run_hrtimer_queue(str */ void hrtimer_run_queues(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base; int i; - if (hrtimer_hres_active()) + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); + if (hrtimer_hres_active(cpu_base)) return; /* @@ -1273,7 +1277,7 @@ void hrtimer_run_queues(void) * deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - if (hrtimer_switch_to_hres()) + if (hrtimer_switch_to_hres(cpu_base)) return; hrtimer_get_softirq_time(cpu_base); Index: linux-rt.q/kernel/sched.c =================================================================== --- linux-rt.q.orig/kernel/sched.c +++ linux-rt.q/kernel/sched.c @@ -3409,7 +3409,7 @@ out: */ static void run_rebalance_domains(struct softirq_action *h) { - int local_cpu = smp_processor_id(); + int local_cpu = raw_smp_processor_id(); struct rq *local_rq = cpu_rq(local_cpu); enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -412,12 +412,12 @@ void do_softirq_from_hardirq(void) { unsigned long p_flags; - if (!local_softirq_pending()) - return; /* * 'immediate' softirq execution, from hardirq context: */ local_irq_disable(); + if (!local_softirq_pending()) + goto out; __local_bh_disable((unsigned long)__builtin_return_address(0)); #ifndef CONFIG_PREEMPT_SOFTIRQS trace_softirq_enter(); @@ -437,6 +437,7 @@ void do_softirq_from_hardirq(void) current->flags &= ~PF_SOFTIRQ; _local_bh_enable(); +out: local_irq_enable(); } Index: linux-rt.q/kernel/timer.c =================================================================== --- linux-rt.q.orig/kernel/timer.c +++ linux-rt.q/kernel/timer.c @@ -999,7 +999,7 @@ static inline void update_times(void) */ static void run_timer_softirq(struct softirq_action *h) { - tvec_base_t *base = __get_cpu_var(tvec_bases); + tvec_base_t *base = per_cpu(tvec_bases, raw_smp_processor_id()); update_times(); hrtimer_run_queues(); patches/latency-tracer-disable-across-trace-cmdline.patch0000664000077200007720000000424510646635212023067 0ustar mingomingoFrom jan.altenberg@linutronix.de Tue Jun 19 16:07:25 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from [192.168.0.182] (unknown [91.89.185.36]) (using SSLv3 with cipher RC4-MD5 (128/128 bits)) (No client certificate requested) by mail.tglx.de (Postfix) with ESMTP id C80AC65C065; Tue, 19 Jun 2007 16:07:25 +0200 (CEST) Subject: freeze with mcount_enabled=1 From: Jan Altenberg To: Ingo Molnar Cc: tglx@linutronix.de Content-Type: text/plain Date: Tue, 19 Jun 2007 16:07:25 +0200 Message-Id: <1182262045.3793.111.camel@bender> Mime-Version: 1.0 X-Mailer: Evolution 2.8.3 (2.8.3-2.fc6) X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Hi Ingo, Hi Thomas, I've seen reproducable freezes on ARM for user triggered latency traces with mcount_enabled = 1: echo 1 > /proc/sys/kernel/mcount_enabled cyclictest -p80 -n -b 300 ends up in a frozen system. I added some instrumentation to the latency tracer code and it looks like the freeze happens when _trace_cmdline() is called from user_trace_start(). _trace_cmdline() calls ____trace() and after that we freeze... With the following Patch, the freezes seem to disappear (tested on a SAM926x and an EP93). I think, this isn't the correct solution, but it might give you an idea, what goes wrong. Let me know, if you have something new for testing. Regards, Jan --- --- kernel/latency_trace.c | 2 ++ 1 file changed, 2 insertions(+) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -2367,10 +2367,12 @@ long user_trace_start(void) #endif reset_trace_idx(cpu, tr); + atomic_inc(&tr->disabled); tr->critical_sequence = max_sequence; tr->preempt_timestamp = get_monotonic_cycles(); tr->critical_start = CALLER_ADDR0; _trace_cmdline(cpu, tr); + atomic_dec(&tr->disabled); mcount(); WARN_ON(!irqs_disabled()); patches/tasklet-redesign.patch0000664000077200007720000002105610646635214016022 0ustar mingomingoFrom: Ingo Molnar tasklet redesign: make it saner and make it easier to thread. Signed-off-by: Ingo Molnar ---- include/linux/interrupt.h | 39 ++++++----- kernel/softirq.c | 155 +++++++++++++++++++++++++++++++--------------- 2 files changed, 128 insertions(+), 66 deletions(-) Index: linux-rt.q/include/linux/interrupt.h =================================================================== --- linux-rt.q.orig/include/linux/interrupt.h +++ linux-rt.q/include/linux/interrupt.h @@ -322,8 +322,9 @@ extern void wait_for_softirq(int softirq to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its excecution is still not started, it will be executed only once. - * If this tasklet is already running on another CPU (or schedule is called - from tasklet itself), it is rescheduled for later. + * If this tasklet is already running on another CPU, it is rescheduled + for later. + * Schedule must not be called from the tasklet itself (a lockup occurs) * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. @@ -348,15 +349,25 @@ struct tasklet_struct name = { NULL, 0, enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */ + TASKLET_STATE_PENDING /* Tasklet is pending */ }; -#ifdef CONFIG_SMP +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED) +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN) +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING) + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } +static inline int tasklet_tryunlock(struct tasklet_struct *t) +{ + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN; +} + static inline void tasklet_unlock(struct tasklet_struct *t) { smp_mb__before_clear_bit(); @@ -368,9 +379,10 @@ static inline void tasklet_unlock_wait(s while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } } #else -#define tasklet_trylock(t) 1 -#define tasklet_unlock_wait(t) do { } while (0) -#define tasklet_unlock(t) do { } while (0) +# define tasklet_trylock(t) 1 +# define tasklet_tryunlock(t) 1 +# define tasklet_unlock_wait(t) do { } while (0) +# define tasklet_unlock(t) do { } while (0) #endif extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t)); @@ -403,17 +415,8 @@ static inline void tasklet_disable(struc smp_mb(); } -static inline void tasklet_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} - -static inline void tasklet_hi_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} +extern fastcall void tasklet_enable(struct tasklet_struct *t); +extern fastcall void tasklet_hi_enable(struct tasklet_struct *t); extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -454,14 +454,24 @@ struct tasklet_head static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL }; static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL }; +static void inline +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) +{ + if (tasklet_trylock(t)) { + WARN_ON(t->next != NULL); + t->next = head->list; + head->list = t; + raise_softirq_irqoff(nr); + tasklet_unlock(t); + } +} + void fastcall __tasklet_schedule(struct tasklet_struct *t) { unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; - raise_softirq_irqoff(TASKLET_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -472,81 +482,130 @@ void fastcall __tasklet_hi_schedule(stru unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; - raise_softirq_irqoff(HI_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__tasklet_hi_schedule); -static void tasklet_action(struct softirq_action *a) +void fastcall tasklet_enable(struct tasklet_struct *t) { - struct tasklet_struct *list; + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_schedule(t); +} - local_irq_disable(); - list = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = NULL; - local_irq_enable(); +EXPORT_SYMBOL(tasklet_enable); + +void fastcall tasklet_hi_enable(struct tasklet_struct *t) +{ + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_hi_schedule(t); +} + +EXPORT_SYMBOL(tasklet_hi_enable); + +static void +__tasklet_action(struct softirq_action *a, struct tasklet_struct *list) +{ + int loops = 1000000; while (list) { struct tasklet_struct *t = list; list = list->next; + /* + * Should always succeed - after a tasklist got on the + * list (after getting the SCHED bit set from 0 to 1), + * nothing but the tasklet softirq it got queued to can + * lock it: + */ + if (!tasklet_trylock(t)) { + WARN_ON(1); + continue; + } + + t->next = NULL; + + /* + * If we cannot handle the tasklet because it's disabled, + * mark it as pending. tasklet_enable() will later + * re-schedule the tasklet. + */ + if (unlikely(atomic_read(&t->count))) { +out_disabled: + /* implicit unlock: */ + wmb(); + t->state = TASKLET_STATEF_PENDING; + continue; + } - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); + /* + * After this point on the tasklet might be rescheduled + * on another CPU, but it can only be added to another + * CPU's tasklet list if we unlock the tasklet (which we + * dont do yet). + */ + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + WARN_ON(1); + +again: + t->func(t->data); + + /* + * Try to unlock the tasklet. We must use cmpxchg, because + * another CPU might have scheduled or disabled the tasklet. + * We only allow the STATE_RUN -> 0 transition here. + */ + while (!tasklet_tryunlock(t)) { + /* + * If it got disabled meanwhile, bail out: + */ + if (atomic_read(&t->count)) + goto out_disabled; + /* + * If it got scheduled meanwhile, re-execute + * the tasklet function: + */ + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + goto again; + if (!--loops) { + printk("hm, tasklet state: %08lx\n", t->state); + WARN_ON(1); tasklet_unlock(t); - continue; + break; } - tasklet_unlock(t); } - - local_irq_disable(); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; - __do_raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); } } -static void tasklet_hi_action(struct softirq_action *a) +static void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = NULL; + list = __get_cpu_var(tasklet_vec).list; + __get_cpu_var(tasklet_vec).list = NULL; local_irq_enable(); - while (list) { - struct tasklet_struct *t = list; + __tasklet_action(a, list); +} - list = list->next; +static void tasklet_hi_action(struct softirq_action *a) +{ + struct tasklet_struct *list; - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } + local_irq_disable(); + list = __get_cpu_var(tasklet_hi_vec).list; + __get_cpu_var(tasklet_hi_vec).list = NULL; + local_irq_enable(); - local_irq_disable(); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; - __do_raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } + __tasklet_action(a, list); } - void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { patches/tasklet-more-fixes.patch0000664000077200007720000001572510646635214016306 0ustar mingomingoFrom linux-kernel-owner@vger.kernel.org Thu Jun 14 23:21:31 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=none autolearn=unavailable version=3.1.7-deb Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by mail.tglx.de (Postfix) with ESMTP id F2D8065C3D9 for ; Thu, 14 Jun 2007 23:21:31 +0200 (CEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756447AbXFNVVF (ORCPT ); Thu, 14 Jun 2007 17:21:05 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753441AbXFNVUw (ORCPT ); Thu, 14 Jun 2007 17:20:52 -0400 Received: from e33.co.us.ibm.com ([32.97.110.151]:53331 "EHLO e33.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752693AbXFNVUv (ORCPT ); Thu, 14 Jun 2007 17:20:51 -0400 Received: from d03relay02.boulder.ibm.com (d03relay02.boulder.ibm.com [9.17.195.227]) by e33.co.us.ibm.com (8.13.8/8.13.8) with ESMTP id l5ELKnM3030113 for ; Thu, 14 Jun 2007 17:20:49 -0400 Received: from d03av01.boulder.ibm.com (d03av01.boulder.ibm.com [9.17.195.167]) by d03relay02.boulder.ibm.com (8.13.8/8.13.8/NCO v8.3) with ESMTP id l5ELKniv268710 for ; Thu, 14 Jun 2007 15:20:49 -0600 Received: from d03av01.boulder.ibm.com (loopback [127.0.0.1]) by d03av01.boulder.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id l5ELKm9A010919 for ; Thu, 14 Jun 2007 15:20:49 -0600 Received: from [9.67.41.186] (wecm-9-67-41-186.wecm.ibm.com [9.67.41.186]) by d03av01.boulder.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id l5ELKl3X010835; Thu, 14 Jun 2007 15:20:47 -0600 Subject: Re: [PATCH -rt] Fix TASKLET_STATE_SCHED WARN_ON() From: john stultz To: Ingo Molnar Cc: Thomas Gleixner , Steven Rostedt , "Paul E. McKenney" , lkml In-Reply-To: <1181096244.6018.20.camel@localhost> References: <1181096244.6018.20.camel@localhost> Content-Type: text/plain Date: Thu, 14 Jun 2007 14:20:20 -0700 Message-Id: <1181856020.6276.14.camel@localhost.localdomain> Mime-Version: 1.0 X-Mailer: Evolution 2.10.1 Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org X-Filter-To: .Kernel.LKML X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit On Tue, 2007-06-05 at 19:17 -0700, john stultz wrote: > Hey Ingo, > So we've been seeing the following trace fairly frequently on our SMP > boxes when running kernbench: > > BUG: at kernel/softirq.c:639 __tasklet_action() > > Call Trace: > [] dump_trace+0xaa/0x32a > [] show_trace+0x41/0x5c > [] dump_stack+0x15/0x17 > [] __tasklet_action+0xdf/0x12e > [] tasklet_action+0x27/0x29 > [] ksoftirqd+0x16c/0x271 > [] kthread+0xf5/0x128 > [] child_rip+0xa/0x12 > > > Paul also pointed this out awhile back: http://lkml.org/lkml/2007/2/25/1 > > > Anyway, I think I finally found the issue. Its a bit hard to explain, > but the idea is while __tasklet_action is running the tasklet function > on CPU1, if a call to tasklet_schedule() on CPU2 is made, and if right > after we mark the TASKLET_STATE_SCHED bit we are preempted, > __tasklet_action on CPU1 might be able to re-run the function, clear the > bit and unlock the tasklet before CPU2 enters __tasklet_common_schedule. > Once __tasklet_common_schedule locks the tasklet, we will add the > tasklet to the list with the TASKLET_STATE_SCHED *unset*. > > I've verified this race occurs w/ a WARN_ON in > __tasklet_common_schedule(). > > > This fix avoids this race by making sure *after* we've locked the > tasklet that the STATE_SCHED bit is set before adding it to the list. > > Does it look ok to you? > > thanks > -john > > Signed-off-by: John Stultz > > Index: 2.6-rt/kernel/softirq.c > =================================================================== > --- 2.6-rt.orig/kernel/softirq.c 2007-06-05 18:30:54.000000000 -0700 > +++ 2.6-rt/kernel/softirq.c 2007-06-05 18:36:44.000000000 -0700 > @@ -544,10 +544,17 @@ static void inline > __tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) > { > if (tasklet_trylock(t)) { > - WARN_ON(t->next != NULL); > - t->next = head->list; > - head->list = t; > - raise_softirq_irqoff(nr); > + /* We may have been preempted before tasklet_trylock > + * and __tasklet_action may have already run. > + * So double check the sched bit while the takslet > + * is locked before adding it to the list. > + */ > + if (test_bit(TASKLET_STATE_SCHED, &t->state)) { > + WARN_ON(t->next != NULL); > + t->next = head->list; > + head->list = t; > + raise_softirq_irqoff(nr); > + } > tasklet_unlock(t); > } > } So while digging on a strange OOM issue we were seeing (which actually ended up being fixed by Steven's softirq patch), I noticed that the fix above is incomplete. With only the patch above, we may no longer have unscheduled tasklets added to the list, but we may end up with scheduled tasklets that are not on the list (and will stay that way!). The following additional patch should correct this issue. Although since we weren't actually hitting it, the issue is a bit theoretical, so I've not been able to prove it really fixes anything. thanks -john --- kernel/softirq.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -459,6 +459,7 @@ static void inline __tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) { if (tasklet_trylock(t)) { +again: /* We may have been preempted before tasklet_trylock * and __tasklet_action may have already run. * So double check the sched bit while the takslet @@ -469,8 +470,21 @@ __tasklet_common_schedule(struct tasklet t->next = head->list; head->list = t; raise_softirq_irqoff(nr); + tasklet_unlock(t); + } else { + /* This is subtle. If we hit the corner case above + * It is possible that we get preempted right here, + * and another task has successfully called + * tasklet_schedule(), then this function, and + * failed on the trylock. Thus we must be sure + * before releasing the tasklet lock, that the + * SCHED_BIT is clear. Otherwise the tasklet + * may get its SCHED_BIT set, but not added to the + * list + */ + if (!tasklet_tryunlock(t)) + goto again; } - tasklet_unlock(t); } } patches/series0000644000077200007720000003421410646635217012752 0ustar mingomingo # # base tree: 2.6.22.1 # # # Linus latest # # # CFS queue: # sched-cfs-v2.6.22.1-v19.patch sched-cfs-latest.patch # # Futex-fixes in -mm # futex-tidy-up-the-code-v2.patch # # hrt: 2.6.22-hrt6 queue # # Basic cpuidle patches cpuidle-complete.patch # # Generic hrtimer/time/clockevent/source patches # i386-hpet-check-if-the-counter-works.patch clockevents-remove-unused-code.patch nohz-fix-nohz-x86-dyntick-idle-handling.patch acpi-move-timer-broadcast-and-pmtimer-access-before-c3-arbiter-shutdown.patch clockevents-fix-typo-in-acpi_pmc.patch timekeeping-fixup-shadow-variable-argument.patch timerc-cleanup-recently-introduced-whitespace-damage.patch clockevents-remove-prototypes-of-removed-functions.patch clockevents-fix-resume-logic.patch clockevents-fix-device-replacement.patch tick-management-spread-timer-interrupt.patch highres-improve-debug-output.patch highres-improve-debug-output-fix.patch hrtimer-speedup-hrtimer_enqueue.patch pcspkr-use-the-global-pit-lock.patch ntp-move-the-cmos-update-code-into-ntpc.patch ntp-move-the-cmos-update-code-into-ntpc-fix.patch ntp-move-the-cmos-update-code-into-ntpc-fix-fix.patch i386-pit-stop-only-when-in-periodic-or-oneshot-mode.patch i386-remove-volatile-in-apicc.patch i386-hpet-assumes-boot-cpu-is-0.patch i386-move-pit-function-declarations-and-constants-to-correct-header-file.patch # # x86-64 # clockevents-remove-unused-inline-function.patch clockevents-allow-build-without-runtime-use.patch i386-remove-pit-interrupt-hook.patch x86_64-hpet-tsc-calibration-fix-broken-smi-detection-logic.patch x86_64-untangle-asm-hpeth-from-asm-timexh.patch x86_64-use-generic-cmos-update.patch x86_64-use-generic-xtime-init.patch x86_64-remove-dead-code-and-other-janitor-work-in-tscc.patch x86_64-fix-apic-typo.patch x86_64-fix-irq-regs-leftovers.patch x86_64-share-hpet-h.patch x86_64-i8259-remove-useless-forward-declaration.patch x86_64-apic-whitespace-comment-and-remove-unused-code.patch x86_64-timec-fix-whitespace-wreckage.patch x86_64-consolidate-tsc-calibration.patch i386-prepare-sharing-hpet-code.patch i386-hpet-add-x8664-hpet-bits.patch i386-prepare-sharing-pit-code.patch x86_64-use-i386-i8253-h.patch x86_64-preparatory-apic-set-lvtt.patch x86_64-apic-remove-bogus-pit-synchronization.patch x86_64-apic-shuffle-calibration-around.patch x86_64-apic-calibration-remove-divisor.patch x86_64-apic-change-setup-calling-convention.patch x86_64-apic-remove-nested-irq-disable.patch x86_64-prep-idle-loop-for-dynticks.patch x86_64-apic-add-clockevents-functions.patch x86_64-convert-to-clockevents.patch x86_64-remove-unused-code.patch x86_64-cleanup-apic-c.patch jiffies-remove-unused-macros.patch acpi-remove-the-useless-ifdef-code.patch i386-pit-remove-the-useless-ifdefs.patch i386-hpet-sharing-optimize.patch # # Venki's HPET series # ich-force-hpet-make-generic-time-capable-of-switching-broadcast-timer.patch ich-force-hpet-restructure-hpet-generic-clock-code.patch ich-force-hpet-ich7-or-later-quirk-to-force-detect-enable.patch ich-force-hpet-ich7-or-later-quirk-to-force-detect-enable-fix.patch ich-force-hpet-late-initialization-of-hpet-after-quirk.patch ich-force-hpet-ich5-quirk-to-force-detect-enable.patch ich-force-hpet-ich5-quirk-to-force-detect-enable-fix.patch ich-force-hpet-ich5-fix-a-bug-with-suspend-resume.patch ich-force-hpet-add-ich7_0-pciid-to-quirk-list.patch hpet-force-enable-on-ich34.patch hpet-force-enable-on-vt8235-37-chipsets.patch # # end of the -hrt queue # # # ARM clock events & co # ep93xx-timer-accuracy.patch ep93xx-clockevents.patch ep93xx-clockevents-fix.patch arm-imx.patch # CHECKME arm-leds-timer.patch # Upstream submitted changes cdrom-use-mdelay-instead-of-jiffies-loop.patch spinlock-init-cleanup.patch # # Check what's in mainline / mm or might be # upstream material. # spinlock-trylock-cleanup-sungem.patch x86_64-tsc-sync-irqflags-fix.patch neptune-no-at-keyboard.patch rtmutex-debug.h-cleanup.patch netpoll-8139too-fix.patch kprobes-preempt-fix.patch replace-bugon-by-warn-on.patch # Suspend / resume fixups i386-mark-atomic-irq-ops-raw.patch msi-suspend-resume-workaround.patch floppy-resume-fix.patch # # assorted fixlets from -mm: # # Check if they are really in -mm or should be submitted # hrtimers-overrun-api.patch slob-scale-no-bigblock-list.patch slob-scale-break-out-caches.patch mm-fix-latency.patch ioapic-fix-too-fast-clocks.patch fix-acpi-build-weirdness.patch write-try-lock-irqsave.patch use-write_trylock_irqsave-in-ptrace_attach.patch move-native-irq.patch dont-unmask-io_apic.patch # # misc build beautification patches: # x86-64-smpboot-whitespace.patch gcc-warnings-shut-up.patch # # Various fixlets # # # Debugging patches # apic-dumpstack.patch netfilter-more-debugging.patch # # Latency tracer # nmi-profiling-base.patch add-notrace.patch redo-regparm-option.patch latency-tracing.patch latency-tracing-remove-trace-array.patch latency-tracer-disable-across-trace-cmdline.patch ns2cyc-result-fix.patch latency-tracing-i386-paravirt-fastcall.patch latency-tracing-i386.patch latency-tracing-x86_64.patch latency-tracing-ppc.patch ppc-remove-last-cpukhz.patch ppc-rename-xmon-mcount.patch ppc-add-mcount.patch ppc-mcount-dummy-functions.patch ppc-mark-notrace-mainline.patch ppc-add-ppc32-mcount.patch latency-tracer-printk-fix.patch latency-tracing-arm.patch latency-tracing-exclude-printk.patch latency-tracing-prctl-api-hack.patch latency-tracing-raw-spinlock-hack.patch latency-tracer-one-off-fix.patch smaller-trace.patch trace-name-plus.patch trace-with-caller-addr.patch trace-sti-mwait.patch latency-tracer-optimize-a-bit.patch idle-stop-critical-timing.patch arm-latency-tracer-support.patch latency-tracer-variable-threshold.patch # Needs to be rewritten to trigger on the procfs variable ! reset-latency-histogram.patch # # x86-64 unwinder # # not applied - it's a large chunk of code and Linus is very sceptical about it # #redo-unwinder.patch #unwinder-fix.patch # # lockdep queue: # latency-trace-fix.patch trace-cpuidle.patch lockdep-show-held-locks.patch lockdep-lock_set_subclass.patch lockdep-prettify.patch lockdep-more-entries.patch # # Revert loopback bh assumption patch # loopback-revert.patch # # hrtimer # hrtimer-trace.patch hrtimer-no-getnstimeofday.patch time-warp-detect.patch # # PPC gtod and highres support # ppc-gtod-support.patch ppc-gtod-support-fix.patch ppc-a-2.patch ppc-fix-clocksource-timebase-shift.patch ppc-remove-broken-vsyscall.patch ppc-read-persistent-clock.patch ppc-gtod-notrace-fix.patch ppc-clockevents.patch ppc-clockevents-fix.patch ppc-highres-dyntick.patch # # -rt queue: # inet_hash_bits.patch # tracing inet-hash-bits-ipv6-fix.patch undo-latency-tracing-raw-spinlock-hack.patch random-driver-latency-fix.patch latency-measurement-drivers.patch latency-measurement-drivers-fix.patch # # RCU preempt patches from Paul: # rcu-1.patch rcu-2.patch rcu-3.patch rcu-4.patch rcu-preempt-fix-nmi-watchdog.patch rcu-preempt-fix-rcu-torture.patch rcu-hrt-fixups.patch dynticks-rcu-rt-fixlet.patch rcu-tasklet-softirq.patch rcu-classic-fixup.patch # # ARM preperatory patches # arm-cmpxchg.patch arm-fix-atomic-cmpxchg.patch arm-cmpxchg-support-armv6.patch arm-preempt-config.patch # # IRQ threading # preempt-irqs-core.patch preempt-irqs-timer.patch preempt-irqs-hrtimer.patch preempt-irqs-i386.patch preempt-irqs-mips.patch preempt-irqs-x86-64.patch preempt-irqs-x86-64-ioapic-mask-quirk.patch preempt-irqs-i386-ioapic-mask-quirk.patch preempt-irqs-arm.patch preempt-irqs-arm-fix-oprofile.patch preempt-irqs-ppc.patch preempt-irqs-ppc-ack-irq-fixups.patch preempt-irqs-ppc-fix-b5.patch preempt-irqs-ppc-fix-b6.patch preempt-irqs-ppc-celleb-beatic-eoi.patch preempt-irqs-ppc-fix-more-fasteoi.patch preempt-irqs-ppc-preempt-schedule-irq-entry-fix.patch preempt-irqs-Kconfig.patch # # Real real time stuff :) # rt-apis.patch rt-slab-new.patch rt-page_alloc.patch # # rt-mutexes # rt-mutex-core.patch rt-mutex-trylock-export.patch rt-mutex-spinlock-might-sleep.patch rt-mutex-spinlock-nested-export-fix.patch rt-mutex-i386.patch rt-mutex-mips.patch rt-mutex-ppc.patch rt-mtex-ppc-fix-a5.patch rt-mutex-x86-64.patch rt-mutex-arm.patch rt-mutex-arm-fix.patch rt-mutex-compat-semaphores.patch # # Per-CPU locking assumption cleanups: # percpu-locked-mm.patch percpu-locked-netfilter.patch percpu-locked-netfilter2.patch percpu-locked-powerpc-fixups.patch percpu-locked-powerpc-fixups-a6.patch # # Various preempt fixups # net-core-preempt-fix.patch bh-uptodate-lock.patch bh-state-lock.patch jbd_assertions_smp_only.patch # # Tasklet redesign # tasklet-redesign.patch tasklet-busy-loop-hack.patch # # Diable irq poll on -rt # tasklet-fix-preemption-race.patch tasklet-more-fixes.patch disable-irqpoll.patch # # Inaccurate -rt stats (should be replaced by CFS) # kstat-add-rt-stats.patch # # Posix-cpu-timers in a thread # cputimer-thread-rt_A0.patch cputimer-thread-rt-fix.patch posix-cpu-timers-fix.patch # # Various broken drivers # preempt-rt-cs5530-lock-ide-fix.patch vortex-fix.patch serial-locking-rt-cleanup.patch fix-emac-locking-2.6.16.patch # # Serial optimizing # serial-slow-machines.patch # # Realtime patches # # ARM: preempt-realtime-arm.patch arm-trace-preempt-idle.patch preempt-realtime-arm-bagde4.patch preempt-realtime-arm-footbridge.patch preempt-realtime-arm-integrator.patch preempt-realtime-arm-ixp4xx.patch preempt-realtime-arm-pxa.patch preempt-realtime-arm-shark.patch # MIPS: needs splitting preempt-realtime-mips.patch mips-gtod_clocksource.patch # X86-64: needs splitting preempt-realtime-x86_64.patch # IA64: needs splitting preempt-realtime-ia64.patch # PowerPC preempt-realtime-ppc-need-resched-delayed.patch preempt-realtime-ppc-more-resched-fixups.patch preempt-realtime-powerpc.patch preempt-realtime-powerpc-update.patch preempt-realtime-powerpc-a7.patch preempt-realtime-powerpc-b2.patch preempt-realtime-powerpc-b3.patch preempt-realtime-powerpc-b4.patch preempt-realtime-powerpc-add-raw-relax-macros.patch preempt-realtime-powerpc-tlb-batching.patch preempt-realtime-powerpc-celleb-raw-spinlocks.patch # # SuperH: needs splitting # preempt-realtime-powerpc-missing-raw-spinlocks.patch preempt-realtime-sh.patch # # i386 # preempt-realtime-i386.patch preempt-irqs-i386-idle-poll-loop-fix.patch # # Core patch # # Note this is a convenience split up it is not supposed to compile # step by step. Needs some care, but it is way easier to handle than # the previous touch all in one patch # preempt-realtime-sched.patch preempt-realtime-prevent-idle-boosting.patch preempt-realtime-cfs-accounting-fix.patch preempt-realtime-core.patch preempt-realtime-fs-block.patch preempt-realtime-acpi.patch preempt-realtime-ipc.patch preempt-realtime-sound.patch preempt-realtime-mm.patch preempt-realtime-init-show-enabled-debugs.patch preempt-realtime-compile-fixes.patch preempt-realtime-console.patch preempt-realtime-debug-sysctl.patch preempt-realtime-ide.patch preempt-realtime-input.patch preempt-realtime-irqs.patch preempt-realtime-net-drivers.patch preempt-realtime-netconsole.patch preempt-realtime-printk.patch preempt-realtime-profiling.patch preempt-realtime-rawlocks.patch preempt-realtime-rcu.patch preempt-realtime-timer.patch preempt-realtime-usb.patch preempt-realtime-warn-and-bug-on.patch preempt-realtime-warn-and-bug-on-fix.patch # # Various -rt fixups # preempt-realtime-gtod-fixups.patch preempt-realtime-supress-cpulock-warning.patch preempt-realtime-supress-nohz-softirq-warning.patch preempt-realtime-net.patch preempt-realtime-net-softirq-fixups.patch preempt-realtime-loopback.patch preempt-realtime-drivers-pci-hotplug.patch preempt-realtime-8139too-rt-irq-flags-fix.patch # # Utility patches (not for upstream inclusion): # preempt-realtime-supress-rtc-printk.patch softlockup-print-regs.patch hrtimer-no-printk.patch nmi-profiling.patch panic-dont-stop-box.patch nmi-watchdog-disable.patch # # Not yet reviewed # gtod-optimize.patch realtime-lsm.patch # # Futex updates # rcu-various-fixups.patch futex-performance-hack.patch futex-performance-hack-sysctl-fix.patch # # Pete's file locking scalability changes: # lock_list.patch barrier.patch s_files.patch s_files-proc-generic-fix.patch s_files-barrier.patch s_files-per_cpu-rt.patch s_files-schedule_on_each_cpu_wq.patch s_files-per_cpu-flush-fix.patch s_files-pipe-fix.patch # # Pete's lockless pagecache port: # radix-tree-use-indirect-bit.patch radix-tree-gang_lookup_slot.patch mm-speculative-get-page.patch mm-lockless-pagecache-lookups.patch mm-lockless-preempt-fixup.patch mm-lockless-preempt-rt-fixup.patch # # kmap atomix fixes # kmap-atomic-prepare.patch pagefault-disable-cleanup.patch kmap-atomic-i386-fix.patch # # Not yet reviewed # select-error-leak-fix.patch module-pde-race-fixes.patch fix-emergency-reboot.patch timer-freq-tweaks.patch # # Highmem modifications # highmem-revert-mainline.patch highmem_rewrite.patch highmem-redo-mainline.patch rt-kmap-scale-fix.patch # # Debug patches: # pause-on-oops-head-tail.patch i386-nmi-watchdog-show-regs.patch x86-64-traps-move-held-locks-output.patch # # x86-64 vsyscall modifications # x86-64-tscless-vgettimeofday.patch rt-time-starvation-fix.patch # # RT-Java testing stuff # Add-dev-rmem-device-driver-for-real-time-JVM-testing.patch Allocate-RTSJ-memory-for-TCK-conformance-test.patch # # Softirq modifications # new-softirq-code.patch new-softirq-code-fixlets.patch softirq-per-cpu-assumptions-fixes.patch smp-processor-id-fixups.patch fix-migrating-softirq.patch vsyscall-add-notrace.patch disable-gtod-functions-if-gtod-is-not-there.patch fix-softirq-checks-for-non-rt-preempt-hardirq.patch # # Weird crap unearthed by -rt which needs to be investigated # irda-fix.patch nf_conntrack-weird-crash-fix.patch # # Needs proper fix # nf_conntrack-fix-smp-processor-id.patch print-might-sleep-hack.patch # # Lockstat: # lockdep-prove-locking.patch lockdep-rt-mutex.patch lockstat-core.patch lockstat-output.patch lockstat-hooks.patch lockstat-rt-hooks.patch lockstat_bounce.patch lockstat_bounce_rt.patch lockdep_fixups.patch lockstat_class_name.patch lockdep_fixup_annotate.patch # # KVM: # kvm-rt.patch # # Add RT to uname and apply the version # RT_utsname.patch # # not yet backmerged tail patches: # hrt-rt-fix-merge-artifact.patch preempt-rt-no-slub.patch rfkill-input-fix.patch fork.c-takeover-tasklets-warning-fix.patch paravirt-function-pointer-fix.patch hpet-build-fix.patch rtc.c-build-fix.patch version.patch patches/rt-kmap-scale-fix.patch0000664000077200007720000001442410646635216016004 0ustar mingomingoHi Ingo, Apply on top of what is still in -rt. This seems to survive a kbuild -j64 & -j512 (although with that latter the machine goes off for a while, but does return with a kernel). If you can spare a cycle between hacking syslets and -rt, could you have a look at the logic this patch adds? --- Solve 2 deadlocks in the current kmap code. 1) akpm spotted a race in the waitqueue usage that could deadlock the machine. the very unlikely scenario was what we would not find a usable map in LAST_PKMAP tries but right before we hit schedule the very last returns. Solve this by keeping a free count. 2) akpm told about the kmap deadlock where multiple processes each require 2 maps (src, dst). When they deplete the maps for the src maps they will be stuck waiting for their dst maps. Solve this by by tracking (and limiting) kmap users and account two maps for each. This all adds more atomic globals, this will bounce like mad on real large smp. (perhaps add some __cacheline_aligned_on_smp) Signed-off-by: Peter Zijlstra --- include/linux/sched.h | 1 mm/highmem.c | 96 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 10 deletions(-) Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -1477,6 +1477,7 @@ static inline void put_task_struct(struc #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_KMAP 0x00004000 /* this context has a kmap */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ Index: linux-rt.q/mm/highmem.c =================================================================== --- linux-rt.q.orig/mm/highmem.c +++ linux-rt.q/mm/highmem.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -62,10 +63,12 @@ unsigned int nr_free_highpages (void) */ static atomic_t pkmap_count[LAST_PKMAP]; static atomic_t pkmap_hand; +static atomic_t pkmap_free; +static atomic_t pkmap_users; pte_t * pkmap_page_table; -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); +static DECLARE_WAIT_QUEUE_HEAD(pkmap_wait); /* * Try to free a given kmap slot. @@ -80,6 +83,7 @@ static int pkmap_try_free(int pos) if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1) return -1; + atomic_dec(&pkmap_free); /* * TODO: add a young bit to make it CLOCK */ @@ -108,7 +112,8 @@ static inline void pkmap_put(atomic_t *c BUG(); case 1: - wake_up(&pkmap_map_wait); + atomic_inc(&pkmap_free); + wake_up(&pkmap_wait); } } @@ -117,11 +122,10 @@ static inline void pkmap_put(atomic_t *c static int pkmap_get_free(void) { int i, pos, flush; - DECLARE_WAITQUEUE(wait, current); restart: for (i = 0; i < LAST_PKMAP; i++) { - pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP; + pos = atomic_inc_return(&pkmap_hand) & LAST_PKMAP_MASK; flush = pkmap_try_free(pos); if (flush >= 0) goto got_one; @@ -130,10 +134,8 @@ restart: /* * wait for somebody else to unmap their entries */ - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); - schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); + if (likely(!in_interrupt())) + wait_event(pkmap_wait, atomic_read(&pkmap_free) != 0); goto restart; @@ -142,7 +144,7 @@ got_one: #if 0 flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1)); #else - int pos2 = (pos + 1) % LAST_PKMAP; + int pos2 = (pos + 1) & LAST_PKMAP_MASK; int nr; int entries[TLB_BATCH]; @@ -152,7 +154,7 @@ got_one: * Scan ahead of the hand to minimise search distances. */ for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH; - i++, pos2 = (pos2 + 1) % LAST_PKMAP) { + i++, pos2 = (pos2 + 1) & LAST_PKMAP_MASK) { flush = pkmap_try_free(pos2); if (flush < 0) @@ -217,9 +219,79 @@ void kmap_flush_unused(void) WARN_ON_ONCE(1); } +/* + * Avoid starvation deadlock by limiting the number of tasks that can obtain a + * kmap to (LAST_PKMAP - KM_TYPE_NR*NR_CPUS)/2. + */ +static void kmap_account(void) +{ + int weight; + +#ifndef CONFIG_PREEMPT_RT + if (in_interrupt()) { + /* irqs can always get them */ + weight = -1; + } else +#endif + if (current->flags & PF_KMAP) { + current->flags &= ~PF_KMAP; + /* we already accounted the second */ + weight = 0; + } else { + /* mark 1, account 2 */ + current->flags |= PF_KMAP; + weight = 2; + } + + if (weight > 0) { + /* + * reserve KM_TYPE_NR maps per CPU for interrupt context + */ + const int target = LAST_PKMAP +#ifndef CONFIG_PREEMPT_RT + - KM_TYPE_NR*NR_CPUS +#endif + ; + +again: + wait_event(pkmap_wait, + atomic_read(&pkmap_users) + weight <= target); + + if (atomic_add_return(weight, &pkmap_users) > target) { + atomic_sub(weight, &pkmap_users); + goto again; + } + } +} + +static void kunmap_account(void) +{ + int weight; + +#ifndef CONFIG_PREEMPT_RT + if (in_irq()) { + weight = -1; + } else +#endif + if (current->flags & PF_KMAP) { + /* there was only 1 kmap, un-account both */ + current->flags &= ~PF_KMAP; + weight = 2; + } else { + /* there were two kmaps, un-account per kunmap */ + weight = 1; + } + + if (weight > 0) + atomic_sub(weight, &pkmap_users); + wake_up(&pkmap_wait); +} + fastcall void *kmap_high(struct page *page) { unsigned long vaddr; + + kmap_account(); again: vaddr = (unsigned long)page_address(page); if (vaddr) { @@ -260,6 +332,7 @@ fastcall void kunmap_high(struct page *p unsigned long vaddr = (unsigned long)page_address(page); BUG_ON(!vaddr); pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]); + kunmap_account(); } EXPORT_SYMBOL(kunmap_high); @@ -404,6 +477,9 @@ void __init page_address_init(void) for (i = 0; i < ARRAY_SIZE(pkmap_count); i++) atomic_set(&pkmap_count[i], 1); + atomic_set(&pkmap_hand, 0); + atomic_set(&pkmap_free, LAST_PKMAP); + atomic_set(&pkmap_users, 0); #endif #ifdef HASHED_PAGE_VIRTUAL patches/highres-improve-debug-output.patch0000664000077200007720000000515710646635210020307 0ustar mingomingoFrom: Ingo Molnar Add some more debug information to the hrtimer and clock events code. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton --- arch/i386/kernel/apic.c | 3 +++ kernel/hrtimer.c | 5 ++++- kernel/time/tick-oneshot.c | 15 ++++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/i386/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/apic.c +++ linux-rt.q/arch/i386/kernel/apic.c @@ -524,6 +524,9 @@ void __init setup_boot_APIC_clock(void) */ if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); } /* Setup the lapic or request the broadcast */ Index: linux-rt.q/kernel/hrtimer.c =================================================================== --- linux-rt.q.orig/kernel/hrtimer.c +++ linux-rt.q/kernel/hrtimer.c @@ -558,7 +558,8 @@ static inline int hrtimer_enqueue_reprog */ static int hrtimer_switch_to_hres(void) { - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + int cpu = smp_processor_id(); + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -568,6 +569,8 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); + printk(KERN_WARNING "Could not switch to high resolution " + "mode on CPU %d\n", cpu); return 0; } base->hres_active = 1; Index: linux-rt.q/kernel/time/tick-oneshot.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-oneshot.c +++ linux-rt.q/kernel/time/tick-oneshot.c @@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handle struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || - !tick_device_is_functional(dev)) + !tick_device_is_functional(dev)) { + + printk(KERN_INFO "Clockevents: " + "could not switch to one-shot mode:"); + if (!dev) { + printk(" no tick device\n"); + } else { + if (!tick_device_is_functional(dev)) + printk(" %s is not functional.\n", dev->name); + else if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + printk(" %s does not support one-shot mode.\n", + dev->name); + } return -EINVAL; + } td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; patches/i386-hpet-add-x8664-hpet-bits.patch0000664000077200007720000000741410646635211017423 0ustar mingomingoSubject: i386: prepare sharing the hpet code with x86_64 Add the x8664 specific bits (mapping) to share the hpet code later. Move the reserve_platform_timer call to late init. This is necessary for x86_64, as hpet enable() is called before memory is setup. i386 calls it in late_time_init, but it does not hurt to do it later for both. Pull in the x8664 hpet disable command line option as well. Signed-off-by: Thomas Gleixner --- arch/i386/kernel/hpet.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -7,6 +8,7 @@ #include #include +#include #include #include #include @@ -23,6 +25,10 @@ unsigned long hpet_address; static void __iomem *hpet_virt_address; +/* Temporary hack. Cleanup after x86_64 clock events conversion */ +#undef hpet_readl +#undef hpet_writel + static inline unsigned long hpet_readl(unsigned long a) { return readl(hpet_virt_address + a); @@ -33,6 +39,24 @@ static inline void hpet_writel(unsigned writel(d, hpet_virt_address + a); } +#ifdef CONFIG_X86_64 + +#include + +static inline void hpet_set_mapping(void) +{ + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); +} + +static inline void hpet_clear_mapping(void) +{ + hpet_virt_address = NULL; +} + +#else + static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); @@ -43,6 +67,7 @@ static inline void hpet_clear_mapping(vo iounmap(hpet_virt_address); hpet_virt_address = NULL; } +#endif /* * HPET command line enable / disable @@ -59,6 +84,13 @@ static int __init hpet_setup(char* str) } __setup("hpet=", hpet_setup); +static int __init disable_hpet(char *str) +{ + boot_hpet_disable = 1; + return 1; +} +__setup("nohpet", disable_hpet); + static inline int is_hpet_capable(void) { return (!boot_hpet_disable && hpet_address); @@ -225,6 +257,13 @@ static cycle_t read_hpet(void) return (cycle_t)hpet_readl(HPET_COUNTER); } +#ifdef CONFIG_X86_64 +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} +#endif + static struct clocksource clocksource_hpet = { .name = "hpet", .rating = 250, @@ -233,6 +272,9 @@ static struct clocksource clocksource_hp .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_start_counter, +#ifdef CONFIG_X86_64 + .vread = vread_hpet, +#endif }; /* @@ -331,7 +373,6 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_enable_int(); - hpet_reserve_platform_timers(id); /* * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. @@ -349,6 +390,22 @@ out_nohpet: return 0; } +/* + * Needs to be late, as the reserve_timer code calls kalloc ! + * + * Not a problem on i386 as hpet_enable is called from late_time_init, + * but on x86_64 it is necessary ! + */ +static __init int hpet_late_init(void) +{ + if (!is_hpet_capable()) + return -ENODEV; + + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + return 0; +} +fs_initcall(hpet_late_init); + #ifdef CONFIG_HPET_EMULATE_RTC /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET patches/trace-sti-mwait.patch0000664000077200007720000000723410646635212015567 0ustar mingomingo--- arch/i386/kernel/process.c | 2 ++ arch/x86_64/kernel/process.c | 7 +++++-- include/linux/irqflags.h | 9 +++++++++ kernel/latency_trace.c | 17 ++++++++++++++++- 4 files changed, 32 insertions(+), 3 deletions(-) Index: linux-rt.q/arch/i386/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/process.c +++ linux-rt.q/arch/i386/kernel/process.c @@ -200,10 +200,12 @@ void cpu_idle(void) __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } + trace_preempt_exit_idle(); tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); + trace_preempt_enter_idle(); } } Index: linux-rt.q/arch/x86_64/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/process.c +++ linux-rt.q/arch/x86_64/kernel/process.c @@ -230,10 +230,12 @@ void cpu_idle (void) __exit_idle(); } + trace_preempt_exit_idle(); tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); + trace_preempt_enter_idle(); } } @@ -263,9 +265,10 @@ static void mwait_idle(void) if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched()) { + trace_hardirqs_on(); __sti_mwait(0, 0); - else + } else local_irq_enable(); } else { local_irq_enable(); Index: linux-rt.q/include/linux/irqflags.h =================================================================== --- linux-rt.q.orig/include/linux/irqflags.h +++ linux-rt.q/include/linux/irqflags.h @@ -16,6 +16,13 @@ extern void trace_hardirqs_off(void); extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); +# ifdef CONFIG_CRITICAL_PREEMPT_TIMING + extern void trace_preempt_enter_idle(void); + extern void trace_preempt_exit_idle(void); +# else +# define trace_preempt_enter_idle() do { } while (0) +# define trace_preempt_exit_idle() do { } while (0) +# endif # define trace_hardirq_context(p) ((p)->hardirq_context) # define trace_softirq_context(p) ((p)->softirq_context) # define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) @@ -26,6 +33,8 @@ # define trace_softirq_exit() do { current->softirq_context--; } while (0) # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, #else +# define trace_preempt_enter_idle() do { } while (0) +# define trace_preempt_exit_idle() do { } while (0) # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) # define trace_softirqs_on(ip) do { } while (0) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -90,7 +90,8 @@ static inline int DEBUG_WARN_ON(int cond #ifdef CONFIG_CRITICAL_IRQSOFF_TIMING # ifdef CONFIG_CRITICAL_PREEMPT_TIMING -# define irqs_off_preempt_count() preempt_count() + static DEFINE_PER_CPU(int, trace_cpu_idle); +# define irqs_off_preempt_count() (!__get_cpu_var(trace_cpu_idle) && preempt_count()) # else # define irqs_off_preempt_count() 0 # endif @@ -2153,6 +2154,20 @@ void notrace unmask_preempt_count(unsign } EXPORT_SYMBOL(unmask_preempt_count); +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + +/* Some archs do their cpu_idle with preemption on. Don't measure it */ +void notrace trace_preempt_enter_idle(void) +{ + __get_cpu_var(trace_cpu_idle) = 1; +} + +void notrace trace_preempt_exit_idle(void) +{ + __get_cpu_var(trace_cpu_idle) = 0; +} + +#endif /* CONFIG_CRITICAL_PREEMPT_TIMING */ #endif patches/ppc-clockevents.patch0000664000077200007720000002077210646635213015660 0ustar mingomingoFrom sshtylyov@ru.mvista.com Thu May 17 19:40:58 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from imap.sh.mvista.com (unknown [63.81.120.155]) by mail.tglx.de (Postfix) with ESMTP id 88FCD65C3EA for ; Thu, 17 May 2007 19:40:58 +0200 (CEST) Received: from wasted.dev.rtsoft.ru (unknown [10.150.0.9]) by imap.sh.mvista.com (Postfix) with ESMTP id 7571C3EC9; Thu, 17 May 2007 10:40:54 -0700 (PDT) From: Sergei Shtylyov Organization: MontaVista Software Inc. To: tglx@linutronix.de, mingo@elte.hu Subject: [PATCH 2.6.21-rt2] PowerPC: decrementer clockevent driver Date: Thu, 17 May 2007 21:42:26 +0400 User-Agent: KMail/1.5 Cc: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org MIME-Version: 1.0 Content-Disposition: inline Content-Type: text/plain; charset="iso-8859-1" Message-Id: <200705172142.26739.sshtylyov@ru.mvista.com> X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Add PowerPC decrementer clock event driver. Every effort has been made to support the different implementations of the decrementer: the classic one (with 970 series variation), 40x and Book E specific ones. I had to make CONFIG_GENERIC_CLOCKEVENTS option selectable for the compatibility reasons -- this option is not compatible with the PPC64 deterministic time accounting. Thanks to Daniel Walker and Thomas Gleixner for the suggestions they made... Signed-off-by: Sergei Shtylyov --- This patch has been reworked against the 2.6.21 clockevents framework. It has only been tested on the Book E 32-bit CPU this time, so re-testing on "classic" PowerPC CPUs is needed (used to work as of 2.6.18-rt7)... CONFIG_PPC_MULTIPLATFORM was the best option I was able to come up with to cover machines built on 970 series CPUs... arch/powerpc/Kconfig | 12 +++- arch/powerpc/kernel/time.c | 124 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 134 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/powerpc/Kconfig =================================================================== --- linux-rt.q.orig/arch/powerpc/Kconfig +++ linux-rt.q/arch/powerpc/Kconfig @@ -351,7 +351,7 @@ config PPC_MM_SLICES config VIRT_CPU_ACCOUNTING bool "Deterministic task and CPU time accounting" - depends on PPC64 + depends on PPC64 && !GENERIC_CLOCKEVENTS default y help Select this option to enable more accurate task and CPU time @@ -410,6 +410,16 @@ config HIGHMEM depends on PPC32 source kernel/Kconfig.hz + +config GENERIC_CLOCKEVENTS + bool "Clock event devices support" + default n + help + Enable support for the clock event devices necessary for the + high-resolution timers and the tickless system support. + NOTE: This is not compatible with the deterministic time accounting + option on PPC64. + source kernel/Kconfig.preempt source "fs/Kconfig.binfmt" Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -125,6 +126,83 @@ unsigned long ppc_tb_freq; static u64 tb_last_jiffy __cacheline_aligned_in_smp; static DEFINE_PER_CPU(u64, last_jiffy); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + +#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +#define DECREMENTER_MAX 0xffffffff +#else +#define DECREMENTER_MAX 0x7fffffff /* setting MSB triggers an interrupt */ +#endif + +static int decrementer_set_next_event(unsigned long evt, + struct clock_event_device *dev) +{ +#if defined(CONFIG_40x) + mtspr(SPRN_PIT, evt); /* 40x has a hidden PIT auto-reload register */ +#elif defined(CONFIG_BOOKE) + mtspr(SPRN_DECAR, evt); /* Book E has separate auto-reload register */ + set_dec(evt); +#else + set_dec(evt - 1); /* Classic decrementer interrupts at -1 */ +#endif + return 0; +} + +static void decrementer_set_mode(enum clock_event_mode mode, + struct clock_event_device *dev) +{ +#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) + u32 tcr = mfspr(SPRN_TCR); + + tcr |= TCR_DIE; + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + tcr |= TCR_ARE; + break; + case CLOCK_EVT_MODE_ONESHOT: + tcr &= ~TCR_ARE; + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + tcr &= ~TCR_DIE; + break; + } + mtspr(SPRN_TCR, tcr); +#endif + if (mode == CLOCK_EVT_MODE_PERIODIC) + decrementer_set_next_event(tb_ticks_per_jiffy, dev); +} + +static struct clock_event_device decrementer_clockevent = { + .name = "decrementer", +#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, +#else + .features = CLOCK_EVT_FEAT_ONESHOT, +#endif + .shift = 32, + .rating = 200, + .irq = -1, + .set_next_event = decrementer_set_next_event, + .set_mode = decrementer_set_mode, +}; + +static DEFINE_PER_CPU(struct clock_event_device, decrementers); + +static void register_decrementer(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *decrementer = &per_cpu(decrementers, cpu); + + memcpy(decrementer, &decrementer_clockevent, sizeof(*decrementer)); + + decrementer->cpumask = cpumask_of_cpu(cpu); + + clockevents_register_device(decrementer); +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS */ + #ifdef CONFIG_VIRT_CPU_ACCOUNTING /* * Factors for converting from cputime_t (timebase ticks) to @@ -310,6 +388,9 @@ void snapshot_timebase(void) { __get_cpu_var(last_jiffy) = get_tb(); snapshot_purr(); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + register_decrementer(); +#endif } void __delay(unsigned long loops) @@ -467,7 +548,31 @@ void timer_interrupt(struct pt_regs * re old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_GENERIC_CLOCKEVENTS +#ifdef CONFIG_PPC_MULTIPLATFORM + /* + * We must write a positive value to the decrementer to clear + * the interrupt on the IBM 970 CPU series. In periodic mode, + * this happens when the decrementer gets reloaded later, but + * in one-shot mode, we have to do it here since an event handler + * may skip loading the new value... + */ + if (per_cpu(decrementers, cpu).mode != CLOCK_EVT_MODE_PERIODIC) + set_dec(DECREMENTER_MAX); +#endif + /* + * We can't disable the decrementer, so in the period between + * CPU being marked offline and calling stop-self, it's taking + * timer interrupts... + */ + if (!cpu_is_offline(cpu)) { + struct clock_event_device *dev = &per_cpu(decrementers, cpu); + + dev->event_handler(dev); + } +#else profile_tick(CPU_PROFILING); +#endif calculate_steal_time(); #ifdef CONFIG_PPC_ISERIES @@ -483,6 +588,7 @@ void timer_interrupt(struct pt_regs * re if (__USE_RTC() && per_cpu(last_jiffy, cpu) >= 1000000000) per_cpu(last_jiffy, cpu) -= 1000000000; +#ifndef CONFIG_GENERIC_CLOCKEVENTS /* * We cannot disable the decrementer, so in the period * between this cpu's being marked offline in cpu_online_map @@ -492,6 +598,7 @@ void timer_interrupt(struct pt_regs * re */ if (!cpu_is_offline(cpu)) account_process_time(regs); +#endif /* * No need to check whether cpu is offline here; boot_cpuid @@ -504,15 +611,19 @@ void timer_interrupt(struct pt_regs * re tb_next_jiffy = tb_last_jiffy + tb_ticks_per_jiffy; if (per_cpu(last_jiffy, cpu) >= tb_next_jiffy) { tb_last_jiffy = tb_next_jiffy; +#ifndef CONFIG_GENERIC_CLOCKEVENTS do_timer(1); +#endif /*timer_recalc_offset(tb_last_jiffy);*/ timer_check_rtc(); } write_sequnlock(&xtime_lock); } - + +#ifndef CONFIG_GENERIC_CLOCKEVENTS next_dec = tb_ticks_per_jiffy - ticks; set_dec(next_dec); +#endif #ifdef CONFIG_PPC_ISERIES if (firmware_has_feature(FW_FEATURE_ISERIES) && hvlpevent_is_pending()) @@ -745,8 +856,19 @@ void __init time_init(void) tb_to_ns_scale = scale; tb_to_ns_shift = shift; +#ifdef CONFIG_GENERIC_CLOCKEVENTS + decrementer_clockevent.mult = div_sc(ppc_tb_freq, NSEC_PER_SEC, + decrementer_clockevent.shift); + decrementer_clockevent.max_delta_ns = + clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent); + decrementer_clockevent.min_delta_ns = + clockevent_delta2ns(0xf, &decrementer_clockevent); + + register_decrementer(); +#else /* Not exact, but the timer interrupt takes care of this */ set_dec(tb_ticks_per_jiffy); +#endif } #define FEBRUARY 2 patches/ich-force-hpet-add-ich7_0-pciid-to-quirk-list.patch0000664000077200007720000000231110646635211022745 0ustar mingomingoFrom: Venki Pallipadi Add another PCI ID for ICH7 force hpet. Signed-off-by: Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andi Kleen Cc: john stultz Cc: Greg KH Signed-off-by: Andrew Morton --- arch/i386/kernel/quirks.c | 2 ++ 1 file changed, 2 insertions(+) Index: linux-rt.q/arch/i386/kernel/quirks.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/quirks.c +++ linux-rt.q/arch/i386/kernel/quirks.c @@ -149,6 +149,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, patches/rt-apis.patch0000664000077200007720000000611710646635214014135 0ustar mingomingo add new, -rt specific IRQ API variants. Maps to the same as before on non-PREEMPT_RT. include/linux/bottom_half.h | 8 ++++++++ include/linux/interrupt.h | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) Index: linux-rt.q/include/linux/bottom_half.h =================================================================== --- linux-rt.q.orig/include/linux/bottom_half.h +++ linux-rt.q/include/linux/bottom_half.h @@ -1,9 +1,17 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H +#ifdef CONFIG_PREEMPT_RT +# define local_bh_disable() do { } while (0) +# define __local_bh_disable(ip) do { } while (0) +# define _local_bh_enable() do { } while (0) +# define local_bh_enable() do { } while (0) +# define local_bh_enable_ip(ip) do { } while (0) +#else extern void local_bh_disable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); +#endif #endif /* _LINUX_BH_H */ Index: linux-rt.q/include/linux/interrupt.h =================================================================== --- linux-rt.q.orig/include/linux/interrupt.h +++ linux-rt.q/include/linux/interrupt.h @@ -119,7 +119,7 @@ extern void devm_free_irq(struct device #ifdef CONFIG_LOCKDEP # define local_irq_enable_in_hardirq() do { } while (0) #else -# define local_irq_enable_in_hardirq() local_irq_enable() +# define local_irq_enable_in_hardirq() local_irq_enable_nort() #endif #ifdef CONFIG_GENERIC_HARDIRQS @@ -477,4 +477,37 @@ static inline void init_irq_proc(void) } #endif +#ifdef CONFIG_PREEMPT_RT +# define local_irq_disable_nort() do { } while (0) +# define local_irq_enable_nort() do { } while (0) +# define local_irq_enable_rt() local_irq_enable() +# define local_irq_save_nort(flags) do { local_save_flags(flags); } while (0) +# define local_irq_restore_nort(flags) do { (void)(flags); } while (0) +# define spin_lock_nort(lock) do { } while (0) +# define spin_unlock_nort(lock) do { } while (0) +# define spin_lock_bh_nort(lock) do { } while (0) +# define spin_unlock_bh_nort(lock) do { } while (0) +# define spin_lock_rt(lock) spin_lock(lock) +# define spin_unlock_rt(lock) spin_unlock(lock) +# define smp_processor_id_rt(cpu) (cpu) +# define in_atomic_rt() (!oops_in_progress && \ + (in_atomic() || irqs_disabled())) +# define read_trylock_rt(lock) ({read_lock(lock); 1; }) +#else +# define local_irq_disable_nort() local_irq_disable() +# define local_irq_enable_nort() local_irq_enable() +# define local_irq_enable_rt() do { } while (0) +# define local_irq_save_nort(flags) local_irq_save(flags) +# define local_irq_restore_nort(flags) local_irq_restore(flags) +# define spin_lock_rt(lock) do { } while (0) +# define spin_unlock_rt(lock) do { } while (0) +# define spin_lock_nort(lock) spin_lock(lock) +# define spin_unlock_nort(lock) spin_unlock(lock) +# define spin_lock_bh_nort(lock) spin_lock_bh(lock) +# define spin_unlock_bh_nort(lock) spin_unlock_bh(lock) +# define smp_processor_id_rt(cpu) smp_processor_id() +# define in_atomic_rt() 0 +# define read_trylock_rt(lock) read_trylock(lock) +#endif + #endif patches/cpuidle-complete.patch0000664000077200007720000026045210646635210016011 0ustar mingomingo--- Documentation/cpuidle/core.txt | 17 Documentation/cpuidle/driver.txt | 29 + Documentation/cpuidle/governor.txt | 28 + Documentation/cpuidle/sysfs.txt | 35 + arch/i386/Kconfig | 2 arch/i386/kernel/process.c | 3 arch/x86_64/Kconfig | 2 drivers/Makefile | 1 drivers/acpi/osl.c | 11 drivers/acpi/processor_core.c | 5 drivers/acpi/processor_idle.c | 904 ++++++++++++++----------------------- drivers/cpuidle/Kconfig | 39 + drivers/cpuidle/Makefile | 5 drivers/cpuidle/cpuidle.c | 306 ++++++++++++ drivers/cpuidle/cpuidle.h | 50 ++ drivers/cpuidle/driver.c | 276 +++++++++++ drivers/cpuidle/governor.c | 187 +++++++ drivers/cpuidle/governors/Makefile | 6 drivers/cpuidle/governors/ladder.c | 228 +++++++++ drivers/cpuidle/governors/menu.c | 181 +++++++ drivers/cpuidle/sysfs.c | 393 ++++++++++++++++ include/acpi/processor.h | 3 include/linux/acpi.h | 7 include/linux/cpuidle.h | 190 +++++++ include/linux/tick.h | 10 kernel/softirq.c | 5 kernel/time/tick-sched.c | 26 + 27 files changed, 2399 insertions(+), 550 deletions(-) Index: linux-rt.q/Documentation/cpuidle/core.txt =================================================================== --- /dev/null +++ linux-rt.q/Documentation/cpuidle/core.txt @@ -0,0 +1,17 @@ + + Supporting multiple CPU idle levels in kernel + + cpuidle + +General Information: + +Various CPUs today support multiple idle levels that are differentiated +by varying exit latencies and power consumption during idle. +cpuidle is a generic in-kernel infrastructure that separates +idle policy (governor) from idle mechanism (driver) and provides a +standardized infrastructure to support independent development of +governors and drivers. + +cpuidle resides under drivers/cpuidle. + + Index: linux-rt.q/Documentation/cpuidle/driver.txt =================================================================== --- /dev/null +++ linux-rt.q/Documentation/cpuidle/driver.txt @@ -0,0 +1,29 @@ + + + Supporting multiple CPU idle levels in kernel + + cpuidle drivers + + + + +cpuidle driver hooks into the cpuidle infrastructure and does the +architecture/platform dependent part of CPU idle states. Driver +provides the platform idle state detection capability and also +has mechanisms in place to support actusl entry-exit into a CPU idle state. + +cpuidle driver supports capability detection for a platform using the +init and exit routines. They will be called for each online CPU, with a +percpu cpuidle_driver object and driver should fill in cpuidle_states +inside cpuidle_driver depending on the CPU capability. + +Driver can handle dynamic state changes (like battery<->AC), by calling +force_redetect interface. + +It is possible to have more than one driver registered at the same time and +user can switch between drivers using /sysfs interface (when enabled). + +Interfaces: +int cpuidle_register_driver(struct cpuidle_driver *drv); +void cpuidle_unregister_driver(struct cpuidle_driver *drv); +int cpuidle_force_redetect(struct cpuidle_device *dev); Index: linux-rt.q/Documentation/cpuidle/governor.txt =================================================================== --- /dev/null +++ linux-rt.q/Documentation/cpuidle/governor.txt @@ -0,0 +1,28 @@ + + + + Supporting multiple CPU idle levels in kernel + + cpuidle governors + + + + +cpuidle governor is policy routine that decides what idle state to enter at +any given time. cpuidle core uses different callbacks to governor while +handling idle entry. +* select_state() callback where governor can determine next idle state to enter +* prepare_idle() callback is called before entering an idle state +* scan() callback is called after a driver forces redetection of the states + +More than one governor can be registered at the same time and +user can switch between drivers using /sysfs interface (when supported). + +More than one governor part is supported for developers to easily experiment +with different governors. By default, most optimal governor based on your +kernel configuration and platform will be selected by cpuidle. + +Interfaces: +int cpuidle_register_governor(struct cpuidle_governor *gov); +void cpuidle_unregister_governor(struct cpuidle_governor *gov); + Index: linux-rt.q/Documentation/cpuidle/sysfs.txt =================================================================== --- /dev/null +++ linux-rt.q/Documentation/cpuidle/sysfs.txt @@ -0,0 +1,35 @@ + + + Supporting multiple CPU idle levels in kernel + + cpuidle sysfs + +System global cpuidle related information and tunables are under +/sys/devices/system/cpu/cpuidle + +The current interfaces in this directory has self-explanatory names: +* current_driver_ro +* current_governor_ro + +With cpuidle_sysfs_switch boot option (meant for developer testing) +following objects are visible instead. +* available_drivers +* available_governors +* current_driver +* current_governor +In this case user can switch the driver, governor at run time by writing +onto current_driver and current_governor. + + +Per logical CPU specific cpuidle information are under +/sys/devices/system/cpu/cpuX/cpuidle +for each online cpu X + +Under this percpu directory, there is a directory for each idle state supported +by the driver, which in turn has +* latency : Latency to exit out of this idle state (in microseconds) +* power : Power consumed while in this idle state (in milliwatts) +* time : Total time spent in this idle state (in microseconds) +* usage : Number of times this state was entered (count) + + Index: linux-rt.q/arch/i386/Kconfig =================================================================== --- linux-rt.q.orig/arch/i386/Kconfig +++ linux-rt.q/arch/i386/Kconfig @@ -1053,6 +1053,8 @@ endif # APM source "arch/i386/kernel/cpu/cpufreq/Kconfig" +source "drivers/cpuidle/Kconfig" + endmenu menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" Index: linux-rt.q/arch/i386/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/process.c +++ linux-rt.q/arch/i386/kernel/process.c @@ -179,13 +179,14 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + tick_nohz_stop_sched_tick(); + check_pgt_cache(); rmb(); idle = pm_idle; Index: linux-rt.q/arch/x86_64/Kconfig =================================================================== --- linux-rt.q.orig/arch/x86_64/Kconfig +++ linux-rt.q/arch/x86_64/Kconfig @@ -698,6 +698,8 @@ source "drivers/acpi/Kconfig" source "arch/x86_64/kernel/cpufreq/Kconfig" +source "drivers/cpuidle/Kconfig" + endmenu menu "Bus options (PCI etc.)" Index: linux-rt.q/drivers/Makefile =================================================================== --- linux-rt.q.orig/drivers/Makefile +++ linux-rt.q/drivers/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_EDAC) += edac/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_CPU_IDLE) += cpuidle/ obj-$(CONFIG_MMC) += mmc/ obj-$(CONFIG_NEW_LEDS) += leds/ obj-$(CONFIG_INFINIBAND) += infiniband/ Index: linux-rt.q/drivers/acpi/osl.c =================================================================== --- linux-rt.q.orig/drivers/acpi/osl.c +++ linux-rt.q/drivers/acpi/osl.c @@ -1056,6 +1056,17 @@ unsigned int max_cstate = ACPI_PROCESSOR EXPORT_SYMBOL(max_cstate); +void (*acpi_do_set_cstate_limit)(void); +EXPORT_SYMBOL(acpi_do_set_cstate_limit); + +void acpi_set_cstate_limit(unsigned int new_limit) +{ + max_cstate = new_limit; + if (acpi_do_set_cstate_limit) + acpi_do_set_cstate_limit(); +} +EXPORT_SYMBOL(acpi_set_cstate_limit); + /* * Acquire a spinlock. * Index: linux-rt.q/drivers/acpi/processor_core.c =================================================================== --- linux-rt.q.orig/drivers/acpi/processor_core.c +++ linux-rt.q/drivers/acpi/processor_core.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -1022,11 +1023,15 @@ static int __init acpi_processor_init(vo acpi_processor_ppc_init(); + cpuidle_register_driver(&acpi_idle_driver); + acpi_do_set_cstate_limit = acpi_max_cstate_changed; return 0; } static void __exit acpi_processor_exit(void) { + acpi_do_set_cstate_limit = NULL; + cpuidle_unregister_driver(&acpi_idle_driver); acpi_processor_ppc_exit(); Index: linux-rt.q/drivers/acpi/processor_idle.c =================================================================== --- linux-rt.q.orig/drivers/acpi/processor_idle.c +++ linux-rt.q/drivers/acpi/processor_idle.c @@ -40,6 +40,7 @@ #include /* need_resched() */ #include #include +#include /* * Include the apic definitions for x86 to have the APIC timer related defines @@ -62,25 +63,34 @@ #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_idle"); #define ACPI_PROCESSOR_FILE_POWER "power" -#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) -#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ -#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ -static void (*pm_idle_save) (void) __read_mostly; -module_param(max_cstate, uint, 0644); +#define PM_TIMER_TICKS_TO_US(p) (((p) * 1000)/(PM_TIMER_FREQUENCY/1000)) +#define C2_OVERHEAD 1 /* 1us */ +#define C3_OVERHEAD 1 /* 1us */ + +void acpi_max_cstate_changed(void) +{ + /* Driver will reset devices' max cstate limit */ + cpuidle_force_redetect_devices(&acpi_idle_driver); +} + +static int change_max_cstate(const char *val, struct kernel_param *kp) +{ + int max; + + max = simple_strtol(val, NULL, 0); + if (!max) + return -EINVAL; + max_cstate = max; + if (acpi_do_set_cstate_limit) + acpi_do_set_cstate_limit(); + return 0; +} + +module_param_call(max_cstate, change_max_cstate, param_get_uint, &max_cstate, 0644); static unsigned int nocst __read_mostly; module_param(nocst, uint, 0000); -/* - * bm_history -- bit-mask with a bit per jiffy of bus-master activity - * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms - * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms - * 100 HZ: 0x0000000F: 4 jiffies = 40ms - * reduce history for more aggressive entry into C3 - */ -static unsigned int bm_history __read_mostly = - (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1)); -module_param(bm_history, uint, 0644); /* -------------------------------------------------------------------------- Power Management -------------------------------------------------------------------------- */ @@ -166,88 +176,6 @@ static struct dmi_system_id __cpuinitdat {}, }; -static inline u32 ticks_elapsed(u32 t1, u32 t2) -{ - if (t2 >= t1) - return (t2 - t1); - else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) - return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); - else - return ((0xFFFFFFFF - t1) + t2); -} - -static void -acpi_processor_power_activate(struct acpi_processor *pr, - struct acpi_processor_cx *new) -{ - struct acpi_processor_cx *old; - - if (!pr || !new) - return; - - old = pr->power.state; - - if (old) - old->promotion.count = 0; - new->demotion.count = 0; - - /* Cleanup from old state. */ - if (old) { - switch (old->type) { - case ACPI_STATE_C3: - /* Disable bus master reload */ - if (new->type != ACPI_STATE_C3 && pr->flags.bm_check) - acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); - break; - } - } - - /* Prepare to use new state. */ - switch (new->type) { - case ACPI_STATE_C3: - /* Enable bus master reload */ - if (old->type != ACPI_STATE_C3 && pr->flags.bm_check) - acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1); - break; - } - - pr->power.state = new; - - return; -} - -static void acpi_safe_halt(void) -{ - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - if (!need_resched()) - safe_halt(); - current_thread_info()->status |= TS_POLLING; -} - -static atomic_t c3_cpu_count; - -/* Common C-state entry for C2, C3, .. */ -static void acpi_cstate_enter(struct acpi_processor_cx *cstate) -{ - if (cstate->space_id == ACPI_CSTATE_FFH) { - /* Call into architectural FFH based C-state */ - acpi_processor_ffh_cstate_enter(cstate); - } else { - int unused; - /* IO port based C-state */ - inb(cstate->address); - /* Dummy wait op - must do something useless after P_LVL2 read - because chipsets cannot guarantee that STPCLK# signal - gets asserted in time to freeze execution properly. */ - unused = inl(acpi_gbl_FADT.xpm_timer_block.address); - } -} - #ifdef ARCH_APICTIMER_STOPS_ON_C3 /* @@ -323,378 +251,6 @@ static void acpi_state_timer_broadcast(s } #endif - -static void acpi_processor_idle(void) -{ - struct acpi_processor *pr = NULL; - struct acpi_processor_cx *cx = NULL; - struct acpi_processor_cx *next_state = NULL; - int sleep_ticks = 0; - u32 t1, t2 = 0; - - /* - * Interrupts must be disabled during bus mastering calculations and - * for C2/C3 transitions. - */ - local_irq_disable(); - - pr = processors[smp_processor_id()]; - if (!pr) { - local_irq_enable(); - return; - } - - /* - * Check whether we truly need to go idle, or should - * reschedule: - */ - if (unlikely(need_resched())) { - local_irq_enable(); - return; - } - - cx = pr->power.state; - if (!cx) { - if (pm_idle_save) - pm_idle_save(); - else - acpi_safe_halt(); - return; - } - - /* - * Check BM Activity - * ----------------- - * Check for bus mastering activity (if required), record, and check - * for demotion. - */ - if (pr->flags.bm_check) { - u32 bm_status = 0; - unsigned long diff = jiffies - pr->power.bm_check_timestamp; - - if (diff > 31) - diff = 31; - - pr->power.bm_activity <<= diff; - - acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); - if (bm_status) { - pr->power.bm_activity |= 0x1; - acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); - } - /* - * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect - * the true state of bus mastering activity; forcing us to - * manually check the BMIDEA bit of each IDE channel. - */ - else if (errata.piix4.bmisx) { - if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01) - || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01)) - pr->power.bm_activity |= 0x1; - } - - pr->power.bm_check_timestamp = jiffies; - - /* - * If bus mastering is or was active this jiffy, demote - * to avoid a faulty transition. Note that the processor - * won't enter a low-power state during this call (to this - * function) but should upon the next. - * - * TBD: A better policy might be to fallback to the demotion - * state (use it for this quantum only) istead of - * demoting -- and rely on duration as our sole demotion - * qualification. This may, however, introduce DMA - * issues (e.g. floppy DMA transfer overrun/underrun). - */ - if ((pr->power.bm_activity & 0x1) && - cx->demotion.threshold.bm) { - local_irq_enable(); - next_state = cx->demotion.state; - goto end; - } - } - -#ifdef CONFIG_HOTPLUG_CPU - /* - * Check for P_LVL2_UP flag before entering C2 and above on - * an SMP system. We do it here instead of doing it at _CST/P_LVL - * detection phase, to work cleanly with logical CPU hotplug. - */ - if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) && - !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) - cx = &pr->power.states[ACPI_STATE_C1]; -#endif - - /* - * Sleep: - * ------ - * Invoke the current Cx state to put the processor to sleep. - */ - if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - if (need_resched()) { - current_thread_info()->status |= TS_POLLING; - local_irq_enable(); - return; - } - } - - switch (cx->type) { - - case ACPI_STATE_C1: - /* - * Invoke C1. - * Use the appropriate idle routine, the one that would - * be used without acpi C-states. - */ - if (pm_idle_save) - pm_idle_save(); - else - acpi_safe_halt(); - - /* - * TBD: Can't get time duration while in C1, as resumes - * go to an ISR rather than here. Need to instrument - * base interrupt handler. - */ - sleep_ticks = 0xFFFFFFFF; - break; - - case ACPI_STATE_C2: - /* Get start time (ticks) */ - t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); - /* Invoke C2 */ - acpi_state_timer_broadcast(pr, cx, 1); - acpi_cstate_enter(cx); - /* Get end time (ticks) */ - t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); - -#ifdef CONFIG_GENERIC_TIME - /* TSC halts in C2, so notify users */ - mark_tsc_unstable("possible TSC halt in C2"); -#endif - /* Re-enable interrupts */ - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; - acpi_state_timer_broadcast(pr, cx, 0); - break; - - case ACPI_STATE_C3: - - if (pr->flags.bm_check) { - if (atomic_inc_return(&c3_cpu_count) == - num_online_cpus()) { - /* - * All CPUs are trying to go to C3 - * Disable bus master arbitration - */ - acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); - } - } else { - /* SMP with no shared cache... Invalidate cache */ - ACPI_FLUSH_CPU_CACHE(); - } - - /* Get start time (ticks) */ - t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); - /* Invoke C3 */ - acpi_state_timer_broadcast(pr, cx, 1); - acpi_cstate_enter(cx); - /* Get end time (ticks) */ - t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); - if (pr->flags.bm_check) { - /* Enable bus master arbitration */ - atomic_dec(&c3_cpu_count); - acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); - } - -#ifdef CONFIG_GENERIC_TIME - /* TSC halts in C3, so notify users */ - mark_tsc_unstable("TSC halts in C3"); -#endif - /* Re-enable interrupts */ - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; - acpi_state_timer_broadcast(pr, cx, 0); - break; - - default: - local_irq_enable(); - return; - } - cx->usage++; - if ((cx->type != ACPI_STATE_C1) && (sleep_ticks > 0)) - cx->time += sleep_ticks; - - next_state = pr->power.state; - -#ifdef CONFIG_HOTPLUG_CPU - /* Don't do promotion/demotion */ - if ((cx->type == ACPI_STATE_C1) && (num_online_cpus() > 1) && - !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) { - next_state = cx; - goto end; - } -#endif - - /* - * Promotion? - * ---------- - * Track the number of longs (time asleep is greater than threshold) - * and promote when the count threshold is reached. Note that bus - * mastering activity may prevent promotions. - * Do not promote above max_cstate. - */ - if (cx->promotion.state && - ((cx->promotion.state - pr->power.states) <= max_cstate)) { - if (sleep_ticks > cx->promotion.threshold.ticks && - cx->promotion.state->latency <= system_latency_constraint()) { - cx->promotion.count++; - cx->demotion.count = 0; - if (cx->promotion.count >= - cx->promotion.threshold.count) { - if (pr->flags.bm_check) { - if (! - (pr->power.bm_activity & cx-> - promotion.threshold.bm)) { - next_state = - cx->promotion.state; - goto end; - } - } else { - next_state = cx->promotion.state; - goto end; - } - } - } - } - - /* - * Demotion? - * --------- - * Track the number of shorts (time asleep is less than time threshold) - * and demote when the usage threshold is reached. - */ - if (cx->demotion.state) { - if (sleep_ticks < cx->demotion.threshold.ticks) { - cx->demotion.count++; - cx->promotion.count = 0; - if (cx->demotion.count >= cx->demotion.threshold.count) { - next_state = cx->demotion.state; - goto end; - } - } - } - - end: - /* - * Demote if current state exceeds max_cstate - * or if the latency of the current state is unacceptable - */ - if ((pr->power.state - pr->power.states) > max_cstate || - pr->power.state->latency > system_latency_constraint()) { - if (cx->demotion.state) - next_state = cx->demotion.state; - } - - /* - * New Cx State? - * ------------- - * If we're going to start using a new Cx state we must clean up - * from the previous and prepare to use the new. - */ - if (next_state != pr->power.state) - acpi_processor_power_activate(pr, next_state); -} - -static int acpi_processor_set_power_policy(struct acpi_processor *pr) -{ - unsigned int i; - unsigned int state_is_set = 0; - struct acpi_processor_cx *lower = NULL; - struct acpi_processor_cx *higher = NULL; - struct acpi_processor_cx *cx; - - - if (!pr) - return -EINVAL; - - /* - * This function sets the default Cx state policy (OS idle handler). - * Our scheme is to promote quickly to C2 but more conservatively - * to C3. We're favoring C2 for its characteristics of low latency - * (quick response), good power savings, and ability to allow bus - * mastering activity. Note that the Cx state policy is completely - * customizable and can be altered dynamically. - */ - - /* startup state */ - for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { - cx = &pr->power.states[i]; - if (!cx->valid) - continue; - - if (!state_is_set) - pr->power.state = cx; - state_is_set++; - break; - } - - if (!state_is_set) - return -ENODEV; - - /* demotion */ - for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { - cx = &pr->power.states[i]; - if (!cx->valid) - continue; - - if (lower) { - cx->demotion.state = lower; - cx->demotion.threshold.ticks = cx->latency_ticks; - cx->demotion.threshold.count = 1; - if (cx->type == ACPI_STATE_C3) - cx->demotion.threshold.bm = bm_history; - } - - lower = cx; - } - - /* promotion */ - for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) { - cx = &pr->power.states[i]; - if (!cx->valid) - continue; - - if (higher) { - cx->promotion.state = higher; - cx->promotion.threshold.ticks = cx->latency_ticks; - if (cx->type >= ACPI_STATE_C2) - cx->promotion.threshold.count = 4; - else - cx->promotion.threshold.count = 10; - if (higher->type == ACPI_STATE_C3) - cx->promotion.threshold.bm = bm_history; - } - - higher = cx; - } - - return 0; -} - static int acpi_processor_get_power_info_fadt(struct acpi_processor *pr) { @@ -912,7 +468,7 @@ static void acpi_processor_power_verify_ * Normalize the C2 latency to expidite policy */ cx->valid = 1; - cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); + cx->latency_ticks = cx->latency; return; } @@ -986,7 +542,7 @@ static void acpi_processor_power_verify_ * use this in our C3 policy */ cx->valid = 1; - cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); + cx->latency_ticks = cx->latency; return; } @@ -1052,18 +608,6 @@ static int acpi_processor_get_power_info pr->power.count = acpi_processor_power_verify(pr); /* - * Set Default Policy - * ------------------ - * Now that we know which states are supported, set the default - * policy. Note that this policy can be changed dynamically - * (e.g. encourage deeper sleeps to conserve battery life when - * not on AC). - */ - result = acpi_processor_set_power_policy(pr); - if (result) - return result; - - /* * if one state of type C2 or C3 is available, mark this * CPU as being "idle manageable" */ @@ -1080,9 +624,6 @@ static int acpi_processor_get_power_info int acpi_processor_cst_has_changed(struct acpi_processor *pr) { - int result = 0; - - if (!pr) return -EINVAL; @@ -1093,16 +634,9 @@ int acpi_processor_cst_has_changed(struc if (!pr->flags.power_setup_done) return -ENODEV; - /* Fall back to the default idle loop */ - pm_idle = pm_idle_save; - synchronize_sched(); /* Relies on interrupts forcing exit from idle. */ - - pr->flags.power = 0; - result = acpi_processor_get_power_info(pr); - if ((pr->flags.power == 1) && (pr->flags.power_setup_done)) - pm_idle = acpi_processor_idle; - - return result; + acpi_processor_get_power_info(pr); + return cpuidle_force_redetect(per_cpu(cpuidle_devices, pr->id), + &acpi_idle_driver); } /* proc interface */ @@ -1188,30 +722,6 @@ static const struct file_operations acpi .release = single_release, }; -#ifdef CONFIG_SMP -static void smp_callback(void *v) -{ - /* we already woke the CPU up, nothing more to do */ -} - -/* - * This function gets called when a part of the kernel has a new latency - * requirement. This means we need to get all processors out of their C-state, - * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that - * wakes them all right up. - */ -static int acpi_processor_latency_notify(struct notifier_block *b, - unsigned long l, void *v) -{ - smp_call_function(smp_callback, NULL, 0, 1); - return NOTIFY_OK; -} - -static struct notifier_block acpi_processor_latency_notifier = { - .notifier_call = acpi_processor_latency_notify, -}; -#endif - int __cpuinit acpi_processor_power_init(struct acpi_processor *pr, struct acpi_device *device) { @@ -1228,9 +738,6 @@ int __cpuinit acpi_processor_power_init( "ACPI: processor limited to max C-state %d\n", max_cstate); first_run++; -#ifdef CONFIG_SMP - register_latency_notifier(&acpi_processor_latency_notifier); -#endif } if (!pr) @@ -1247,6 +754,7 @@ int __cpuinit acpi_processor_power_init( acpi_processor_get_power_info(pr); + /* * Install the idle handler if processor power management is supported. * Note that we use previously set idle handler will be used on @@ -1259,11 +767,6 @@ int __cpuinit acpi_processor_power_init( printk(" C%d[C%d]", i, pr->power.states[i].type); printk(")\n"); - - if (pr->id == 0) { - pm_idle_save = pm_idle; - pm_idle = acpi_processor_idle; - } } /* 'power' [R] */ @@ -1291,21 +794,344 @@ int acpi_processor_power_exit(struct acp if (acpi_device_dir(device)) remove_proc_entry(ACPI_PROCESSOR_FILE_POWER, acpi_device_dir(device)); + return 0; +} - /* Unregister the idle handler when processor #0 is removed. */ - if (pr->id == 0) { - pm_idle = pm_idle_save; +/** + * ticks_elapsed - a helper function that determines how many ticks (in US) + * have elapsed between two PM Timer timestamps + * @t1: the start time + * @t2: the end time + */ +static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2) +{ + if (t2 >= t1) + return PM_TIMER_TICKS_TO_US(t2 - t1); + else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) + return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); + else + return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2); +} - /* - * We are about to unload the current idle thread pm callback - * (pm_idle), Wait for all processors to update cached/local - * copies of pm_idle before proceeding. - */ - cpu_idle_wait(); -#ifdef CONFIG_SMP - unregister_latency_notifier(&acpi_processor_latency_notifier); +static inline u32 ticks_elapsed(u32 t1, u32 t2) +{ + if (t2 >= t1) + return (t2 - t1); + else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) + return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); + else + return ((0xFFFFFFFF - t1) + t2); +} + +/** + * acpi_idle_update_bm_rld - updates the BM_RLD bit depending on target state + * @pr: the processor + * @target: the new target state + */ +static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr, + struct acpi_processor_cx *target) +{ + if (pr->flags.bm_rld_set && target->type != ACPI_STATE_C3) { + acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); + pr->flags.bm_rld_set = 0; + } + + if (!pr->flags.bm_rld_set && target->type == ACPI_STATE_C3) { + acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1); + pr->flags.bm_rld_set = 1; + } +} + +/** + * acpi_idle_do_entry - a helper function that does C2 and C3 type entry + * @cx: cstate data + */ +static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) +{ + if (cx->space_id == ACPI_CSTATE_FFH) { + /* Call into architectural FFH based C-state */ + acpi_processor_ffh_cstate_enter(cx); + } else { + int unused; + /* IO port based C-state */ + inb(cx->address); + /* Dummy wait op - must do something useless after P_LVL2 read + because chipsets cannot guarantee that STPCLK# signal + gets asserted in time to freeze execution properly. */ + unused = inl(acpi_gbl_FADT.xpm_timer_block.address); + } +} + +/** + * acpi_idle_enter_c1 - enters an ACPI C1 state-type + * @dev: the target CPU + * @state: the state data + * + * This is equivalent to the HALT instruction. + */ +static int acpi_idle_enter_c1(struct cpuidle_device *dev, + struct cpuidle_state *state) +{ + struct acpi_processor *pr; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state); + pr = processors[smp_processor_id()]; + + if (unlikely(!pr)) + return 0; + + if (pr->flags.bm_check) + acpi_idle_update_bm_rld(pr, cx); + + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we test + * NEED_RESCHED: + */ + smp_mb(); + if (!need_resched()) + safe_halt(); + current_thread_info()->status |= TS_POLLING; + + cx->usage++; + + return 0; +} + +/** + * acpi_idle_enter_c2 - enters an ACPI C2 state-type + * @dev: the target CPU + * @state: the state data + */ +static int acpi_idle_enter_c2(struct cpuidle_device *dev, + struct cpuidle_state *state) +{ + struct acpi_processor *pr; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state); + u32 t1, t2; + pr = processors[smp_processor_id()]; + + if (unlikely(!pr)) + return 0; + + if (pr->flags.bm_check) + acpi_idle_update_bm_rld(pr, cx); + + local_irq_disable(); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we test + * NEED_RESCHED: + */ + smp_mb(); + + if (unlikely(need_resched())) { + current_thread_info()->status |= TS_POLLING; + local_irq_enable(); + return 0; + } + + t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + acpi_state_timer_broadcast(pr, cx, 1); + acpi_idle_do_entry(cx); + t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); + +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C2, so notify users */ + mark_tsc_unstable("possible TSC halt in C2"); #endif + + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; + + cx->usage++; + + acpi_state_timer_broadcast(pr, cx, 0); + cx->time += ticks_elapsed(t1, t2); + return ticks_elapsed_in_us(t1, t2); +} + +static int c3_cpu_count; +static DEFINE_SPINLOCK(c3_lock); + +/** + * acpi_idle_enter_c3 - enters an ACPI C3 state-type + * @dev: the target CPU + * @state: the state data + * + * Similar to C2 entry, except special bus master handling is needed. + */ +static int acpi_idle_enter_c3(struct cpuidle_device *dev, + struct cpuidle_state *state) +{ + struct acpi_processor *pr; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state); + u32 t1, t2; + pr = processors[smp_processor_id()]; + + if (unlikely(!pr)) + return 0; + + if (pr->flags.bm_check) + acpi_idle_update_bm_rld(pr, cx); + + local_irq_disable(); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we test + * NEED_RESCHED: + */ + smp_mb(); + + if (unlikely(need_resched())) { + current_thread_info()->status |= TS_POLLING; + local_irq_enable(); + return 0; } + /* disable bus master */ + if (pr->flags.bm_check) { + spin_lock(&c3_lock); + c3_cpu_count++; + if (c3_cpu_count == num_online_cpus()) { + /* + * All CPUs are trying to go to C3 + * Disable bus master arbitration + */ + acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); + } + spin_unlock(&c3_lock); + } else { + /* SMP with no shared cache... Invalidate cache */ + ACPI_FLUSH_CPU_CACHE(); + } + + /* Get start time (ticks) */ + t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + acpi_state_timer_broadcast(pr, cx, 1); + acpi_idle_do_entry(cx); + t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); + + if (pr->flags.bm_check) { + spin_lock(&c3_lock); + /* Enable bus master arbitration */ + if (c3_cpu_count == num_online_cpus()) + acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); + c3_cpu_count--; + spin_unlock(&c3_lock); + } + +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C3, so notify users */ + mark_tsc_unstable("TSC halts in C3"); +#endif + + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; + + cx->usage++; + + acpi_state_timer_broadcast(pr, cx, 0); + cx->time += ticks_elapsed(t1, t2); + return ticks_elapsed_in_us(t1, t2); +} + +/** + * acpi_idle_bm_check - checks if bus master activity was detected + */ +static int acpi_idle_bm_check(void) +{ + u32 bm_status = 0; + + acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); + if (bm_status) + acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); + /* + * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect + * the true state of bus mastering activity; forcing us to + * manually check the BMIDEA bit of each IDE channel. + */ + else if (errata.piix4.bmisx) { + if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01) + || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01)) + bm_status = 1; + } + return bm_status; +} + +/** + * acpi_idle_init - attaches the driver to a CPU + * @dev: the CPU + */ +static int acpi_idle_init(struct cpuidle_device *dev) +{ + int cpu = dev->cpu; + int i, count = 0; + struct acpi_processor_cx *cx; + struct cpuidle_state *state; + + struct acpi_processor *pr = processors[cpu]; + + if (!pr->flags.power_setup_done) + return -EINVAL; + + if (pr->flags.power == 0) { + return -EINVAL; + } + + for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) { + cx = &pr->power.states[i]; + state = &dev->states[count]; + + if (!cx->valid) + continue; + +#ifdef CONFIG_HOTPLUG_CPU + if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) && + !pr->flags.has_cst && + !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) + continue; +#endif + cpuidle_set_statedata(state, cx); + + state->exit_latency = cx->latency; + state->target_residency = cx->latency * 6; + state->power_usage = cx->power; + + state->flags = 0; + switch (cx->type) { + case ACPI_STATE_C1: + state->flags |= CPUIDLE_FLAG_SHALLOW; + state->enter = acpi_idle_enter_c1; + break; + + case ACPI_STATE_C2: + state->flags |= CPUIDLE_FLAG_BALANCED; + state->flags |= CPUIDLE_FLAG_TIME_VALID; + state->enter = acpi_idle_enter_c2; + break; + + case ACPI_STATE_C3: + state->flags |= CPUIDLE_FLAG_DEEP; + state->flags |= CPUIDLE_FLAG_TIME_VALID; + state->flags |= CPUIDLE_FLAG_CHECK_BM; + state->enter = acpi_idle_enter_c3; + break; + } + + count++; + } + + if (!count) + return -EINVAL; + + dev->state_count = count; return 0; } + +struct cpuidle_driver acpi_idle_driver = { + .name = "acpi_idle", + .init = acpi_idle_init, + .redetect = acpi_idle_init, + .bm_check = acpi_idle_bm_check, + .owner = THIS_MODULE, +}; Index: linux-rt.q/drivers/cpuidle/Kconfig =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/Kconfig @@ -0,0 +1,39 @@ +menu "CPU idle PM support" + +config CPU_IDLE + bool "CPU idle PM support" + help + CPU idle is a generic framework for supporting software-controlled + idle processor power management. It includes modular cross-platform + governors that can be swapped during runtime. + + If you're using a mobile platform that supports CPU idle PM (e.g. + an ACPI-capable notebook), you should say Y here. + +if CPU_IDLE + +comment "Governors" + +config CPU_IDLE_GOV_LADDER + tristate "'ladder' governor" + depends on CPU_IDLE + default y + help + This cpuidle governor promotes and demotes through the supported idle + states using residency time and bus master activity as metrics. This + algorithm was originally introduced in the old ACPI processor driver. + +config CPU_IDLE_GOV_MENU + tristate "'menu' governor" + depends on CPU_IDLE && NO_HZ + default y + help + This cpuidle governor evaluates all available states and chooses the + deepest state that meets all of the following constraints: BM activity, + expected time until next timer interrupt, and last break event time + delta. It is designed to minimize power consumption. Currently + dynticks is required. + +endif # CPU_IDLE + +endmenu Index: linux-rt.q/drivers/cpuidle/Makefile =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for cpuidle. +# + +obj-y += cpuidle.o driver.o governor.o sysfs.o governors/ Index: linux-rt.q/drivers/cpuidle/cpuidle.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/cpuidle.c @@ -0,0 +1,306 @@ +/* + * cpuidle.c - core cpuidle infrastructure + * + * (C) 2006-2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cpuidle.h" + +DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices); +EXPORT_PER_CPU_SYMBOL_GPL(cpuidle_devices); + +DEFINE_MUTEX(cpuidle_lock); +LIST_HEAD(cpuidle_detected_devices); +static void (*pm_idle_old)(void); + +/** + * cpuidle_idle_call - the main idle loop + * + * NOTE: no locks or semaphores should be used here + */ +static void cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __get_cpu_var(cpuidle_devices); + struct cpuidle_state *target_state; + int next_state; + + /* check if the device is ready */ + if (!dev || dev->status != CPUIDLE_STATUS_DOIDLE) { + if (pm_idle_old) + pm_idle_old(); + else + local_irq_enable(); + return; + } + + /* ask the governor for the next state */ + next_state = cpuidle_curr_governor->select(dev); + if (need_resched()) + return; + target_state = &dev->states[next_state]; + + /* enter the state and update stats */ + dev->last_residency = target_state->enter(dev, target_state); + dev->last_state = target_state; + target_state->time += dev->last_residency; + target_state->usage++; + + /* give the governor an opportunity to reflect on the outcome */ + if (cpuidle_curr_governor->reflect) + cpuidle_curr_governor->reflect(dev); +} + +/** + * cpuidle_install_idle_handler - installs the cpuidle idle loop handler + */ +void cpuidle_install_idle_handler(void) +{ + if (pm_idle != cpuidle_idle_call) { + /* Make sure all changes finished before we switch to new idle */ + smp_wmb(); + pm_idle = cpuidle_idle_call; + } +} + +/** + * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler + */ +void cpuidle_uninstall_idle_handler(void) +{ + if (pm_idle != pm_idle_old) { + pm_idle = pm_idle_old; + cpu_idle_wait(); + } +} + +/** + * cpuidle_rescan_device - prepares for a new state configuration + * @dev: the target device + * + * Must be called with cpuidle_lock aquired. + */ +void cpuidle_rescan_device(struct cpuidle_device *dev) +{ + int i; + + if (cpuidle_curr_governor->scan) + cpuidle_curr_governor->scan(dev); + + for (i = 0; i < dev->state_count; i++) { + dev->states[i].usage = 0; + dev->states[i].time = 0; + } +} + +/** + * cpuidle_add_device - attaches the driver to a CPU instance + * @sys_dev: the system device (driver model CPU representation) + */ +static int cpuidle_add_device(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, cpu); + + mutex_lock(&cpuidle_lock); + if (cpu_is_offline(cpu)) { + mutex_unlock(&cpuidle_lock); + return 0; + } + + if (!dev) { + dev = kzalloc(sizeof(struct cpuidle_device), GFP_KERNEL); + if (!dev) { + mutex_unlock(&cpuidle_lock); + return -ENOMEM; + } + init_completion(&dev->kobj_unregister); + per_cpu(cpuidle_devices, cpu) = dev; + } + dev->cpu = cpu; + + if (dev->status & CPUIDLE_STATUS_DETECTED) { + mutex_unlock(&cpuidle_lock); + return 0; + } + + cpuidle_add_sysfs(sys_dev); + + if (cpuidle_curr_driver) { + if (cpuidle_attach_driver(dev)) + goto err_ret; + } + + if (cpuidle_curr_governor) { + if (cpuidle_attach_governor(dev)) { + cpuidle_detach_driver(dev); + goto err_ret; + } + } + + if (cpuidle_device_can_idle(dev)) + cpuidle_install_idle_handler(); + + list_add(&dev->device_list, &cpuidle_detected_devices); + dev->status |= CPUIDLE_STATUS_DETECTED; + +err_ret: + mutex_unlock(&cpuidle_lock); + + return 0; +} + +/** + * __cpuidle_remove_device - detaches the driver from a CPU instance + * @sys_dev: the system device (driver model CPU representation) + * + * Must be called with cpuidle_lock aquired. + */ +static int __cpuidle_remove_device(struct sys_device *sys_dev) +{ + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, sys_dev->id); + + if (!(dev->status & CPUIDLE_STATUS_DETECTED)) { + return 0; + } + dev->status &= ~CPUIDLE_STATUS_DETECTED; + /* NOTE: we don't wait because the cpu is already offline */ + if (cpuidle_curr_governor) + cpuidle_detach_governor(dev); + if (cpuidle_curr_driver) + cpuidle_detach_driver(dev); + cpuidle_remove_sysfs(sys_dev); + list_del(&dev->device_list); + wait_for_completion(&dev->kobj_unregister); + per_cpu(cpuidle_devices, sys_dev->id) = NULL; + kfree(dev); + + return 0; +} + +/** + * cpuidle_remove_device - detaches the driver from a CPU instance + * @sys_dev: the system device (driver model CPU representation) + */ +static int cpuidle_remove_device(struct sys_device *sys_dev) +{ + int ret; + mutex_lock(&cpuidle_lock); + ret = __cpuidle_remove_device(sys_dev); + mutex_unlock(&cpuidle_lock); + + return ret; +} + +static struct sysdev_driver cpuidle_sysdev_driver = { + .add = cpuidle_add_device, + .remove = cpuidle_remove_device, +}; + +static int cpuidle_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev((unsigned long)hcpu); + + switch (action) { + case CPU_ONLINE: + cpuidle_add_device(sys_dev); + break; + case CPU_DOWN_PREPARE: + mutex_lock(&cpuidle_lock); + break; + case CPU_DEAD: + __cpuidle_remove_device(sys_dev); + mutex_unlock(&cpuidle_lock); + break; + case CPU_DOWN_FAILED: + mutex_unlock(&cpuidle_lock); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpuidle_cpu_notifier = +{ + .notifier_call = cpuidle_cpu_callback, +}; + +#ifdef CONFIG_SMP + +static void smp_callback(void *v) +{ + /* we already woke the CPU up, nothing more to do */ +} + +/* + * This function gets called when a part of the kernel has a new latency + * requirement. This means we need to get all processors out of their C-state, + * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that + * wakes them all right up. + */ +static int cpuidle_latency_notify(struct notifier_block *b, + unsigned long l, void *v) +{ + smp_call_function(smp_callback, NULL, 0, 1); + return NOTIFY_OK; +} + +static struct notifier_block cpuidle_latency_notifier = { + .notifier_call = cpuidle_latency_notify, +}; + +#define latency_notifier_init(x) do { register_latency_notifier(x); } while (0) + +#else /* CONFIG_SMP */ + +#define latency_notifier_init(x) do { } while (0) + +#endif /* CONFIG_SMP */ + +/** + * cpuidle_init - core initializer + */ +static int __init cpuidle_init(void) +{ + int ret; + + pm_idle_old = pm_idle; + + ret = cpuidle_add_class_sysfs(&cpu_sysdev_class); + if (ret) + return ret; + + register_hotcpu_notifier(&cpuidle_cpu_notifier); + + ret = sysdev_driver_register(&cpu_sysdev_class, &cpuidle_sysdev_driver); + + if (ret) { + cpuidle_remove_class_sysfs(&cpu_sysdev_class); + printk(KERN_ERR "cpuidle: failed to initialize\n"); + return ret; + } + + latency_notifier_init(&cpuidle_latency_notifier); + + return 0; +} + +core_initcall(cpuidle_init); Index: linux-rt.q/drivers/cpuidle/cpuidle.h =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/cpuidle.h @@ -0,0 +1,50 @@ +/* + * cpuidle.h - The internal header file + */ + +#ifndef __DRIVER_CPUIDLE_H +#define __DRIVER_CPUIDLE_H + +#include + +/* For internal use only */ +extern struct cpuidle_governor *cpuidle_curr_governor; +extern struct cpuidle_driver *cpuidle_curr_driver; +extern struct list_head cpuidle_drivers; +extern struct list_head cpuidle_governors; +extern struct list_head cpuidle_detected_devices; +extern struct mutex cpuidle_lock; + +/* idle loop */ +extern void cpuidle_install_idle_handler(void); +extern void cpuidle_uninstall_idle_handler(void); +extern void cpuidle_rescan_device(struct cpuidle_device *dev); + +/* drivers */ +extern int cpuidle_attach_driver(struct cpuidle_device *dev); +extern void cpuidle_detach_driver(struct cpuidle_device *dev); +extern int cpuidle_switch_driver(struct cpuidle_driver *drv); + +/* governors */ +extern int cpuidle_attach_governor(struct cpuidle_device *dev); +extern void cpuidle_detach_governor(struct cpuidle_device *dev); +extern int cpuidle_switch_governor(struct cpuidle_governor *gov); + +/* sysfs */ +extern int cpuidle_add_class_sysfs(struct sysdev_class *cls); +extern void cpuidle_remove_class_sysfs(struct sysdev_class *cls); +extern int cpuidle_add_driver_sysfs(struct cpuidle_device *device); +extern void cpuidle_remove_driver_sysfs(struct cpuidle_device *device); +extern int cpuidle_add_sysfs(struct sys_device *sysdev); +extern void cpuidle_remove_sysfs(struct sys_device *sysdev); + +/** + * cpuidle_device_can_idle - determines if a CPU can utilize the idle loop + * @dev: the target CPU + */ +static inline int cpuidle_device_can_idle(struct cpuidle_device *dev) +{ + return (dev->status == CPUIDLE_STATUS_DOIDLE); +} + +#endif /* __DRIVER_CPUIDLE_H */ Index: linux-rt.q/drivers/cpuidle/driver.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/driver.c @@ -0,0 +1,276 @@ +/* + * driver.c - driver support + * + * (C) 2006-2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include + +#include "cpuidle.h" + +LIST_HEAD(cpuidle_drivers); +struct cpuidle_driver *cpuidle_curr_driver; + + +/** + * cpuidle_attach_driver - attaches a driver to a CPU + * @dev: the target CPU + * + * Must be called with cpuidle_lock aquired. + */ +int cpuidle_attach_driver(struct cpuidle_device *dev) +{ + int ret; + + if (dev->status & CPUIDLE_STATUS_DRIVER_ATTACHED) + return -EIO; + + if (!try_module_get(cpuidle_curr_driver->owner)) + return -EINVAL; + + ret = cpuidle_curr_driver->init(dev); + if (ret) { + module_put(cpuidle_curr_driver->owner); + printk(KERN_INFO "cpuidle: driver %s failed to attach to " + "cpu %d\n", cpuidle_curr_driver->name, dev->cpu); + } else { + if (dev->status & CPUIDLE_STATUS_GOVERNOR_ATTACHED) + cpuidle_rescan_device(dev); + smp_wmb(); + dev->status |= CPUIDLE_STATUS_DRIVER_ATTACHED; + cpuidle_add_driver_sysfs(dev); + } + + return ret; +} + +/** + * cpuidle_detach_govenor - detaches a driver from a CPU + * @dev: the target CPU + * + * Must be called with cpuidle_lock aquired. + */ +void cpuidle_detach_driver(struct cpuidle_device *dev) +{ + if (dev->status & CPUIDLE_STATUS_DRIVER_ATTACHED) { + cpuidle_remove_driver_sysfs(dev); + dev->status &= ~CPUIDLE_STATUS_DRIVER_ATTACHED; + if (cpuidle_curr_driver->exit) + cpuidle_curr_driver->exit(dev); + module_put(cpuidle_curr_driver->owner); + } +} + +/** + * __cpuidle_find_driver - finds a driver of the specified name + * @str: the name + * + * Must be called with cpuidle_lock aquired. + */ +static struct cpuidle_driver * __cpuidle_find_driver(const char *str) +{ + struct cpuidle_driver *drv; + + list_for_each_entry(drv, &cpuidle_drivers, driver_list) + if (!strnicmp(str, drv->name, CPUIDLE_NAME_LEN)) + return drv; + + return NULL; +} + +/** + * cpuidle_switch_driver - changes the driver + * @drv: the new target driver + * + * NOTE: "drv" can be NULL to specify disabled + * Must be called with cpuidle_lock aquired. + */ +int cpuidle_switch_driver(struct cpuidle_driver *drv) +{ + struct cpuidle_device *dev; + + if (drv == cpuidle_curr_driver) + return -EINVAL; + + cpuidle_uninstall_idle_handler(); + + if (cpuidle_curr_driver) + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_detach_driver(dev); + + cpuidle_curr_driver = drv; + + if (drv) { + int ret = 1; + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + if (cpuidle_attach_driver(dev) == 0) + ret = 0; + + /* If attach on all devices fail, switch to NULL driver */ + if (ret) + cpuidle_curr_driver = NULL; + + if (cpuidle_curr_driver && cpuidle_curr_governor) { + printk(KERN_INFO "cpuidle: using driver %s\n", + drv->name); + cpuidle_install_idle_handler(); + } + } + + return 0; +} + +/** + * cpuidle_register_driver - registers a driver + * @drv: the driver + */ +int cpuidle_register_driver(struct cpuidle_driver *drv) +{ + int ret = -EEXIST; + + if (!drv || !drv->init) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + if (__cpuidle_find_driver(drv->name) == NULL) { + ret = 0; + list_add_tail(&drv->driver_list, &cpuidle_drivers); + if (!cpuidle_curr_driver) + cpuidle_switch_driver(drv); + } + mutex_unlock(&cpuidle_lock); + + return ret; +} + +EXPORT_SYMBOL_GPL(cpuidle_register_driver); + +/** + * cpuidle_unregister_driver - unregisters a driver + * @drv: the driver + */ +void cpuidle_unregister_driver(struct cpuidle_driver *drv) +{ + if (!drv) + return; + + mutex_lock(&cpuidle_lock); + if (drv == cpuidle_curr_driver) + cpuidle_switch_driver(NULL); + list_del(&drv->driver_list); + mutex_unlock(&cpuidle_lock); +} + +EXPORT_SYMBOL_GPL(cpuidle_unregister_driver); + +static void __cpuidle_force_redetect(struct cpuidle_device *dev) +{ + cpuidle_remove_driver_sysfs(dev); + cpuidle_curr_driver->redetect(dev); + cpuidle_add_driver_sysfs(dev); +} + +/** + * cpuidle_force_redetect - redetects the idle states of a CPU + * + * @dev: the CPU to redetect + * @drv: the target driver + * + * Generally, the driver will call this when the supported states set has + * changed. (e.g. as the result of an ACPI transition to battery power) + */ +int cpuidle_force_redetect(struct cpuidle_device *dev, + struct cpuidle_driver *drv) +{ + int uninstalled = 0; + + mutex_lock(&cpuidle_lock); + + if (drv != cpuidle_curr_driver) { + mutex_unlock(&cpuidle_lock); + return 0; + } + + if (!(dev->status & CPUIDLE_STATUS_DRIVER_ATTACHED) || + !cpuidle_curr_driver->redetect) { + mutex_unlock(&cpuidle_lock); + return -EIO; + } + + if (cpuidle_device_can_idle(dev)) { + uninstalled = 1; + cpuidle_uninstall_idle_handler(); + } + + __cpuidle_force_redetect(dev); + + if (cpuidle_device_can_idle(dev)) { + cpuidle_rescan_device(dev); + cpuidle_install_idle_handler(); + } + + /* other devices are still ok */ + if (uninstalled) + cpuidle_install_idle_handler(); + + mutex_unlock(&cpuidle_lock); + + return 0; +} + +EXPORT_SYMBOL_GPL(cpuidle_force_redetect); + +/** + * cpuidle_force_redetect_devices - redetects the idle states of all CPUs + * + * @drv: the target driver + * + * Generally, the driver will call this when the supported states set has + * changed. (e.g. as the result of an ACPI transition to battery power) + */ +int cpuidle_force_redetect_devices(struct cpuidle_driver *drv) +{ + struct cpuidle_device *dev; + int ret = 0; + + mutex_lock(&cpuidle_lock); + + if (drv != cpuidle_curr_driver) + goto out; + + if (!cpuidle_curr_driver->redetect) { + ret = -EIO; + goto out; + } + + cpuidle_uninstall_idle_handler(); + + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + __cpuidle_force_redetect(dev); + + cpuidle_install_idle_handler(); +out: + mutex_unlock(&cpuidle_lock); + return ret; +} + +EXPORT_SYMBOL_GPL(cpuidle_force_redetect_devices); + +/** + * cpuidle_get_bm_activity - determines if BM activity has occured + */ +int cpuidle_get_bm_activity(void) +{ + if (cpuidle_curr_driver->bm_check) + return cpuidle_curr_driver->bm_check(); + else + return 0; +} +EXPORT_SYMBOL_GPL(cpuidle_get_bm_activity); + Index: linux-rt.q/drivers/cpuidle/governor.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/governor.c @@ -0,0 +1,187 @@ +/* + * governor.c - governor support + * + * (C) 2006-2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include + +#include "cpuidle.h" + +LIST_HEAD(cpuidle_governors); +struct cpuidle_governor *cpuidle_curr_governor; + + +/** + * cpuidle_attach_governor - attaches a governor to a CPU + * @dev: the target CPU + * + * Must be called with cpuidle_lock aquired. + */ +int cpuidle_attach_governor(struct cpuidle_device *dev) +{ + int ret = 0; + + if(dev->status & CPUIDLE_STATUS_GOVERNOR_ATTACHED) + return -EIO; + + if (!try_module_get(cpuidle_curr_governor->owner)) + return -EINVAL; + + if (cpuidle_curr_governor->init) + ret = cpuidle_curr_governor->init(dev); + if (ret) { + module_put(cpuidle_curr_governor->owner); + printk(KERN_ERR "cpuidle: governor %s failed to attach to cpu %d\n", + cpuidle_curr_governor->name, dev->cpu); + } else { + if (dev->status & CPUIDLE_STATUS_DRIVER_ATTACHED) + cpuidle_rescan_device(dev); + smp_wmb(); + dev->status |= CPUIDLE_STATUS_GOVERNOR_ATTACHED; + } + + return ret; +} + +/** + * cpuidle_detach_govenor - detaches a governor from a CPU + * @dev: the target CPU + * + * Must be called with cpuidle_lock aquired. + */ +void cpuidle_detach_governor(struct cpuidle_device *dev) +{ + if (dev->status & CPUIDLE_STATUS_GOVERNOR_ATTACHED) { + dev->status &= ~CPUIDLE_STATUS_GOVERNOR_ATTACHED; + if (cpuidle_curr_governor->exit) + cpuidle_curr_governor->exit(dev); + module_put(cpuidle_curr_governor->owner); + } +} + +/** + * __cpuidle_find_governor - finds a governor of the specified name + * @str: the name + * + * Must be called with cpuidle_lock aquired. + */ +static struct cpuidle_governor * __cpuidle_find_governor(const char *str) +{ + struct cpuidle_governor *gov; + + list_for_each_entry(gov, &cpuidle_governors, governor_list) + if (!strnicmp(str, gov->name, CPUIDLE_NAME_LEN)) + return gov; + + return NULL; +} + +/** + * cpuidle_switch_governor - changes the governor + * @gov: the new target governor + * + * NOTE: "gov" can be NULL to specify disabled + * Must be called with cpuidle_lock aquired. + */ +int cpuidle_switch_governor(struct cpuidle_governor *gov) +{ + struct cpuidle_device *dev; + + if (gov == cpuidle_curr_governor) + return -EINVAL; + + cpuidle_uninstall_idle_handler(); + + if (cpuidle_curr_governor) + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_detach_governor(dev); + + cpuidle_curr_governor = gov; + + if (gov) { + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_attach_governor(dev); + if (cpuidle_curr_driver) + cpuidle_install_idle_handler(); + printk(KERN_INFO "cpuidle: using governor %s\n", gov->name); + } + + return 0; +} + +/** + * cpuidle_register_governor - registers a governor + * @gov: the governor + */ +int cpuidle_register_governor(struct cpuidle_governor *gov) +{ + int ret = -EEXIST; + + if (!gov || !gov->select) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + if (__cpuidle_find_governor(gov->name) == NULL) { + ret = 0; + list_add_tail(&gov->governor_list, &cpuidle_governors); + if (!cpuidle_curr_governor || + cpuidle_curr_governor->rating < gov->rating) + cpuidle_switch_governor(gov); + } + mutex_unlock(&cpuidle_lock); + + return ret; +} + +EXPORT_SYMBOL_GPL(cpuidle_register_governor); + +/** + * cpuidle_replace_governor - find a replacement governor + * @exclude_rating: the rating that will be skipped while looking for + * new governor. + */ +struct cpuidle_governor *cpuidle_replace_governor(int exclude_rating) +{ + struct cpuidle_governor *gov; + struct cpuidle_governor *ret_gov = NULL; + unsigned int max_rating = 0; + + list_for_each_entry(gov, &cpuidle_governors, governor_list) { + if (gov->rating == exclude_rating) + continue; + if (gov->rating > max_rating) { + max_rating = gov->rating; + ret_gov = gov; + } + } + + return ret_gov; +} + +/** + * cpuidle_unregister_governor - unregisters a governor + * @gov: the governor + */ +void cpuidle_unregister_governor(struct cpuidle_governor *gov) +{ + if (!gov) + return; + + mutex_lock(&cpuidle_lock); + if (gov == cpuidle_curr_governor) { + struct cpuidle_governor *new_gov; + new_gov = cpuidle_replace_governor(gov->rating); + cpuidle_switch_governor(new_gov); + } + list_del(&gov->governor_list); + mutex_unlock(&cpuidle_lock); +} + +EXPORT_SYMBOL_GPL(cpuidle_unregister_governor); Index: linux-rt.q/drivers/cpuidle/governors/Makefile =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/governors/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for cpuidle governors. +# + +obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o +obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o Index: linux-rt.q/drivers/cpuidle/governors/ladder.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/governors/ladder.c @@ -0,0 +1,228 @@ +/* + * ladder.c - the residency ladder algorithm + * + * Copyright (C) 2001, 2002 Andy Grover + * Copyright (C) 2001, 2002 Paul Diefenbaugh + * Copyright (C) 2004, 2005 Dominik Brodowski + * + * (C) 2006-2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define PROMOTION_COUNT 4 +#define DEMOTION_COUNT 1 + +/* + * bm_history -- bit-mask with a bit per jiffy of bus-master activity + * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms + * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms + * 100 HZ: 0x0000000F: 4 jiffies = 40ms + * reduce history for more aggressive entry into C3 + */ +static unsigned int bm_history __read_mostly = + (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1)); +module_param(bm_history, uint, 0644); + +struct ladder_device_state { + struct { + u32 promotion_count; + u32 demotion_count; + u32 promotion_time; + u32 demotion_time; + u32 bm; + } threshold; + struct { + int promotion_count; + int demotion_count; + } stats; +}; + +struct ladder_device { + struct ladder_device_state states[CPUIDLE_STATE_MAX]; + unsigned int bm_check:1; + unsigned long bm_check_timestamp; + unsigned long bm_activity; /* FIXME: bm activity should be global */ + int last_state_idx; +}; + +/** + * ladder_do_selection - prepares private data for a state change + * @ldev: the ladder device + * @old_idx: the current state index + * @new_idx: the new target state index + */ +static inline void ladder_do_selection(struct ladder_device *ldev, + int old_idx, int new_idx) +{ + ldev->states[old_idx].stats.promotion_count = 0; + ldev->states[old_idx].stats.demotion_count = 0; + ldev->last_state_idx = new_idx; +} + +/** + * ladder_select_state - selects the next state to enter + * @dev: the CPU + */ +static int ladder_select_state(struct cpuidle_device *dev) +{ + struct ladder_device *ldev = dev->governor_data; + struct ladder_device_state *last_state; + int last_residency, last_idx = ldev->last_state_idx; + + if (unlikely(!ldev)) + return 0; + + last_state = &ldev->states[last_idx]; + + /* demote if within BM threshold */ + if (ldev->bm_check) { + unsigned long diff; + + diff = jiffies - ldev->bm_check_timestamp; + if (diff > 31) + diff = 31; + + ldev->bm_activity <<= diff; + if (cpuidle_get_bm_activity()) + ldev->bm_activity |= ((1 << diff) - 1); + + ldev->bm_check_timestamp = jiffies; + if ((last_idx > 0) && + (last_state->threshold.bm & ldev->bm_activity)) { + ladder_do_selection(ldev, last_idx, last_idx - 1); + return last_idx - 1; + } + } + + if (dev->states[last_idx].flags & CPUIDLE_FLAG_TIME_VALID) + last_residency = cpuidle_get_last_residency(dev) - dev->states[last_idx].exit_latency; + else + last_residency = last_state->threshold.promotion_time + 1; + + /* consider promotion */ + if (last_idx < dev->state_count - 1 && + last_residency > last_state->threshold.promotion_time && + dev->states[last_idx + 1].exit_latency <= system_latency_constraint()) { + last_state->stats.promotion_count++; + last_state->stats.demotion_count = 0; + if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) { + ladder_do_selection(ldev, last_idx, last_idx + 1); + return last_idx + 1; + } + } + + /* consider demotion */ + if (last_idx > 0 && + last_residency < last_state->threshold.demotion_time) { + last_state->stats.demotion_count++; + last_state->stats.promotion_count = 0; + if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) { + ladder_do_selection(ldev, last_idx, last_idx - 1); + return last_idx - 1; + } + } + + /* otherwise remain at the current state */ + return last_idx; +} + +/** + * ladder_scan_device - scans a CPU's states and does setup + * @dev: the CPU + */ +static void ladder_scan_device(struct cpuidle_device *dev) +{ + int i, bm_check = 0; + struct ladder_device *ldev = dev->governor_data; + struct ladder_device_state *lstate; + struct cpuidle_state *state; + + ldev->last_state_idx = 0; + ldev->bm_check_timestamp = 0; + ldev->bm_activity = 0; + + for (i = 0; i < dev->state_count; i++) { + state = &dev->states[i]; + lstate = &ldev->states[i]; + + lstate->stats.promotion_count = 0; + lstate->stats.demotion_count = 0; + + lstate->threshold.promotion_count = PROMOTION_COUNT; + lstate->threshold.demotion_count = DEMOTION_COUNT; + + if (i < dev->state_count - 1) + lstate->threshold.promotion_time = state->exit_latency; + if (i > 0) + lstate->threshold.demotion_time = state->exit_latency; + if (state->flags & CPUIDLE_FLAG_CHECK_BM) { + lstate->threshold.bm = bm_history; + bm_check = 1; + } else + lstate->threshold.bm = 0; + } + + ldev->bm_check = bm_check; +} + +/** + * ladder_init_device - initializes a CPU-instance + * @dev: the CPU + */ +static int ladder_init_device(struct cpuidle_device *dev) +{ + dev->governor_data = kmalloc(sizeof(struct ladder_device), GFP_KERNEL); + + return !dev->governor_data; +} + +/** + * ladder_exit_device - exits a CPU-instance + * @dev: the CPU + */ +static void ladder_exit_device(struct cpuidle_device *dev) +{ + kfree(dev->governor_data); +} + +static struct cpuidle_governor ladder_governor = { + .name = "ladder", + .rating = 10, + .init = ladder_init_device, + .exit = ladder_exit_device, + .scan = ladder_scan_device, + .select = ladder_select_state, + .owner = THIS_MODULE, +}; + +/** + * init_ladder - initializes the governor + */ +static int __init init_ladder(void) +{ + return cpuidle_register_governor(&ladder_governor); +} + +/** + * exit_ladder - exits the governor + */ +static void __exit exit_ladder(void) +{ + cpuidle_unregister_governor(&ladder_governor); +} + +MODULE_LICENSE("GPL"); +module_init(init_ladder); +module_exit(exit_ladder); Index: linux-rt.q/drivers/cpuidle/governors/menu.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/governors/menu.c @@ -0,0 +1,181 @@ +/* + * menu.c - the menu idle governor + * + * Copyright (C) 2006-2007 Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define BM_HOLDOFF 20000 /* 20 ms */ +#define DEMOTION_THRESHOLD 5 +#define DEMOTION_TIMEOUT_MULTIPLIER 1000 + +struct menu_device { + int last_state_idx; + + int deepest_break_state; + struct timespec break_expire_time_ts; + int break_last_cnt; + + int deepest_bm_state; + int bm_elapsed_us; + int bm_holdoff_us; +}; + +static DEFINE_PER_CPU(struct menu_device, menu_devices); + +/** + * menu_select - selects the next idle state to enter + * @dev: the CPU + */ +static int menu_select(struct cpuidle_device *dev) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + int i, expected_us, max_state = dev->state_count; + + /* discard BM history because it is sticky */ + cpuidle_get_bm_activity(); + + /* determine the expected residency time */ + expected_us = (s32) ktime_to_ns(tick_nohz_get_sleep_length()) / 1000; + + /* determine the maximum state compatible with current BM status */ + if (cpuidle_get_bm_activity()) + data->bm_elapsed_us = 0; + if (data->bm_elapsed_us <= data->bm_holdoff_us) + max_state = data->deepest_bm_state + 1; + + /* determine the maximum state compatible with recent idle breaks */ + if (data->deepest_break_state >= 0) { + struct timespec now; + ktime_get_ts(&now); + if (timespec_compare(&data->break_expire_time_ts, &now) > 0) { + max_state = min(max_state, + data->deepest_break_state + 1); + } else { + data->deepest_break_state = -1; + } + } + + /* find the deepest idle state that satisfies our constraints */ + for (i = 1; i < max_state; i++) { + struct cpuidle_state *s = &dev->states[i]; + + if (s->target_residency > expected_us) + break; + + if (s->exit_latency > system_latency_constraint()) + break; + } + + if (data->last_state_idx != i - 1) + data->break_last_cnt = 0; + + data->last_state_idx = i - 1; + return i - 1; +} + +/** + * menu_reflect - attempts to guess what happened after entry + * @dev: the CPU + * + * NOTE: it's important to be fast here because this operation will add to + * the overall exit latency. + */ +static void menu_reflect(struct cpuidle_device *dev) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + int last_idx = data->last_state_idx; + int measured_us = cpuidle_get_last_residency(dev); + struct cpuidle_state *target = &dev->states[last_idx]; + + /* + * Ugh, this idle state doesn't support residency measurements, so we + * are basically lost in the dark. As a compromise, assume we slept + * for one full standard timer tick. However, be aware that this + * could potentially result in a suboptimal state transition. + */ + if (!(target->flags & CPUIDLE_FLAG_TIME_VALID)) + measured_us = USEC_PER_SEC / HZ; + + data->bm_elapsed_us += measured_us; + + if (data->last_state_idx == 0) + return; + + /* + * Did something other than the timer interrupt + * cause an early break event? + */ + if (unlikely(measured_us < target->target_residency)) { + if (data->break_last_cnt > DEMOTION_THRESHOLD) { + data->deepest_break_state = data->last_state_idx - 1; + ktime_get_ts(&data->break_expire_time_ts); + timespec_add_ns(&data->break_expire_time_ts, + target->target_residency * + DEMOTION_TIMEOUT_MULTIPLIER); + } else { + data->break_last_cnt++; + } + } else { + if (data->break_last_cnt > 0) + data->break_last_cnt--; + } +} + +/** + * menu_scan_device - scans a CPU's states and does setup + * @dev: the CPU + */ +static void menu_scan_device(struct cpuidle_device *dev) +{ + struct menu_device *data = &per_cpu(menu_devices, dev->cpu); + int i; + + data->last_state_idx = 0; + data->bm_elapsed_us = 0; + data->bm_holdoff_us = BM_HOLDOFF; + data->deepest_break_state = -1; + + for (i = 1; i < dev->state_count; i++) + if (dev->states[i].flags & CPUIDLE_FLAG_CHECK_BM) + break; + data->deepest_bm_state = i - 1; +} + +struct cpuidle_governor menu_governor = { + .name = "menu", + .rating = 20, + .scan = menu_scan_device, + .select = menu_select, + .reflect = menu_reflect, + .owner = THIS_MODULE, +}; + +/** + * init_menu - initializes the governor + */ +static int __init init_menu(void) +{ + return cpuidle_register_governor(&menu_governor); +} + +/** + * exit_menu - exits the governor + */ +static void __exit exit_menu(void) +{ + cpuidle_unregister_governor(&menu_governor); +} + +MODULE_LICENSE("GPL"); +module_init(init_menu); +module_exit(exit_menu); Index: linux-rt.q/drivers/cpuidle/sysfs.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/cpuidle/sysfs.c @@ -0,0 +1,393 @@ +/* + * sysfs.c - sysfs support + * + * (C) 2006-2007 Shaohua Li + * + * This code is licenced under the GPL. + */ + +#include +#include +#include +#include + +#include "cpuidle.h" + +static unsigned int sysfs_switch; +static int __init cpuidle_sysfs_setup(char *unused) +{ + sysfs_switch = 1; + return 1; +} +__setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup); + +static ssize_t show_available_drivers(struct sys_device *dev, char *buf) +{ + ssize_t i = 0; + struct cpuidle_driver *tmp; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_drivers, driver_list) { + if (i >= (ssize_t)((PAGE_SIZE/sizeof(char)) - CPUIDLE_NAME_LEN - 2)) + goto out; + i += scnprintf(&buf[i], CPUIDLE_NAME_LEN, "%s ", tmp->name); + } +out: + i+= sprintf(&buf[i], "\n"); + mutex_unlock(&cpuidle_lock); + return i; +} + +static ssize_t show_available_governors(struct sys_device *dev, char *buf) +{ + ssize_t i = 0; + struct cpuidle_governor *tmp; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_governors, governor_list) { + if (i >= (ssize_t)((PAGE_SIZE/sizeof(char)) - CPUIDLE_NAME_LEN - 2)) + goto out; + i += scnprintf(&buf[i], CPUIDLE_NAME_LEN, "%s ", tmp->name); + } + if (list_empty(&cpuidle_governors)) + i+= sprintf(&buf[i], "no governors"); +out: + i+= sprintf(&buf[i], "\n"); + mutex_unlock(&cpuidle_lock); + return i; +} + +static ssize_t show_current_driver(struct sys_device *dev, char *buf) +{ + ssize_t ret; + + mutex_lock(&cpuidle_lock); + ret = sprintf(buf, "%s\n", cpuidle_curr_driver->name); + mutex_unlock(&cpuidle_lock); + return ret; +} + +static ssize_t store_current_driver(struct sys_device *dev, + const char *buf, size_t count) +{ + char str[CPUIDLE_NAME_LEN]; + int len = count; + struct cpuidle_driver *tmp, *found = NULL; + + if (len > CPUIDLE_NAME_LEN) + len = CPUIDLE_NAME_LEN; + + if (sscanf(buf, "%s", str) != 1) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_drivers, driver_list) { + if (strncmp(tmp->name, str, CPUIDLE_NAME_LEN) == 0) { + found = tmp; + break; + } + } + if (found) + cpuidle_switch_driver(found); + mutex_unlock(&cpuidle_lock); + + return count; +} + +static ssize_t show_current_governor(struct sys_device *dev, char *buf) +{ + ssize_t i; + + mutex_lock(&cpuidle_lock); + if (cpuidle_curr_governor) + i = sprintf(buf, "%s\n", cpuidle_curr_governor->name); + else + i = sprintf(buf, "no governor\n"); + mutex_unlock(&cpuidle_lock); + + return i; +} + +static ssize_t store_current_governor(struct sys_device *dev, + const char *buf, size_t count) +{ + char str[CPUIDLE_NAME_LEN]; + int len = count; + struct cpuidle_governor *tmp, *found = NULL; + + if (len > CPUIDLE_NAME_LEN) + len = CPUIDLE_NAME_LEN; + + if (sscanf(buf, "%s", str) != 1) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_governors, governor_list) { + if (strncmp(tmp->name, str, CPUIDLE_NAME_LEN) == 0) { + found = tmp; + break; + } + } + if (found) + cpuidle_switch_governor(found); + mutex_unlock(&cpuidle_lock); + + return count; +} + +static SYSDEV_ATTR(current_driver_ro, 0444, show_current_driver, NULL); +static SYSDEV_ATTR(current_governor_ro, 0444, show_current_governor, NULL); + +static struct attribute *cpuclass_default_attrs[] = { + &attr_current_driver_ro.attr, + &attr_current_governor_ro.attr, + NULL +}; + +static SYSDEV_ATTR(available_drivers, 0444, show_available_drivers, NULL); +static SYSDEV_ATTR(available_governors, 0444, show_available_governors, NULL); +static SYSDEV_ATTR(current_driver, 0644, show_current_driver, + store_current_driver); +static SYSDEV_ATTR(current_governor, 0644, show_current_governor, + store_current_governor); + +static struct attribute *cpuclass_switch_attrs[] = { + &attr_available_drivers.attr, + &attr_available_governors.attr, + &attr_current_driver.attr, + &attr_current_governor.attr, + NULL +}; + +static struct attribute_group cpuclass_attr_group = { + .attrs = cpuclass_default_attrs, + .name = "cpuidle", +}; + +/** + * cpuidle_add_class_sysfs - add CPU global sysfs attributes + */ +int cpuidle_add_class_sysfs(struct sysdev_class *cls) +{ + if (sysfs_switch) + cpuclass_attr_group.attrs = cpuclass_switch_attrs; + + return sysfs_create_group(&cls->kset.kobj, &cpuclass_attr_group); +} + +/** + * cpuidle_remove_class_sysfs - remove CPU global sysfs attributes + */ +void cpuidle_remove_class_sysfs(struct sysdev_class *cls) +{ + sysfs_remove_group(&cls->kset.kobj, &cpuclass_attr_group); +} + +struct cpuidle_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_device *, char *); + ssize_t (*store)(struct cpuidle_device *, const char *, size_t count); +}; + +#define define_one_ro(_name, show) \ + static struct cpuidle_attr attr_##_name = __ATTR(_name, 0444, show, NULL) +#define define_one_rw(_name, show, store) \ + static struct cpuidle_attr attr_##_name = __ATTR(_name, 0644, show, store) + +#define kobj_to_cpuidledev(k) container_of(k, struct cpuidle_device, kobj) +#define attr_to_cpuidleattr(a) container_of(a, struct cpuidle_attr, attr) +static ssize_t cpuidle_show(struct kobject * kobj, struct attribute * attr ,char * buf) +{ + int ret = -EIO; + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + struct cpuidle_attr * cattr = attr_to_cpuidleattr(attr); + + if (cattr->show) { + mutex_lock(&cpuidle_lock); + ret = cattr->show(dev, buf); + mutex_unlock(&cpuidle_lock); + } + return ret; +} + +static ssize_t cpuidle_store(struct kobject * kobj, struct attribute * attr, + const char * buf, size_t count) +{ + int ret = -EIO; + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + struct cpuidle_attr * cattr = attr_to_cpuidleattr(attr); + + if (cattr->store) { + mutex_lock(&cpuidle_lock); + ret = cattr->store(dev, buf, count); + mutex_unlock(&cpuidle_lock); + } + return ret; +} + +static struct sysfs_ops cpuidle_sysfs_ops = { + .show = cpuidle_show, + .store = cpuidle_store, +}; + +static void cpuidle_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + + complete(&dev->kobj_unregister); +} + +static struct kobj_type ktype_cpuidle = { + .sysfs_ops = &cpuidle_sysfs_ops, + .release = cpuidle_sysfs_release, +}; + +struct cpuidle_state_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_state *, char *); + ssize_t (*store)(struct cpuidle_state *, const char *, size_t); +}; + +#define define_one_state_ro(_name, show) \ +static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0444, show, NULL) + +#define define_show_state_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \ +{ \ + return sprintf(buf, "%d\n", state->_name);\ +} + +define_show_state_function(exit_latency) +define_show_state_function(power_usage) +define_show_state_function(usage) +define_show_state_function(time) +define_one_state_ro(latency, show_state_exit_latency); +define_one_state_ro(power, show_state_power_usage); +define_one_state_ro(usage, show_state_usage); +define_one_state_ro(time, show_state_time); + +static struct attribute *cpuidle_state_default_attrs[] = { + &attr_latency.attr, + &attr_power.attr, + &attr_usage.attr, + &attr_time.attr, + NULL +}; + +#define kobj_to_state_obj(k) container_of(k, struct cpuidle_state_kobj, kobj) +#define kobj_to_state(k) (kobj_to_state_obj(k)->state) +#define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr) +static ssize_t cpuidle_state_show(struct kobject * kobj, + struct attribute * attr ,char * buf) +{ + int ret = -EIO; + struct cpuidle_state *state = kobj_to_state(kobj); + struct cpuidle_state_attr * cattr = attr_to_stateattr(attr); + + if (cattr->show) + ret = cattr->show(state, buf); + + return ret; +} + +static struct sysfs_ops cpuidle_state_sysfs_ops = { + .show = cpuidle_state_show, +}; + +static void cpuidle_state_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_state_kobj *state_obj = kobj_to_state_obj(kobj); + + complete(&state_obj->kobj_unregister); +} + +static struct kobj_type ktype_state_cpuidle = { + .sysfs_ops = &cpuidle_state_sysfs_ops, + .default_attrs = cpuidle_state_default_attrs, + .release = cpuidle_state_sysfs_release, +}; + +static void inline cpuidle_free_state_kobj(struct cpuidle_device *device, int i) +{ + kobject_unregister(&device->kobjs[i]->kobj); + wait_for_completion(&device->kobjs[i]->kobj_unregister); + kfree(device->kobjs[i]); + device->kobjs[i] = NULL; +} + +/** + * cpuidle_add_driver_sysfs - adds driver-specific sysfs attributes + * @device: the target device + */ +int cpuidle_add_driver_sysfs(struct cpuidle_device *device) +{ + int i, ret = -ENOMEM; + struct cpuidle_state_kobj *kobj; + + /* state statistics */ + for (i = 0; i < device->state_count; i++) { + kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); + if (!kobj) + goto error_state; + kobj->state = &device->states[i]; + init_completion(&kobj->kobj_unregister); + + kobj->kobj.parent = &device->kobj; + kobj->kobj.ktype = &ktype_state_cpuidle; + kobject_set_name(&kobj->kobj, "state%d", i); + ret = kobject_register(&kobj->kobj); + if (ret) { + kfree(kobj); + goto error_state; + } + device->kobjs[i] = kobj; + } + + return 0; + +error_state: + for (i = i - 1; i >= 0; i--) + cpuidle_free_state_kobj(device, i); + return ret; +} + +/** + * cpuidle_remove_driver_sysfs - removes driver-specific sysfs attributes + * @device: the target device + */ +void cpuidle_remove_driver_sysfs(struct cpuidle_device *device) +{ + int i; + + for (i = 0; i < device->state_count; i++) + cpuidle_free_state_kobj(device, i); +} + +/** + * cpuidle_add_sysfs - creates a sysfs instance for the target device + * @sysdev: the target device + */ +int cpuidle_add_sysfs(struct sys_device *sysdev) +{ + int cpu = sysdev->id; + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, cpu); + dev->kobj.parent = &sysdev->kobj; + dev->kobj.ktype = &ktype_cpuidle; + kobject_set_name(&dev->kobj, "%s", "cpuidle"); + return kobject_register(&dev->kobj); +} + +/** + * cpuidle_remove_sysfs - deletes a sysfs instance on the target device + * @sysdev: the target device + */ +void cpuidle_remove_sysfs(struct sys_device *sysdev) +{ + int cpu = sysdev->id; + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, cpu); + kobject_unregister(&dev->kobj); +} Index: linux-rt.q/include/acpi/processor.h =================================================================== --- linux-rt.q.orig/include/acpi/processor.h +++ linux-rt.q/include/acpi/processor.h @@ -161,6 +161,7 @@ struct acpi_processor_flags { u8 bm_check:1; u8 has_cst:1; u8 power_setup_done:1; + u8 bm_rld_set:1; }; struct acpi_processor { @@ -279,6 +280,8 @@ int acpi_processor_power_init(struct acp int acpi_processor_cst_has_changed(struct acpi_processor *pr); int acpi_processor_power_exit(struct acpi_processor *pr, struct acpi_device *device); +extern struct cpuidle_driver acpi_idle_driver; +void acpi_max_cstate_changed(void); /* in processor_thermal.c */ int acpi_processor_get_limit_info(struct acpi_processor *pr); Index: linux-rt.q/include/linux/acpi.h =================================================================== --- linux-rt.q.orig/include/linux/acpi.h +++ linux-rt.q/include/linux/acpi.h @@ -206,11 +206,8 @@ static inline unsigned int acpi_get_csta { return max_cstate; } -static inline void acpi_set_cstate_limit(unsigned int new_limit) -{ - max_cstate = new_limit; - return; -} +extern void (*acpi_do_set_cstate_limit)(void); +extern void acpi_set_cstate_limit(unsigned int new_limit); #else static inline unsigned int acpi_get_cstate_limit(void) { return 0; } static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; } Index: linux-rt.q/include/linux/cpuidle.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/cpuidle.h @@ -0,0 +1,190 @@ +/* + * cpuidle.h - a generic framework for CPU idle power management + * + * (C) 2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#ifndef _LINUX_CPUIDLE_H +#define _LINUX_CPUIDLE_H + +#include +#include +#include +#include +#include + +#define CPUIDLE_STATE_MAX 8 +#define CPUIDLE_NAME_LEN 16 + +struct cpuidle_device; + + +/**************************** + * CPUIDLE DEVICE INTERFACE * + ****************************/ + +struct cpuidle_state { + char name[CPUIDLE_NAME_LEN]; + void *driver_data; + + unsigned int flags; + unsigned int exit_latency; /* in US */ + unsigned int power_usage; /* in mW */ + unsigned int target_residency; /* in US */ + + unsigned int usage; + unsigned int time; /* in US */ + + int (*enter) (struct cpuidle_device *dev, + struct cpuidle_state *state); +}; + +/* Idle State Flags */ +#define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */ +#define CPUIDLE_FLAG_CHECK_BM (0x02) /* BM activity will exit state */ +#define CPUIDLE_FLAG_SHALLOW (0x10) /* low latency, minimal savings */ +#define CPUIDLE_FLAG_BALANCED (0x20) /* medium latency, moderate savings */ +#define CPUIDLE_FLAG_DEEP (0x40) /* high latency, large savings */ + +#define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) + +/** + * cpuidle_get_statedata - retrieves private driver state data + * @state: the state + */ +static inline void * cpuidle_get_statedata(struct cpuidle_state *state) +{ + return state->driver_data; +} + +/** + * cpuidle_set_statedata - stores private driver state data + * @state: the state + * @data: the private data + */ +static inline void +cpuidle_set_statedata(struct cpuidle_state *state, void *data) +{ + state->driver_data = data; +} + +struct cpuidle_state_kobj { + struct cpuidle_state *state; + struct completion kobj_unregister; + struct kobject kobj; +}; + +struct cpuidle_device { + unsigned int status; + int cpu; + + int last_residency; + int state_count; + struct cpuidle_state states[CPUIDLE_STATE_MAX]; + struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX]; + struct cpuidle_state *last_state; + + struct list_head device_list; + struct kobject kobj; + struct completion kobj_unregister; + void *governor_data; +}; + +DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); + +/* Device Status Flags */ +#define CPUIDLE_STATUS_DETECTED (0x1) +#define CPUIDLE_STATUS_DRIVER_ATTACHED (0x2) +#define CPUIDLE_STATUS_GOVERNOR_ATTACHED (0x4) +#define CPUIDLE_STATUS_DOIDLE (CPUIDLE_STATUS_DETECTED | \ + CPUIDLE_STATUS_DRIVER_ATTACHED | \ + CPUIDLE_STATUS_GOVERNOR_ATTACHED) + +/** + * cpuidle_get_last_residency - retrieves the last state's residency time + * @dev: the target CPU + * + * NOTE: this value is invalid if CPUIDLE_FLAG_TIME_VALID isn't set + */ +static inline int cpuidle_get_last_residency(struct cpuidle_device *dev) +{ + return dev->last_residency; +} + + +/**************************** + * CPUIDLE DRIVER INTERFACE * + ****************************/ + +struct cpuidle_driver { + char name[CPUIDLE_NAME_LEN]; + struct list_head driver_list; + + int (*init) (struct cpuidle_device *dev); + void (*exit) (struct cpuidle_device *dev); + int (*redetect) (struct cpuidle_device *dev); + + int (*bm_check) (void); + + struct module *owner; +}; + +#ifdef CONFIG_CPU_IDLE + +extern int cpuidle_register_driver(struct cpuidle_driver *drv); +extern void cpuidle_unregister_driver(struct cpuidle_driver *drv); +extern int cpuidle_force_redetect(struct cpuidle_device *dev, struct cpuidle_driver *drv); +extern int cpuidle_force_redetect_devices(struct cpuidle_driver *drv); + +#else + +static inline int cpuidle_register_driver(struct cpuidle_driver *drv) +{return 0;} +static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { } +static inline int cpuidle_force_redetect(struct cpuidle_device *dev, struct cpuidle_driver *drv) +{return 0;} +static inline int cpuidle_force_redetect_devices(struct cpuidle_driver *drv) +{return 0;} + +#endif + +/****************************** + * CPUIDLE GOVERNOR INTERFACE * + ******************************/ + +struct cpuidle_governor { + char name[CPUIDLE_NAME_LEN]; + struct list_head governor_list; + unsigned int rating; + + int (*init) (struct cpuidle_device *dev); + void (*exit) (struct cpuidle_device *dev); + void (*scan) (struct cpuidle_device *dev); + + int (*select) (struct cpuidle_device *dev); + void (*reflect) (struct cpuidle_device *dev); + + struct module *owner; +}; + +#ifdef CONFIG_CPU_IDLE + +extern int cpuidle_register_governor(struct cpuidle_governor *gov); +extern void cpuidle_unregister_governor(struct cpuidle_governor *gov); +extern int cpuidle_get_bm_activity(void); + +#else + +static inline int cpuidle_register_governor(struct cpuidle_governor *gov) +{return 0;} +static inline void cpuidle_unregister_governor(struct cpuidle_governor *gov) { } +static inline int cpuidle_get_bm_activity(void) +{return 0;} + +#endif + +#endif /* _LINUX_CPUIDLE_H */ Index: linux-rt.q/include/linux/tick.h =================================================================== --- linux-rt.q.orig/include/linux/tick.h +++ linux-rt.q/include/linux/tick.h @@ -40,6 +40,7 @@ enum tick_nohz_mode { * @idle_sleeps: Number of idle calls, where the sched tick was stopped * @idle_entrytime: Time when the idle call was entered * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @sleep_length: Duration of the current idle sleep */ struct tick_sched { struct hrtimer sched_timer; @@ -52,6 +53,7 @@ struct tick_sched { unsigned long idle_sleeps; ktime_t idle_entrytime; ktime_t idle_sleeptime; + ktime_t sleep_length; unsigned long last_jiffies; unsigned long next_jiffies; ktime_t idle_expires; @@ -100,10 +102,18 @@ static inline int tick_check_oneshot_cha extern void tick_nohz_stop_sched_tick(void); extern void tick_nohz_restart_sched_tick(void); extern void tick_nohz_update_jiffies(void); +extern ktime_t tick_nohz_get_sleep_length(void); +extern unsigned long tick_nohz_get_idle_jiffies(void); # else static inline void tick_nohz_stop_sched_tick(void) { } static inline void tick_nohz_restart_sched_tick(void) { } static inline void tick_nohz_update_jiffies(void) { } +static inline ktime_t tick_nohz_get_sleep_length(void) +{ + ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; + + return len; +} # endif /* !NO_HZ */ #endif Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -303,11 +303,6 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -#ifdef CONFIG_NO_HZ - /* Make sure that timer wheel updates are propagated */ - if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) - tick_nohz_stop_sched_tick(); -#endif preempt_enable_no_resched(); } Index: linux-rt.q/kernel/time/tick-sched.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-sched.c +++ linux-rt.q/kernel/time/tick-sched.c @@ -153,6 +153,7 @@ void tick_nohz_stop_sched_tick(void) unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; struct tick_sched *ts; ktime_t last_update, expires, now, delta; + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; int cpu; local_irq_save(flags); @@ -290,11 +291,36 @@ void tick_nohz_stop_sched_tick(void) out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; + ts->sleep_length = ktime_sub(dev->next_event, now); end: local_irq_restore(flags); } /** + * tick_nohz_get_sleep_length - return the length of the current sleep + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_sleep_length(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + return ts->sleep_length; +} +EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length); + +/** + * tick_nohz_get_idle_jiffies - returns the current idle jiffie count + */ +unsigned long tick_nohz_get_idle_jiffies(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + return ts->idle_jiffies; +} +EXPORT_SYMBOL_GPL(tick_nohz_get_idle_jiffies); + +/** * nohz_restart_sched_tick - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle patches/lockstat_bounce.patch0000664000077200007720000001565610646635217015750 0ustar mingomingo Subject: lockstat: measure lock bouncing __acquire | lock _____ | \ | __contended | | | wait | _______/ |/ | __acquired | __release | unlock We measure acquisition and contention bouncing. This is done by recording a cpu stamp in each lock instance. Contention bouncing requires the cpu stamp to be set on acquisition. Hence we move __acquired into the generic path. __acquired is then used to measure acquisition bouncing by comparing the current cpu with the old stamp before replacing it. __contended is used to measure contention bouncing (only useful for preemptable locks) Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar --- include/linux/lockdep.h | 17 ++++++++++++++++- kernel/lockdep.c | 39 +++++++++++++++++++++++++++------------ kernel/lockdep_proc.c | 22 ++++++++++++---------- kernel/mutex.c | 2 +- 4 files changed, 56 insertions(+), 24 deletions(-) Index: linux-rt.q/include/linux/lockdep.h =================================================================== --- linux-rt.q.orig/include/linux/lockdep.h +++ linux-rt.q/include/linux/lockdep.h @@ -128,12 +128,24 @@ struct lock_time { unsigned long nr; }; +enum bounce_type { + bounce_acquired_write, + bounce_acquired_read, + bounce_contended_write, + bounce_contended_read, + nr_bounce_types, + + bounce_acquired = bounce_acquired_write, + bounce_contended = bounce_contended_write, +}; + struct lock_class_stats { unsigned long contention_point[4]; struct lock_time read_waittime; struct lock_time write_waittime; struct lock_time read_holdtime; struct lock_time write_holdtime; + unsigned long bounces[nr_bounce_types]; }; struct lock_class_stats lock_stats(struct lock_class *class); @@ -148,6 +160,9 @@ struct lockdep_map { struct lock_class_key *key; struct lock_class *class_cache; const char *name; +#ifdef CONFIG_LOCK_STAT + int cpu; +#endif }; /* @@ -323,8 +338,8 @@ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ - lock_acquired(&(_lock)->dep_map); \ } \ + lock_acquired(&(_lock)->dep_map); \ } while (0) #define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -194,6 +194,9 @@ struct lock_class_stats lock_stats(struc lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + + for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) + stats.bounces[i] += pcs->bounces[i]; } return stats; @@ -2151,6 +2154,9 @@ void lockdep_init_map(struct lockdep_map lock->name = name; lock->key = key; lock->class_cache = NULL; +#ifdef CONFIG_LOCK_STAT + lock->cpu = raw_smp_processor_id(); +#endif if (subclass) register_lock_class(lock, subclass, 1); } @@ -2780,6 +2786,8 @@ found_it: stats = get_lock_stats(hlock->class); if (point < ARRAY_SIZE(stats->contention_point)) stats->contention_point[i]++; + if (lock->cpu != smp_processor_id()) + stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); } @@ -2791,8 +2799,8 @@ __lock_acquired(struct lockdep_map *lock struct lock_class_stats *stats; unsigned int depth; u64 now; - s64 waittime; - int i; + s64 waittime = 0; + int i, cpu; depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) @@ -2814,19 +2822,25 @@ __lock_acquired(struct lockdep_map *lock return; found_it: - if (!hlock->waittime_stamp) - return; - - now = sched_clock(); - waittime = now - hlock->waittime_stamp; - hlock->holdtime_stamp = now; + cpu = smp_processor_id(); + if (hlock->waittime_stamp) { + now = sched_clock(); + waittime = now - hlock->waittime_stamp; + hlock->holdtime_stamp = now; + } stats = get_lock_stats(hlock->class); - if (hlock->read) - lock_time_inc(&stats->read_waittime, waittime); - else - lock_time_inc(&stats->write_waittime, waittime); + if (waittime) { + if (hlock->read) + lock_time_inc(&stats->read_waittime, waittime); + else + lock_time_inc(&stats->write_waittime, waittime); + } + if (lock->cpu != cpu) + stats->bounces[bounce_acquired + !!hlock->read]++; put_lock_stats(stats); + + lock->cpu = cpu; } void lock_contended(struct lockdep_map *lock, unsigned long ip) @@ -2866,6 +2880,7 @@ void lock_acquired(struct lockdep_map *l raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquired); + #endif /* Index: linux-rt.q/kernel/lockdep_proc.c =================================================================== --- linux-rt.q.orig/kernel/lockdep_proc.c +++ linux-rt.q/kernel/lockdep_proc.c @@ -428,17 +428,17 @@ static void seq_stats(struct seq_file *m seq_printf(m, "%40s:", name); seq_lock_time(m, &stats->write_waittime); - seq_puts(m, " "); + seq_printf(m, " %14lu ", stats->bounces[bounce_contended_write]); seq_lock_time(m, &stats->write_holdtime); - seq_puts(m, "\n"); + seq_printf(m, " %14lu\n", stats->bounces[bounce_acquired_write]); } if (stats->read_holdtime.nr) { seq_printf(m, "%38s-R:", name); seq_lock_time(m, &stats->read_waittime); - seq_puts(m, " "); + seq_printf(m, " %14lu ", stats->bounces[bounce_contended_read]); seq_lock_time(m, &stats->read_holdtime); - seq_puts(m, "\n"); + seq_printf(m, " %14lu\n", stats->bounces[bounce_acquired_read]); } if (stats->read_waittime.nr + stats->write_waittime.nr == 0) @@ -466,26 +466,28 @@ static void seq_stats(struct seq_file *m } if (i) { seq_puts(m, "\n"); - seq_line(m, '.', 0, 40 + 1 + 8 * (14 + 1)); + seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); seq_puts(m, "\n"); } } static void seq_header(struct seq_file *m) { - seq_printf(m, "lock_stat version 0.1\n"); - seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1)); - seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s\n", + seq_printf(m, "lock_stat version 0.2\n"); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s\n", "class name", "contentions", "waittime-min", "waittime-max", "waittime-total", + "con-bounces", "acquisitions", "holdtime-min", "holdtime-max", - "holdtime-total"); - seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1)); + "holdtime-total", + "acq-bounces"); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); seq_printf(m, "\n"); } Index: linux-rt.q/kernel/mutex.c =================================================================== --- linux-rt.q.orig/kernel/mutex.c +++ linux-rt.q/kernel/mutex.c @@ -180,8 +180,8 @@ __mutex_lock_common(struct mutex *lock, spin_lock_mutex(&lock->wait_lock, flags); } - lock_acquired(&lock->dep_map); done: + lock_acquired(&lock->dep_map); /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task_thread_info(task)); patches/x86_64-apic-remove-nested-irq-disable.patch0000664000077200007720000000330510646635211021364 0ustar mingomingoSubject: x86_64: remove nested irq disables setup_APIC_timer disables interrupts anyway. So no need to do the same in setup_boot_APIC_clock and setup_secondary_APIC_clock. Disable interrupts explicit in the calibration code. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -827,6 +827,9 @@ static void __init calibrate_APIC_clock( unsigned apic, apic_start; unsigned long tsc, tsc_start; int result; + + local_irq_disable(); + /* * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the @@ -856,6 +859,9 @@ static void __init calibrate_APIC_clock( result = (apic_start - apic) * 1000L * tsc_khz / (tsc - tsc_start); } + + local_irq_enable(); + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", @@ -874,22 +880,16 @@ void __init setup_boot_APIC_clock (void) printk(KERN_INFO "Using local APIC timer interrupts.\n"); using_apic_timer = 1; - local_irq_disable(); - calibrate_APIC_clock(); /* * Now set up the timer for real. */ setup_APIC_timer(); - - local_irq_enable(); } void __cpuinit setup_secondary_APIC_clock(void) { - local_irq_disable(); /* FIXME: Do we need this? --RR */ setup_APIC_timer(); - local_irq_enable(); } void disable_APIC_timer(void) patches/x86_64-cleanup-apic-c.patch0000664000077200007720000000244410646635211016267 0ustar mingomingoSubject: x86_64: cleanup apic.c after clock events switch Make variables static. Signed-off-by: Chris Wright Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 6 +++--- include/asm-x86_64/apic.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -40,10 +40,10 @@ #include #include -int apic_mapped; int apic_verbosity; -int apic_calibrate_pmtmr __initdata; -int disable_apic_timer __initdata; +static int apic_mapped; +static int apic_calibrate_pmtmr __initdata; +static int disable_apic_timer __initdata; /* Local APIC timer works in C2? */ int local_apic_timer_c2_ok; Index: linux-rt.q/include/asm-x86_64/apic.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/apic.h +++ linux-rt.q/include/asm-x86_64/apic.h @@ -19,7 +19,6 @@ extern int apic_verbosity; extern int apic_runs_main_timer; extern int ioapic_force; -extern int apic_mapped; /* * Define the default level of output to be very little patches/preempt-irqs-ppc-preempt-schedule-irq-entry-fix.patch0000664000077200007720000001043710646635214023746 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Tue May 22 13:47:39 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=UNPARSEABLE_RELAY autolearn=unavailable version=3.1.7-deb Received: from inet-tsb5.toshiba.co.jp (inet-tsb5.toshiba.co.jp [202.33.96.24]) by mail.tglx.de (Postfix) with ESMTP id 57F7E65C065 for ; Tue, 22 May 2007 13:47:39 +0200 (CEST) Received: from tsb-wall.toshiba.co.jp ([133.199.160.134]) by inet-tsb5.toshiba.co.jp with ESMTP id l4MBlERT003242; Tue, 22 May 2007 20:47:14 +0900 (JST) Received: (from root@localhost) by tsb-wall.toshiba.co.jp id l4MBlEQK014361; Tue, 22 May 2007 20:47:14 +0900 (JST) Received: from ovp1.toshiba.co.jp [133.199.192.124] by tsb-wall.toshiba.co.jp with ESMTP id WAA14360; Tue, 22 May 2007 20:47:14 +0900 Received: from mx2.toshiba.co.jp (localhost [127.0.0.1]) by ovp1.toshiba.co.jp with ESMTP id l4MBlEDs007674; Tue, 22 May 2007 20:47:14 +0900 (JST) Received: from rdcgw.rdc.toshiba.co.jp by toshiba.co.jp id l4MBlDm9015993; Tue, 22 May 2007 20:47:13 +0900 (JST) Received: from island.swc.toshiba.co.jp by rdcgw.rdc.toshiba.co.jp (8.8.8p2+Sun/3.7W) with ESMTP id UAA17003; Tue, 22 May 2007 20:47:13 +0900 (JST) Received: from forest.toshiba.co.jp (forest [133.196.122.2]) by island.swc.toshiba.co.jp (Postfix) with ESMTP id 6A26B40002; Tue, 22 May 2007 20:47:13 +0900 (JST) Date: Tue, 22 May 2007 20:47:13 +0900 Message-ID: From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: [PATCH] powerpc 2.6.21-rt6: replace preempt_schedule w/ preempt_schedule_irq User-Agent: Wanderlust/2.8.1 (Something) Emacs/20.7 Mule/4.0 (HANANOEN) Organization: Software Engineering Center, TOSHIBA. MIME-Version: 1.0 (generated by SEMI 1.14.4 - "Hosorogi") Content-Type: text/plain; charset=US-ASCII X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Hi Ingo and Thomas, Please apply. Replace preempt_schedule() w/ preempt_schedule_irq() in irq return path, to avoid irq-entry recursion and stack overflow problems for powerpc64. It hits when doing netperf from another machine to the machine running rt kernel. This patch applies on top of linux-2.6.21 + patch-2.6.21-rt6. Compile, boot and netperf tested on celleb. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ~ $ uname -a Linux Linux 2.6.21-rt6 #1 SMP PREEMPT RT Tue May 22 19:18:00 JST 2007 ppc64 unkn own ~ $ Unable to handle kernel paging request for data at address 0xc0000180004cd9b 0 Faulting instruction address: 0xc00000000003da48 cpu 0x0: Vector: 300 (Data Access) at [c00000000fffba00] pc: c00000000003da48: .resched_task+0x34/0xc4 lr: c0000000000410b4: .try_to_wake_up+0x4cc/0x5a8 sp: c00000000fffbc80 msr: 9000000000001032 dar: c0000180004cd9b0 dsisr: 40000000 current = 0xc00000000244ed20 paca = 0xc0000000004cd980 pid = 425, comm = netserver enter ? for help [c00000000fffbd00] c0000000000410b4 .try_to_wake_up+0x4cc/0x5a8 [c00000000fffbde0] c0000000000880c8 .redirect_hardirq+0x68/0x88 [c00000000fffbe60] c00000000008aec8 .handle_level_irq+0x13c/0x220 [c00000000fffbf00] c000000000032538 .spider_irq_cascade+0x98/0xec [c00000000fffbf90] c000000000022280 .call_handle_irq+0x1c/0x2c [c0000000025abea0] c00000000000c33c .do_IRQ+0xc8/0x17c [c0000000025abf30] c00000000000444c hardware_interrupt_entry+0x18/0x4c --- arch/powerpc/kernel/entry_64.S | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/entry_64.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/entry_64.S +++ linux-rt.q/arch/powerpc/kernel/entry_64.S @@ -558,14 +558,9 @@ do_work: cmpdi r0,0 crandc eq,cr1*4+eq,eq bne restore - /* here we are preempting the current task */ 1: - li r0,1 - stb r0,PACASOFTIRQEN(r13) - stb r0,PACAHARDIRQEN(r13) - ori r10,r10,MSR_EE - mtmsrd r10,1 /* reenable interrupts */ - bl .preempt_schedule + /* preempt_schedule_irq() expects interrupts disabled. */ + bl .preempt_schedule_irq mfmsr r10 clrrdi r9,r1,THREAD_SHIFT rldicl r10,r10,48,1 /* disable interrupts again */ patches/ppc-a-2.patch0000664000077200007720000000164610646635213013716 0ustar mingomingo To fix the following compile error by replacing the deleted structure member "is_continuous" with "flags". - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - arch/powerpc/kernel/time.c arch/powerpc/kernel/time.c:938: error: unknown field 'is_continuous' specified in initializer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Signed-off-by: Tsutomu Owa -- owa --- arch/powerpc/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -916,7 +916,7 @@ struct clocksource clocksource_timebase .mask = (cycle_t)-1, .mult = 0, .shift = 22, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; patches/timekeeping-fixup-shadow-variable-argument.patch0000664000077200007720000000230210646635210023062 0ustar mingomingoFrom: Thomas Gleixner clocksource_adjust() has a clock argument, which shadows the file global clock variable. Fix this up. Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/kernel/time/timekeeping.c =================================================================== --- linux-rt.q.orig/kernel/time/timekeeping.c +++ linux-rt.q/kernel/time/timekeeping.c @@ -391,7 +391,7 @@ static __always_inline int clocksource_b * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(struct clocksource *clock, s64 offset) +static void clocksource_adjust(s64 offset) { s64 error, interval = clock->cycle_interval; int adj; @@ -466,7 +466,7 @@ void update_wall_time(void) } /* correct the clock when NTP error is too big */ - clocksource_adjust(clock, offset); + clocksource_adjust(offset); /* store full nanoseconds into xtime */ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; patches/rt-mutex-i386.patch0000664000077200007720000005533610646635214015041 0ustar mingomingo--- arch/i386/Kconfig.cpu | 8 +++- arch/i386/kernel/apm.c | 2 - arch/i386/kernel/entry.S | 4 +- arch/i386/kernel/i386_ksyms.c | 12 +++--- arch/i386/kernel/process.c | 10 ++--- arch/i386/lib/semaphore.S | 24 ++++++------- include/asm-i386/rwsem.h | 41 +++++++++++------------ include/asm-i386/semaphore.h | 67 ++++++++++++++++++++++---------------- include/asm-i386/spinlock.h | 36 ++++++++++---------- include/asm-i386/spinlock_types.h | 4 +- include/asm-i386/thread_info.h | 3 + 11 files changed, 117 insertions(+), 94 deletions(-) Index: linux-rt.q/arch/i386/Kconfig.cpu =================================================================== --- linux-rt.q.orig/arch/i386/Kconfig.cpu +++ linux-rt.q/arch/i386/Kconfig.cpu @@ -247,12 +247,16 @@ config X86_XADD config RWSEM_GENERIC_SPINLOCK bool - depends on !X86_XADD + depends on !X86_XADD || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool default y config RWSEM_XCHGADD_ALGORITHM bool - depends on X86_XADD + depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK default y config ARCH_HAS_ILOG2_U32 Index: linux-rt.q/arch/i386/kernel/apm.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/apm.c +++ linux-rt.q/arch/i386/kernel/apm.c @@ -782,7 +782,7 @@ static int apm_do_idle(void) */ smp_mb(); } - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); } Index: linux-rt.q/arch/i386/kernel/entry.S =================================================================== --- linux-rt.q.orig/arch/i386/kernel/entry.S +++ linux-rt.q/arch/i386/kernel/entry.S @@ -480,7 +480,7 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jz work_notifysig work_resched: call schedule @@ -492,7 +492,7 @@ work_resched: andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jnz work_resched work_notifysig: # deal with pending signals and Index: linux-rt.q/arch/i386/kernel/i386_ksyms.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/i386_ksyms.c +++ linux-rt.q/arch/i386/kernel/i386_ksyms.c @@ -2,10 +2,12 @@ #include #include -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); @@ -20,7 +22,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES) extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); Index: linux-rt.q/arch/i386/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/process.c +++ linux-rt.q/arch/i386/kernel/process.c @@ -115,7 +115,7 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) safe_halt(); /* enables interrupts racelessly */ else local_irq_enable(); @@ -179,7 +179,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -202,7 +202,7 @@ void cpu_idle(void) } trace_preempt_exit_idle(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); trace_preempt_enter_idle(); @@ -251,10 +251,10 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(eax, ecx); } } Index: linux-rt.q/arch/i386/lib/semaphore.S =================================================================== --- linux-rt.q.orig/arch/i386/lib/semaphore.S +++ linux-rt.q/arch/i386/lib/semaphore.S @@ -30,7 +30,7 @@ * value or just clobbered.. */ .section .sched.text -ENTRY(__down_failed) +ENTRY(__compat_down_failed) CFI_STARTPROC FRAME pushl %edx @@ -39,7 +39,7 @@ ENTRY(__down_failed) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down + call __compat_down popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -49,9 +49,9 @@ ENTRY(__down_failed) ENDFRAME ret CFI_ENDPROC - END(__down_failed) + END(__compat_down_failed) -ENTRY(__down_failed_interruptible) +ENTRY(__compat_down_failed_interruptible) CFI_STARTPROC FRAME pushl %edx @@ -60,7 +60,7 @@ ENTRY(__down_failed_interruptible) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down_interruptible + call __compat_down_interruptible popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -70,9 +70,9 @@ ENTRY(__down_failed_interruptible) ENDFRAME ret CFI_ENDPROC - END(__down_failed_interruptible) + END(__compat_down_failed_interruptible) -ENTRY(__down_failed_trylock) +ENTRY(__compat_down_failed_trylock) CFI_STARTPROC FRAME pushl %edx @@ -81,7 +81,7 @@ ENTRY(__down_failed_trylock) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down_trylock + call __compat_down_trylock popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -91,9 +91,9 @@ ENTRY(__down_failed_trylock) ENDFRAME ret CFI_ENDPROC - END(__down_failed_trylock) + END(__compat_down_failed_trylock) -ENTRY(__up_wakeup) +ENTRY(__compat_up_wakeup) CFI_STARTPROC FRAME pushl %edx @@ -102,7 +102,7 @@ ENTRY(__up_wakeup) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __up + call __compat_up popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -112,7 +112,7 @@ ENTRY(__up_wakeup) ENDFRAME ret CFI_ENDPROC - END(__up_wakeup) + END(__compat_up_wakeup) /* * rw spinlock fallbacks Index: linux-rt.q/include/asm-i386/rwsem.h =================================================================== --- linux-rt.q.orig/include/asm-i386/rwsem.h +++ linux-rt.q/include/asm-i386/rwsem.h @@ -44,15 +44,15 @@ struct rwsem_waiter; -extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem)); -extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem)); -extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *)); -extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *sem)); +extern struct compat_rw_semaphore *FASTCALL(rwsem_down_read_failed(struct compat_rw_semaphore *sem)); +extern struct compat_rw_semaphore *FASTCALL(rwsem_down_write_failed(struct compat_rw_semaphore *sem)); +extern struct compat_rw_semaphore *FASTCALL(rwsem_wake(struct compat_rw_semaphore *)); +extern struct compat_rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct compat_rw_semaphore *sem)); /* * the semaphore definition */ -struct rw_semaphore { +struct compat_rw_semaphore { signed long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -78,23 +78,23 @@ struct rw_semaphore { { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define compat_init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __compat_init_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning down_read\n\t" @@ -111,7 +111,7 @@ LOCK_PREFIX " incl (%%eax)\n\t" /* /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) { __s32 result, tmp; __asm__ __volatile__( @@ -134,7 +134,8 @@ LOCK_PREFIX " cmpxchgl %2,%0\n\t" /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct compat_rw_semaphore *sem, int subclass) { int tmp; @@ -160,7 +161,7 @@ static inline void __down_write(struct r /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) { signed long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, @@ -173,7 +174,7 @@ static inline int __down_write_trylock(s /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct compat_rw_semaphore *sem) { __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; __asm__ __volatile__( @@ -191,7 +192,7 @@ LOCK_PREFIX " xadd %%edx,(%%eax)\n /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning __up_write\n\t" @@ -209,7 +210,7 @@ LOCK_PREFIX " xaddl %%edx,(%%eax)\n /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning __downgrade_write\n\t" @@ -226,7 +227,7 @@ LOCK_PREFIX " addl %2,(%%eax)\n\t" /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) { __asm__ __volatile__( LOCK_PREFIX "addl %1,%0" @@ -237,7 +238,7 @@ LOCK_PREFIX "addl %1,%0" /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) { int tmp = delta; @@ -249,7 +250,7 @@ LOCK_PREFIX "xadd %0,%1" return tmp+delta; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); } Index: linux-rt.q/include/asm-i386/semaphore.h =================================================================== --- linux-rt.q.orig/include/asm-i386/semaphore.h +++ linux-rt.q/include/asm-i386/semaphore.h @@ -3,8 +3,6 @@ #include -#ifdef __KERNEL__ - /* * SMP- and interrupt-safe semaphores.. * @@ -41,30 +39,40 @@ #include #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) -#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -static inline void sema_init (struct semaphore *sem, int val) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0) + +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { /* - * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val); + * *sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val); * * i'd rather use the more flexible initialization above, but sadly * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well. @@ -74,27 +82,27 @@ static inline void sema_init (struct sem init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -fastcall void __down_failed(void /* special register calling convention */); -fastcall int __down_failed_interruptible(void /* params in registers */); -fastcall int __down_failed_trylock(void /* params in registers */); -fastcall void __up_wakeup(void /* special register calling convention */); +fastcall void __compat_down_failed(void /* special register calling convention */); +fastcall int __compat_down_failed_interruptible(void /* params in registers */); +fastcall int __compat_down_failed_trylock(void /* params in registers */); +fastcall void __compat_up_wakeup(void /* special register calling convention */); /* * This is ugly, but we want the default case to fall through. * "__down_failed" is a special asm handler that calls the C * routine that actually waits. See arch/i386/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); __asm__ __volatile__( @@ -102,7 +110,7 @@ static inline void down(struct semaphore LOCK_PREFIX "decl %0\n\t" /* --sem->count */ "jns 2f\n" "\tlea %0,%%eax\n\t" - "call __down_failed\n" + "call __compat_down_failed\n" "2:" :"+m" (sem->count) : @@ -113,7 +121,7 @@ static inline void down(struct semaphore * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR */ -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int result; @@ -124,7 +132,7 @@ static inline int down_interruptible(str LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" "lea %1,%%eax\n\t" - "call __down_failed_interruptible\n" + "call __compat_down_failed_interruptible\n" "2:" :"=&a" (result), "+m" (sem->count) : @@ -136,7 +144,7 @@ static inline int down_interruptible(str * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it */ -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { int result; @@ -146,7 +154,7 @@ static inline int down_trylock(struct se LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" "lea %1,%%eax\n\t" - "call __down_failed_trylock\n\t" + "call __compat_down_failed_trylock\n\t" "2:\n" :"=&a" (result), "+m" (sem->count) : @@ -158,19 +166,24 @@ static inline int down_trylock(struct se * Note! This is subtle. We jump to wake people up only if * the semaphore was negative (== somebody was waiting on it). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __asm__ __volatile__( "# atomic up operation\n\t" LOCK_PREFIX "incl %0\n\t" /* ++sem->count */ "jg 1f\n\t" "lea %0,%%eax\n\t" - "call __up_wakeup\n" + "call __compat_up_wakeup\n" "1:" :"+m" (sem->count) : :"memory","ax"); } -#endif +extern int FASTCALL(compat_sem_is_locked(struct compat_semaphore *sem)); + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif Index: linux-rt.q/include/asm-i386/spinlock.h =================================================================== --- linux-rt.q.orig/include/asm-i386/spinlock.h +++ linux-rt.q/include/asm-i386/spinlock.h @@ -27,12 +27,12 @@ * (the type definitions are in asm/spinlock_types.h) */ -static inline int __raw_spin_is_locked(raw_spinlock_t *x) +static inline int __raw_spin_is_locked(__raw_spinlock_t *x) { return *(volatile signed char *)(&(x)->slock) <= 0; } -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { asm volatile("\n1:\t" LOCK_PREFIX " ; decb %0\n\t" @@ -55,7 +55,7 @@ static inline void __raw_spin_lock(raw_s * irq-traced, but on CONFIG_TRACE_IRQFLAGS we never use this variant. */ #ifndef CONFIG_PROVE_LOCKING -static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +static inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { asm volatile( "\n1:\t" @@ -84,7 +84,7 @@ static inline void __raw_spin_lock_flags } #endif -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { char oldval; asm volatile( @@ -103,14 +103,14 @@ static inline int __raw_spin_trylock(raw #if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE) -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory"); } #else -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { char oldval = 1; @@ -121,7 +121,7 @@ static inline void __raw_spin_unlock(raw #endif -static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +static inline void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (__raw_spin_is_locked(lock)) cpu_relax(); @@ -152,7 +152,7 @@ static inline void __raw_spin_unlock_wai * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_read_can_lock(raw_rwlock_t *x) +static inline int __raw_read_can_lock(__raw_rwlock_t *x) { return (int)(x)->lock > 0; } @@ -161,12 +161,12 @@ static inline int __raw_read_can_lock(ra * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_write_can_lock(raw_rwlock_t *x) +static inline int __raw_write_can_lock(__raw_rwlock_t *x) { return (x)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void __raw_read_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" "jns 1f\n" @@ -175,7 +175,7 @@ static inline void __raw_read_lock(raw_r ::"a" (rw) : "memory"); } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void __raw_write_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" "jz 1f\n" @@ -184,7 +184,7 @@ static inline void __raw_write_lock(raw_ ::"a" (rw) : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *lock) +static inline int __raw_read_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; atomic_dec(count); @@ -194,7 +194,7 @@ static inline int __raw_read_trylock(raw return 0; } -static inline int __raw_write_trylock(raw_rwlock_t *lock) +static inline int __raw_write_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; if (atomic_sub_and_test(RW_LOCK_BIAS, count)) @@ -203,19 +203,19 @@ static inline int __raw_write_trylock(ra return 0; } -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void __raw_read_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void __raw_write_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ", %0" : "+m" (rw->lock) : : "memory"); } -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() #endif /* __ASM_SPINLOCK_H */ Index: linux-rt.q/include/asm-i386/spinlock_types.h =================================================================== --- linux-rt.q.orig/include/asm-i386/spinlock_types.h +++ linux-rt.q/include/asm-i386/spinlock_types.h @@ -7,13 +7,13 @@ typedef struct { unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 1 } typedef struct { unsigned int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } Index: linux-rt.q/include/asm-i386/thread_info.h =================================================================== --- linux-rt.q.orig/include/asm-i386/thread_info.h +++ linux-rt.q/include/asm-i386/thread_info.h @@ -133,15 +133,18 @@ static inline struct thread_info *curren #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ +#define TIF_NEED_RESCHED_DELAYED 10 /* reschedule on return to userspace */ #define TIF_MEMDIE 16 #define TIF_DEBUG 17 /* uses debug registers */ #define TIF_IO_BITMAP 18 /* uses I/O bitmap */ #define TIF_FREEZE 19 /* is freezing for suspend */ + #define _TIF_SYSCALL_TRACE (1< Subject: [patch 2/4] radix-tree: gang_lookup_slot Introduce a gang_lookup_slot function which is used by lockless pagecache. Signed-off-by: Nick Piggin --- include/linux/radix-tree.h | 7 +++ lib/radix-tree.c | 86 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 82 insertions(+), 11 deletions(-) Index: linux-rt.q/include/linux/radix-tree.h =================================================================== --- linux-rt.q.orig/include/linux/radix-tree.h +++ linux-rt.q/include/linux/radix-tree.h @@ -99,12 +99,14 @@ do { \ * * The notable exceptions to this rule are the following functions: * radix_tree_lookup + * radix_tree_lookup_slot * radix_tree_tag_get * radix_tree_gang_lookup + * radix_tree_gang_lookup_slot * radix_tree_gang_lookup_tag * radix_tree_tagged * - * The first 4 functions are able to be called locklessly, using RCU. The + * The first 6 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. @@ -159,6 +161,9 @@ void *radix_tree_delete(struct radix_tre unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items); /* * On a mutex based kernel we can freely schedule within the radix code: */ Index: linux-rt.q/lib/radix-tree.c =================================================================== --- linux-rt.q.orig/lib/radix-tree.c +++ linux-rt.q/lib/radix-tree.c @@ -341,18 +341,17 @@ EXPORT_SYMBOL(radix_tree_insert); * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * - * This function cannot be called under rcu_read_lock, it must be - * excluded from writers, as must the returned slot for subsequent - * use by radix_tree_deref_slot() and radix_tree_replace slot. - * Caller must hold tree write locked across slot lookup and - * replace. + * This function can be called under rcu_read_lock iff the slot is not + * modified by radix_tree_replace_slot, otherwise it must be called + * exclusive from other writers. Any dereference of the slot must be done + * using radix_tree_deref_slot. */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { unsigned int height, shift; struct radix_tree_node *node, **slot; - node = root->rnode; + node = rcu_dereference(root->rnode); if (node == NULL) return NULL; @@ -372,7 +371,7 @@ void **radix_tree_lookup_slot(struct rad do { slot = (struct radix_tree_node **) (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); - node = *slot; + node = rcu_dereference(*slot); if (node == NULL) return NULL; @@ -609,7 +608,7 @@ EXPORT_SYMBOL(radix_tree_tag_get); #endif static unsigned int -__lookup(struct radix_tree_node *slot, void **results, unsigned long index, +__lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) { unsigned int nr_found = 0; @@ -647,7 +646,7 @@ __lookup(struct radix_tree_node *slot, v index++; node = slot->slots[i]; if (node) { - results[nr_found++] = rcu_dereference(node); + results[nr_found++] = &(slot->slots[i]); if (nr_found == max_items) goto out; } @@ -701,6 +700,73 @@ radix_tree_gang_lookup(struct radix_tree ret = 0; while (ret < max_items) { + unsigned int nr_found, i, j; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup(node, (void ***)results + ret, cur_index, + max_items - ret, &next_index); + for (i = j = 0; i < nr_found; i++) { + struct radix_tree_node *slot; + slot = rcu_dereference(*(((void ***)results)[ret + i])); + if (!slot) + continue; + results[ret + j] = slot; + j++; + } + ret += j; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup); + +/** + * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * their slots at *@results and returns the number of items which were + * placed at *@results. + * + * The implementation is naive. + * + * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must + * be dereferenced with radix_tree_deref_slot, and if using only RCU + * protection, radix_tree_deref_slot may fail requiring a retry. + */ +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items) +{ + unsigned long max_index; + struct radix_tree_node *node; + unsigned long cur_index = first_index; + unsigned int ret; + + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { unsigned int nr_found; unsigned long next_index; /* Index of next search */ @@ -716,7 +782,7 @@ radix_tree_gang_lookup(struct radix_tree return ret; } -EXPORT_SYMBOL(radix_tree_gang_lookup); +EXPORT_SYMBOL(radix_tree_gang_lookup_slot); /* * FIXME: the two tag_get()s here should use find_next_bit() instead of patches/lockdep-rt-mutex.patch0000664000077200007720000001124410646635217015762 0ustar mingomingoSubject: lockdep-rt: annotate PREEMPT_RT DEFINE_MUTEX Signed-off-by: Peter Zijlstra --- include/linux/mutex.h | 16 ++++++---- include/linux/rt_lock.h | 70 ++++++++++++++++++++---------------------------- 2 files changed, 39 insertions(+), 47 deletions(-) Index: linux-rt.q/include/linux/mutex.h =================================================================== --- linux-rt.q.orig/include/linux/mutex.h +++ linux-rt.q/include/linux/mutex.h @@ -18,6 +18,13 @@ #include +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ + , .dep_map = { .name = #lockname } +#else +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +#endif + #ifdef CONFIG_PREEMPT_RT #include @@ -29,9 +36,11 @@ struct mutex { #endif }; + #define __MUTEX_INITIALIZER(mutexname) \ { \ .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ } #define DEFINE_MUTEX(mutexname) \ @@ -140,13 +149,6 @@ do { \ # define mutex_destroy(mutex) do { } while (0) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ - , .dep_map = { .name = #lockname } -#else -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) -#endif - #define __MUTEX_INITIALIZER(lockname) \ { .count = ATOMIC_INIT(1) \ , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ Index: linux-rt.q/include/linux/rt_lock.h =================================================================== --- linux-rt.q.orig/include/linux/rt_lock.h +++ linux-rt.q/include/linux/rt_lock.h @@ -27,30 +27,31 @@ typedef struct { } spinlock_t; #ifdef CONFIG_DEBUG_RT_MUTEXES -# define __SPIN_LOCK_UNLOCKED(name) \ - (spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) \ - , .save_state = 1, .file = __FILE__, .line = __LINE__ }, SPIN_DEP_MAP_INIT(name) } +# define __RT_SPIN_INITIALIZER(name) \ + { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + .save_state = 1, \ + .file = __FILE__, \ + .line = __LINE__, } #else -# define __SPIN_LOCK_UNLOCKED(name) \ - (spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) }, SPIN_DEP_MAP_INIT(name) } +# define __RT_SPIN_INITIALIZER(name) \ + { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) } #endif -# define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(spin_old_style) + +#define __SPIN_LOCK_UNLOCKED(name) (spinlock_t) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + SPIN_DEP_MAP_INIT(name) } + #else /* !PREEMPT_RT */ - typedef raw_spinlock_t spinlock_t; -# ifdef CONFIG_DEBUG_SPINLOCK -# define _SPIN_LOCK_UNLOCKED \ - { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ - .magic = SPINLOCK_MAGIC, \ - .owner = SPINLOCK_OWNER_INIT, \ - .owner_cpu = -1 } -# else -# define _SPIN_LOCK_UNLOCKED \ - { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED } -# endif -# define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED -# define __SPIN_LOCK_UNLOCKED(name) _SPIN_LOCK_UNLOCKED + +typedef raw_spinlock_t spinlock_t; + +#define __SPIN_LOCK_UNLOCKED _RAW_SPIN_LOCK_UNLOCKED + #endif +#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(spin_old_style) + + #define __DEFINE_SPINLOCK(name) \ spinlock_t name = __SPIN_LOCK_UNLOCKED(name) @@ -89,32 +90,20 @@ typedef struct { #endif } rwlock_t; -# ifdef CONFIG_DEBUG_RT_MUTEXES -# define __RW_LOCK_UNLOCKED(name) (rwlock_t) \ - { .lock = { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name), \ - .save_state = 1, .file = __FILE__, .line = __LINE__ } } -# else -# define __RW_LOCK_UNLOCKED(name) (rwlock_t) \ - { .lock = { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) } } -# endif +#define __RW_LOCK_UNLOCKED(name) (rwlock_t) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + RW_DEP_MAP_INIT(name) } #else /* !PREEMPT_RT */ - typedef raw_rwlock_t rwlock_t; -# ifdef CONFIG_DEBUG_SPINLOCK -# define _RW_LOCK_UNLOCKED \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ - .magic = RWLOCK_MAGIC, \ - .owner = SPINLOCK_OWNER_INIT, \ - .owner_cpu = -1 } -# else -# define _RW_LOCK_UNLOCKED \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED } -# endif -# define __RW_LOCK_UNLOCKED(name) _RW_LOCK_UNLOCKED +typedef raw_rwlock_t rwlock_t; + +#define __RW_LOCK_UNLOCKED _RAW_RW_LOCK_UNLOCKED + #endif #define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(rw_old_style) + #define DEFINE_RWLOCK(name) \ rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name) @@ -241,7 +230,8 @@ do { \ */ #define __RWSEM_INITIALIZER(name) \ - { .lock = __RT_MUTEX_INITIALIZER(name.lock) } + { .lock = __RT_MUTEX_INITIALIZER(name.lock), \ + RW_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) patches/rt-mutex-spinlock-might-sleep.patch0000664000077200007720000000445510646635214020402 0ustar mingomingoFrom rostedt@goodmis.org Sat Jun 2 00:35:54 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=ham version=3.1.7-deb Received: from ms-smtp-01.nyroc.rr.com (ms-smtp-01.nyroc.rr.com [24.24.2.55]) by mail.tglx.de (Postfix) with ESMTP id C420E65C065 for ; Sat, 2 Jun 2007 00:35:54 +0200 (CEST) Received: from [192.168.23.10] (cpe-24-94-51-176.stny.res.rr.com [24.94.51.176]) by ms-smtp-01.nyroc.rr.com (8.13.6/8.13.6) with ESMTP id l51MZLun018065; Fri, 1 Jun 2007 18:35:24 -0400 (EDT) Subject: [PATCH RT] add might_sleep in rt_spin_lock_fastlock From: Steven Rostedt To: Ingo Molnar Cc: Thomas Gleixner , Arnaldo Carvalho de Melo , LKML Content-Type: multipart/mixed; boundary="=-jgTmng/RcFNHiVaU9w/Z" Date: Fri, 01 Jun 2007 18:35:21 -0400 Message-Id: <1180737321.21781.46.camel@localhost.localdomain> Mime-Version: 1.0 X-Mailer: Evolution 2.6.3 X-Virus-Scanned: Symantec AntiVirus Scan Engine X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ --=-jgTmng/RcFNHiVaU9w/Z Content-Type: text/plain Content-Transfer-Encoding: 8bit Ingo, Every so often we get bit by a bug "scheduling in atomic", and it comes from a rtmutex spin_lock. The bug only happens when that lock has contention, so we miss it a lot. This patch adds a might_sleep() to the rt_spin_lock_fastlock to find bugs where we can schedule in atomic. The one place that exists now is from do_page_fault and sending a signal. I wrote a simple crash program that segfaults (attached) and with this patch, I get the warning. -- Steve Signed-off-by: Steven Rostedt --- kernel/rtmutex.c | 2 ++ 1 file changed, 2 insertions(+) Index: linux-rt.q/kernel/rtmutex.c =================================================================== --- linux-rt.q.orig/kernel/rtmutex.c +++ linux-rt.q/kernel/rtmutex.c @@ -631,6 +631,8 @@ static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, void fastcall (*slowfn)(struct rt_mutex *lock)) { + might_sleep(); + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) rt_mutex_deadlock_account_lock(lock, current); else patches/netpoll-8139too-fix.patch0000664000077200007720000000116110646635211016132 0ustar mingomingo--- drivers/net/8139too.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) Index: linux-rt.q/drivers/net/8139too.c =================================================================== --- linux-rt.q.orig/drivers/net/8139too.c +++ linux-rt.q/drivers/net/8139too.c @@ -2214,7 +2214,11 @@ static irqreturn_t rtl8139_interrupt (in */ static void rtl8139_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); + /* + * use _nosync() variant - might be used by netconsole + * from atomic contexts: + */ + disable_irq_nosync(dev->irq); rtl8139_interrupt(dev->irq, dev); enable_irq(dev->irq); } patches/preempt-realtime-printk.patch0000664000077200007720000001053010646635215017332 0ustar mingomingo--- kernel/printk.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 8 deletions(-) Index: linux-rt.q/kernel/printk.c =================================================================== --- linux-rt.q.orig/kernel/printk.c +++ linux-rt.q/kernel/printk.c @@ -82,7 +82,7 @@ static int console_locked, console_suspe * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -327,7 +327,7 @@ static void __call_console_drivers(unsig touch_critical_timing(); for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write && - (cpu_online(smp_processor_id()) || + (cpu_online(raw_smp_processor_id()) || (con->flags & CON_ANYTIME))) { /* * Disable tracing of printk details - it just @@ -459,6 +459,7 @@ static void zap_locks(void) spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ init_MUTEX(&console_sem); + zap_rt_locks(); } #if defined(CONFIG_PRINTK_TIME) @@ -549,6 +550,7 @@ asmlinkage int vprintk(const char *fmt, lockdep_off(); spin_lock(&logbuf_lock); printk_cpu = smp_processor_id(); + preempt_enable(); /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); @@ -618,6 +620,8 @@ asmlinkage int vprintk(const char *fmt, console_locked = 1; printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* * Console drivers may assume that per-cpu resources have @@ -625,7 +629,7 @@ asmlinkage int vprintk(const char *fmt, * being able to cope (CON_ANYTIME) don't call them until * this CPU is officially up. */ - if (cpu_online(smp_processor_id()) || have_callable_console()) { + if (cpu_online(raw_smp_processor_id()) || have_callable_console()) { console_may_schedule = 0; release_console_sem(); } else { @@ -633,8 +637,6 @@ asmlinkage int vprintk(const char *fmt, console_locked = 0; up(&console_sem); } - lockdep_on(); - raw_local_irq_restore(flags); } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -647,7 +649,6 @@ asmlinkage int vprintk(const char *fmt, raw_local_irq_restore(flags); } - preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); @@ -840,13 +841,33 @@ void release_console_sem(void) _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ + /* + * on PREEMPT_RT, call console drivers with + * interrupts enabled (if printk was called + * with interrupts disabled): + */ +#ifdef CONFIG_PREEMPT_RT + spin_unlock_irqrestore(&logbuf_lock, flags); +#else spin_unlock(&logbuf_lock); +#endif call_console_drivers(_con_start, _log_end); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } console_locked = 0; - up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); + up(&console_sem); + /* + * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd + * up only if we are in a preemptible section. We normally dont + * printk from non-preemptible sections so this is for the emergency + * case only. + */ +#ifdef CONFIG_PREEMPT_RT + if (!in_atomic() && !irqs_disabled()) +#endif if (wake_klogd) wake_up_klogd(); } @@ -1094,7 +1115,7 @@ void tty_write_message(struct tty_struct */ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { - static DEFINE_SPINLOCK(ratelimit_lock); + static DEFINE_RAW_SPINLOCK(ratelimit_lock); static unsigned long toks = 10 * 5 * HZ; static unsigned long last_msg; static int missed; @@ -1135,6 +1156,23 @@ int printk_ratelimit(void) } EXPORT_SYMBOL(printk_ratelimit); +static DEFINE_RAW_SPINLOCK(warn_lock); + +void __WARN_ON(const char *func, const char *file, const int line) +{ + unsigned long flags; + + spin_lock_irqsave(&warn_lock, flags); + printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n", + current->comm, current->pid, raw_smp_processor_id(), + func, file, line); + dump_stack(); + spin_unlock_irqrestore(&warn_lock, flags); +} + +EXPORT_SYMBOL(__WARN_ON); + + /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state patches/preempt-realtime-arm-shark.patch0000664000077200007720000000101010646635214017700 0ustar mingomingo--- arch/arm/mach-shark/leds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/arm/mach-shark/leds.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-shark/leds.c +++ linux-rt.q/arch/arm/mach-shark/leds.c @@ -32,7 +32,7 @@ static char led_state; static short hw_led_state; static short saved_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_RAW_SPINLOCK(leds_lock); short sequoia_read(int addr) { outw(addr,0x24); patches/inet-hash-bits-ipv6-fix.patch0000664000077200007720000001657110646635213017047 0ustar mingomingoFrom linux-rt-users-owner@vger.kernel.org Fri Jun 22 23:36:42 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=none autolearn=unavailable version=3.1.7-deb Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by mail.tglx.de (Postfix) with ESMTP id E246465C292; Fri, 22 Jun 2007 23:36:42 +0200 (CEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753653AbXFVVgk (ORCPT + 1 other); Fri, 22 Jun 2007 17:36:40 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753545AbXFVVgk (ORCPT ); Fri, 22 Jun 2007 17:36:40 -0400 Received: from homer.mvista.com ([63.81.120.158]:10787 "EHLO gateway-1237.mvista.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1753234AbXFVVgk (ORCPT ); Fri, 22 Jun 2007 17:36:40 -0400 Received: from [192.168.0.30] (unknown [10.0.10.125]) by hermes.mvista.com (Postfix) with ESMTP id EDA2D1DBDB; Fri, 22 Jun 2007 14:36:38 -0700 (PDT) Message-ID: <467C40D3.206@ncos.nec.co.jp> Date: Fri, 22 Jun 2007 14:36:19 -0700 From: Masayuki Nakagawa User-Agent: Thunderbird 1.5.0.12 (Windows/20070509) MIME-Version: 1.0 To: mingo@elte.hu Cc: linux-rt-users@vger.kernel.org, dwalker@mvista.com Subject: [PATCH 2.6.21.5-rt17] IPV6: estalished connections are not shown with "cat /proc/net/tcp6" Content-Type: text/plain; charset=UTF-8 Sender: linux-rt-users-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-rt-users@vger.kernel.org X-Filter-To: .Kernel.rt-users X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit I found an issue regarding networking in the real-time patch (patch-2.6.21.5-rt17). The issue happens only with the kernel, which the real-time patch was applied. However, the latest stable main kernel (2.6.21.5) doesn't have the same issue. Therefore, please don't transfer this report to netdev. The detail of issue is below. I ran my test program, which is a very simple IPv6 client-server program. These programs establish a TCP/IPv6 connection between two hosts, and then sleep, like following diagram. And then, the problem appears with "cat /proc/net/tcp6". serverA serverB | SYN | +--------------->+ | SYN/ACK | +<---------------+ | ACK | +--------------->+ | | sleep... sleep... | | When I "cat /proc/net/tcp6" on serverA while establishing connection between serverA and B, the established connections are not shown. If you need my test program, please let me know. I can provide it to you. However, in case of the main-line kernel, the established connections will be shown appropriately with "cat /proc/net/tcp6". It's different because the real-time patch has implemented a new socket lookup mechanism for a high-latency. So, real-time patch has a different mechanism from main-line kernel. The real-time patch, which implemented a new socket lookup mechanism is using bitmap(ebitmask). When establishing TCP connection, it sets a flag bit into the bitmap like followings. [ebitmask in struct inet_hashinfo] Before connecting 0000000000000000000000000000000000000000000000000000000000000000 After connecting 0000001000000000000000000000000000000000000000000000000000000000 ^ And when reading "/proc/net/tcp and tcp6", the kernel searches the currently active TCP connections with reference to the bitmap. However, the kernel can't search the active TCP/IPv6 connection in established state. It is because the kernel doesn't set a flag bit when establishing TCP/IPv6 connection. In case of TCP/IPv4, __inet_hash() sets the flag bit properly with __inet_hash_setbit(). But, in case of TCP/IPv6, the setting the flag bit is missing in __inet6_hash(). [include/net/inet_hashtables.h] static inline void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk, const int listen_possible) { struct hlist_head *list; rwlock_t *lock; unsigned long *bitmask = NULL; unsigned int index = 0; BUG_TRAP(sk_unhashed(sk)); if (listen_possible && sk->sk_state == TCP_LISTEN) { list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; lock = &hashinfo->lhash_lock; inet_listen_wlock(hashinfo); } else { struct inet_ehash_bucket *head; sk->sk_hash = inet_sk_ehashfn(sk); index = inet_ehash_index(hashinfo, sk->sk_hash); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = &head->lock; bitmask = hashinfo->ebitmask; write_lock(lock); } __sk_add_node(sk, list); __inet_hash_setbit(bitmask, index); sock_prot_inc_use(sk->sk_prot); write_unlock(lock); if (listen_possible && sk->sk_state == TCP_LISTEN) wake_up(&hashinfo->lhash_wait); } [net/ipv6/inet6_hashtables.c] void __inet6_hash(struct inet_hashinfo *hashinfo, struct sock *sk) { struct hlist_head *list; rwlock_t *lock; printk("__inet6_hash hit\n"); BUG_TRAP(sk_unhashed(sk)); if (sk->sk_state == TCP_LISTEN) { list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; lock = &hashinfo->lhash_lock; inet_listen_wlock(hashinfo); } else { unsigned int hash; sk->sk_hash = hash = inet6_sk_ehashfn(sk); hash &= (hashinfo->ehash_size - 1); list = &hashinfo->ehash[hash].chain; lock = &hashinfo->ehash[hash].lock; write_lock(lock); } __sk_add_node(sk, list); sock_prot_inc_use(sk->sk_prot); write_unlock(lock); } So, I suggest a following change. The change is to set the flag bit appropriately in __inet6_hash(). Signed-off-by: Masayuki Nakagawa --- net/ipv6/inet6_hashtables.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) Index: linux-rt.q/net/ipv6/inet6_hashtables.c =================================================================== --- linux-rt.q.orig/net/ipv6/inet6_hashtables.c +++ linux-rt.q/net/ipv6/inet6_hashtables.c @@ -27,6 +27,8 @@ void __inet6_hash(struct inet_hashinfo * { struct hlist_head *list; rwlock_t *lock; + unsigned long *bitmask = NULL; + unsigned int index = 0; BUG_TRAP(sk_unhashed(sk)); @@ -35,15 +37,16 @@ void __inet6_hash(struct inet_hashinfo * lock = &hashinfo->lhash_lock; inet_listen_wlock(hashinfo); } else { - unsigned int hash; - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (hashinfo->ehash_size - 1); - list = &hashinfo->ehash[hash].chain; - lock = &hashinfo->ehash[hash].lock; + sk->sk_hash = inet6_sk_ehashfn(sk); + index = inet_ehash_index(hashinfo, sk->sk_hash); + list = &hashinfo->ehash[index].chain; + lock = &hashinfo->ehash[index].lock; + bitmask = hashinfo->ebitmask; write_lock(lock); } __sk_add_node(sk, list); + __inet_hash_setbit(bitmask, index); sock_prot_inc_use(sk->sk_prot); write_unlock(lock); } patches/kvm-rt.patch0000664000077200007720000000465410646635217014005 0ustar mingomingoSubject: [patch] kvm: make vcpu_load/put preemptible From: Ingo Molnar make vcpu_load/put preemptible. Signed-off-by: Ingo Molnar --- drivers/kvm/svm.c | 13 ++++++++++--- drivers/kvm/vmx.c | 15 ++++++++++++--- 2 files changed, 22 insertions(+), 6 deletions(-) Index: linux-rt.q/drivers/kvm/svm.c =================================================================== --- linux-rt.q.orig/drivers/kvm/svm.c +++ linux-rt.q/drivers/kvm/svm.c @@ -610,9 +610,17 @@ static void svm_free_vcpu(struct kvm_vcp static void svm_vcpu_load(struct kvm_vcpu *vcpu) { - int cpu, i; + int cpu = raw_smp_processor_id(), i; + cpumask_t this_mask = cpumask_of_cpu(cpu); + + /* + * Keep the context preemptible, but do not migrate + * away to another CPU. TODO: make sure this persists. + * Save/restore original mask. + */ + if (unlikely(!cpus_equal(current->cpus_allowed, this_mask))) + set_cpus_allowed(current, cpumask_of_cpu(cpu)); - cpu = get_cpu(); if (unlikely(cpu != vcpu->cpu)) { u64 tsc_this, delta; @@ -638,7 +646,6 @@ static void svm_vcpu_put(struct kvm_vcpu wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]); rdtscll(vcpu->host_tsc); - put_cpu(); } static void svm_vcpu_decache(struct kvm_vcpu *vcpu) Index: linux-rt.q/drivers/kvm/vmx.c =================================================================== --- linux-rt.q.orig/drivers/kvm/vmx.c +++ linux-rt.q/drivers/kvm/vmx.c @@ -241,9 +241,16 @@ static void vmcs_set_bits(unsigned long static void vmx_vcpu_load(struct kvm_vcpu *vcpu) { u64 phys_addr = __pa(vcpu->vmcs); - int cpu; + int cpu = raw_smp_processor_id(); + cpumask_t this_mask = cpumask_of_cpu(cpu); - cpu = get_cpu(); + /* + * Keep the context preemptible, but do not migrate + * away to another CPU. TODO: make sure this persists. + * Save/restore original mask. + */ + if (unlikely(!cpus_equal(current->cpus_allowed, this_mask))) + set_cpus_allowed(current, cpumask_of_cpu(cpu)); if (vcpu->cpu != cpu) vcpu_clear(vcpu); @@ -281,7 +288,6 @@ static void vmx_vcpu_load(struct kvm_vcp static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { kvm_put_guest_fpu(vcpu); - put_cpu(); } static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) @@ -1862,6 +1868,7 @@ again: } #endif + preempt_disable(); asm ( /* Store host registers */ "pushf \n\t" @@ -2002,6 +2009,8 @@ again: reload_tss(); } + preempt_enable(); + ++vcpu->stat.exits; #ifdef CONFIG_X86_64 patches/preempt-realtime-ia64.patch0000664000077200007720000014017010646635214016571 0ustar mingomingo Hi, This is a first version of my port of Ingo's -rt kernel to the IA64 arch. So far the kernel boots with PREEMPT_RT enabled (on a 4-cpu tiger), and that's about it. I've not done extensive tests (only scripts/rt-tester), nor any measurements of any kind. There's very probably many bugs I'm not aware of. But there is already one thing I know should be fixed : I've changed the declaration of (struct zone).lock (in include/linux/mmzone.h) from spinlock_t to raw_spinlock_t. I did this because on IA64, cpu_idle(), which is not allowed to call schedule(), calls check_pgt_cache(). I guess this could be fixed by moving this call to another kernel thread... ideas are welcome. Simon. Signed-off-by: Simon.Derr@bull.net arch/ia64/Kconfig | 82 ++++++++++++++++++++++++++++---- arch/ia64/kernel/asm-offsets.c | 2 arch/ia64/kernel/entry.S | 25 +++++----- arch/ia64/kernel/fsys.S | 21 ++++++++ arch/ia64/kernel/iosapic.c | 34 ++++++++++++- arch/ia64/kernel/mca.c | 2 arch/ia64/kernel/perfmon.c | 6 +- arch/ia64/kernel/process.c | 14 +++-- arch/ia64/kernel/sal.c | 2 arch/ia64/kernel/salinfo.c | 6 +- arch/ia64/kernel/semaphore.c | 8 +-- arch/ia64/kernel/signal.c | 8 +++ arch/ia64/kernel/smp.c | 16 ++++++ arch/ia64/kernel/smpboot.c | 3 + arch/ia64/kernel/time.c | 74 +++++++++++++++++++---------- arch/ia64/kernel/traps.c | 10 ++-- arch/ia64/kernel/unwind.c | 4 - arch/ia64/kernel/unwind_i.h | 2 arch/ia64/mm/init.c | 2 arch/ia64/mm/tlb.c | 2 drivers/char/blocker.c | 1 include/asm-ia64/irqflags.h | 95 ++++++++++++++++++++++++++++++++++++++ include/asm-ia64/mmu_context.h | 2 include/asm-ia64/percpu.h | 21 +++++++- include/asm-ia64/processor.h | 6 +- include/asm-ia64/rtc.h | 7 ++ include/asm-ia64/rwsem.h | 32 ++++++------ include/asm-ia64/sal.h | 2 include/asm-ia64/semaphore.h | 53 +++++++++++++-------- include/asm-ia64/spinlock.h | 26 ++++------ include/asm-ia64/spinlock_types.h | 4 - include/asm-ia64/system.h | 67 -------------------------- include/asm-ia64/thread_info.h | 1 include/asm-ia64/tlb.h | 10 ++-- 34 files changed, 446 insertions(+), 204 deletions(-) Index: linux-rt.q/arch/ia64/Kconfig =================================================================== --- linux-rt.q.orig/arch/ia64/Kconfig +++ linux-rt.q/arch/ia64/Kconfig @@ -44,6 +44,7 @@ config SWIOTLB config RWSEM_XCHGADD_ALGORITHM bool + depends on !PREEMPT_RT default y config ARCH_HAS_ILOG2_U32 @@ -271,6 +272,69 @@ config SMP If you don't know what to do here, say N. + +config GENERIC_TIME + bool + default y + +config HIGH_RES_TIMERS + bool "High-Resolution Timers" + help + + POSIX timers are available by default. This option enables + high-resolution POSIX timers. With this option the resolution + is at least 1 microsecond. High resolution is not free. If + enabled this option will add a small overhead each time a + timer expires that is not on a 1/HZ tick boundary. If no such + timers are used the overhead is nil. + + This option enables two additional POSIX CLOCKS, + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Note that this + option does not change the resolution of CLOCK_REALTIME or + CLOCK_MONOTONIC which remain at 1/HZ resolution. + +config HIGH_RES_RESOLUTION + int "High-Resolution-Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + + This sets the resolution of timers accessed with + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Too + fine a resolution (small a number) will usually not + be observable due to normal system latencies. For an + 800 MHZ processor about 10,000 is the recommended maximum + (smallest number). If you don't need that sort of resolution, + higher numbers may generate less overhead. + +choice + prompt "Clock source" + depends on HIGH_RES_TIMERS + default HIGH_RES_TIMER_ITC + help + This option allows you to choose the hardware source in charge + of generating high precision interruptions on your system. + On IA-64 these are: + + + ITC Interval Time Counter 1/CPU clock + HPET High Precision Event Timer ~ (XXX:have to check the spec) + + The ITC timer is available on all the ia64 computers because + it is integrated directly into the processor. However it may not + give correct results on MP machines with processors running + at different clock rates. In this case you may want to use + the HPET if available on your machine. + + +config HIGH_RES_TIMER_ITC + bool "Interval Time Counter/ITC" + +config HIGH_RES_TIMER_HPET + bool "High Precision Event Timer/HPET" + +endchoice + config NR_CPUS int "Maximum number of CPUs (2-1024)" range 2 1024 @@ -323,17 +387,15 @@ config FORCE_CPEI_RETARGET This option it useful to enable this feature on older BIOS's as well. You can also enable this by using boot command line option force_cpei=1. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. +source "kernel/Kconfig.preempt" - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +config RWSEM_GENERIC_SPINLOCK + bool + depends on PREEMPT_RT + default y + +config PREEMPT + def_bool y if (PREEMPT_RT || PREEMPT_SOFTIRQS || PREEMPT_HARDIRQS || PREEMPT_VOLUNTARY || PREEMPT_DESKTOP) source "mm/Kconfig" Index: linux-rt.q/arch/ia64/kernel/asm-offsets.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/asm-offsets.c +++ linux-rt.q/arch/ia64/kernel/asm-offsets.c @@ -255,6 +255,7 @@ void foo(void) offsetof (struct pal_min_state_area_s, pmsa_xip)); BLANK(); +#ifdef CONFIG_TIME_INTERPOLATION /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr)); DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source)); @@ -269,4 +270,5 @@ void foo(void) DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64); DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32); DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); +#endif } Index: linux-rt.q/arch/ia64/kernel/entry.S =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/entry.S +++ linux-rt.q/arch/ia64/kernel/entry.S @@ -1098,23 +1098,24 @@ skip_rbs_switch: st8 [r2]=r8 st8 [r3]=r10 .work_pending: - tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? + tbit.nz p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? +(p6) br.cond.sptk.few .needresched + tbit.z p6,p0=r31,TIF_NEED_RESCHED_DELAYED // current_thread_info()->need_resched_delayed==0? (p6) br.cond.sptk.few .notify -#ifdef CONFIG_PREEMPT -(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1 + +.needresched: + +(pKStk) br.cond.sptk.many .fromkernel ;; -(pKStk) st4 [r20]=r21 ssm psr.i // enable interrupts -#endif br.call.spnt.many rp=schedule -.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 - rsm psr.i // disable interrupts - ;; -#ifdef CONFIG_PREEMPT -(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 +.ret9a: rsm psr.i // disable interrupts ;; -(pKStk) st4 [r20]=r0 // preempt_count() <- 0 -#endif + br.cond.sptk.many .endpreemptdep +.fromkernel: + br.call.spnt.many rp=preempt_schedule_irq +.ret9b: rsm psr.i // disable interrupts +.endpreemptdep: (pLvSys)br.cond.sptk.few .work_pending_syscall_end br.cond.sptk.many .work_processed_kernel // re-check Index: linux-rt.q/arch/ia64/kernel/fsys.S =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/fsys.S +++ linux-rt.q/arch/ia64/kernel/fsys.S @@ -26,6 +26,7 @@ #include "entry.h" +#ifdef CONFIG_TIME_INTERPOLATION /* * See Documentation/ia64/fsys.txt for details on fsyscalls. * @@ -350,6 +351,26 @@ ENTRY(fsys_clock_gettime) br.many .gettime END(fsys_clock_gettime) + +#else // !CONFIG_TIME_INTERPOLATION + +# define fsys_gettimeofday 0 +# define fsys_clock_gettime 0 + +.fail_einval: + mov r8 = EINVAL + mov r10 = -1 + FSYS_RETURN + +.fail_efault: + mov r8 = EFAULT + mov r10 = -1 + FSYS_RETURN + +#endif + + + /* * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). */ Index: linux-rt.q/arch/ia64/kernel/iosapic.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/iosapic.c +++ linux-rt.q/arch/ia64/kernel/iosapic.c @@ -111,7 +111,7 @@ (PAGE_SIZE / sizeof(struct iosapic_rte_info)) #define RTE_PREALLOCATED (1) -static DEFINE_SPINLOCK(iosapic_lock); +static DEFINE_RAW_SPINLOCK(iosapic_lock); /* * These tables map IA-64 vectors to the IOSAPIC pin that generates this @@ -429,6 +429,34 @@ iosapic_startup_level_irq (unsigned int return 0; } +/* + * In the preemptible case mask the IRQ first then handle it and ack it. + */ +#ifdef CONFIG_PREEMPT_HARDIRQS + +static void +iosapic_ack_level_irq (unsigned int irq) +{ + ia64_vector vec = irq_to_vector(irq); + struct iosapic_rte_info *rte; + + move_irq(irq); + mask_irq(irq); + list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) + iosapic_eoi(rte->addr, vec); +} + +static void +iosapic_end_level_irq (unsigned int irq) +{ + if (!(irq_desc[irq].status & IRQ_INPROGRESS)) + unmask_irq(irq); +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +#define iosapic_ack_level_irq nop + static void iosapic_end_level_irq (unsigned int irq) { @@ -440,10 +468,12 @@ iosapic_end_level_irq (unsigned int irq) iosapic_eoi(rte->addr, vec); } + +#endif + #define iosapic_shutdown_level_irq mask_irq #define iosapic_enable_level_irq unmask_irq #define iosapic_disable_level_irq mask_irq -#define iosapic_ack_level_irq nop struct irq_chip irq_type_iosapic_level = { .name = "IO-SAPIC-level", Index: linux-rt.q/arch/ia64/kernel/mca.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/mca.c +++ linux-rt.q/arch/ia64/kernel/mca.c @@ -320,7 +320,7 @@ ia64_mca_spin(const char *func) typedef struct ia64_state_log_s { - spinlock_t isl_lock; + raw_spinlock_t isl_lock; int isl_index; unsigned long isl_count; ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */ Index: linux-rt.q/arch/ia64/kernel/perfmon.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/perfmon.c +++ linux-rt.q/arch/ia64/kernel/perfmon.c @@ -280,7 +280,7 @@ typedef struct { */ typedef struct pfm_context { - spinlock_t ctx_lock; /* context protection */ + raw_spinlock_t ctx_lock; /* context protection */ pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */ unsigned int ctx_state; /* state: active/inactive (no bitfield) */ @@ -369,7 +369,7 @@ typedef struct pfm_context { * mostly used to synchronize between system wide and per-process */ typedef struct { - spinlock_t pfs_lock; /* lock the structure */ + raw_spinlock_t pfs_lock; /* lock the structure */ unsigned int pfs_task_sessions; /* number of per task sessions */ unsigned int pfs_sys_sessions; /* number of per system wide sessions */ @@ -510,7 +510,7 @@ static pfm_intr_handler_desc_t *pfm_alt static struct proc_dir_entry *perfmon_dir; static pfm_uuid_t pfm_null_uuid = {0,}; -static spinlock_t pfm_buffer_fmt_lock; +static raw_spinlock_t pfm_buffer_fmt_lock; static LIST_HEAD(pfm_buffer_fmt_list); static pmu_config_t *pmu_conf; Index: linux-rt.q/arch/ia64/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/process.c +++ linux-rt.q/arch/ia64/kernel/process.c @@ -94,6 +94,9 @@ show_stack (struct task_struct *task, un void dump_stack (void) { + if (irqs_disabled()) { + printk("Uh oh.. entering dump_stack() with irqs disabled.\n"); + } show_stack(NULL, NULL); } @@ -197,7 +200,7 @@ void default_idle (void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { if (can_do_pal_halt) safe_halt(); else @@ -281,7 +284,7 @@ cpu_idle (void) current_thread_info()->status |= TS_POLLING; } - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { void (*idle)(void); #ifdef CONFIG_SMP min_xtp(); @@ -303,10 +306,11 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); + preempt_disable(); - check_pgt_cache(); + if (cpu_is_offline(cpu)) play_dead(); } Index: linux-rt.q/arch/ia64/kernel/sal.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/sal.c +++ linux-rt.q/arch/ia64/kernel/sal.c @@ -18,7 +18,7 @@ #include #include - __cacheline_aligned DEFINE_SPINLOCK(sal_lock); + __cacheline_aligned DEFINE_RAW_SPINLOCK(sal_lock); unsigned long sal_platform_features; unsigned short sal_revision; Index: linux-rt.q/arch/ia64/kernel/salinfo.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/salinfo.c +++ linux-rt.q/arch/ia64/kernel/salinfo.c @@ -140,7 +140,7 @@ enum salinfo_state { struct salinfo_data { cpumask_t cpu_event; /* which cpus have outstanding events */ - struct semaphore mutex; + struct compat_semaphore mutex; u8 *log_buffer; u64 log_size; u8 *oemdata; /* decoded oem data */ @@ -156,8 +156,8 @@ struct salinfo_data { static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)]; -static DEFINE_SPINLOCK(data_lock); -static DEFINE_SPINLOCK(data_saved_lock); +static DEFINE_RAW_SPINLOCK(data_lock); +static DEFINE_RAW_SPINLOCK(data_saved_lock); /** salinfo_platform_oemdata - optional callback to decode oemdata from an error * record. Index: linux-rt.q/arch/ia64/kernel/semaphore.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/semaphore.c +++ linux-rt.q/arch/ia64/kernel/semaphore.c @@ -40,12 +40,12 @@ */ void -__up (struct semaphore *sem) +__up (struct compat_semaphore *sem) { wake_up(&sem->wait); } -void __sched __down (struct semaphore *sem) +void __sched __down (struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,7 +82,7 @@ void __sched __down (struct semaphore *s tsk->state = TASK_RUNNING; } -int __sched __down_interruptible (struct semaphore * sem) +int __sched __down_interruptible (struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -142,7 +142,7 @@ int __sched __down_interruptible (struct * count. */ int -__down_trylock (struct semaphore *sem) +__down_trylock (struct compat_semaphore *sem) { unsigned long flags; int sleepers; Index: linux-rt.q/arch/ia64/kernel/signal.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/signal.c +++ linux-rt.q/arch/ia64/kernel/signal.c @@ -446,6 +446,14 @@ ia64_do_signal (struct sigscratch *scr, long errno = scr->pt.r8; # define ERR_CODE(c) (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c)) +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * In the ia64_leave_kernel code path, we want the common case to go fast, which * is why we may in certain cases get here from kernel mode. Just return without Index: linux-rt.q/arch/ia64/kernel/smp.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/smp.c +++ linux-rt.q/arch/ia64/kernel/smp.c @@ -261,6 +261,22 @@ smp_send_reschedule (int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, + IA64_IPI_DM_INT, 0); + } +} + +/* * Called with preemption disabled. */ static void Index: linux-rt.q/arch/ia64/kernel/smpboot.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/smpboot.c +++ linux-rt.q/arch/ia64/kernel/smpboot.c @@ -370,6 +370,8 @@ smp_setup_percpu_timer (void) { } +extern void register_itc_clockevent(void); + static void __cpuinit smp_callin (void) { @@ -444,6 +446,7 @@ smp_callin (void) #ifdef CONFIG_IA32_SUPPORT ia32_gdt_init(); #endif + register_itc_clockevent(); /* * Allow the master to continue. Index: linux-rt.q/arch/ia64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/time.c +++ linux-rt.q/arch/ia64/kernel/time.c @@ -54,6 +54,7 @@ timer_interrupt (int irq, void *dev_id) platform_timer_interrupt(irq, dev_id); +#if 0 new_itm = local_cpu_data->itm_next; if (!time_after(ia64_get_itc(), new_itm)) @@ -61,29 +62,48 @@ timer_interrupt (int irq, void *dev_id) ia64_get_itc(), new_itm); profile_tick(CPU_PROFILING); +#endif + + if (time_after(ia64_get_itc(), local_cpu_data->itm_tick_next)) { - while (1) { - update_process_times(user_mode(get_irq_regs())); + unsigned long new_tick_itm; + new_tick_itm = local_cpu_data->itm_tick_next; - new_itm += local_cpu_data->itm_delta; + profile_tick(CPU_PROFILING, get_irq_regs()); - if (smp_processor_id() == time_keeper_id) { - /* - * Here we are in the timer irq handler. We have irqs locally - * disabled, but we don't know if the timer_bh is running on - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ - write_seqlock(&xtime_lock); - do_timer(1); - local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); - } else - local_cpu_data->itm_next = new_itm; + while (1) { + update_process_times(user_mode(get_irq_regs())); + + new_tick_itm += local_cpu_data->itm_tick_delta; + + if (smp_processor_id() == time_keeper_id) { + /* + * Here we are in the timer irq handler. We have irqs locally + * disabled, but we don't know if the timer_bh is running on + * another CPU. We need to avoid to SMP race by acquiring the + * xtime_lock. + */ + write_seqlock(&xtime_lock); + do_timer(get_irq_regs()); + local_cpu_data->itm_tick_next = new_tick_itm; + write_sequnlock(&xtime_lock); + } else + local_cpu_data->itm_tick_next = new_tick_itm; + + if (time_after(new_tick_itm, ia64_get_itc())) + break; + } + } - if (time_after(new_itm, ia64_get_itc())) - break; + if (time_after(ia64_get_itc(), local_cpu_data->itm_timer_next)) { + if (itc_clockevent.event_handler) + itc_clockevent.event_handler(get_irq_regs()); + // FIXME, really, please + new_itm = local_cpu_data->itm_tick_next; + + if (time_after(new_itm, local_cpu_data->itm_timer_next)) + new_itm = local_cpu_data->itm_timer_next; /* * Allow IPIs to interrupt the timer loop. */ @@ -101,8 +121,8 @@ timer_interrupt (int irq, void *dev_id) * too fast (with the potentially devastating effect * of losing monotony of time). */ - while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2)) - new_itm += local_cpu_data->itm_delta; + while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_tick_delta/2)) + new_itm += local_cpu_data->itm_tick_delta; ia64_set_itm(new_itm); /* double check, in case we got hit by a (slow) PMI: */ } while (time_after_eq(ia64_get_itc(), new_itm)); @@ -121,7 +141,7 @@ ia64_cpu_local_tick (void) /* arrange for the cycle counter to generate a timer interrupt: */ ia64_set_itv(IA64_TIMER_VECTOR); - delta = local_cpu_data->itm_delta; + delta = local_cpu_data->itm_tick_delta; /* * Stagger the timer tick for each CPU so they don't occur all at (almost) the * same time: @@ -130,8 +150,8 @@ ia64_cpu_local_tick (void) unsigned long hi = 1UL << ia64_fls(cpu); shift = (2*(cpu - hi) + 1) * delta/hi/2; } - local_cpu_data->itm_next = ia64_get_itc() + delta + shift; - ia64_set_itm(local_cpu_data->itm_next); + local_cpu_data->itm_tick_next = ia64_get_itc() + delta + shift; + ia64_set_itm(local_cpu_data->itm_tick_next); } static int nojitter; @@ -189,7 +209,7 @@ ia64_init_itm (void) itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den; - local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ; + local_cpu_data->itm_tick_delta = (itc_freq + HZ/2) / HZ; printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, " "ITC freq=%lu.%03luMHz", smp_processor_id(), platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000, @@ -209,6 +229,7 @@ ia64_init_itm (void) local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<itc_freq; itc_interpolator.drift = itc_drift; @@ -227,6 +248,7 @@ ia64_init_itm (void) #endif register_time_interpolator(&itc_interpolator); } +#endif /* Setup the CPU local timer tick */ ia64_cpu_local_tick(); @@ -234,7 +256,7 @@ ia64_init_itm (void) static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NODELAY, .name = "timer" }; @@ -255,6 +277,8 @@ time_init (void) * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC). */ set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + register_itc_clocksource(); + register_itc_clockevent(); } /* Index: linux-rt.q/arch/ia64/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/traps.c +++ linux-rt.q/arch/ia64/kernel/traps.c @@ -39,11 +39,11 @@ void die (const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -180,7 +180,7 @@ __kprobes ia64_bad_break (unsigned long * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes * care of clearing psr.dfh. */ -static inline void +void disabled_fph_fault (struct pt_regs *regs) { struct ia64_psr *psr = ia64_psr(regs); @@ -199,7 +199,7 @@ disabled_fph_fault (struct pt_regs *regs = (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER); if (ia64_is_local_fpu_owner(current)) { - preempt_enable_no_resched(); + __preempt_enable_no_resched(); return; } @@ -219,7 +219,7 @@ disabled_fph_fault (struct pt_regs *regs */ psr->mfh = 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } static inline int Index: linux-rt.q/arch/ia64/kernel/unwind.c =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/unwind.c +++ linux-rt.q/arch/ia64/kernel/unwind.c @@ -82,7 +82,7 @@ typedef unsigned long unw_word; typedef unsigned char unw_hash_index_t; static struct { - spinlock_t lock; /* spinlock for unwind data */ + raw_spinlock_t lock; /* spinlock for unwind data */ /* list of unwind tables (one per load-module) */ struct unw_table *tables; @@ -146,7 +146,7 @@ static struct { # endif } unw = { .tables = &unw.kernel_table, - .lock = __SPIN_LOCK_UNLOCKED(unw.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(unw.lock), .save_order = { UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR, UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR Index: linux-rt.q/arch/ia64/kernel/unwind_i.h =================================================================== --- linux-rt.q.orig/arch/ia64/kernel/unwind_i.h +++ linux-rt.q/arch/ia64/kernel/unwind_i.h @@ -154,7 +154,7 @@ struct unw_script { unsigned long ip; /* ip this script is for */ unsigned long pr_mask; /* mask of predicates script depends on */ unsigned long pr_val; /* predicate values this script is for */ - rwlock_t lock; + raw_rwlock_t lock; unsigned int flags; /* see UNW_FLAG_* in unwind.h */ unsigned short lru_chain; /* used for least-recently-used chain */ unsigned short coll_chain; /* used for hash collisions */ Index: linux-rt.q/arch/ia64/mm/init.c =================================================================== --- linux-rt.q.orig/arch/ia64/mm/init.c +++ linux-rt.q/arch/ia64/mm/init.c @@ -37,7 +37,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); extern void ia64_tlb_init (void); Index: linux-rt.q/arch/ia64/mm/tlb.c =================================================================== --- linux-rt.q.orig/arch/ia64/mm/tlb.c +++ linux-rt.q/arch/ia64/mm/tlb.c @@ -32,7 +32,7 @@ static struct { } purge; struct ia64_ctx ia64_ctx = { - .lock = __SPIN_LOCK_UNLOCKED(ia64_ctx.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(ia64_ctx.lock), .next = 1, .max_ctx = ~0U }; Index: linux-rt.q/drivers/char/blocker.c =================================================================== --- linux-rt.q.orig/drivers/char/blocker.c +++ linux-rt.q/drivers/char/blocker.c @@ -4,7 +4,6 @@ #include #include -#include #define BLOCKER_MINOR 221 Index: linux-rt.q/include/asm-ia64/irqflags.h =================================================================== --- /dev/null +++ linux-rt.q/include/asm-ia64/irqflags.h @@ -0,0 +1,95 @@ + +/* + * include/asm-i64/irqflags.h + * + * IRQ flags handling + * + * This file gets included from lowlevel asm headers too, to provide + * wrapped versions of the local_irq_*() APIs, based on the + * raw_local_irq_*() macros from the lowlevel headers. + */ +#ifndef _ASM_IRQFLAGS_H +#define _ASM_IRQFLAGS_H + +/* For spinlocks etc */ + +/* + * - clearing psr.i is implicitly serialized (visible by next insn) + * - setting psr.i requires data serialization + * - we need a stop-bit before reading PSR because we sometimes + * write a floating-point register right before reading the PSR + * and that writes to PSR.mfl + */ +#define __local_irq_save(x) \ +do { \ + ia64_stop(); \ + (x) = ia64_getreg(_IA64_REG_PSR); \ + ia64_stop(); \ + ia64_rsm(IA64_PSR_I); \ +} while (0) + +#define __local_irq_disable() \ +do { \ + ia64_stop(); \ + ia64_rsm(IA64_PSR_I); \ +} while (0) + +#define __local_irq_restore(x) ia64_intrin_local_irq_restore((x) & IA64_PSR_I) + +#ifdef CONFIG_IA64_DEBUG_IRQ + + extern unsigned long last_cli_ip; + +# define __save_ip() last_cli_ip = ia64_getreg(_IA64_REG_IP) + +# define raw_local_irq_save(x) \ +do { \ + unsigned long psr; \ + \ + __local_irq_save(psr); \ + if (psr & IA64_PSR_I) \ + __save_ip(); \ + (x) = psr; \ +} while (0) + +# define raw_local_irq_disable() do { unsigned long x; local_irq_save(x); } while (0) + +# define raw_local_irq_restore(x) \ +do { \ + unsigned long old_psr, psr = (x); \ + \ + local_save_flags(old_psr); \ + __local_irq_restore(psr); \ + if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I)) \ + __save_ip(); \ +} while (0) + +#else /* !CONFIG_IA64_DEBUG_IRQ */ +# define raw_local_irq_save(x) __local_irq_save(x) +# define raw_local_irq_disable() __local_irq_disable() +# define raw_local_irq_restore(x) __local_irq_restore(x) +#endif /* !CONFIG_IA64_DEBUG_IRQ */ + +#define raw_local_irq_enable() ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); }) +#define raw_local_save_flags(flags) ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); }) + +#define raw_irqs_disabled() \ +({ \ + unsigned long __ia64_id_flags; \ + local_save_flags(__ia64_id_flags); \ + (__ia64_id_flags & IA64_PSR_I) == 0; \ +}) + +#define raw_irqs_disabled_flags(flags) ((flags & IA64_PSR_I) == 0) + + +#define raw_safe_halt() ia64_pal_halt_light() /* PAL_HALT_LIGHT */ + +/* TBD... */ +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +# define TRACE_IRQS_ON_STR +# define TRACE_IRQS_OFF_STR + +#endif + Index: linux-rt.q/include/asm-ia64/mmu_context.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/mmu_context.h +++ linux-rt.q/include/asm-ia64/mmu_context.h @@ -32,7 +32,7 @@ #include struct ia64_ctx { - spinlock_t lock; + raw_spinlock_t lock; unsigned int next; /* next context number to use */ unsigned int limit; /* available free range */ unsigned int max_ctx; /* max. context value supported by all CPUs */ Index: linux-rt.q/include/asm-ia64/percpu.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/percpu.h +++ linux-rt.q/include/asm-ia64/percpu.h @@ -24,10 +24,17 @@ #define DECLARE_PER_CPU(type, name) \ extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name +#define DECLARE_PER_CPU_LOCKED(type, name) \ + extern spinlock_t per_cpu_lock__##name##_locked; \ + extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name##_locked + /* Separate out the type, so (int[3], foo) works. */ #define DEFINE_PER_CPU(type, name) \ - __attribute__((__section__(".data.percpu"))) \ - __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name + __attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_LOCKED(type, name) \ + __attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \ + __attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name##_locked /* * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an @@ -45,6 +52,16 @@ DECLARE_PER_CPU(unsigned long, local_per #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) #define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) +#define per_cpu_lock(var, cpu) \ + (*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu])) +#define per_cpu_var_locked(var, cpu) \ + (*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu])) +#define __get_cpu_lock(var, cpu) \ + per_cpu_lock(var, cpu) +#define __get_cpu_var_locked(var, cpu) \ + per_cpu_var_locked(var, cpu) + + extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size); extern void setup_per_cpu_areas (void); extern void *per_cpu_init(void); Index: linux-rt.q/include/asm-ia64/processor.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/processor.h +++ linux-rt.q/include/asm-ia64/processor.h @@ -124,8 +124,10 @@ struct ia64_psr { */ struct cpuinfo_ia64 { __u32 softirq_pending; - __u64 itm_delta; /* # of clock cycles between clock ticks */ - __u64 itm_next; /* interval timer mask value to use for next clock tick */ + __u64 itm_tick_delta; /* # of clock cycles between clock ticks */ + __u64 itm_tick_next; /* interval timer mask value to use for next clock tick */ + __u64 itm_timer_next; + __u64 __itm_next; __u64 nsec_per_cyc; /* (1000000000<count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -66,7 +66,7 @@ init_rwsem (struct rw_semaphore *sem) * lock for reading */ static inline void -__down_read (struct rw_semaphore *sem) +__down_read (struct compat_rw_semaphore *sem) { long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1); @@ -78,7 +78,7 @@ __down_read (struct rw_semaphore *sem) * lock for writing */ static inline void -__down_write (struct rw_semaphore *sem) +__down_write (struct compat_rw_semaphore *sem) { long old, new; @@ -95,7 +95,7 @@ __down_write (struct rw_semaphore *sem) * unlock after reading */ static inline void -__up_read (struct rw_semaphore *sem) +__up_read (struct compat_rw_semaphore *sem) { long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1); @@ -107,7 +107,7 @@ __up_read (struct rw_semaphore *sem) * unlock after writing */ static inline void -__up_write (struct rw_semaphore *sem) +__up_write (struct compat_rw_semaphore *sem) { long old, new; @@ -124,7 +124,7 @@ __up_write (struct rw_semaphore *sem) * trylock for reading -- returns 1 if successful, 0 if contention */ static inline int -__down_read_trylock (struct rw_semaphore *sem) +__down_read_trylock (struct compat_rw_semaphore *sem) { long tmp; while ((tmp = sem->count) >= 0) { @@ -139,7 +139,7 @@ __down_read_trylock (struct rw_semaphore * trylock for writing -- returns 1 if successful, 0 if contention */ static inline int -__down_write_trylock (struct rw_semaphore *sem) +__down_write_trylock (struct compat_rw_semaphore *sem) { long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); @@ -150,7 +150,7 @@ __down_write_trylock (struct rw_semaphor * downgrade write lock to read lock */ static inline void -__downgrade_write (struct rw_semaphore *sem) +__downgrade_write (struct compat_rw_semaphore *sem) { long old, new; @@ -170,7 +170,7 @@ __downgrade_write (struct rw_semaphore * #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->count != 0); } Index: linux-rt.q/include/asm-ia64/sal.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/sal.h +++ linux-rt.q/include/asm-ia64/sal.h @@ -43,7 +43,7 @@ #include #include -extern spinlock_t sal_lock; +extern raw_spinlock_t sal_lock; /* SAL spec _requires_ eight args for each call. */ #define __SAL_CALL(result,a0,a1,a2,a3,a4,a5,a6,a7) \ Index: linux-rt.q/include/asm-ia64/semaphore.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/semaphore.h +++ linux-rt.q/include/asm-ia64/semaphore.h @@ -11,54 +11,65 @@ #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name, count) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name, count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1) -#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name, 0) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) +#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0) + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem); static inline void -sema_init (struct semaphore *sem, int val) +compat_sema_init (struct compat_semaphore *sem, int val) { - *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val); + *sem = (struct compat_semaphore) __COMPAT_SEMAPHORE_INITIALIZER(*sem, val); } static inline void -init_MUTEX (struct semaphore *sem) +compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } static inline void -init_MUTEX_LOCKED (struct semaphore *sem) +compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern void __down (struct semaphore * sem); -extern int __down_interruptible (struct semaphore * sem); -extern int __down_trylock (struct semaphore * sem); -extern void __up (struct semaphore * sem); +extern void __down (struct compat_semaphore * sem); +extern int __down_interruptible (struct compat_semaphore * sem); +extern int __down_trylock (struct compat_semaphore * sem); +extern void __up (struct compat_semaphore * sem); /* * Atomically decrement the semaphore's count. If it goes negative, * block the calling thread in the TASK_UNINTERRUPTIBLE state. */ static inline void -down (struct semaphore *sem) +compat_down (struct compat_semaphore *sem) { might_sleep(); if (ia64_fetchadd(-1, &sem->count.counter, acq) < 1) @@ -70,7 +81,7 @@ down (struct semaphore *sem) * block the calling thread in the TASK_INTERRUPTIBLE state. */ static inline int -down_interruptible (struct semaphore * sem) +compat_down_interruptible (struct compat_semaphore * sem) { int ret = 0; @@ -81,7 +92,7 @@ down_interruptible (struct semaphore * s } static inline int -down_trylock (struct semaphore *sem) +compat_down_trylock (struct compat_semaphore *sem) { int ret = 0; @@ -91,10 +102,12 @@ down_trylock (struct semaphore *sem) } static inline void -up (struct semaphore * sem) +compat_up (struct compat_semaphore * sem) { if (ia64_fetchadd(1, &sem->count.counter, rel) <= -1) __up(sem); } +#include + #endif /* _ASM_IA64_SEMAPHORE_H */ Index: linux-rt.q/include/asm-ia64/spinlock.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/spinlock.h +++ linux-rt.q/include/asm-ia64/spinlock.h @@ -17,8 +17,6 @@ #include #include -#define __raw_spin_lock_init(x) ((x)->lock = 0) - #ifdef ASM_SUPPORTED /* * Try to get the lock. If we fail to get the lock, make a non-standard call to @@ -30,7 +28,7 @@ #define IA64_SPINLOCK_CLOBBERS "ar.ccv", "ar.pfs", "p14", "p15", "r27", "r28", "r29", "r30", "b6", "memory" static inline void -__raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags) +__raw_spin_lock_flags (__raw_spinlock_t *lock, unsigned long flags) { register volatile unsigned int *ptr asm ("r31") = &lock->lock; @@ -89,7 +87,7 @@ __raw_spin_lock_flags (raw_spinlock_t *l #define __raw_spin_lock(lock) __raw_spin_lock_flags(lock, 0) /* Unlock by doing an ordered store and releasing the cacheline with nta */ -static inline void __raw_spin_unlock(raw_spinlock_t *x) { +static inline void __raw_spin_unlock(__raw_spinlock_t *x) { barrier(); asm volatile ("st4.rel.nta [%0] = r0\n\t" :: "r"(x)); } @@ -109,7 +107,7 @@ do { \ } while (ia64_spinlock_val); \ } \ } while (0) -#define __raw_spin_unlock(x) do { barrier(); ((raw_spinlock_t *) x)->lock = 0; } while (0) +#define __raw_spin_unlock(x) do { barrier(); ((__raw_spinlock_t *) x)->lock = 0; } while (0) #endif /* !ASM_SUPPORTED */ #define __raw_spin_is_locked(x) ((x)->lock != 0) @@ -122,7 +120,7 @@ do { \ #define __raw_read_lock(rw) \ do { \ - raw_rwlock_t *__read_lock_ptr = (rw); \ + __raw_rwlock_t *__read_lock_ptr = (rw); \ \ while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) { \ ia64_fetchadd(-1, (int *) __read_lock_ptr, rel); \ @@ -133,7 +131,7 @@ do { \ #define __raw_read_unlock(rw) \ do { \ - raw_rwlock_t *__read_lock_ptr = (rw); \ + __raw_rwlock_t *__read_lock_ptr = (rw); \ ia64_fetchadd(-1, (int *) __read_lock_ptr, rel); \ } while (0) @@ -165,7 +163,7 @@ do { \ (result == 0); \ }) -static inline void __raw_write_unlock(raw_rwlock_t *x) +static inline void __raw_write_unlock(__raw_rwlock_t *x) { u8 *y = (u8 *)x; barrier(); @@ -193,7 +191,7 @@ static inline void __raw_write_unlock(ra (ia64_val == 0); \ }) -static inline void __raw_write_unlock(raw_rwlock_t *x) +static inline void __raw_write_unlock(__raw_rwlock_t *x) { barrier(); x->write_lock = 0; @@ -201,10 +199,10 @@ static inline void __raw_write_unlock(ra #endif /* !ASM_SUPPORTED */ -static inline int __raw_read_trylock(raw_rwlock_t *x) +static inline int __raw_read_trylock(__raw_rwlock_t *x) { union { - raw_rwlock_t lock; + __raw_rwlock_t lock; __u32 word; } old, new; old.lock = new.lock = *x; @@ -213,8 +211,8 @@ static inline int __raw_read_trylock(raw return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word; } -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() #endif /* _ASM_IA64_SPINLOCK_H */ Index: linux-rt.q/include/asm-ia64/spinlock_types.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/spinlock_types.h +++ linux-rt.q/include/asm-ia64/spinlock_types.h @@ -7,14 +7,14 @@ typedef struct { volatile unsigned int lock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } typedef struct { volatile unsigned int read_counter : 31; volatile unsigned int write_lock : 1; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { 0, 0 } Index: linux-rt.q/include/asm-ia64/system.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/system.h +++ linux-rt.q/include/asm-ia64/system.h @@ -104,81 +104,16 @@ extern struct ia64_boot_param { */ #define set_mb(var, value) do { (var) = (value); mb(); } while (0) -#define safe_halt() ia64_pal_halt_light() /* PAL_HALT_LIGHT */ /* * The group barrier in front of the rsm & ssm are necessary to ensure * that none of the previous instructions in the same group are * affected by the rsm/ssm. */ -/* For spinlocks etc */ -/* - * - clearing psr.i is implicitly serialized (visible by next insn) - * - setting psr.i requires data serialization - * - we need a stop-bit before reading PSR because we sometimes - * write a floating-point register right before reading the PSR - * and that writes to PSR.mfl - */ -#define __local_irq_save(x) \ -do { \ - ia64_stop(); \ - (x) = ia64_getreg(_IA64_REG_PSR); \ - ia64_stop(); \ - ia64_rsm(IA64_PSR_I); \ -} while (0) - -#define __local_irq_disable() \ -do { \ - ia64_stop(); \ - ia64_rsm(IA64_PSR_I); \ -} while (0) - -#define __local_irq_restore(x) ia64_intrin_local_irq_restore((x) & IA64_PSR_I) - -#ifdef CONFIG_IA64_DEBUG_IRQ - extern unsigned long last_cli_ip; - -# define __save_ip() last_cli_ip = ia64_getreg(_IA64_REG_IP) - -# define local_irq_save(x) \ -do { \ - unsigned long psr; \ - \ - __local_irq_save(psr); \ - if (psr & IA64_PSR_I) \ - __save_ip(); \ - (x) = psr; \ -} while (0) - -# define local_irq_disable() do { unsigned long x; local_irq_save(x); } while (0) - -# define local_irq_restore(x) \ -do { \ - unsigned long old_psr, psr = (x); \ - \ - local_save_flags(old_psr); \ - __local_irq_restore(psr); \ - if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I)) \ - __save_ip(); \ -} while (0) +#include -#else /* !CONFIG_IA64_DEBUG_IRQ */ -# define local_irq_save(x) __local_irq_save(x) -# define local_irq_disable() __local_irq_disable() -# define local_irq_restore(x) __local_irq_restore(x) -#endif /* !CONFIG_IA64_DEBUG_IRQ */ - -#define local_irq_enable() ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); }) -#define local_save_flags(flags) ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); }) - -#define irqs_disabled() \ -({ \ - unsigned long __ia64_id_flags; \ - local_save_flags(__ia64_id_flags); \ - (__ia64_id_flags & IA64_PSR_I) == 0; \ -}) #ifdef __KERNEL__ Index: linux-rt.q/include/asm-ia64/thread_info.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/thread_info.h +++ linux-rt.q/include/asm-ia64/thread_info.h @@ -91,6 +91,7 @@ struct thread_info { #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ #define TIF_FREEZE 20 /* is freezing for suspend */ +#define TIF_NEED_RESCHED_DELAYED 20 /* reschedule on return to userspace */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) Index: linux-rt.q/include/asm-ia64/tlb.h =================================================================== --- linux-rt.q.orig/include/asm-ia64/tlb.h +++ linux-rt.q/include/asm-ia64/tlb.h @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -61,11 +62,12 @@ struct mmu_gather { unsigned char need_flush; /* really unmapped some PTEs? */ unsigned long start_addr; unsigned long end_addr; + int cpu; struct page *pages[FREE_PTE_NR]; }; /* Users of the generic TLB shootdown code must declare this storage space. */ -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * Flush the TLB for address range START to END and, if not in fast mode, release the @@ -127,8 +129,10 @@ ia64_tlb_flush_mmu (struct mmu_gather *t static inline struct mmu_gather * tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); + int cpu; + struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu); + tlb->cpu = cpu; tlb->mm = mm; /* * Use fast mode if only 1 CPU is online. @@ -165,7 +169,7 @@ tlb_finish_mmu (struct mmu_gather *tlb, /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + put_cpu_var_locked(mmu_gathers, tlb->cpu); } /* patches/latency-tracing-i386.patch0000664000077200007720000002045210646635212016325 0ustar mingomingo arch/i386/Kconfig | 1 + arch/i386/boot/compressed/Makefile | 1 + arch/i386/kernel/Makefile | 1 + arch/i386/kernel/apic.c | 2 ++ arch/i386/kernel/entry.S | 15 +++++++++++++++ arch/i386/kernel/hpet.c | 2 +- arch/i386/kernel/irq.c | 7 ++++++- arch/i386/kernel/mcount-wrapper.S | 27 +++++++++++++++++++++++++++ arch/i386/kernel/traps.c | 1 + arch/i386/kernel/tsc.c | 2 +- arch/i386/mm/fault.c | 1 + arch/i386/mm/init.c | 2 +- include/asm-i386/processor.h | 11 +++++++---- 13 files changed, 65 insertions(+), 8 deletions(-) Index: linux-rt.q/arch/i386/Kconfig =================================================================== --- linux-rt.q.orig/arch/i386/Kconfig +++ linux-rt.q/arch/i386/Kconfig @@ -777,6 +777,7 @@ config BOOT_IOREMAP # config REGPARM bool + depends on !MCOUNT default y config SECCOMP Index: linux-rt.q/arch/i386/boot/compressed/Makefile =================================================================== --- linux-rt.q.orig/arch/i386/boot/compressed/Makefile +++ linux-rt.q/arch/i386/boot/compressed/Makefile @@ -9,6 +9,7 @@ targets := vmlinux vmlinux.bin vmlinux. EXTRA_AFLAGS := -traditional LDFLAGS_vmlinux := -T +CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing CFLAGS_misc.o += -fPIC hostprogs-y := relocs Index: linux-rt.q/arch/i386/kernel/Makefile =================================================================== --- linux-rt.q.orig/arch/i386/kernel/Makefile +++ linux-rt.q/arch/i386/kernel/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o obj-$(CONFIG_SMP) += smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o Index: linux-rt.q/arch/i386/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/apic.c +++ linux-rt.q/arch/i386/kernel/apic.c @@ -583,6 +583,8 @@ void fastcall smp_apic_timer_interrupt(s { struct pt_regs *old_regs = set_irq_regs(regs); + trace_special(regs->eip, 1, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. Index: linux-rt.q/arch/i386/kernel/entry.S =================================================================== --- linux-rt.q.orig/arch/i386/kernel/entry.S +++ linux-rt.q/arch/i386/kernel/entry.S @@ -329,6 +329,11 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -343,6 +348,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_EVENT_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -366,6 +376,11 @@ ENTRY(system_call) pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -292,7 +292,7 @@ static int hpet_legacy_next_event(unsign /* * Clock source related code */ -static cycle_t read_hpet(void) +static cycle_t notrace read_hpet(void) { return (cycle_t)hpet_readl(HPET_COUNTER); } Index: linux-rt.q/arch/i386/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/irq.c +++ linux-rt.q/arch/i386/kernel/irq.c @@ -68,7 +68,7 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ @@ -87,6 +87,11 @@ fastcall unsigned int do_IRQ(struct pt_r old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_EVENT_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { Index: linux-rt.q/arch/i386/kernel/mcount-wrapper.S =================================================================== --- /dev/null +++ linux-rt.q/arch/i386/kernel/mcount-wrapper.S @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + Index: linux-rt.q/arch/i386/kernel/traps.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/traps.c +++ linux-rt.q/arch/i386/kernel/traps.c @@ -222,6 +222,7 @@ show_trace_log_lvl(struct task_struct *t { dump_trace(task, regs, stack, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); + print_traces(task); } void show_trace(struct task_struct *task, struct pt_regs *regs, Index: linux-rt.q/arch/i386/kernel/tsc.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/tsc.c +++ linux-rt.q/arch/i386/kernel/tsc.c @@ -261,7 +261,7 @@ core_initcall(cpufreq_tsc); static unsigned long current_tsc_khz = 0; -static cycle_t read_tsc(void) +static notrace cycle_t read_tsc(void) { cycle_t ret; Index: linux-rt.q/arch/i386/mm/fault.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/fault.c +++ linux-rt.q/arch/i386/mm/fault.c @@ -488,6 +488,7 @@ bad_area_nosemaphore: nr = (address - idt_descr.address) >> 3; if (nr == 6) { + stop_trace(); do_invalid_op(regs, 0); return; } Index: linux-rt.q/arch/i386/mm/init.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/init.c +++ linux-rt.q/arch/i386/mm/init.c @@ -193,7 +193,7 @@ static inline int page_kills_ppro(unsign return 0; } -int page_is_ram(unsigned long pagenr) +int notrace page_is_ram(unsigned long pagenr) { int i; unsigned long addr, end; Index: linux-rt.q/include/asm-i386/processor.h =================================================================== --- linux-rt.q.orig/include/asm-i386/processor.h +++ linux-rt.q/include/asm-i386/processor.h @@ -128,7 +128,7 @@ extern void detect_ht(struct cpuinfo_x86 static inline void detect_ht(struct cpuinfo_x86 *c) {} #endif -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, +static inline void fastcall native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ @@ -602,7 +602,9 @@ static inline void load_esp0(struct tss_ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx * resulting in stale register contents being returned. */ -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +static inline void +cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = 0; @@ -610,8 +612,9 @@ static inline void cpuid(unsigned int op } /* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) +static inline void +cpuid_count(int op, int count, unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = count; patches/s_files-per_cpu-flush-fix.patch0000664000077200007720000000235210646635216017537 0ustar mingomingo--- fs/file_table.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) Index: linux-rt.q/fs/file_table.c =================================================================== --- linux-rt.q.orig/fs/file_table.c +++ linux-rt.q/fs/file_table.c @@ -332,6 +332,22 @@ static void __filevec_add(struct filevec filevec_reinit(fvec); } +/* + * Flush files per-CPU workqueue: + */ +static struct workqueue_struct *flush_files_workqueue; + +int __init flush_files_init(void) +{ + flush_files_workqueue = create_workqueue("flush_filesd"); + if (!flush_files_workqueue) + panic("Failed to create flush_filesd\n"); + + return 0; +} + +__initcall(flush_files_init); + static void filevec_add_drain(void) { int cpu; @@ -341,14 +357,15 @@ static void filevec_add_drain(void) put_cpu_var_locked(sb_fvec, cpu); } -static void filevec_add_drain_per_cpu(void *dummy) +static void filevec_add_drain_per_cpu(struct work_struct *none) { filevec_add_drain(); } int filevec_add_drain_all(void) { - return schedule_on_each_cpu(filevec_add_drain_per_cpu, NULL); + return schedule_on_each_cpu_wq(flush_files_workqueue, + filevec_add_drain_per_cpu); } EXPORT_SYMBOL_GPL(filevec_add_drain_all); patches/net-core-preempt-fix.patch0000664000077200007720000000070410646635214016524 0ustar mingomingo--- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/net/core/dev.c =================================================================== --- linux-rt.q.orig/net/core/dev.c +++ linux-rt.q/net/core/dev.c @@ -1679,8 +1679,8 @@ int netif_rx_ni(struct sk_buff *skb) { int err; - preempt_disable(); err = netif_rx(skb); + preempt_disable(); if (local_softirq_pending()) do_softirq(); preempt_enable(); patches/preempt-realtime-supress-rtc-printk.patch0000664000077200007720000000100310646635215021615 0ustar mingomingo--- drivers/char/rtc.c | 2 ++ 1 file changed, 2 insertions(+) Index: linux-rt.q/drivers/char/rtc.c =================================================================== --- linux-rt.q.orig/drivers/char/rtc.c +++ linux-rt.q/drivers/char/rtc.c @@ -1332,7 +1332,9 @@ static void rtc_dropped_irq(unsigned lon spin_unlock_irq(&rtc_lock); +#ifndef CONFIG_PREEMPT_RT printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); +#endif /* Now we have new data */ wake_up_interruptible(&rtc_wait); patches/rcu-2.patch0000664000077200007720000000567610646635213013516 0ustar mingomingo Finally, RCU gets its own softirq. With it being used extensively, the per-cpu tasklet used earlier was just a softirq with overheads. This makes things more efficient. Signed-off-by: Dipankar Sarma include/linux/interrupt.h | 1 + kernel/rcuclassic.c | 12 +++++------- 2 files changed, 6 insertions(+), 7 deletions(-) Index: linux-rt.q/include/linux/interrupt.h =================================================================== --- linux-rt.q.orig/include/linux/interrupt.h +++ linux-rt.q/include/linux/interrupt.h @@ -269,6 +269,7 @@ enum #ifdef CONFIG_HIGH_RES_TIMERS HRTIMER_SOFTIRQ, #endif + RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ }; /* softirq mask and active fields moved to irq_cpustat_t in Index: linux-rt.q/kernel/rcuclassic.c =================================================================== --- linux-rt.q.orig/kernel/rcuclassic.c +++ linux-rt.q/kernel/rcuclassic.c @@ -69,7 +69,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; /* Fake initialization required by compiler */ -static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int blimit = 10; static int qhimark = 10000; static int qlowmark = 100; @@ -215,7 +214,7 @@ static void rcu_do_batch(struct rcu_data if (!rdp->donelist) rdp->donetail = &rdp->donelist; else - tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); + raise_softirq(RCU_SOFTIRQ); } /* @@ -367,7 +366,6 @@ static void rcu_offline_cpu(int cpu) &per_cpu(rcu_bh_data, cpu)); put_cpu_var(rcu_data); put_cpu_var(rcu_bh_data); - tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); } #else @@ -379,7 +377,7 @@ static void rcu_offline_cpu(int cpu) #endif /* - * This does the RCU processing work from tasklet context. + * This does the RCU processing work from softirq context. */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) @@ -424,7 +422,7 @@ static void __rcu_process_callbacks(stru rcu_do_batch(rdp); } -static void rcu_process_callbacks(unsigned long unused) +static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); @@ -488,7 +486,7 @@ void rcu_check_callbacks(int cpu, int us rcu_bh_qsctr_inc(cpu); } else if (!in_softirq()) rcu_bh_qsctr_inc(cpu); - tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); + raise_softirq(RCU_SOFTIRQ); } static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, @@ -511,7 +509,7 @@ static void __devinit rcu_online_cpu(int rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); } static int __devinit rcu_cpu_notify(struct notifier_block *self, patches/rcu-4.patch0000664000077200007720000003620410646635213013507 0ustar mingomingo This patch consolidates the RCU tracing code in the preemptible RCU implementation, moves them to a separate "trace" file and cleans up the #ifdefs. Moving to a separate file will eventually allow dynamic tracing of RCU implementation. Signed-off-by: Paul McKenney Signed-off-by: Dipankar Sarma include/linux/rcupreempt_trace.h | 84 ++++++++++++++++++++++++++++ kernel/Kconfig.preempt | 11 +-- kernel/Makefile | 1 kernel/rcupreempt.c | 113 ++++++++++++--------------------------- kernel/rcupreempt_trace.c | 99 ++++++++++++++++++++++++++++++++++ 5 files changed, 225 insertions(+), 83 deletions(-) Index: linux-rt.q/include/linux/rcupreempt_trace.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/rcupreempt_trace.h @@ -0,0 +1,84 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (RT implementation) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Paul McKenney + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rcupdate.html + * + */ + +#ifndef __LINUX_RCUPREEMPT_TRACE_H +#define __LINUX_RCUPREEMPT_TRACE_H + +#ifdef __KERNEL__ +#include +#include + +#include + +/* + * PREEMPT_RCU data structures. + */ + +struct rcupreempt_trace { + long next_length; + long next_add; + long wait_length; + long wait_add; + long done_length; + long done_add; + long done_remove; + atomic_t done_invoked; + long rcu_check_callbacks; + atomic_t rcu_try_flip1; + long rcu_try_flip2; + long rcu_try_flip3; + atomic_t rcu_try_flip_e1; + long rcu_try_flip_e2; + long rcu_try_flip_e3; +}; + +#ifdef CONFIG_RCU_TRACE +#define RCU_TRACE(fn, arg) fn(arg); +#else +#define RCU_TRACE(fn, arg) +#endif + +extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_try_flip1(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_try_flip_e2(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_try_flip_e3(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_try_flip2(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_try_flip3(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace); +extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace); + +#endif /* __KERNEL__ */ +#endif /* __LINUX_RCUPREEMPT_TRACE_H */ Index: linux-rt.q/kernel/Kconfig.preempt =================================================================== --- linux-rt.q.orig/kernel/Kconfig.preempt +++ linux-rt.q/kernel/Kconfig.preempt @@ -90,13 +90,12 @@ config PREEMPT_RCU endchoice -config RCU_STATS - bool "/proc stats for preemptible RCU read-side critical sections" - depends on PREEMPT_RCU +config RCU_TRACE + bool "Enable tracing for RCU - currently stats in /proc" default y help - This option provides /proc stats to provide debugging info for - the preemptible realtime RCU implementation. + This option provides tracing in RCU which presents /proc + stats for debugging RCU implementation. - Say Y here if you want to see RCU stats in /proc + Say Y here if you want to enable RCU tracing Say N if you are unsure. Index: linux-rt.q/kernel/Makefile =================================================================== --- linux-rt.q.orig/kernel/Makefile +++ linux-rt.q/kernel/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_CLASSIC_RCU) += rcupdate.o rcuclassic.o obj-$(CONFIG_PREEMPT_RCU) += rcupdate.o rcupreempt.o +obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_UTS_NS) += utsname.o Index: linux-rt.q/kernel/rcupreempt.c =================================================================== --- linux-rt.q.orig/kernel/rcupreempt.c +++ linux-rt.q/kernel/rcupreempt.c @@ -48,6 +48,7 @@ #include #include #include +#include /* * PREEMPT_RCU data structures. @@ -63,23 +64,9 @@ struct rcu_data { struct rcu_head **waittail; struct rcu_head *donelist; struct rcu_head **donetail; -#ifdef CONFIG_RCU_STATS - long n_next_length; - long n_next_add; - long n_wait_length; - long n_wait_add; - long n_done_length; - long n_done_add; - long n_done_remove; - atomic_t n_done_invoked; - long n_rcu_check_callbacks; - atomic_t n_rcu_try_flip1; - long n_rcu_try_flip2; - long n_rcu_try_flip3; - atomic_t n_rcu_try_flip_e1; - long n_rcu_try_flip_e2; - long n_rcu_try_flip_e3; -#endif /* #ifdef CONFIG_RCU_STATS */ +#ifdef CONFIG_RCU_TRACE + struct rcupreempt_trace trace; +#endif /* #ifdef CONFIG_RCU_TRACE */ }; struct rcu_ctrlblk { spinlock_t fliplock; @@ -180,22 +167,14 @@ static void __rcu_advance_callbacks(void if (rcu_data.waitlist != NULL) { *rcu_data.donetail = rcu_data.waitlist; rcu_data.donetail = rcu_data.waittail; -#ifdef CONFIG_RCU_STATS - rcu_data.n_done_length += rcu_data.n_wait_length; - rcu_data.n_done_add += rcu_data.n_wait_length; - rcu_data.n_wait_length = 0; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_move2done, &rcu_data.trace); } if (rcu_data.nextlist != NULL) { rcu_data.waitlist = rcu_data.nextlist; rcu_data.waittail = rcu_data.nexttail; rcu_data.nextlist = NULL; rcu_data.nexttail = &rcu_data.nextlist; -#ifdef CONFIG_RCU_STATS - rcu_data.n_wait_length += rcu_data.n_next_length; - rcu_data.n_wait_add += rcu_data.n_next_length; - rcu_data.n_next_length = 0; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_move2wait, &rcu_data.trace); } else { rcu_data.waitlist = NULL; rcu_data.waittail = &rcu_data.waitlist; @@ -220,22 +199,16 @@ static void rcu_try_flip(void) unsigned long oldirq; flipctr = rcu_ctrlblk.completed; -#ifdef CONFIG_RCU_STATS - atomic_inc(&rcu_data.n_rcu_try_flip1); -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_try_flip1, &rcu_data.trace); if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) { -#ifdef CONFIG_RCU_STATS - atomic_inc(&rcu_data.n_rcu_try_flip_e1); -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_try_flip_e1, &rcu_data.trace); return; } if (unlikely(flipctr != rcu_ctrlblk.completed)) { /* Our work is done! ;-) */ -#ifdef CONFIG_RCU_STATS - rcu_data.n_rcu_try_flip_e2++; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_try_flip_e2, &rcu_data.trace); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); return; } @@ -246,14 +219,11 @@ static void rcu_try_flip(void) * that started prior to the previous flip. */ -#ifdef CONFIG_RCU_STATS - rcu_data.n_rcu_try_flip2++; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_try_flip2, &rcu_data.trace); for_each_possible_cpu(cpu) { if (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) { -#ifdef CONFIG_RCU_STATS - rcu_data.n_rcu_try_flip_e3++; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_try_flip_e3, + &rcu_data.trace); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); return; } @@ -264,9 +234,7 @@ static void rcu_try_flip(void) smp_mb(); rcu_ctrlblk.completed++; -#ifdef CONFIG_RCU_STATS - rcu_data.n_rcu_try_flip3++; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_try_flip3, &rcu_data.trace); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); } @@ -281,9 +249,7 @@ void rcu_check_callbacks(int cpu, int us } } spin_lock_irqsave(&rcu_data.lock, oldirq); -#ifdef CONFIG_RCU_STATS - rcu_data.n_rcu_check_callbacks++; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_check_callbacks, &rcu_data.trace); __rcu_advance_callbacks(); if (rcu_data.donelist == NULL) { spin_unlock_irqrestore(&rcu_data.lock, oldirq); @@ -306,18 +272,13 @@ static void rcu_process_callbacks(unsign } rcu_data.donelist = NULL; rcu_data.donetail = &rcu_data.donelist; -#ifdef CONFIG_RCU_STATS - rcu_data.n_done_remove += rcu_data.n_done_length; - rcu_data.n_done_length = 0; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_done_remove, &rcu_data.trace); spin_unlock_irqrestore(&rcu_data.lock, flags); while (list) { next = list->next; list->func(list); list = next; -#ifdef CONFIG_RCU_STATS - atomic_inc(&rcu_data.n_done_invoked); -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_invoke, &rcu_data.trace); } } @@ -332,10 +293,7 @@ void fastcall call_rcu(struct rcu_head * __rcu_advance_callbacks(); *rcu_data.nexttail = head; rcu_data.nexttail = &head->next; -#ifdef CONFIG_RCU_STATS - rcu_data.n_next_add++; - rcu_data.n_next_length++; -#endif /* #ifdef CONFIG_RCU_STATS */ + RCU_TRACE(rcupreempt_trace_next_add, &rcu_data.trace); spin_unlock_irqrestore(&rcu_data.lock, flags); } @@ -389,9 +347,10 @@ void synchronize_kernel(void) synchronize_rcu(); } -#ifdef CONFIG_RCU_STATS +#ifdef CONFIG_RCU_TRACE int rcu_read_proc_data(char *page) { + struct rcupreempt_trace *trace = &rcu_data.trace; return sprintf(page, "ggp=%ld lgp=%ld rcc=%ld\n" "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" @@ -399,23 +358,23 @@ int rcu_read_proc_data(char *page) rcu_ctrlblk.completed, rcu_data.completed, - rcu_data.n_rcu_check_callbacks, + trace->rcu_check_callbacks, - rcu_data.n_next_add, - rcu_data.n_next_length, - rcu_data.n_wait_add, - rcu_data.n_wait_length, - rcu_data.n_done_add, - rcu_data.n_done_length, - rcu_data.n_done_remove, - atomic_read(&rcu_data.n_done_invoked), - - atomic_read(&rcu_data.n_rcu_try_flip1), - rcu_data.n_rcu_try_flip2, - rcu_data.n_rcu_try_flip3, - atomic_read(&rcu_data.n_rcu_try_flip_e1), - rcu_data.n_rcu_try_flip_e2, - rcu_data.n_rcu_try_flip_e3); + trace->next_add, + trace->next_length, + trace->wait_add, + trace->wait_length, + trace->done_add, + trace->done_length, + trace->done_remove, + atomic_read(&trace->done_invoked), + + atomic_read(&trace->rcu_try_flip1), + trace->rcu_try_flip2, + trace->rcu_try_flip3, + atomic_read(&trace->rcu_try_flip_e1), + trace->rcu_try_flip_e2, + trace->rcu_try_flip_e3); } int rcu_read_proc_gp_data(char *page) @@ -454,7 +413,7 @@ int rcu_read_proc_ctrs_data(char *page) return (cnt); } -#endif /* #ifdef CONFIG_RCU_STATS */ +#endif /* #ifdef CONFIG_RCU_TRACE */ EXPORT_SYMBOL_GPL(call_rcu); EXPORT_SYMBOL_GPL(rcu_batches_completed); Index: linux-rt.q/kernel/rcupreempt_trace.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/rcupreempt_trace.c @@ -0,0 +1,99 @@ +/* + * Read-Copy Update tracing for realtime implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Papers: http://www.rdrop.com/users/paulmck/RCU + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) +{ + trace->done_length += trace->wait_length; + trace->done_add += trace->wait_length; + trace->wait_length = 0; +} +void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) +{ + trace->wait_length += trace->next_length; + trace->wait_add += trace->next_length; + trace->next_length = 0; +} +void rcupreempt_trace_try_flip1(struct rcupreempt_trace *trace) +{ + atomic_inc(&trace->rcu_try_flip1); +} +void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) +{ + atomic_inc(&trace->rcu_try_flip_e1); +} +void rcupreempt_trace_try_flip_e2(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_e2++; +} +void rcupreempt_trace_try_flip_e3(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip_e3++; +} +void rcupreempt_trace_try_flip2(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip2++; +} +void rcupreempt_trace_try_flip3(struct rcupreempt_trace *trace) +{ + trace->rcu_try_flip3++; +} +void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) +{ + trace->rcu_check_callbacks++; +} +void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) +{ + trace->done_remove += trace->done_length; + trace->done_length = 0; +} +void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) +{ + atomic_inc(&trace->done_invoked); +} +void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) +{ + trace->next_add++; + trace->next_length++; +} patches/clockevents-fix-resume-logic.patch0000664000077200007720000002403010646635210020241 0ustar mingomingoFrom: Thomas Gleixner We need to make sure, that the clockevent devices are resumed, before the tick is resumed. The current resume logic does not guarantee this. Add CLOCK_EVT_MODE_RESUME and call the set mode functions of the clock event devices before resuming the tick / oneshot functionality. Fixup the existing users. Thanks to Nigel Cunningham for tracking down a long standing thinko, which affected the jinxed VAIO. Signed-off-by: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton --- arch/arm/mach-davinci/time.c | 2 + arch/arm/mach-ixp4xx/common.c | 2 + arch/arm/mach-omap1/time.c | 1 arch/arm/plat-omap/timer32k.c | 2 + arch/i386/kernel/apic.c | 3 + arch/i386/kernel/hpet.c | 71 ++------------------------------------ arch/i386/kernel/i8253.c | 26 ++++++------- arch/i386/kernel/vmiclock.c | 1 arch/sh/kernel/timers/timer-tmu.c | 1 arch/sparc64/kernel/time.c | 1 include/linux/clockchips.h | 1 kernel/time/tick-broadcast.c | 6 ++- kernel/time/tick-common.c | 16 +++++--- 13 files changed, 45 insertions(+), 88 deletions(-) Index: linux-rt.q/arch/arm/mach-davinci/time.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-davinci/time.c +++ linux-rt.q/arch/arm/mach-davinci/time.c @@ -285,6 +285,8 @@ static void davinci_set_mode(enum clock_ case CLOCK_EVT_MODE_SHUTDOWN: t->opts = TIMER_OPTS_DISABLED; break; + case CLOCK_EVT_MODE_RESUME: + break; } } Index: linux-rt.q/arch/arm/mach-ixp4xx/common.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-ixp4xx/common.c +++ linux-rt.q/arch/arm/mach-ixp4xx/common.c @@ -459,6 +459,8 @@ static void ixp4xx_set_mode(enum clock_e default: osrt = opts = 0; break; + case CLOCK_EVT_MODE_RESUME: + break; } *IXP4XX_OSRT1 = osrt | opts; Index: linux-rt.q/arch/arm/mach-omap1/time.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-omap1/time.c +++ linux-rt.q/arch/arm/mach-omap1/time.c @@ -156,6 +156,7 @@ static void omap_mpu_set_mode(enum clock break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_RESUME: break; } } Index: linux-rt.q/arch/arm/plat-omap/timer32k.c =================================================================== --- linux-rt.q.orig/arch/arm/plat-omap/timer32k.c +++ linux-rt.q/arch/arm/plat-omap/timer32k.c @@ -156,6 +156,8 @@ static void omap_32k_timer_set_mode(enum case CLOCK_EVT_MODE_SHUTDOWN: omap_32k_timer_stop(); break; + case CLOCK_EVT_MODE_RESUME: + break; } } Index: linux-rt.q/arch/i386/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/apic.c +++ linux-rt.q/arch/i386/kernel/apic.c @@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write_around(APIC_LVTT, v); break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; } local_irq_restore(flags); Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -187,6 +187,10 @@ static void hpet_set_mode(enum clock_eve cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T0_CFG); break; + + case CLOCK_EVT_MODE_RESUME: + hpet_enable_int(); + break; } } @@ -217,6 +221,7 @@ static struct clocksource clocksource_hp .mask = HPET_MASK, .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_start_counter, }; /* @@ -313,7 +318,6 @@ int __init hpet_enable(void) clocksource_register(&clocksource_hpet); - if (id & HPET_ID_LEGSUP) { hpet_enable_int(); hpet_reserve_platform_timers(id); @@ -546,68 +550,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, return IRQ_HANDLED; } #endif - - -/* - * Suspend/resume part - */ - -#ifdef CONFIG_PM - -static int hpet_suspend(struct sys_device *sys_device, pm_message_t state) -{ - unsigned long cfg = hpet_readl(HPET_CFG); - - cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_resume(struct sys_device *sys_device) -{ - unsigned int id; - - hpet_start_counter(); - - id = hpet_readl(HPET_ID); - - if (id & HPET_ID_LEGSUP) - hpet_enable_int(); - - return 0; -} - -static struct sysdev_class hpet_class = { - set_kset_name("hpet"), - .suspend = hpet_suspend, - .resume = hpet_resume, -}; - -static struct sys_device hpet_device = { - .id = 0, - .cls = &hpet_class, -}; - - -static __init int hpet_register_sysfs(void) -{ - int err; - - if (!is_hpet_capable()) - return 0; - - err = sysdev_class_register(&hpet_class); - - if (!err) { - err = sysdev_register(&hpet_device); - if (err) - sysdev_class_unregister(&hpet_class); - } - - return err; -} - -device_initcall(hpet_register_sysfs); - -#endif Index: linux-rt.q/arch/i386/kernel/i8253.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/i8253.c +++ linux-rt.q/arch/i386/kernel/i8253.c @@ -3,11 +3,11 @@ * */ #include -#include +#include +#include #include -#include #include -#include +#include #include #include @@ -41,26 +41,24 @@ static void init_pit_timer(enum clock_ev case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(0x34, PIT_MODE); - udelay(10); outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); outb(LATCH >> 8 , PIT_CH0); /* MSB */ break; - /* - * Avoid unnecessary state transitions, as it confuses - * Geode / Cyrix based boxen. - */ case CLOCK_EVT_MODE_SHUTDOWN: - if (evt->mode == CLOCK_EVT_MODE_UNUSED) - break; case CLOCK_EVT_MODE_UNUSED: - if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN) - break; + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); /* LSB */ + outb_p(0, PIT_CH0); /* MSB */ + break; + case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ outb_p(0x38, PIT_MODE); - udelay(10); + break; + + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ break; } spin_unlock_irqrestore(&i8253_lock, flags); Index: linux-rt.q/arch/i386/kernel/vmiclock.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/vmiclock.c +++ linux-rt.q/arch/i386/kernel/vmiclock.c @@ -142,6 +142,7 @@ static void vmi_timer_set_mode(enum cloc switch (mode) { case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: break; case CLOCK_EVT_MODE_PERIODIC: cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); Index: linux-rt.q/arch/sh/kernel/timers/timer-tmu.c =================================================================== --- linux-rt.q.orig/arch/sh/kernel/timers/timer-tmu.c +++ linux-rt.q/arch/sh/kernel/timers/timer-tmu.c @@ -80,6 +80,7 @@ static void tmu_set_mode(enum clock_even break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_RESUME: break; } } Index: linux-rt.q/arch/sparc64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/sparc64/kernel/time.c +++ linux-rt.q/arch/sparc64/kernel/time.c @@ -938,6 +938,7 @@ static void sparc64_timer_setup(enum clo { switch (mode) { case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: break; case CLOCK_EVT_MODE_SHUTDOWN: Index: linux-rt.q/include/linux/clockchips.h =================================================================== --- linux-rt.q.orig/include/linux/clockchips.h +++ linux-rt.q/include/linux/clockchips.h @@ -23,6 +23,7 @@ enum clock_event_mode { CLOCK_EVT_MODE_SHUTDOWN, CLOCK_EVT_MODE_PERIODIC, CLOCK_EVT_MODE_ONESHOT, + CLOCK_EVT_MODE_RESUME, }; /* Clock event notification values */ Index: linux-rt.q/kernel/time/tick-broadcast.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-broadcast.c +++ linux-rt.q/kernel/time/tick-broadcast.c @@ -49,7 +49,7 @@ cpumask_t *tick_get_broadcast_mask(void) */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { - if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (bc) tick_setup_periodic(bc, 1); } @@ -299,7 +299,7 @@ void tick_suspend_broadcast(void) spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + if (bc) clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -316,6 +316,8 @@ int tick_resume_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); + switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if(!cpus_empty(tick_broadcast_mask)) Index: linux-rt.q/kernel/time/tick-common.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-common.c +++ linux-rt.q/kernel/time/tick-common.c @@ -318,12 +318,17 @@ static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); unsigned long flags; + int broadcast = tick_resume_broadcast(); spin_lock_irqsave(&tick_device_lock, flags); - if (td->mode == TICKDEV_MODE_PERIODIC) - tick_setup_periodic(td->evtdev, 0); - else - tick_resume_oneshot(); + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); + + if (!broadcast) { + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(td->evtdev, 0); + else + tick_resume_oneshot(); + } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -360,8 +365,7 @@ static int tick_notify(struct notifier_b break; case CLOCK_EVT_NOTIFY_RESUME: - if (!tick_resume_broadcast()) - tick_resume(); + tick_resume(); break; default: patches/preempt-realtime-usb.patch0000664000077200007720000001366410646635215016627 0ustar mingomingo--- drivers/net/usb/usbnet.c | 2 ++ drivers/usb/core/devio.c | 7 ++++--- drivers/usb/core/hcd.c | 30 +++++++++++------------------- drivers/usb/core/message.c | 11 ++++++----- 4 files changed, 23 insertions(+), 27 deletions(-) Index: linux-rt.q/drivers/net/usb/usbnet.c =================================================================== --- linux-rt.q.orig/drivers/net/usb/usbnet.c +++ linux-rt.q/drivers/net/usb/usbnet.c @@ -896,6 +896,8 @@ static void tx_complete (struct urb *urb urb->dev = NULL; entry->state = tx_done; + spin_lock_rt(&dev->txq.lock); + spin_unlock_rt(&dev->txq.lock); defer_bh(dev, skb, &dev->txq); } Index: linux-rt.q/drivers/usb/core/devio.c =================================================================== --- linux-rt.q.orig/drivers/usb/core/devio.c +++ linux-rt.q/drivers/usb/core/devio.c @@ -308,10 +308,11 @@ static void async_completed(struct urb * struct async *as = urb->context; struct dev_state *ps = as->ps; struct siginfo sinfo; + unsigned long flags; - spin_lock(&ps->lock); - list_move_tail(&as->asynclist, &ps->async_completed); - spin_unlock(&ps->lock); + spin_lock_irqsave(&ps->lock, flags); + list_move_tail(&as->asynclist, &ps->async_completed); + spin_unlock_irqrestore(&ps->lock, flags); if (as->signr) { sinfo.si_signo = as->signr; sinfo.si_errno = as->urb->status; Index: linux-rt.q/drivers/usb/core/hcd.c =================================================================== --- linux-rt.q.orig/drivers/usb/core/hcd.c +++ linux-rt.q/drivers/usb/core/hcd.c @@ -518,13 +518,11 @@ error: } /* any errors get returned through the urb completion */ - local_irq_save (flags); - spin_lock (&urb->lock); + spin_lock_irqsave(&urb->lock, flags); if (urb->status == -EINPROGRESS) urb->status = status; - spin_unlock (&urb->lock); + spin_unlock_irqrestore(&urb->lock, flags); usb_hcd_giveback_urb (hcd, urb); - local_irq_restore (flags); return 0; } @@ -554,8 +552,7 @@ void usb_hcd_poll_rh_status(struct usb_h if (length > 0) { /* try to complete the status urb */ - local_irq_save (flags); - spin_lock(&hcd_root_hub_lock); + spin_lock_irqsave(&hcd_root_hub_lock, flags); urb = hcd->status_urb; if (urb) { spin_lock(&urb->lock); @@ -571,14 +568,13 @@ void usb_hcd_poll_rh_status(struct usb_h spin_unlock(&urb->lock); } else length = 0; - spin_unlock(&hcd_root_hub_lock); + spin_unlock_irqrestore(&hcd_root_hub_lock, flags); /* local irqs are always blocked in completions */ if (length > 0) usb_hcd_giveback_urb (hcd, urb); else hcd->poll_pending = 1; - local_irq_restore (flags); } /* The USB 2.0 spec says 256 ms. This is close enough and won't @@ -650,17 +646,15 @@ static int usb_rh_urb_dequeue (struct us } else { /* Status URB */ if (!hcd->uses_new_polling) del_timer (&hcd->rh_timer); - local_irq_save (flags); - spin_lock (&hcd_root_hub_lock); + spin_lock_irqsave(&hcd_root_hub_lock, flags); if (urb == hcd->status_urb) { hcd->status_urb = NULL; urb->hcpriv = NULL; } else urb = NULL; /* wasn't fully queued */ - spin_unlock (&hcd_root_hub_lock); + spin_unlock_irqrestore(&hcd_root_hub_lock, flags); if (urb) usb_hcd_giveback_urb (hcd, urb); - local_irq_restore (flags); } return 0; @@ -1175,11 +1169,10 @@ void usb_hcd_endpoint_disable (struct us struct urb *urb; hcd = bus_to_hcd(udev->bus); - local_irq_disable (); /* ep is already gone from udev->ep_{in,out}[]; no more submits */ rescan: - spin_lock (&hcd_data_lock); + spin_lock_irq(&hcd_data_lock); list_for_each_entry (urb, &ep->urb_list, urb_list) { int tmp; @@ -1187,13 +1180,13 @@ rescan: if (urb->status != -EINPROGRESS) continue; usb_get_urb (urb); - spin_unlock (&hcd_data_lock); + spin_unlock_irq(&hcd_data_lock); - spin_lock (&urb->lock); + spin_lock_irq(&urb->lock); tmp = urb->status; if (tmp == -EINPROGRESS) urb->status = -ESHUTDOWN; - spin_unlock (&urb->lock); + spin_unlock_irq(&urb->lock); /* kick hcd unless it's already returning this */ if (tmp == -EINPROGRESS) { @@ -1216,8 +1209,7 @@ rescan: /* list contents may have changed */ goto rescan; } - spin_unlock (&hcd_data_lock); - local_irq_enable (); + spin_unlock_irq(&hcd_data_lock); /* synchronize with the hardware, so old configuration state * clears out immediately (and will be freed). Index: linux-rt.q/drivers/usb/core/message.c =================================================================== --- linux-rt.q.orig/drivers/usb/core/message.c +++ linux-rt.q/drivers/usb/core/message.c @@ -250,8 +250,9 @@ static void sg_clean (struct usb_sg_requ static void sg_complete (struct urb *urb) { struct usb_sg_request *io = urb->context; + unsigned long flags; - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); /* In 2.5 we require hcds' endpoint queues not to progress after fault * reports, until the completion callback (this!) returns. That lets @@ -285,7 +286,7 @@ static void sg_complete (struct urb *urb * unlink pending urbs so they won't rx/tx bad data. * careful: unlink can sometimes be synchronous... */ - spin_unlock (&io->lock); + spin_unlock_irqrestore (&io->lock, flags); for (i = 0, found = 0; i < io->entries; i++) { if (!io->urbs [i] || !io->urbs [i]->dev) continue; @@ -300,7 +301,7 @@ static void sg_complete (struct urb *urb } else if (urb == io->urbs [i]) found = 1; } - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); } urb->dev = NULL; @@ -310,7 +311,7 @@ static void sg_complete (struct urb *urb if (!io->count) complete (&io->complete); - spin_unlock (&io->lock); + spin_unlock_irqrestore (&io->lock, flags); } @@ -586,7 +587,7 @@ void usb_sg_cancel (struct usb_sg_reques dev_warn (&io->dev->dev, "%s, unlink --> %d\n", __FUNCTION__, retval); } - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); } spin_unlock_irqrestore (&io->lock, flags); } patches/preempt-realtime-core.patch0000664000077200007720000012456010646635215016764 0ustar mingomingo--- include/linux/completion.h | 1 include/linux/hardirq.h | 42 ++++++----- include/linux/kernel.h | 15 +++- include/linux/profile.h | 12 ++- include/linux/radix-tree.h | 13 +++ include/linux/smp.h | 11 +++ include/linux/smp_lock.h | 4 - include/linux/workqueue.h | 3 kernel/Kconfig.preempt | 145 +++++++++++++++++++++++++-------------- kernel/exit.c | 21 ++++- kernel/fork.c | 164 ++++++++++++++++++++++++++++++++++++++++++++- kernel/futex.c | 10 ++ kernel/power/swsusp.c | 1 kernel/signal.c | 9 ++ kernel/softirq.c | 14 +++ kernel/stop_machine.c | 4 - kernel/sys.c | 9 +- kernel/user.c | 7 + kernel/workqueue.c | 52 +++++++++++++- lib/Kconfig.debug | 4 - lib/Makefile | 3 lib/kernel_lock.c | 27 ++++--- lib/locking-selftest.c | 29 +++++-- lib/radix-tree.c | 6 + lib/smp_processor_id.c | 4 - 25 files changed, 485 insertions(+), 125 deletions(-) Index: linux-rt.q/include/linux/completion.h =================================================================== --- linux-rt.q.orig/include/linux/completion.h +++ linux-rt.q/include/linux/completion.h @@ -48,6 +48,7 @@ extern unsigned long FASTCALL(wait_for_c unsigned long timeout)); extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout( struct completion *x, unsigned long timeout)); +extern unsigned int FASTCALL(completion_done(struct completion *x)); extern void FASTCALL(complete(struct completion *)); extern void FASTCALL(complete_all(struct completion *)); Index: linux-rt.q/include/linux/hardirq.h =================================================================== --- linux-rt.q.orig/include/linux/hardirq.h +++ linux-rt.q/include/linux/hardirq.h @@ -41,23 +41,25 @@ # error HARDIRQ_BITS is too low! #endif #endif +#define PREEMPT_ACTIVE_BITS 1 -#define PREEMPT_SHIFT 0 -#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) -#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) - -#define __IRQ_MASK(x) ((1UL << (x))-1) - -#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) -#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) -#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) - -#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) -#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) -#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) +#define PREEMPT_SHIFT 0 +#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) +#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define PREEMPT_ACTIVE_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + +#define __IRQ_MASK(x) ((1UL << (x))-1) + +#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) +#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) +#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) + +#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) +#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS)) -#error PREEMPT_ACTIVE is too low! +# error PREEMPT_ACTIVE is too low! #endif #define hardirq_count() (preempt_count() & HARDIRQ_MASK) @@ -68,11 +70,13 @@ * Are we doing bottom half or hardware interrupt processing? * Are we in a softirq context? Interrupt context? */ -#define in_irq() (hardirq_count()) -#define in_softirq() (softirq_count()) -#define in_interrupt() (irq_count()) - -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) +#define in_irq() (hardirq_count() || (current->flags & PF_HARDIRQ)) +#define in_softirq() (softirq_count() || (current->flags & PF_SOFTIRQ)) +#define in_interrupt() (irq_count()) + +#if defined(CONFIG_PREEMPT) && \ + !defined(CONFIG_PREEMPT_BKL) && \ + !defined(CONFIG_PREEMPT_RT) # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked()) #else # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) Index: linux-rt.q/include/linux/kernel.h =================================================================== --- linux-rt.q.orig/include/linux/kernel.h +++ linux-rt.q/include/linux/kernel.h @@ -88,7 +88,7 @@ extern int cond_resched(void); # define might_resched() do { } while (0) #endif -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) @@ -167,6 +167,18 @@ static inline int printk(const char *s, static inline int printk(const char *s, ...) { return 0; } #endif +#ifdef CONFIG_PREEMPT_RT +extern void zap_rt_locks(void); +#else +# define zap_rt_locks() do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_RT +extern void zap_rt_locks(void); +#else +# define zap_rt_locks() do { } while (0) +#endif + unsigned long int_sqrt(unsigned long); extern int printk_ratelimit(void); @@ -198,6 +210,7 @@ extern void add_taint(unsigned); /* Values used for system_state */ extern enum system_states { SYSTEM_BOOTING, + SYSTEM_BOOTING_SCHEDULER_OK, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, Index: linux-rt.q/include/linux/profile.h =================================================================== --- linux-rt.q.orig/include/linux/profile.h +++ linux-rt.q/include/linux/profile.h @@ -6,16 +6,18 @@ #include #include #include +#include #include #include extern int prof_on __read_mostly; -#define CPU_PROFILING 1 -#define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 -#define KVM_PROFILING 4 +#define CPU_PROFILING 1 +#define SCHED_PROFILING 2 +#define SLEEP_PROFILING 3 +#define KVM_PROFILING 4 +#define PREEMPT_PROFILING 5 struct proc_dir_entry; struct pt_regs; @@ -54,6 +56,8 @@ enum profile_type { PROFILE_MUNMAP }; +extern int prof_pid; + #ifdef CONFIG_PROFILING struct task_struct; Index: linux-rt.q/include/linux/radix-tree.h =================================================================== --- linux-rt.q.orig/include/linux/radix-tree.h +++ linux-rt.q/include/linux/radix-tree.h @@ -155,7 +155,18 @@ void *radix_tree_delete(struct radix_tre unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); +/* + * On a mutex based kernel we can freely schedule within the radix code: + */ +#ifdef CONFIG_PREEMPT_RT +static inline int radix_tree_preload(gfp_t gfp_mask) +{ + return 0; +} +#else int radix_tree_preload(gfp_t gfp_mask); +#endif + void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); @@ -171,7 +182,9 @@ int radix_tree_tagged(struct radix_tree_ static inline void radix_tree_preload_end(void) { +#ifndef CONFIG_PREEMPT_RT preempt_enable(); +#endif } #endif /* _LINUX_RADIX_TREE_H */ Index: linux-rt.q/include/linux/smp.h =================================================================== --- linux-rt.q.orig/include/linux/smp.h +++ linux-rt.q/include/linux/smp.h @@ -33,6 +33,16 @@ extern void smp_send_stop(void); */ extern void smp_send_reschedule(int cpu); +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + /* * Prepare machine for booting other CPUs. @@ -97,6 +107,7 @@ static inline int up_smp_call_function(v 0; \ }) static inline void smp_send_reschedule(int cpu) { } +static inline void smp_send_reschedule_allbutself(void) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) static inline int smp_call_function_single(int cpuid, void (*func) (void *info), Index: linux-rt.q/include/linux/smp_lock.h =================================================================== --- linux-rt.q.orig/include/linux/smp_lock.h +++ linux-rt.q/include/linux/smp_lock.h @@ -18,6 +18,8 @@ extern void __lockfunc __release_kernel_ __release_kernel_lock(); \ } while (0) + + /* * Non-SMP kernels will never block on the kernel lock, * so we are better off returning a constant zero from @@ -45,7 +47,7 @@ extern void __lockfunc unlock_kernel(voi #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) #define release_kernel_lock(task) do { } while(0) -#define reacquire_kernel_lock(task) 0 +#define reacquire_kernel_lock(task) do { } while(0) #define kernel_locked() 1 #endif /* CONFIG_LOCK_KERNEL */ Index: linux-rt.q/include/linux/workqueue.h =================================================================== --- linux-rt.q.orig/include/linux/workqueue.h +++ linux-rt.q/include/linux/workqueue.h @@ -125,6 +125,9 @@ extern struct workqueue_struct *__create #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1) #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0) +extern void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice); + extern void destroy_workqueue(struct workqueue_struct *wq); extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work)); Index: linux-rt.q/kernel/Kconfig.preempt =================================================================== --- linux-rt.q.orig/kernel/Kconfig.preempt +++ linux-rt.q/kernel/Kconfig.preempt @@ -1,14 +1,13 @@ - choice - prompt "Preemption Model" - default PREEMPT_NONE + prompt "Preemption Mode" + default PREEMPT_RT config PREEMPT_NONE bool "No Forced Preemption (Server)" help - This is the traditional Linux preemption model, geared towards + This is the traditional Linux preemption model geared towards throughput. It will still provide good latencies most of the - time, but there are no guarantees and occasional longer delays + time but there are no guarantees and occasional long delays are possible. Select this option if you are building a kernel for a server or @@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new - preemption points have been selected to reduce the maximum + preemption points have been selected to minimize the maximum latency of rescheduling, providing faster application reactions, at the cost of slightly lower throughput. @@ -33,42 +32,120 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT +config PREEMPT_DESKTOP bool "Preemptible Kernel (Low-Latency Desktop)" help This option reduces the latency of the kernel by making - all kernel code (that is not executing in a critical section) + all kernel code that is not executing in a critical section preemptible. This allows reaction to interactive events by permitting a low priority process to be preempted involuntarily even if it is in kernel mode executing a system call and would - otherwise not be about to reach a natural preemption point. - This allows applications to run more 'smoothly' even when the - system is under load, at the cost of slightly lower throughput - and a slight runtime overhead to kernel code. + otherwise not about to reach a preemption point. This allows + applications to run more 'smoothly' even when the system is + under load, at the cost of slighly lower throughput and a + slight runtime overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 50% of the time.) Select this if you are building a kernel for a desktop or embedded system with latency requirements in the milliseconds range. +config PREEMPT_RT + bool "Complete Preemption (Real-Time)" + select PREEMPT_SOFTIRQS + select PREEMPT_HARDIRQS + select PREEMPT_RCU + select RT_MUTEXES + help + This option further reduces the scheduling latency of the + kernel by replacing almost every spinlock used by the kernel + with preemptible mutexes and thus making all but the most + critical kernel code involuntarily preemptible. The remaining + handful of lowlevel non-preemptible codepaths are short and + have a deterministic latency of a couple of tens of + microseconds (depending on the hardware). This also allows + applications to run more 'smoothly' even when the system is + under load, at the cost of lower throughput and runtime + overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 95% of the time.) + + Select this if you are building a kernel for a desktop, + embedded or real-time system with guaranteed latency + requirements of 100 usecs or lower. + endchoice -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on SMP || PREEMPT +config PREEMPT + bool default y + depends on PREEMPT_DESKTOP || PREEMPT_RT + +config PREEMPT_SOFTIRQS + bool "Thread Softirqs" + default n +# depends on PREEMPT + help + This option reduces the latency of the kernel by 'threading' + soft interrupts. This means that all softirqs will execute + in softirqd's context. While this helps latency, it can also + reduce performance. + + The threading of softirqs can also be controlled via + /proc/sys/kernel/softirq_preemption runtime flag and the + sofirq-preempt=0/1 boot-time option. + + Say N if you are unsure. + +config PREEMPT_HARDIRQS + bool "Thread Hardirqs" + default n + depends on !GENERIC_HARDIRQS_NO__DO_IRQ + help + This option reduces the latency of the kernel by 'threading' + hardirqs. This means that all (or selected) hardirqs will run + in their own kernel thread context. While this helps latency, + this feature can also reduce performance. + + The threading of hardirqs can also be controlled via the + /proc/sys/kernel/hardirq_preemption runtime flag and the + hardirq-preempt=0/1 boot-time option. Per-irq threading can + be enabled/disable via the /proc/irq///threaded + runtime flags. + + Say N if you are unsure. + +config SPINLOCK_BKL + bool "Old-Style Big Kernel Lock" + depends on (PREEMPT || SMP) && !PREEMPT_RT + default n help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. + This option increases the latency of the kernel by making the + big kernel lock spinlock-based (which is bad for latency). + However, enable this option if you see any problems to revert + back to the traditional spinlock BKL design. Say Y here if you are building a kernel for a desktop system. Say N if you are unsure. +config PREEMPT_BKL + bool + depends on PREEMPT_RT || !SPINLOCK_BKL + default n if !PREEMPT + default y + choice prompt "RCU implementation type:" default CLASSIC_RCU config CLASSIC_RCU bool "Classic RCU" + depends on !PREEMPT_RT help This option selects the classic RCU implementation that is designed for best read-side performance on non-realtime @@ -99,37 +176,3 @@ config RCU_TRACE Say Y here if you want to enable RCU tracing Say N if you are unsure. - -config PREEMPT_SOFTIRQS - bool "Thread Softirqs" - default n -# depends on PREEMPT - help - This option reduces the latency of the kernel by 'threading' - soft interrupts. This means that all softirqs will execute - in softirqd's context. While this helps latency, it can also - reduce performance. - - The threading of softirqs can also be controlled via - /proc/sys/kernel/softirq_preemption runtime flag and the - sofirq-preempt=0/1 boot-time option. - - Say N if you are unsure. - -config PREEMPT_HARDIRQS - bool "Thread Hardirqs" - default n - depends on !GENERIC_HARDIRQS_NO__DO_IRQ - help - This option reduces the latency of the kernel by 'threading' - hardirqs. This means that all (or selected) hardirqs will run - in their own kernel thread context. While this helps latency, - this feature can also reduce performance. - - The threading of hardirqs can also be controlled via the - /proc/sys/kernel/hardirq_preemption runtime flag and the - hardirq-preempt=0/1 boot-time option. Per-irq threading can - be enabled/disable via the /proc/irq///threaded - runtime flags. - - Say N if you are unsure. Index: linux-rt.q/kernel/exit.c =================================================================== --- linux-rt.q.orig/kernel/exit.c +++ linux-rt.q/kernel/exit.c @@ -63,7 +63,9 @@ static void __unhash_process(struct task detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); + preempt_disable(); __get_cpu_var(process_counts)--; + preempt_enable(); } list_del_rcu(&p->thread_group); remove_parent(p); @@ -585,9 +587,11 @@ static void exit_mm(struct task_struct * BUG_ON(mm != tsk->active_mm); /* more a memory barrier than a real lock */ task_lock(tsk); + preempt_disable(); // FIXME tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); + preempt_enable(); task_unlock(tsk); mmput(mm); } @@ -867,6 +871,7 @@ fastcall NORET_TYPE void do_exit(long co WARN_ON(atomic_read(&tsk->fs_excl)); + BUG_ON(in_interrupt()); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) @@ -993,15 +998,18 @@ fastcall NORET_TYPE void do_exit(long co if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - preempt_disable(); +again: + local_irq_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + __schedule(); + printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n", + current->comm, current->pid); + printk(KERN_ERR ".... flags: %08x, count: %d, state: %08lx\n", + current->flags, atomic_read(¤t->usage), current->state); + printk(KERN_ERR ".... trying again ...\n"); + goto again; } EXPORT_SYMBOL_GPL(do_exit); @@ -1508,6 +1516,7 @@ repeat: list_for_each(_p,&tsk->children) { p = list_entry(_p, struct task_struct, sibling); + BUG_ON(!atomic_read(&p->usage)); ret = eligible_child(pid, options, p); if (!ret) continue; Index: linux-rt.q/kernel/fork.c =================================================================== --- linux-rt.q.orig/kernel/fork.c +++ linux-rt.q/kernel/fork.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include #include @@ -69,6 +71,15 @@ DEFINE_PER_CPU(unsigned long, process_co __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +/* + * Delayed mmdrop. In the PREEMPT_RT case we + * dont want to do this from the scheduling + * context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); + +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); + int nr_processes(void) { int cpu; @@ -112,10 +123,13 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -void __put_task_struct(struct task_struct *tsk) +#ifdef CONFIG_PREEMPT_RT +void __put_task_struct_cb(struct rcu_head *rhp) { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + BUG_ON(atomic_read(&tsk->usage)); WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); - WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); sched_dead(tsk); @@ -128,8 +142,27 @@ void __put_task_struct(struct task_struc free_task(tsk); } +#else + +void __put_task_struct(struct task_struct *tsk) +{ + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); + BUG_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); + + if (!profile_handoff_task(tsk)) + free_task(tsk); +} +#endif + void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -157,6 +190,9 @@ void __init fork_init(unsigned long memp init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < NR_CPUS; i++) + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -341,6 +377,7 @@ static struct mm_struct * mm_init(struct spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; @@ -1200,10 +1237,12 @@ static struct task_struct *copy_process( * to ensure it is on a valid CPU (and if not, just force it back to * parent's CPU). This avoids alot of nasty races. */ + preempt_disable(); p->cpus_allowed = current->cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); + preempt_enable(); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) @@ -1263,7 +1302,9 @@ static struct task_struct *copy_process( attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; @@ -1685,3 +1726,122 @@ bad_unshare_cleanup_thread: bad_unshare_out: return err; } + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void fastcall __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_drop_list); +} + +static int desched_thread(void * __bind_cpu) +{ + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (mmdrop_complete()) + continue; + schedule(); + + /* This must be called from time to time on ia64, and is a no-op on other archs. + * Used to be in cpu_idle(), but with the new -rt semantics it can't stay there. + */ + check_pgt_cache(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} + Index: linux-rt.q/kernel/futex.c =================================================================== --- linux-rt.q.orig/kernel/futex.c +++ linux-rt.q/kernel/futex.c @@ -940,7 +940,7 @@ static int futex_requeue(u32 __user *uad plist_del(&this->list, &hb1->chain); plist_add(&this->list, &hb2->chain); this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST +#if defined(CONFIG_DEBUG_PI_LIST) && !defined(CONFIG_PREEMPT_RT) this->list.plist.lock = &hb2->lock; #endif } @@ -1001,7 +1001,7 @@ static inline void __queue_me(struct fut prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); -#ifdef CONFIG_DEBUG_PI_LIST +#if defined(CONFIG_DEBUG_PI_LIST) && !defined(CONFIG_PREEMPT_RT) q->list.plist.lock = &hb->lock; #endif plist_add(&q->list, &hb->chain); @@ -1239,6 +1239,10 @@ static int futex_wait(u32 __user *uaddr, * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ if (likely(!plist_node_empty(&q.list))) { + unsigned long nosched_flag = current->flags & PF_NOSCHED; + + current->flags &= ~PF_NOSCHED; + if (!abs_time) schedule(); else { @@ -1261,6 +1265,8 @@ static int futex_wait(u32 __user *uaddr, /* Flag if a timeout occured */ rem = (t.task == NULL); } + + current->flags |= nosched_flag; } __set_current_state(TASK_RUNNING); Index: linux-rt.q/kernel/power/swsusp.c =================================================================== --- linux-rt.q.orig/kernel/power/swsusp.c +++ linux-rt.q/kernel/power/swsusp.c @@ -294,6 +294,7 @@ int swsusp_suspend(void) printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); + touch_softlockup_watchdog(); /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ Index: linux-rt.q/kernel/signal.c =================================================================== --- linux-rt.q.orig/kernel/signal.c +++ linux-rt.q/kernel/signal.c @@ -724,8 +724,10 @@ specific_send_sig_info(int sig, struct s { int ret = 0; - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); +#ifdef CONFIG_SMP assert_spin_locked(&t->sighand->siglock); +#endif /* Short-circuit ignored signals. */ if (sig_ignored(t, sig)) @@ -1571,6 +1573,7 @@ static void ptrace_stop(int exit_code, i if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); + current->flags &= ~PF_NOSCHED; schedule(); } else { /* @@ -1631,6 +1634,7 @@ finish_stop(int stop_count) } do { + current->flags &= ~PF_NOSCHED; schedule(); } while (try_to_freeze()); /* @@ -1742,6 +1746,9 @@ int get_signal_to_deliver(siginfo_t *inf try_to_freeze(); +#ifdef CONFIG_PREEMPT_RT + might_sleep(); +#endif relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -119,6 +120,8 @@ static void trigger_softirqs(void) } } +#ifndef CONFIG_PREEMPT_RT + /* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: @@ -236,6 +239,8 @@ void local_bh_enable_ip(unsigned long ip } EXPORT_SYMBOL(local_bh_enable_ip); +#endif + /* * We restart softirq processing MAX_SOFTIRQ_RESTART times, * and we fall back to softirqd after that. @@ -647,7 +652,7 @@ void tasklet_kill(struct tasklet_struct while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { do - yield(); + msleep(1); while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); @@ -899,6 +904,11 @@ int softirq_preemption = 1; EXPORT_SYMBOL(softirq_preemption); +/* + * Real-Time Preemption depends on softirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + static int __init softirq_preempt_setup (char *str) { if (!strncmp(str, "off", 3)) @@ -912,7 +922,7 @@ static int __init softirq_preempt_setup } __setup("softirq-preempt=", softirq_preempt_setup); - +#endif #endif #ifdef CONFIG_SMP Index: linux-rt.q/kernel/stop_machine.c =================================================================== --- linux-rt.q.orig/kernel/stop_machine.c +++ linux-rt.q/kernel/stop_machine.c @@ -63,7 +63,7 @@ static int stopmachine(void *cpu) /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) - yield(); + __yield(); else cpu_relax(); } @@ -113,7 +113,7 @@ static int stop_machine(void) /* Wait for them all to come to life. */ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - yield(); + __yield(); /* If some failed, kill them all. */ if (ret < 0) { Index: linux-rt.q/kernel/sys.c =================================================================== --- linux-rt.q.orig/kernel/sys.c +++ linux-rt.q/kernel/sys.c @@ -34,6 +34,7 @@ #include #include +#include #include #include @@ -149,9 +150,9 @@ static int notifier_chain_unregister(str * last notifier function called. */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +static int __kprobes notrace notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; @@ -487,7 +488,7 @@ int srcu_notifier_chain_register(struct * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ - if (unlikely(system_state == SYSTEM_BOOTING)) + if (unlikely(system_state < SYSTEM_RUNNING)) return notifier_chain_register(&nh->head, n); mutex_lock(&nh->mutex); Index: linux-rt.q/kernel/user.c =================================================================== --- linux-rt.q.orig/kernel/user.c +++ linux-rt.q/kernel/user.c @@ -108,15 +108,16 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save(flags); + local_irq_save_nort(flags); if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { uid_hash_remove(up); - spin_unlock_irqrestore(&uidhash_lock, flags); + spin_unlock(&uidhash_lock); + local_irq_restore_nort(flags); key_put(up->uid_keyring); key_put(up->session_keyring); kmem_cache_free(uid_cachep, up); } else { - local_irq_restore(flags); + local_irq_restore_nort(flags); } } Index: linux-rt.q/kernel/workqueue.c =================================================================== --- linux-rt.q.orig/kernel/workqueue.c +++ linux-rt.q/kernel/workqueue.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,8 @@ #include #include +#include + /* * The per-CPU workqueue (if single thread, we always use the first * possible cpu). @@ -157,15 +160,16 @@ static void __queue_work(struct cpu_work * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. + * + * Especially no such guarantee on PREEMPT_RT. */ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0; + int ret = 0, cpu = raw_smp_processor_id(); if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, get_cpu()), work); - put_cpu(); + __queue_work(wq_per_cpu(wq, cpu), work); ret = 1; } return ret; @@ -755,6 +759,47 @@ static void cleanup_workqueue_thread(str cwq->thread = NULL; } +void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu, + int policy, int rt_priority, int nice) +{ + struct sched_param param = { .sched_priority = rt_priority }; + struct cpu_workqueue_struct *cwq; + mm_segment_t oldfs = get_fs(); + struct task_struct *p; + unsigned long flags; + int ret; + + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + spin_lock_irqsave(&cwq->lock, flags); + p = cwq->thread; + spin_unlock_irqrestore(&cwq->lock, flags); + + set_user_nice(p, nice); + + set_fs(KERNEL_DS); + ret = sys_sched_setscheduler(p->pid, policy, ¶m); + set_fs(oldfs); + + WARN_ON(ret); +} + + void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice) +{ + int cpu; + + /* We don't need the distraction of CPUs appearing and vanishing. */ + mutex_lock(&workqueue_mutex); + if (is_single_threaded(wq)) + set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice); + else { + for_each_online_cpu(cpu) + set_workqueue_thread_prio(wq, cpu, policy, + rt_priority, nice); + } + mutex_unlock(&workqueue_mutex); +} + /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue @@ -837,4 +882,5 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); + set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20); } Index: linux-rt.q/lib/Kconfig.debug =================================================================== --- linux-rt.q.orig/lib/Kconfig.debug +++ linux-rt.q/lib/Kconfig.debug @@ -168,6 +168,8 @@ config DEBUG_RT_MUTEXES help This allows rt mutex semantics violations and rt mutex related deadlocks (lockups) to be detected and reported automatically. + When realtime preemption is enabled this includes spinlocks, + rwlocks, mutexes and (rw)semaphores config DEBUG_PI_LIST bool @@ -191,7 +193,7 @@ config DEBUG_SPINLOCK config DEBUG_MUTEXES bool "Mutex debugging: basic checks" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !PREEMPT_RT help This feature allows mutex semantics violations to be detected and reported. Index: linux-rt.q/lib/Makefile =================================================================== --- linux-rt.q.orig/lib/Makefile +++ linux-rt.q/lib/Makefile @@ -24,7 +24,8 @@ obj-$(CONFIG_GENERIC_IOMAP) += iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_PREEMPT_RT) += plist.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o Index: linux-rt.q/lib/kernel_lock.c =================================================================== --- linux-rt.q.orig/lib/kernel_lock.c +++ linux-rt.q/lib/kernel_lock.c @@ -35,22 +35,25 @@ DECLARE_MUTEX(kernel_sem); * about recursion, both due to the down() and due to the enabling of * preemption. schedule() will re-check the preemption flag after * reacquiring the semaphore. + * + * Called with interrupts disabled. */ int __lockfunc __reacquire_kernel_lock(void) { struct task_struct *task = current; int saved_lock_depth = task->lock_depth; + local_irq_enable(); BUG_ON(saved_lock_depth < 0); task->lock_depth = -1; - __preempt_enable_no_resched(); down(&kernel_sem); - preempt_disable(); task->lock_depth = saved_lock_depth; + local_irq_disable(); + return 0; } @@ -67,11 +70,15 @@ void __lockfunc lock_kernel(void) struct task_struct *task = current; int depth = task->lock_depth + 1; - if (likely(!depth)) + if (likely(!depth)) { /* * No recursion worries - we set up lock_depth _after_ */ down(&kernel_sem); +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = __builtin_return_address(0); +#endif + } task->lock_depth = depth; } @@ -82,8 +89,12 @@ void __lockfunc unlock_kernel(void) BUG_ON(task->lock_depth < 0); - if (likely(--task->lock_depth < 0)) + if (likely(--task->lock_depth == -1)) { +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = NULL; +#endif up(&kernel_sem); + } } #else @@ -116,11 +127,9 @@ static __cacheline_aligned_in_smp DEFIN */ int __lockfunc __reacquire_kernel_lock(void) { - while (!_raw_spin_trylock(&kernel_flag)) { - if (test_thread_flag(TIF_NEED_RESCHED)) - return -EAGAIN; - cpu_relax(); - } + local_irq_enable(); + _raw_spin_lock(&kernel_flag); + local_irq_disable(); preempt_disable(); return 0; } Index: linux-rt.q/lib/locking-selftest.c =================================================================== --- linux-rt.q.orig/lib/locking-selftest.c +++ linux-rt.q/lib/locking-selftest.c @@ -158,7 +158,7 @@ static void init_shared_classes(void) local_bh_disable(); \ local_irq_disable(); \ trace_softirq_enter(); \ - WARN_ON(!in_softirq()); + /* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */ #define SOFTIRQ_EXIT() \ trace_softirq_exit(); \ @@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem) #undef E /* + * FIXME: turns these into raw-spinlock tests on -rt + */ +#ifndef CONFIG_PREEMPT_RT + +/* * locking an irq-safe lock with irqs enabled: */ #define E1() \ @@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_ #include "locking-selftest-softirq.h" // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft) +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) # define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map) @@ -1004,7 +1011,7 @@ static inline void print_testname(const #define DO_TESTCASE_1(desc, name, nr) \ print_testname(desc"/"#nr); \ - dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); #define DO_TESTCASE_1B(desc, name, nr) \ @@ -1012,17 +1019,17 @@ static inline void print_testname(const dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3(desc, name, nr) \ - print_testname(desc"/"#nr); \ - dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ +#define DO_TESTCASE_3(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3RW(desc, name, nr) \ - print_testname(desc"/"#nr); \ +#define DO_TESTCASE_3RW(desc, name, nr) \ + print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); @@ -1053,7 +1060,7 @@ static inline void print_testname(const print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ - dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ @@ -1185,6 +1192,7 @@ void locking_selftest(void) /* * irq-context testcases: */ +#ifndef CONFIG_PREEMPT_RT DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); @@ -1194,6 +1202,7 @@ void locking_selftest(void) DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); +#endif if (unexpected_testcase_failures) { printk("-----------------------------------------------------------------\n"); Index: linux-rt.q/lib/radix-tree.c =================================================================== --- linux-rt.q.orig/lib/radix-tree.c +++ linux-rt.q/lib/radix-tree.c @@ -97,12 +97,13 @@ radix_tree_node_alloc(struct radix_tree_ if (ret == NULL && !(gfp_mask & __GFP_WAIT)) { struct radix_tree_preload *rtp; - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = &get_cpu_var(radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; rtp->nodes[rtp->nr - 1] = NULL; rtp->nr--; } + put_cpu_var(radix_tree_preloads); } BUG_ON(radix_tree_is_direct_ptr(ret)); return ret; @@ -121,6 +122,8 @@ radix_tree_node_free(struct radix_tree_n call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } +#ifndef CONFIG_PREEMPT_RT + /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On @@ -151,6 +154,7 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } +#endif static inline void tag_set(struct radix_tree_node *node, unsigned int tag, int offset) Index: linux-rt.q/lib/smp_processor_id.c =================================================================== --- linux-rt.q.orig/lib/smp_processor_id.c +++ linux-rt.q/lib/smp_processor_id.c @@ -7,7 +7,7 @@ #include #include -unsigned int debug_smp_processor_id(void) +unsigned int notrace debug_smp_processor_id(void) { unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); @@ -42,7 +42,7 @@ unsigned int debug_smp_processor_id(void if (!printk_ratelimit()) goto out_enable; - printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid); + printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count()-1, current->comm, current->pid); print_symbol("caller is %s\n", (long)__builtin_return_address(0)); dump_stack(); patches/kmap-atomic-i386-fix.patch0000664000077200007720000000270210646635216016231 0ustar mingomingo--- arch/i386/mm/highmem.c | 2 +- include/asm-i386/highmem.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) Index: linux-rt.q/arch/i386/mm/highmem.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/highmem.c +++ linux-rt.q/arch/i386/mm/highmem.c @@ -3,9 +3,9 @@ void *kmap(struct page *page) { - might_sleep(); if (!PageHighMem(page)) return page_address(page); + might_sleep(); return kmap_high(page); } Index: linux-rt.q/include/asm-i386/highmem.h =================================================================== --- linux-rt.q.orig/include/asm-i386/highmem.h +++ linux-rt.q/include/asm-i386/highmem.h @@ -94,10 +94,10 @@ struct page *kmap_atomic_to_page(void *p * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap(): */ #ifdef CONFIG_PREEMPT_RT -# define kmap_atomic_prot(page, type, prot) kmap(page) -# define kmap_atomic(page, type) kmap(page) +# define kmap_atomic_prot(page, type, prot) ({ pagefault_disable(); kmap(page); }) +# define kmap_atomic(page, type) ({ pagefault_disable(); kmap(page); }) # define kmap_atomic_pfn(pfn, type) kmap(pfn_to_page(pfn)) -# define kunmap_atomic(kvaddr, type) kunmap_virt(kvaddr) +# define kunmap_atomic(kvaddr, type) do { pagefault_enable(); kunmap_virt(kvaddr); } while(0) # define kmap_atomic_to_page(kvaddr) kmap_to_page(kvaddr) #else # define kmap_atomic_prot(page, type, prot) __kmap_atomic_prot(page, type, prot) patches/preempt-realtime-ide.patch0000664000077200007720000002332110646635215016566 0ustar mingomingo--- drivers/ide/ide-floppy.c | 4 ++-- drivers/ide/ide-io.c | 4 ++-- drivers/ide/ide-iops.c | 26 ++++++++++++-------------- drivers/ide/ide-lib.c | 14 +++++--------- drivers/ide/ide-probe.c | 8 ++++---- drivers/ide/ide-taskfile.c | 6 +++--- drivers/ide/pci/alim15x3.c | 12 ++++++------ drivers/ide/pci/hpt366.c | 4 ++-- 8 files changed, 36 insertions(+), 42 deletions(-) Index: linux-rt.q/drivers/ide/ide-floppy.c =================================================================== --- linux-rt.q.orig/drivers/ide/ide-floppy.c +++ linux-rt.q/drivers/ide/ide-floppy.c @@ -1667,9 +1667,9 @@ static int idefloppy_get_format_progress atapi_status_t status; unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); status.all = HWIF(drive)->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); progress_indication = !status.b.dsc ? 0 : 0x10000; } Index: linux-rt.q/drivers/ide/ide-io.c =================================================================== --- linux-rt.q.orig/drivers/ide/ide-io.c +++ linux-rt.q/drivers/ide/ide-io.c @@ -1193,7 +1193,7 @@ static void ide_do_request (ide_hwgroup_ ide_get_lock(ide_intr, hwgroup); /* caller must own ide_lock */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); while (!hwgroup->busy) { hwgroup->busy = 1; @@ -1461,7 +1461,7 @@ void ide_timer_expiry (unsigned long dat #endif /* DISABLE_IRQ_NOSYNC */ /* local CPU only, * as if we were handling an interrupt */ - local_irq_disable(); + local_irq_disable_nort(); if (hwgroup->polling) { startstop = handler(drive); } else if (drive_is_ready(drive)) { Index: linux-rt.q/drivers/ide/ide-iops.c =================================================================== --- linux-rt.q.orig/drivers/ide/ide-iops.c +++ linux-rt.q/drivers/ide/ide-iops.c @@ -220,10 +220,10 @@ static void ata_input_data(ide_drive_t * if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->INSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->INSL(IDE_DATA_REG, buffer, wcount); } else { @@ -242,10 +242,10 @@ static void ata_output_data(ide_drive_t if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->OUTSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->OUTSL(IDE_DATA_REG, buffer, wcount); } else { @@ -540,12 +540,12 @@ int ide_wait_stat (ide_startstop_t *star if (!(stat & BUSY_STAT)) break; - local_irq_restore(flags); + local_irq_restore_nort(flags); *startstop = ide_error(drive, "status timeout", stat); return 1; } } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* * Allow status to settle, then read it again. @@ -716,17 +716,15 @@ int ide_driveid_update (ide_drive_t *dri printk("%s: CHECK for good STATUS\n", drive->name); return 0; } - local_irq_save(flags); - SELECT_MASK(drive, 0); id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC); - if (!id) { - local_irq_restore(flags); + if (!id) return 0; - } + local_irq_save_nort(flags); + SELECT_MASK(drive, 0); ata_input_data(drive, id, SECTOR_WORDS); (void) hwif->INB(IDE_STATUS_REG); /* clear drive IRQ */ - local_irq_enable(); - local_irq_restore(flags); + local_irq_enable_nort(); + local_irq_restore_nort(flags); ide_fix_driveid(id); if (id) { drive->id->dma_ultra = id->dma_ultra; @@ -806,7 +804,7 @@ int ide_config_drive_speed (ide_drive_t if (time_after(jiffies, timeout)) break; } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* Index: linux-rt.q/drivers/ide/ide-lib.c =================================================================== --- linux-rt.q.orig/drivers/ide/ide-lib.c +++ linux-rt.q/drivers/ide/ide-lib.c @@ -376,15 +376,16 @@ int ide_set_xfer_rate(ide_drive_t *drive static void ide_dump_opcode(ide_drive_t *drive) { + unsigned long flags; struct request *rq; u8 opcode = 0; int found = 0; - spin_lock(&ide_lock); + spin_lock_irqsave(&ide_lock, flags); rq = NULL; if (HWGROUP(drive)) rq = HWGROUP(drive)->rq; - spin_unlock(&ide_lock); + spin_unlock_irqrestore(&ide_lock, flags); if (!rq) return; if (rq->cmd_type == REQ_TYPE_ATA_CMD || @@ -413,10 +414,8 @@ static void ide_dump_opcode(ide_drive_t static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat) { ide_hwif_t *hwif = HWIF(drive); - unsigned long flags; u8 err = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (stat & BUSY_STAT) printk("Busy "); @@ -476,7 +475,7 @@ static u8 ide_dump_ata_status(ide_drive_ printk("\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return err; } @@ -491,14 +490,11 @@ static u8 ide_dump_ata_status(ide_drive_ static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat) { - unsigned long flags; - atapi_status_t status; atapi_error_t error; status.all = stat; error.all = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (status.b.bsy) printk("Busy "); @@ -524,7 +520,7 @@ static u8 ide_dump_atapi_status(ide_driv printk("}\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return error.all; } Index: linux-rt.q/drivers/ide/ide-probe.c =================================================================== --- linux-rt.q.orig/drivers/ide/ide-probe.c +++ linux-rt.q/drivers/ide/ide-probe.c @@ -141,7 +141,7 @@ static inline void do_identify (ide_driv hwif->ata_input_data(drive, id, SECTOR_WORDS); drive->id_read = 1; - local_irq_enable(); + local_irq_enable_nort(); ide_fix_driveid(id); #if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA) @@ -323,14 +323,14 @@ static int actual_try_to_identify (ide_d unsigned long flags; /* local CPU only; some systems need this */ - local_irq_save(flags); + local_irq_save_nort(flags); /* drive returned ID */ do_identify(drive, cmd); /* drive responded with ID */ rc = 0; /* clear drive IRQ */ (void) hwif->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { /* drive refused ID */ rc = 2; @@ -807,7 +807,7 @@ static void probe_hwif(ide_hwif_t *hwif, } while ((stat & BUSY_STAT) && time_after(timeout, jiffies)); } - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * Use cached IRQ number. It might be (and is...) changed by probe * code above Index: linux-rt.q/drivers/ide/ide-taskfile.c =================================================================== --- linux-rt.q.orig/drivers/ide/ide-taskfile.c +++ linux-rt.q/drivers/ide/ide-taskfile.c @@ -278,7 +278,7 @@ static void ide_pio_sector(ide_drive_t * offset %= PAGE_SIZE; #ifdef CONFIG_HIGHMEM - local_irq_save(flags); + local_irq_save_nort(flags); #endif buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; @@ -298,7 +298,7 @@ static void ide_pio_sector(ide_drive_t * kunmap_atomic(buf, KM_BIO_SRC_IRQ); #ifdef CONFIG_HIGHMEM - local_irq_restore(flags); + local_irq_restore_nort(flags); #endif } @@ -464,7 +464,7 @@ ide_startstop_t pre_task_out_intr (ide_d } if (!drive->unmask) - local_irq_disable(); + local_irq_disable_nort(); ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL); ide_pio_datablock(drive, rq, 1); Index: linux-rt.q/drivers/ide/pci/alim15x3.c =================================================================== --- linux-rt.q.orig/drivers/ide/pci/alim15x3.c +++ linux-rt.q/drivers/ide/pci/alim15x3.c @@ -325,7 +325,7 @@ static u8 ali15x3_tune_pio (ide_drive_t if (r_clc >= 16) r_clc = 0; } - local_irq_save(flags); + local_irq_save_nort(flags); /* * PIO mode => ATA FIFO on, ATAPI FIFO off @@ -347,7 +347,7 @@ static u8 ali15x3_tune_pio (ide_drive_t pci_write_config_byte(dev, port, s_clc); pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc); - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * setup active rec @@ -518,7 +518,7 @@ static unsigned int __devinit init_chips } #endif /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_IDE_PROC_FS) */ - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision < 0xC2) { /* @@ -579,7 +579,7 @@ static unsigned int __devinit init_chips out: pci_dev_put(north); pci_dev_put(isa_dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -603,7 +603,7 @@ static unsigned int __devinit ata66_ali1 unsigned long flags; u8 tmpbyte; - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision >= 0xC2) { /* @@ -655,7 +655,7 @@ static unsigned int __devinit ata66_ali1 pci_write_config_byte(dev, 0x53, tmpbyte); - local_irq_restore(flags); + local_irq_restore_nort(flags); return(ata66); } Index: linux-rt.q/drivers/ide/pci/hpt366.c =================================================================== --- linux-rt.q.orig/drivers/ide/pci/hpt366.c +++ linux-rt.q/drivers/ide/pci/hpt366.c @@ -1371,7 +1371,7 @@ static void __devinit init_dma_hpt366(id dma_old = hwif->INB(dmabase + 2); - local_irq_save(flags); + local_irq_save_nort(flags); dma_new = dma_old; pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma); @@ -1382,7 +1382,7 @@ static void __devinit init_dma_hpt366(id if (dma_new != dma_old) hwif->OUTB(dma_new, dmabase + 2); - local_irq_restore(flags); + local_irq_restore_nort(flags); ide_setup_dma(hwif, dmabase, 8); } patches/preempt-realtime-net.patch0000664000077200007720000004751510646635215016626 0ustar mingomingo--- include/linux/netdevice.h | 8 ++--- include/net/dn_dev.h | 6 ++-- include/net/sock.h | 4 +- net/core/dev.c | 49 +++++++++++++++++++++++++++----- net/core/netpoll.c | 60 ++++++++++++++++++++++++++-------------- net/core/sock.c | 2 - net/decnet/dn_dev.c | 44 ++++++++++++++--------------- net/ipv4/icmp.c | 5 ++- net/ipv4/route.c | 4 +- net/ipv6/netfilter/ip6_tables.c | 4 +- net/sched/sch_generic.c | 29 ++++++++++++++++--- net/unix/af_unix.c | 3 +- 12 files changed, 146 insertions(+), 72 deletions(-) Index: linux-rt.q/include/linux/netdevice.h =================================================================== --- linux-rt.q.orig/include/linux/netdevice.h +++ linux-rt.q/include/linux/netdevice.h @@ -903,7 +903,7 @@ static inline int netif_rx_reschedule(st local_irq_save(flags); list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); local_irq_restore(flags); return 1; } @@ -951,20 +951,20 @@ static inline void netif_poll_enable(str static inline void netif_tx_lock(struct net_device *dev) { spin_lock(&dev->_xmit_lock); - dev->xmit_lock_owner = smp_processor_id(); + dev->xmit_lock_owner = raw_smp_processor_id(); } static inline void netif_tx_lock_bh(struct net_device *dev) { spin_lock_bh(&dev->_xmit_lock); - dev->xmit_lock_owner = smp_processor_id(); + dev->xmit_lock_owner = raw_smp_processor_id(); } static inline int netif_tx_trylock(struct net_device *dev) { int ok = spin_trylock(&dev->_xmit_lock); if (likely(ok)) - dev->xmit_lock_owner = smp_processor_id(); + dev->xmit_lock_owner = raw_smp_processor_id(); return ok; } Index: linux-rt.q/include/net/dn_dev.h =================================================================== --- linux-rt.q.orig/include/net/dn_dev.h +++ linux-rt.q/include/net/dn_dev.h @@ -76,9 +76,9 @@ struct dn_dev_parms { int priority; /* Priority to be a router */ char *name; /* Name for sysctl */ int ctl_name; /* Index for sysctl */ - int (*up)(struct net_device *); - void (*down)(struct net_device *); - void (*timer3)(struct net_device *, struct dn_ifaddr *ifa); + int (*dn_up)(struct net_device *); + void (*dn_down)(struct net_device *); + void (*dn_timer3)(struct net_device *, struct dn_ifaddr *ifa); void *sysctl; }; Index: linux-rt.q/include/net/sock.h =================================================================== --- linux-rt.q.orig/include/net/sock.h +++ linux-rt.q/include/net/sock.h @@ -625,12 +625,12 @@ static inline void sk_refcnt_debug_relea /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { - prot->stats[smp_processor_id()].inuse++; + prot->stats[raw_smp_processor_id()].inuse++; } static __inline__ void sock_prot_dec_use(struct proto *prot) { - prot->stats[smp_processor_id()].inuse--; + prot->stats[raw_smp_processor_id()].inuse--; } /* With per-bucket locks this operation is not-atomic, so that Index: linux-rt.q/net/core/dev.c =================================================================== --- linux-rt.q.orig/net/core/dev.c +++ linux-rt.q/net/core/dev.c @@ -1440,7 +1440,7 @@ out_kfree_skb: return 0; } -#define HARD_TX_LOCK(dev, cpu) { \ +#define HARD_TX_LOCK(dev) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ netif_tx_lock(dev); \ } \ @@ -1568,11 +1568,17 @@ gso: Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ + /* + * No need to check for recursion with threaded interrupts: + */ +#ifdef CONFIG_PREEMPT_RT + if (1) { +#else + int cpu = raw_smp_processor_id(); /* ok because BHs are off */ if (dev->xmit_lock_owner != cpu) { - - HARD_TX_LOCK(dev, cpu); +#endif + HARD_TX_LOCK(dev); if (!netif_queue_stopped(dev)) { rc = 0; @@ -1707,7 +1713,8 @@ static inline struct net_device *skb_bon static void net_tx_action(struct softirq_action *h) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = &per_cpu(softnet_data, + raw_smp_processor_id()); if (sd->completion_queue) { struct sk_buff *clist; @@ -1723,6 +1730,11 @@ static void net_tx_action(struct softirq BUG_TRAP(!atomic_read(&skb->users)); __kfree_skb(skb); + /* + * Safe to reschedule - the list is private + * at this point. + */ + cond_resched_softirq_context(); } } @@ -1741,12 +1753,27 @@ static void net_tx_action(struct softirq smp_mb__before_clear_bit(); clear_bit(__LINK_STATE_SCHED, &dev->state); + /* + * We are executing in softirq context here, and + * if softirqs are preemptible, we must avoid + * infinite reactivation of the softirq by + * either the tx handler, or by netif_schedule(). + * (it would result in an infinitely looping + * softirq context) + * So we take the spinlock unconditionally. + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + spin_lock(&dev->queue_lock); + qdisc_run(dev); + spin_unlock(&dev->queue_lock); +#else if (spin_trylock(&dev->queue_lock)) { qdisc_run(dev); spin_unlock(&dev->queue_lock); } else { netif_schedule(dev); } +#endif } } } @@ -1853,7 +1880,7 @@ int netif_receive_skb(struct sk_buff *sk if (!orig_dev) return NET_RX_DROP; - __get_cpu_var(netdev_rx_stat).total++; + per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++; skb_reset_network_header(skb); skb_reset_transport_header(skb); @@ -1930,9 +1957,10 @@ static int process_backlog(struct net_de { int work = 0; int quota = min(backlog_dev->quota, *budget); - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue; unsigned long start_time = jiffies; + queue = &per_cpu(softnet_data, raw_smp_processor_id()); backlog_dev->weight = weight_p; for (;;) { struct sk_buff *skb; @@ -1975,12 +2003,13 @@ job_done: static void net_rx_action(struct softirq_action *h) { - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue; unsigned long start_time = jiffies; int budget = netdev_budget; void *have; local_irq_disable(); + queue = &__get_cpu_var(softnet_data); while (!list_empty(&queue->poll_list)) { struct net_device *dev; @@ -1989,6 +2018,10 @@ static void net_rx_action(struct softirq goto softnet_break; local_irq_enable(); + if (unlikely(cond_resched_softirq_context())) { + local_irq_disable(); + continue; + } dev = list_entry(queue->poll_list.next, struct net_device, poll_list); Index: linux-rt.q/net/core/netpoll.c =================================================================== --- linux-rt.q.orig/net/core/netpoll.c +++ linux-rt.q/net/core/netpoll.c @@ -64,19 +64,19 @@ static void queue_process(struct work_st continue; } - local_irq_save(flags); + local_irq_save_nort(flags); netif_tx_lock(dev); if (netif_queue_stopped(dev) || dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); netif_tx_unlock(dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } netif_tx_unlock(dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } } @@ -121,7 +121,7 @@ static void poll_napi(struct netpoll *np int budget = 16; if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && - npinfo->poll_owner != smp_processor_id() && + npinfo->poll_owner != raw_smp_processor_id() && spin_trylock(&npinfo->poll_lock)) { npinfo->rx_flags |= NETPOLL_RX_DROP; atomic_inc(&trapped); @@ -155,7 +155,9 @@ void netpoll_poll(struct netpoll *np) return; /* Process pending work on NIC */ +// WARN_ON_RT(irqs_disabled()); np->dev->poll_controller(np->dev); +// WARN_ON_RT(irqs_disabled()); if (np->dev->poll) poll_napi(np); @@ -182,28 +184,31 @@ static void refill_skbs(void) static void zap_completion_queue(void) { - unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); + struct sk_buff *clist = NULL; + unsigned long flags; if (sd->completion_queue) { - struct sk_buff *clist; - local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); - - while (clist != NULL) { - struct sk_buff *skb = clist; - clist = clist->next; - if (skb->destructor) - dev_kfree_skb_any(skb); /* put this one back */ - else - __kfree_skb(skb); - } } + /* + * Took the list private, can drop our softnet + * reference: + */ put_cpu_var(softnet_data); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if (skb->destructor) + dev_kfree_skb_any(skb); /* put this one back */ + else + __kfree_skb(skb); + } } static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) @@ -211,13 +216,26 @@ static struct sk_buff *find_skb(struct n int count = 0; struct sk_buff *skb; +#ifdef CONFIG_PREEMPT_RT + /* + * On -rt skb_pool.lock is schedulable, so if we are + * in an atomic context we just try to dequeue from the + * pool and fail if we cannot get one. + */ + if (in_atomic() || irqs_disabled()) + goto pick_atomic; +#endif zap_completion_queue(); refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) + if (!skb) { +#ifdef CONFIG_PREEMPT_RT +pick_atomic: +#endif skb = skb_dequeue(&skb_pool); + } if (!skb) { if (++count < 10) { @@ -246,10 +264,10 @@ static void netpoll_send_skb(struct netp /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && - npinfo->poll_owner != smp_processor_id()) { + npinfo->poll_owner != raw_smp_processor_id()) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { @@ -268,7 +286,7 @@ static void netpoll_send_skb(struct netp udelay(USEC_PER_POLL); } - local_irq_restore(flags); + local_irq_restore_nort(flags); } if (status != NETDEV_TX_OK) { @@ -696,7 +714,7 @@ int netpoll_setup(struct netpoll *np) np->name); break; } - cond_resched(); + schedule_timeout_uninterruptible(1); } /* If carrier appears to come up instantly, we don't Index: linux-rt.q/net/core/sock.c =================================================================== --- linux-rt.q.orig/net/core/sock.c +++ linux-rt.q/net/core/sock.c @@ -1446,7 +1446,7 @@ static void sock_def_readable(struct soc { read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk,1,POLL_IN); read_unlock(&sk->sk_callback_lock); } Index: linux-rt.q/net/decnet/dn_dev.c =================================================================== --- linux-rt.q.orig/net/decnet/dn_dev.c +++ linux-rt.q/net/decnet/dn_dev.c @@ -89,9 +89,9 @@ static struct dn_dev_parms dn_dev_list[] .t3 = 10, .name = "ethernet", .ctl_name = NET_DECNET_CONF_ETHER, - .up = dn_eth_up, - .down = dn_eth_down, - .timer3 = dn_send_brd_hello, + .dn_up = dn_eth_up, + .dn_down = dn_eth_down, + .dn_timer3 = dn_send_brd_hello, }, { .type = ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */ @@ -101,7 +101,7 @@ static struct dn_dev_parms dn_dev_list[] .t3 = 10, .name = "ipgre", .ctl_name = NET_DECNET_CONF_GRE, - .timer3 = dn_send_brd_hello, + .dn_timer3 = dn_send_brd_hello, }, #if 0 { @@ -112,7 +112,7 @@ static struct dn_dev_parms dn_dev_list[] .t3 = 120, .name = "x25", .ctl_name = NET_DECNET_CONF_X25, - .timer3 = dn_send_ptp_hello, + .dn_timer3 = dn_send_ptp_hello, }, #endif #if 0 @@ -124,7 +124,7 @@ static struct dn_dev_parms dn_dev_list[] .t3 = 10, .name = "ppp", .ctl_name = NET_DECNET_CONF_PPP, - .timer3 = dn_send_brd_hello, + .dn_timer3 = dn_send_brd_hello, }, #endif { @@ -135,7 +135,7 @@ static struct dn_dev_parms dn_dev_list[] .t3 = 120, .name = "ddcmp", .ctl_name = NET_DECNET_CONF_DDCMP, - .timer3 = dn_send_ptp_hello, + .dn_timer3 = dn_send_ptp_hello, }, { .type = ARPHRD_LOOPBACK, /* Loopback interface - always last */ @@ -145,7 +145,7 @@ static struct dn_dev_parms dn_dev_list[] .t3 = 10, .name = "loopback", .ctl_name = NET_DECNET_CONF_LOOPBACK, - .timer3 = dn_send_brd_hello, + .dn_timer3 = dn_send_brd_hello, } }; @@ -326,11 +326,11 @@ static int dn_forwarding_proc(ctl_table */ tmp = dn_db->parms.forwarding; dn_db->parms.forwarding = old; - if (dn_db->parms.down) - dn_db->parms.down(dev); + if (dn_db->parms.dn_down) + dn_db->parms.dn_down(dev); dn_db->parms.forwarding = tmp; - if (dn_db->parms.up) - dn_db->parms.up(dev); + if (dn_db->parms.dn_up) + dn_db->parms.dn_up(dev); } return err; @@ -364,11 +364,11 @@ static int dn_forwarding_sysctl(ctl_tabl if (value > 2) return -EINVAL; - if (dn_db->parms.down) - dn_db->parms.down(dev); + if (dn_db->parms.dn_down) + dn_db->parms.dn_down(dev); dn_db->parms.forwarding = value; - if (dn_db->parms.up) - dn_db->parms.up(dev); + if (dn_db->parms.dn_up) + dn_db->parms.dn_up(dev); } return 0; @@ -1090,10 +1090,10 @@ static void dn_dev_timer_func(unsigned l struct dn_ifaddr *ifa; if (dn_db->t3 <= dn_db->parms.t2) { - if (dn_db->parms.timer3) { + if (dn_db->parms.dn_timer3) { for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) { if (!(ifa->ifa_flags & IFA_F_SECONDARY)) - dn_db->parms.timer3(dev, ifa); + dn_db->parms.dn_timer3(dev, ifa); } } dn_db->t3 = dn_db->parms.t3; @@ -1152,8 +1152,8 @@ struct dn_dev *dn_dev_create(struct net_ return NULL; } - if (dn_db->parms.up) { - if (dn_db->parms.up(dev) < 0) { + if (dn_db->parms.dn_up) { + if (dn_db->parms.dn_up(dev) < 0) { neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms); dev->dn_ptr = NULL; kfree(dn_db); @@ -1247,8 +1247,8 @@ static void dn_dev_delete(struct net_dev dn_dev_check_default(dev); neigh_ifdown(&dn_neigh_table, dev); - if (dn_db->parms.down) - dn_db->parms.down(dev); + if (dn_db->parms.dn_down) + dn_db->parms.dn_down(dev); dev->dn_ptr = NULL; Index: linux-rt.q/net/ipv4/icmp.c =================================================================== --- linux-rt.q.orig/net/ipv4/icmp.c +++ linux-rt.q/net/ipv4/icmp.c @@ -230,7 +230,10 @@ static const struct icmp_control icmp_po * On SMP we have one ICMP socket per-cpu. */ static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; -#define icmp_socket __get_cpu_var(__icmp_socket) +/* + * Should be safe on PREEMPT_SOFTIRQS/HARDIRQS to use raw-smp-processor-id: + */ +#define icmp_socket per_cpu(__icmp_socket, raw_smp_processor_id()) static __inline__ int icmp_xmit_lock(void) { Index: linux-rt.q/net/ipv4/route.c =================================================================== --- linux-rt.q.orig/net/ipv4/route.c +++ linux-rt.q/net/ipv4/route.c @@ -206,13 +206,13 @@ struct rt_hash_bucket { struct rtable *chain; }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) + defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT) /* * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks * The size of this table is a power of two and depends on the number of CPUS. * (on lockdep we have a quite big spinlock_t, so keep the size down there) */ -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) # define RT_HASH_LOCK_SZ 256 #else # if NR_CPUS >= 32 Index: linux-rt.q/net/ipv6/netfilter/ip6_tables.c =================================================================== --- linux-rt.q.orig/net/ipv6/netfilter/ip6_tables.c +++ linux-rt.q/net/ipv6/netfilter/ip6_tables.c @@ -273,7 +273,7 @@ ip6t_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); private = table->private; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)private->entries[smp_processor_id()]; + table_base = (void *)private->entries[raw_smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); /* For return from builtin chain */ @@ -1088,7 +1088,7 @@ do_add_counters(void __user *user, unsig i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[smp_processor_id()]; + loc_cpu_entry = private->entries[raw_smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, Index: linux-rt.q/net/sched/sch_generic.c =================================================================== --- linux-rt.q.orig/net/sched/sch_generic.c +++ linux-rt.q/net/sched/sch_generic.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include #include @@ -99,8 +101,10 @@ static inline int qdisc_restart(struct n * will be requeued. */ if (!nolock) { +#ifdef CONFIG_PREEMPT_RT + netif_tx_lock(dev); +#else if (!netif_tx_trylock(dev)) { - collision: /* So, someone grabbed the driver. */ /* It may be transient configuration error, @@ -108,7 +112,7 @@ static inline int qdisc_restart(struct n it by checking xmit owner and drop the packet when deadloop is detected. */ - if (dev->xmit_lock_owner == smp_processor_id()) { + if (dev->xmit_lock_owner == raw_smp_processor_id()) { kfree_skb(skb); if (net_ratelimit()) printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); @@ -117,6 +121,7 @@ static inline int qdisc_restart(struct n __get_cpu_var(netdev_rx_stat).cpu_collision++; goto requeue; } +#endif } { @@ -126,7 +131,15 @@ static inline int qdisc_restart(struct n if (!netif_queue_stopped(dev)) { int ret; + WARN_ON_RT(irqs_disabled()); ret = dev_hard_start_xmit(skb, dev); +#ifdef CONFIG_PREEMPT_RT + if (irqs_disabled()) { + if (printk_ratelimit()) + print_symbol("network driver disabled raw interrupts: %s\n", (unsigned long)dev->hard_start_xmit); + local_irq_enable(); + } +#endif if (ret == NETDEV_TX_OK) { if (!nolock) { netif_tx_unlock(dev); @@ -138,7 +151,10 @@ static inline int qdisc_restart(struct n if (ret == NETDEV_TX_LOCKED && nolock) { spin_lock(&dev->queue_lock); q = dev->qdisc; - goto collision; + preempt_disable(); + __get_cpu_var(netdev_rx_stat).cpu_collision++; + preempt_enable(); + goto requeue; } } @@ -565,9 +581,12 @@ void dev_deactivate(struct net_device *d /* Wait for outstanding dev_queue_xmit calls. */ synchronize_rcu(); - /* Wait for outstanding qdisc_run calls. */ + /* + * Wait for outstanding qdisc_run calls. + * TODO: shouldnt this be wakeup-based, instead of polling it? + */ while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state)) - yield(); + msleep(1); } void dev_init_scheduler(struct net_device *dev) Index: linux-rt.q/net/unix/af_unix.c =================================================================== --- linux-rt.q.orig/net/unix/af_unix.c +++ linux-rt.q/net/unix/af_unix.c @@ -307,10 +307,11 @@ static void unix_write_space(struct sock read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, 2, POLL_OUT); } read_unlock(&sk->sk_callback_lock); + preempt_check_resched_delayed(); } /* When dgram socket disconnects (or changes its peer), we clear its receive patches/latency-measurement-drivers.patch0000664000077200007720000004530310646635213020213 0ustar mingomingo this patch adds: - histogram support to /dev/rtc - the /dev/blocker lock-latency test-device - the /dev/lpptest parallel-port irq latency test-device drivers/char/Kconfig | 40 ++++++++++ drivers/char/Makefile | 3 drivers/char/blocker.c | 108 +++++++++++++++++++++++++++++ drivers/char/lpptest.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/char/rtc.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++++- scripts/Makefile | 3 scripts/testlpp.c | 159 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 668 insertions(+), 2 deletions(-) Index: linux-rt.q/drivers/char/Kconfig =================================================================== --- linux-rt.q.orig/drivers/char/Kconfig +++ linux-rt.q/drivers/char/Kconfig @@ -791,6 +791,46 @@ config RTC To compile this driver as a module, choose M here: the module will be called rtc. +config RTC_HISTOGRAM + bool "Real Time Clock Histogram Support" + default n + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + depends on X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + +config LPPTEST + tristate "Parallel Port Based Latency Measurement Device" + depends on !PARPORT && X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + testlpp utility uses to measure IRQ latencies of a target system + from an independent measurement system. + + NOTE: this code assumes x86 PCs and that the parallel port is + bidirectional and is on IRQ 7. + + to use the device, both the target and the source system needs to + run a kernel with CONFIG_LPPTEST enabled. To measure latencies, + use the scripts/testlpp utility in your kernel source directory, + and run it (as root) on the source system - it will start printing + out the latencies it took to get a response from the target system: + + Latency of response: 12.2 usecs (121265 cycles) + + then generate various workloads on the target system to see how + (worst-case-) latencies are impacted. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_IP22 Index: linux-rt.q/drivers/char/Makefile =================================================================== --- linux-rt.q.orig/drivers/char/Makefile +++ linux-rt.q/drivers/char/Makefile @@ -94,6 +94,9 @@ obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu. obj-$(CONFIG_GPIO_TB0219) += tb0219.o obj-$(CONFIG_TELCLOCK) += tlclk.o +obj-$(CONFIG_BLOCKER) += blocker.o +obj-$(CONFIG_LPPTEST) += lpptest.o + obj-$(CONFIG_WATCHDOG) += watchdog/ obj-$(CONFIG_MWAVE) += mwave/ obj-$(CONFIG_AGP) += agp/ Index: linux-rt.q/drivers/char/blocker.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/char/blocker.c @@ -0,0 +1,108 @@ +/* + * priority inheritance testing device + */ + +#include +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define BLOCKER_MAX_LOCK_DEPTH 10 + +void loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cycles(); +} + +static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static long blocker_ioctl(struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= BLOCKER_MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++) + spin_lock_init(blocker_lock + i); + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + Index: linux-rt.q/drivers/char/lpptest.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/char/lpptest.c @@ -0,0 +1,178 @@ +/* + * /dev/lpptest device: test IRQ handling latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + * + * licensed under the GPL + * + * You need to have CONFIG_PARPORT disabled for this device, it is a + * completely self-contained device that assumes sole ownership of the + * parallel port. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * API wrappers so that the code can be shared with the -rt tree: + */ +#ifndef local_irq_disable +# define local_irq_disable local_irq_disable +# define local_irq_enable local_irq_enable +#endif + +#ifndef IRQ_NODELAY +# define IRQ_NODELAY 0 +# define IRQF_NODELAY 0 +#endif + +/* + * Driver: + */ +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_IRQ 7 + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static char dev_id[] = "lpptest"; + +#define INIT_PORT() outb(0x04, 0x37a) +#define ENABLE_IRQ() outb(0x10, 0x37a) +#define DISABLE_IRQ() outb(0, 0x37a) + +static unsigned char out = 0x5a; + +/** + * Interrupt handler. Flip a bit in the reply. + */ +static int lpptest_irq (int irq, void *dev_id) +{ + out ^= 0xff; + outb(out, 0x378); + + return IRQ_HANDLED; +} + +static cycles_t test_response(void) +{ + cycles_t now, end; + unsigned char in; + int timeout = 0; + + local_irq_disable(); + in = inb(0x379); + inb(0x378); + outb(0x08, 0x378); + now = get_cycles(); + while(1) { + if (inb(0x379) != in) + break; + if (timeout++ > 1000000) { + outb(0x00, 0x378); + local_irq_enable(); + + return 0; + } + } + end = get_cycles(); + outb(0x00, 0x378); + local_irq_enable(); + + return end - now; +} + +static int lpptest_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int lpptest_close(struct inode *inode, struct file *file) +{ + return 0; +} + +int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) +{ + int retval = 0; + + switch (ioctl_num) { + + case LPPTEST_DISABLE: + DISABLE_IRQ(); + break; + + case LPPTEST_ENABLE: + ENABLE_IRQ(); + break; + + case LPPTEST_TEST: { + + cycles_t diff = test_response(); + if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff))) + goto errcpy; + break; + } + default: retval = -EINVAL; + } + + return retval; + + errcpy: + return -EFAULT; +} + +static struct file_operations lpptest_dev_fops = { + .ioctl = lpptest_ioctl, + .open = lpptest_open, + .release = lpptest_close, +}; + +static int __init lpptest_init (void) +{ + if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops)) + { + printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n", + LPPTEST_CHAR_MAJOR); + return -EAGAIN; + } + + if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) { + printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); + return -EAGAIN; + } + irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY; + irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED; + + INIT_PORT(); + ENABLE_IRQ(); + + return 0; +} +module_init (lpptest_init); + +static void __exit lpptest_exit (void) +{ + DISABLE_IRQ(); + + free_irq(LPPTEST_IRQ, dev_id); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); +} +module_exit (lpptest_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("lpp test module"); + Index: linux-rt.q/drivers/char/rtc.c =================================================================== --- linux-rt.q.orig/drivers/char/rtc.c +++ linux-rt.q/drivers/char/rtc.c @@ -93,6 +93,32 @@ #include #endif +#ifdef CONFIG_MIPS +# include +#endif + +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + static unsigned long rtc_port; static int rtc_irq = PCI_IRQ_NONE; #endif @@ -225,7 +251,146 @@ static inline unsigned char rtc_is_updat return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + last_interrupt_time = get_cycles(); + rtc_state = S_READ_MISSED; + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + last_interrupt_time = get_cycles(); + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with IRQF_DISABLED set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -269,9 +434,9 @@ irqreturn_t rtc_interrupt(int irq, void if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); - kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + rtc_wake_event(); + wake_up_interruptible(&rtc_wait); return IRQ_HANDLED; } @@ -381,6 +546,8 @@ static ssize_t rtc_read(struct file *fil schedule(); } while (1); + rtc_read_event(); + if (count == sizeof(unsigned int)) retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); else @@ -613,6 +780,11 @@ static int rtc_do_ioctl(unsigned int cmd save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -622,6 +794,7 @@ static int rtc_do_ioctl(unsigned int cmd CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -720,6 +893,7 @@ static int rtc_open(struct inode *inode, if(rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -775,6 +949,7 @@ no_irq: rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq (&rtc_lock); + rtc_close_event(); return 0; } Index: linux-rt.q/scripts/Makefile =================================================================== --- linux-rt.q.orig/scripts/Makefile +++ linux-rt.q/scripts/Makefile @@ -13,6 +13,9 @@ hostprogs-$(CONFIG_LOGO) += pnmt hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash hostprogs-$(CONFIG_IKCONFIG) += bin2c +ifdef CONFIG_LPPTEST +hostprogs-y += testlpp +endif always := $(hostprogs-y) $(hostprogs-m) Index: linux-rt.q/scripts/testlpp.c =================================================================== --- /dev/null +++ linux-rt.q/scripts/testlpp.c @@ -0,0 +1,159 @@ +/* + * testlpp.c: use the /dev/lpptest device to test IRQ handling + * latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner + * + * licensed under the GPL + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +#define HIST_SIZE 10000 + +static int hist_total; +static unsigned long hist[HIST_SIZE]; + +static void hist_hit(unsigned long usecs) +{ + hist_total++; + if (usecs >= HIST_SIZE-1) + hist[HIST_SIZE-1]++; + else + hist[usecs]++; +} + +static void print_hist(void) +{ + int i; + + printf("LPP latency histogram:\n"); + + for (i = 0; i < HIST_SIZE; i++) { + if (hist[i]) + printf("%3d usecs: %9ld\n", i, hist[i]); + } +} + +static inline unsigned long long int rdtsc(void) +{ + unsigned long long int x, y; + for (;;) { + __asm__ volatile ("rdtsc" : "=A" (x)); + __asm__ volatile ("rdtsc" : "=A" (y)); + if (y - x < 1000) + return y; + } +} + +static unsigned long long calibrate_loop(void) +{ + unsigned long long mytime1, mytime2; + + mytime1 = rdtsc(); + usleep(500000); + mytime2 = rdtsc(); + + return (mytime2 - mytime1) * 2; +} + +#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec) + +#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec) + +int fd, total; +unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec; + +void cleanup(int sig) +{ + ioctl (fd, LPPTEST_ENABLE, &tim); + if (sig) + printf("[ interrupted - exiting ]\n"); + printf("\ntotal number of responses: %d\n", total); + printf("average reponse latency: %.2lf usecs\n", + time_to_usecs(sum_tim/total)); + printf("minimum latency: %.2lf usecs\n", + time_to_usecs(min_tim)); + printf("maximum latency: %.2lf usecs\n", + time_to_usecs(max_tim)); + print_hist(); + exit(0); +} + +#define HZ 3000 + +int main (int argc, char **argv) +{ + unsigned int nr_requests = 0; + + if (argc > 2) { + fprintf(stderr, "usage: testlpp []\n"); + exit(-1); + } + if (argc == 2) + nr_requests = atol(argv[1]); + + if (getuid() != 0) { + fprintf(stderr, "need to run as root!\n"); + exit(-1); + } + mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1)); + + fd = open("/dev/lpptest", O_RDWR); + if (fd == -1) { + fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n"); + exit(-1); + } + + signal(SIGINT,&cleanup); + + ioctl (fd, LPPTEST_DISABLE, &tim); + + fprintf(stderr, "calibrating cycles to usecs: "); + cycles_per_sec = calibrate_loop(); + fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000); + if (nr_requests) + fprintf(stderr, "[max # of requests: %u]\n", nr_requests); + fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ); + + while(1) { + ioctl (fd, LPPTEST_TEST, &tim); + if (tim == 0) + printf ("No response from target.\n"); + else { + hist_hit(time_to_usecs_l(tim)); + if (tim > max_tim) { + printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim); + max_tim = tim; + } + if (tim < min_tim) + min_tim = tim; + total++; + if (total == nr_requests) + break; + sum_tim += tim; + } + usleep(1000000/HZ); + } + cleanup(0); + + return 0; +} + + patches/x86_64-use-i386-i8253-h.patch0000664000077200007720000000314610646635211015766 0ustar mingomingoSubject: i386: prepare sharing the PIT code PIT clock events work already and the PIT handling is the same for i386 and x86_64. x86_64 does not support PIT as a clock source, so disable the PIT clocksource for x86_64. Use the i386 i8253.h include file for x86_64 as well to share the exports and the PIT constants. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/time.c | 4 ---- include/asm-x86_64/i8253.h | 8 ++------ 2 files changed, 2 insertions(+), 10 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/time.c +++ linux-rt.q/arch/x86_64/kernel/time.c @@ -32,7 +32,6 @@ #include /* for PM timer frequency */ #include #endif -#include #include #include #include @@ -291,9 +290,6 @@ static unsigned int __init tsc_calibrate return pmc_now * tsc_khz / (tsc_now - tsc_start); } -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 - static void __pit_init(int val, u8 mode) { unsigned long flags; Index: linux-rt.q/include/asm-x86_64/i8253.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/i8253.h +++ linux-rt.q/include/asm-x86_64/i8253.h @@ -1,6 +1,2 @@ -#ifndef __ASM_I8253_H__ -#define __ASM_I8253_H__ - -extern spinlock_t i8253_lock; - -#endif /* __ASM_I8253_H__ */ +#include +#include patches/x86_64-hpet-tsc-calibration-fix-broken-smi-detection-logic.patch0000664000077200007720000000250110646635210025412 0ustar mingomingoSubject: X86_64: hpet tsc calibration fix broken smi detection logic The current SMI detection logic in read_hpet_tsc() makes sure, that when a SMI happens between the read of the HPET counter and the read of the TSC, this wrong value is used for TSC calibration. This is not the intention of the function. The comparison must ensure, that we do _NOT_ use such a value. Fix the check to use calibration values where delta of the two TSC reads is smaller than a reasonable threshold. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/hpet.c +++ linux-rt.q/arch/x86_64/kernel/hpet.c @@ -190,7 +190,7 @@ int hpet_reenable(void) */ #define TICK_COUNT 100000000 -#define TICK_MIN 5000 +#define SMI_THRESHOLD 50000 #define MAX_TRIES 5 /* @@ -205,7 +205,7 @@ static void __init read_hpet_tsc(int *hp tsc1 = get_cycles_sync(); hpet1 = hpet_readl(HPET_COUNTER); tsc2 = get_cycles_sync(); - if (tsc2 - tsc1 > TICK_MIN) + if ((tsc2 - tsc1) < SMI_THRESHOLD) break; } *hpet = hpet1; patches/preempt-realtime-net-drivers.patch0000664000077200007720000000103310646635215020263 0ustar mingomingo--- drivers/net/tulip/tulip_core.c | 1 + 1 file changed, 1 insertion(+) Index: linux-rt.q/drivers/net/tulip/tulip_core.c =================================================================== --- linux-rt.q.orig/drivers/net/tulip/tulip_core.c +++ linux-rt.q/drivers/net/tulip/tulip_core.c @@ -1803,6 +1803,7 @@ static void __devexit tulip_remove_one ( pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ patches/preempt-realtime-arm-ixp4xx.patch0000664000077200007720000000114010646635214020040 0ustar mingomingo--- arch/arm/mach-ixp4xx/common-pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/arm/mach-ixp4xx/common-pci.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-ixp4xx/common-pci.c +++ linux-rt.q/arch/arm/mach-ixp4xx/common-pci.c @@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0; * these transactions are atomic or we will end up * with corrupt data on the bus or in a driver. */ -static DEFINE_SPINLOCK(ixp4xx_pci_lock); +static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock); /* * Read from PCI config space patches/rcu-3.patch0000664000077200007720000004772710646635213013522 0ustar mingomingoFrom: Paul McKenney This patch implements a new version of RCU which allows its read-side critical sections to be preempted. It uses a set of counter pairs to keep track of the read-side critical sections and flips them when all tasks exit read-side critical section. The details of this implementation can be found in this paper - http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf This patch was developed as a part of the -rt kernel development and meant to provide better latencies when read-side critical sections of RCU don't disable preemption. As a consequence of keeping track of RCU readers, the readers have a slight overhead (optimizations in the paper). This implementation co-exists with the "classic" RCU implementations and can be switched to at compiler. Signed-off-by: Paul McKenney Signed-off-by: Dipankar Sarma include/linux/rcupdate.h | 5 include/linux/rcupreempt.h | 66 ++++++ include/linux/sched.h | 6 kernel/Kconfig.preempt | 37 +++ kernel/Makefile | 4 kernel/rcupreempt.c | 464 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 581 insertions(+), 1 deletion(-) Index: linux-rt.q/include/linux/rcupdate.h =================================================================== --- linux-rt.q.orig/include/linux/rcupdate.h +++ linux-rt.q/include/linux/rcupdate.h @@ -41,7 +41,12 @@ #include #include #include + +#ifdef CONFIG_CLASSIC_RCU #include +#else +#include +#endif /** * struct rcu_head - callback structure for use with RCU Index: linux-rt.q/include/linux/rcupreempt.h =================================================================== --- /dev/null +++ linux-rt.q/include/linux/rcupreempt.h @@ -0,0 +1,66 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (RT implementation) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Paul McKenney + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rcupdate.html + * + */ + +#ifndef __LINUX_RCUPREEMPT_H +#define __LINUX_RCUPREEMPT_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +#define rcu_qsctr_inc(cpu) +#define rcu_bh_qsctr_inc(cpu) +#define call_rcu_bh(head, rcu) call_rcu(head, rcu) + +extern void __rcu_read_lock(void); +extern void __rcu_read_unlock(void); +extern int rcu_pending(int cpu); + +#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); } +#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); } + +#define __rcu_read_lock_nesting() (current->rcu_read_lock_nesting) + +extern void __synchronize_sched(void); + +extern void __rcu_init(void); +extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_restart_cpu(int cpu); +extern long rcu_batches_completed(void); + +#endif /* __KERNEL__ */ +#endif /* __LINUX_RCUPREEMPT_H */ Index: linux-rt.q/include/linux/sched.h =================================================================== --- linux-rt.q.orig/include/linux/sched.h +++ linux-rt.q/include/linux/sched.h @@ -1018,6 +1018,12 @@ struct task_struct { unsigned int time_slice; struct sched_class *sched_class; +#ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; + atomic_t *rcu_flipctr1; + atomic_t *rcu_flipctr2; +#endif + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif Index: linux-rt.q/kernel/Kconfig.preempt =================================================================== --- linux-rt.q.orig/kernel/Kconfig.preempt +++ linux-rt.q/kernel/Kconfig.preempt @@ -63,3 +63,40 @@ config PREEMPT_BKL Say Y here if you are building a kernel for a desktop system. Say N if you are unsure. +choice + prompt "RCU implementation type:" + default CLASSIC_RCU + +config CLASSIC_RCU + bool "Classic RCU" + help + This option selects the classic RCU implementation that is + designed for best read-side performance on non-realtime + systems. + + Say Y if you are unsure. + +config PREEMPT_RCU + bool "Preemptible RCU" + help + This option reduces the latency of the kernel by making certain + RCU sections preemptible. Normally RCU code is non-preemptible, if + this option is selected then read-only RCU sections become + preemptible. This helps latency, but may expose bugs due to + now-naive assumptions about each RCU read-side critical section + remaining on a given CPU through its execution. + + Say N if you are unsure. + +endchoice + +config RCU_STATS + bool "/proc stats for preemptible RCU read-side critical sections" + depends on PREEMPT_RCU + default y + help + This option provides /proc stats to provide debugging info for + the preemptible realtime RCU implementation. + + Say Y here if you want to see RCU stats in /proc + Say N if you are unsure. Index: linux-rt.q/kernel/Makefile =================================================================== --- linux-rt.q.orig/kernel/Makefile +++ linux-rt.q/kernel/Makefile @@ -6,7 +6,7 @@ obj-y = sched.o fork.o exec_domain.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ - rcupdate.o rcuclassic.o extable.o params.o posix-timers.o \ + extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o @@ -51,6 +51,8 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_CLASSIC_RCU) += rcupdate.o rcuclassic.o +obj-$(CONFIG_PREEMPT_RCU) += rcupdate.o rcupreempt.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_UTS_NS) += utsname.o Index: linux-rt.q/kernel/rcupreempt.c =================================================================== --- /dev/null +++ linux-rt.q/kernel/rcupreempt.c @@ -0,0 +1,464 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, realtime implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2001 + * + * Authors: Paul E. McKenney + * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar + * for pushing me away from locks and towards counters. + * + * Papers: http://www.rdrop.com/users/paulmck/RCU + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * PREEMPT_RCU data structures. + */ + +struct rcu_data { + spinlock_t lock; + long completed; /* Number of last completed batch. */ + struct tasklet_struct rcu_tasklet; + struct rcu_head *nextlist; + struct rcu_head **nexttail; + struct rcu_head *waitlist; + struct rcu_head **waittail; + struct rcu_head *donelist; + struct rcu_head **donetail; +#ifdef CONFIG_RCU_STATS + long n_next_length; + long n_next_add; + long n_wait_length; + long n_wait_add; + long n_done_length; + long n_done_add; + long n_done_remove; + atomic_t n_done_invoked; + long n_rcu_check_callbacks; + atomic_t n_rcu_try_flip1; + long n_rcu_try_flip2; + long n_rcu_try_flip3; + atomic_t n_rcu_try_flip_e1; + long n_rcu_try_flip_e2; + long n_rcu_try_flip_e3; +#endif /* #ifdef CONFIG_RCU_STATS */ +}; +struct rcu_ctrlblk { + spinlock_t fliplock; + long completed; /* Number of last completed batch. */ +}; +static struct rcu_data rcu_data; +static struct rcu_ctrlblk rcu_ctrlblk = { + .fliplock = SPIN_LOCK_UNLOCKED, + .completed = 0, +}; +static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) = + { ATOMIC_INIT(0), ATOMIC_INIT(0) }; + +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed(void) +{ + return rcu_ctrlblk.completed; +} + +void __rcu_read_lock(void) +{ + int flipctr; + unsigned long oldirq; + + local_irq_save(oldirq); + + if (current->rcu_read_lock_nesting++ == 0) { + + /* + * Outermost nesting of rcu_read_lock(), so atomically + * increment the current counter for the current CPU. + */ + + flipctr = rcu_ctrlblk.completed & 0x1; + smp_read_barrier_depends(); + current->rcu_flipctr1 = &(__get_cpu_var(rcu_flipctr)[flipctr]); + /* Can optimize to non-atomic on fastpath, but start simple. */ + atomic_inc(current->rcu_flipctr1); + smp_mb__after_atomic_inc(); /* might optimize out... */ + if (unlikely(flipctr != (rcu_ctrlblk.completed & 0x1))) { + + /* + * We raced with grace-period processing (flip). + * Although we cannot be preempted here, there + * could be interrupts, ECC errors and the like, + * so just nail down both sides of the rcu_flipctr + * array for the duration of our RCU read-side + * critical section, preventing a second flip + * from racing with us. At some point, it would + * be safe to decrement one of the counters, but + * we have no way of knowing when that would be. + * So just decrement them both in rcu_read_unlock(). + */ + + current->rcu_flipctr2 = + &(__get_cpu_var(rcu_flipctr)[!flipctr]); + /* Can again optimize to non-atomic on fastpath. */ + atomic_inc(current->rcu_flipctr2); + smp_mb__after_atomic_inc(); /* might optimize out... */ + } + } + local_irq_restore(oldirq); +} + +void __rcu_read_unlock(void) +{ + unsigned long oldirq; + + local_irq_save(oldirq); + if (--current->rcu_read_lock_nesting == 0) { + + /* + * Just atomically decrement whatever we incremented. + * Might later want to awaken some task waiting for the + * grace period to complete, but keep it simple for the + * moment. + */ + + smp_mb__before_atomic_dec(); + atomic_dec(current->rcu_flipctr1); + current->rcu_flipctr1 = NULL; + if (unlikely(current->rcu_flipctr2 != NULL)) { + atomic_dec(current->rcu_flipctr2); + current->rcu_flipctr2 = NULL; + } + } + + local_irq_restore(oldirq); +} + +static void __rcu_advance_callbacks(void) +{ + + if (rcu_data.completed != rcu_ctrlblk.completed) { + if (rcu_data.waitlist != NULL) { + *rcu_data.donetail = rcu_data.waitlist; + rcu_data.donetail = rcu_data.waittail; +#ifdef CONFIG_RCU_STATS + rcu_data.n_done_length += rcu_data.n_wait_length; + rcu_data.n_done_add += rcu_data.n_wait_length; + rcu_data.n_wait_length = 0; +#endif /* #ifdef CONFIG_RCU_STATS */ + } + if (rcu_data.nextlist != NULL) { + rcu_data.waitlist = rcu_data.nextlist; + rcu_data.waittail = rcu_data.nexttail; + rcu_data.nextlist = NULL; + rcu_data.nexttail = &rcu_data.nextlist; +#ifdef CONFIG_RCU_STATS + rcu_data.n_wait_length += rcu_data.n_next_length; + rcu_data.n_wait_add += rcu_data.n_next_length; + rcu_data.n_next_length = 0; +#endif /* #ifdef CONFIG_RCU_STATS */ + } else { + rcu_data.waitlist = NULL; + rcu_data.waittail = &rcu_data.waitlist; + } + rcu_data.completed = rcu_ctrlblk.completed; + } +} + +/* + * Attempt a single flip of the counters. Remember, a single flip does + * -not- constitute a grace period. Instead, the interval between + * a pair of consecutive flips is a grace period. + * + * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation + * on a large SMP, they might want to use a hierarchical organization of + * the per-CPU-counter pairs. + */ +static void rcu_try_flip(void) +{ + int cpu; + long flipctr; + unsigned long oldirq; + + flipctr = rcu_ctrlblk.completed; +#ifdef CONFIG_RCU_STATS + atomic_inc(&rcu_data.n_rcu_try_flip1); +#endif /* #ifdef CONFIG_RCU_STATS */ + if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) { +#ifdef CONFIG_RCU_STATS + atomic_inc(&rcu_data.n_rcu_try_flip_e1); +#endif /* #ifdef CONFIG_RCU_STATS */ + return; + } + if (unlikely(flipctr != rcu_ctrlblk.completed)) { + + /* Our work is done! ;-) */ + +#ifdef CONFIG_RCU_STATS + rcu_data.n_rcu_try_flip_e2++; +#endif /* #ifdef CONFIG_RCU_STATS */ + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); + return; + } + flipctr &= 0x1; + + /* + * Check for completion of all RCU read-side critical sections + * that started prior to the previous flip. + */ + +#ifdef CONFIG_RCU_STATS + rcu_data.n_rcu_try_flip2++; +#endif /* #ifdef CONFIG_RCU_STATS */ + for_each_possible_cpu(cpu) { + if (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) { +#ifdef CONFIG_RCU_STATS + rcu_data.n_rcu_try_flip_e3++; +#endif /* #ifdef CONFIG_RCU_STATS */ + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); + return; + } + } + + /* Do the flip. */ + + smp_mb(); + rcu_ctrlblk.completed++; + +#ifdef CONFIG_RCU_STATS + rcu_data.n_rcu_try_flip3++; +#endif /* #ifdef CONFIG_RCU_STATS */ + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); +} + +void rcu_check_callbacks(int cpu, int user) +{ + unsigned long oldirq; + + if (rcu_ctrlblk.completed == rcu_data.completed) { + rcu_try_flip(); + if (rcu_ctrlblk.completed == rcu_data.completed) { + return; + } + } + spin_lock_irqsave(&rcu_data.lock, oldirq); +#ifdef CONFIG_RCU_STATS + rcu_data.n_rcu_check_callbacks++; +#endif /* #ifdef CONFIG_RCU_STATS */ + __rcu_advance_callbacks(); + if (rcu_data.donelist == NULL) { + spin_unlock_irqrestore(&rcu_data.lock, oldirq); + } else { + spin_unlock_irqrestore(&rcu_data.lock, oldirq); + tasklet_schedule(&rcu_data.rcu_tasklet); + } +} + +static void rcu_process_callbacks(unsigned long data) +{ + unsigned long flags; + struct rcu_head *next, *list; + + spin_lock_irqsave(&rcu_data.lock, flags); + list = rcu_data.donelist; + if (list == NULL) { + spin_unlock_irqrestore(&rcu_data.lock, flags); + return; + } + rcu_data.donelist = NULL; + rcu_data.donetail = &rcu_data.donelist; +#ifdef CONFIG_RCU_STATS + rcu_data.n_done_remove += rcu_data.n_done_length; + rcu_data.n_done_length = 0; +#endif /* #ifdef CONFIG_RCU_STATS */ + spin_unlock_irqrestore(&rcu_data.lock, flags); + while (list) { + next = list->next; + list->func(list); + list = next; +#ifdef CONFIG_RCU_STATS + atomic_inc(&rcu_data.n_done_invoked); +#endif /* #ifdef CONFIG_RCU_STATS */ + } +} + +void fastcall call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + + head->func = func; + head->next = NULL; + spin_lock_irqsave(&rcu_data.lock, flags); + __rcu_advance_callbacks(); + *rcu_data.nexttail = head; + rcu_data.nexttail = &head->next; +#ifdef CONFIG_RCU_STATS + rcu_data.n_next_add++; + rcu_data.n_next_length++; +#endif /* #ifdef CONFIG_RCU_STATS */ + spin_unlock_irqrestore(&rcu_data.lock, flags); +} + +/* + * Crude hack, reduces but does not eliminate possibility of failure. + * Needs to wait for all CPUs to pass through a -voluntary- context + * switch to eliminate possibility of failure. (Maybe just crank + * priority down...) + */ +void __synchronize_sched(void) +{ + cpumask_t oldmask; + int cpu; + + if (sched_getaffinity(0, &oldmask) < 0) { + oldmask = cpu_possible_map; + } + for_each_online_cpu(cpu) { + sched_setaffinity(0, cpumask_of_cpu(cpu)); + schedule(); + } + sched_setaffinity(0, oldmask); +} + +int rcu_pending(int cpu) +{ + return (rcu_data.donelist != NULL || + rcu_data.waitlist != NULL || + rcu_data.nextlist != NULL); +} + +void __init __rcu_init(void) +{ +/*&&&&*/printk("WARNING: experimental RCU implementation.\n"); + spin_lock_init(&rcu_data.lock); + rcu_data.completed = 0; + rcu_data.nextlist = NULL; + rcu_data.nexttail = &rcu_data.nextlist; + rcu_data.waitlist = NULL; + rcu_data.waittail = &rcu_data.waitlist; + rcu_data.donelist = NULL; + rcu_data.donetail = &rcu_data.donelist; + tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL); +} + +/* + * Deprecated, use synchronize_rcu() or synchronize_sched() instead. + */ +void synchronize_kernel(void) +{ + synchronize_rcu(); +} + +#ifdef CONFIG_RCU_STATS +int rcu_read_proc_data(char *page) +{ + return sprintf(page, + "ggp=%ld lgp=%ld rcc=%ld\n" + "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" + "rtf1=%d rtf2=%ld rtf3=%ld rtfe1=%d rtfe2=%ld rtfe3=%ld\n", + + rcu_ctrlblk.completed, + rcu_data.completed, + rcu_data.n_rcu_check_callbacks, + + rcu_data.n_next_add, + rcu_data.n_next_length, + rcu_data.n_wait_add, + rcu_data.n_wait_length, + rcu_data.n_done_add, + rcu_data.n_done_length, + rcu_data.n_done_remove, + atomic_read(&rcu_data.n_done_invoked), + + atomic_read(&rcu_data.n_rcu_try_flip1), + rcu_data.n_rcu_try_flip2, + rcu_data.n_rcu_try_flip3, + atomic_read(&rcu_data.n_rcu_try_flip_e1), + rcu_data.n_rcu_try_flip_e2, + rcu_data.n_rcu_try_flip_e3); +} + +int rcu_read_proc_gp_data(char *page) +{ + long oldgp = rcu_ctrlblk.completed; + + synchronize_rcu(); + return sprintf(page, "oldggp=%ld newggp=%ld\n", + oldgp, rcu_ctrlblk.completed); +} + +int rcu_read_proc_ptrs_data(char *page) +{ + return sprintf(page, + "nl=%p/%p nt=%p\n wl=%p/%p wt=%p dl=%p/%p dt=%p\n", + &rcu_data.nextlist, rcu_data.nextlist, rcu_data.nexttail, + &rcu_data.waitlist, rcu_data.waitlist, rcu_data.waittail, + &rcu_data.donelist, rcu_data.donelist, rcu_data.donetail + ); +} + +int rcu_read_proc_ctrs_data(char *page) +{ + int cnt = 0; + int cpu; + int f = rcu_data.completed & 0x1; + + cnt += sprintf(&page[cnt], "CPU last cur\n"); + for_each_online_cpu(cpu) { + cnt += sprintf(&page[cnt], "%3d %4d %3d\n", + cpu, + atomic_read(&per_cpu(rcu_flipctr, cpu)[!f]), + atomic_read(&per_cpu(rcu_flipctr, cpu)[f])); + } + cnt += sprintf(&page[cnt], "ggp = %ld\n", rcu_data.completed); + return (cnt); +} + +#endif /* #ifdef CONFIG_RCU_STATS */ + +EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(rcu_batches_completed); +EXPORT_SYMBOL_GPL(__synchronize_sched); +EXPORT_SYMBOL_GPL(__rcu_read_lock); +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + patches/fix-emergency-reboot.patch0000664000077200007720000000277710646635216016622 0ustar mingomingoSubject: [patch] call reboot notifier list when doing an emergency reboot From: Ingo Molnar my laptop does not reboot unless the shutdown notifiers are called first. So the following command, which i use as a fast way to reboot into a new kernel: echo b > /proc/sysrq-trigger just hangs indefinitely after the kernel prints "System rebooting". the thing is, that the kernel is actually reschedulable in this stage, so we could as well process the reboot_notifier_list. (furthermore, on -rt kernels this place is preemptable even during SysRq-b) So just process the reboot notifier list if we are preemptable. This will shut disk caches and chipsets off. Signed-off-by: Ingo Molnar --- kernel/sys.c | 10 ++++++++++ 1 file changed, 10 insertions(+) Index: linux-rt.q/kernel/sys.c =================================================================== --- linux-rt.q.orig/kernel/sys.c +++ linux-rt.q/kernel/sys.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -787,6 +788,15 @@ out_unlock: */ void emergency_restart(void) { + /* + * Call the notifier chain if we are not in an + * atomic context: + */ +#ifdef CONFIG_PREEMPT + if (!in_atomic() && !irqs_disabled()) + blocking_notifier_call_chain(&reboot_notifier_list, + SYS_RESTART, NULL); +#endif machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); patches/mm-lockless-preempt-fixup.patch0000664000077200007720000001205310646635216017605 0ustar mingomingo Livelock scenario pointed out by Nick. SetPageNoNewRefs(page); *** preempted here *** page_cache_get_speculative() { while (PageNoNewRefs(page)) /* livelock */ } Signed-off-by: Peter Zijlstra --- include/linux/pagemap.h | 25 +++++++++++++++++++++++-- mm/filemap.c | 6 ++---- mm/migrate.c | 10 ++++------ mm/swap_state.c | 6 ++---- mm/vmscan.c | 8 +++----- 5 files changed, 34 insertions(+), 21 deletions(-) Index: linux-rt.q/include/linux/pagemap.h =================================================================== --- linux-rt.q.orig/include/linux/pagemap.h +++ linux-rt.q/include/linux/pagemap.h @@ -64,6 +64,28 @@ static inline void mapping_set_gfp_mask( #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +static inline void set_page_no_new_refs(struct page *page) +{ + VM_BUG_ON(PageNoNewRefs(page)); + preempt_disable(); + SetPageNoNewRefs(page); + smp_wmb(); +} + +static inline void end_page_no_new_refs(struct page *page) +{ + VM_BUG_ON(!PageNoNewRefs(page)); + smp_wmb(); + ClearPageNoNewRefs(page); + preempt_enable(); +} + +static inline void wait_on_new_refs(struct page *page) +{ + while (unlikely(PageNoNewRefs(page))) + cpu_relax(); +} + /* * speculatively take a reference to a page. * If the page is free (_count == 0), then _count is untouched, and 0 @@ -139,8 +161,7 @@ static inline int page_cache_get_specula * page refcount has been raised. See below comment. */ - while (unlikely(PageNoNewRefs(page))) - cpu_relax(); + wait_on_new_refs(page); /* * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs()) Index: linux-rt.q/mm/filemap.c =================================================================== --- linux-rt.q.orig/mm/filemap.c +++ linux-rt.q/mm/filemap.c @@ -440,8 +440,7 @@ int add_to_page_cache(struct page *page, int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { - SetPageNoNewRefs(page); - smp_wmb(); + set_page_no_new_refs(page); write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { @@ -453,8 +452,7 @@ int add_to_page_cache(struct page *page, __inc_zone_page_state(page, NR_FILE_PAGES); } write_unlock_irq(&mapping->tree_lock); - smp_wmb(); - ClearPageNoNewRefs(page); + end_page_no_new_refs(page); radix_tree_preload_end(); } return error; Index: linux-rt.q/mm/migrate.c =================================================================== --- linux-rt.q.orig/mm/migrate.c +++ linux-rt.q/mm/migrate.c @@ -303,8 +303,7 @@ static int migrate_page_move_mapping(str return 0; } - SetPageNoNewRefs(page); - smp_wmb(); + set_page_no_new_refs(page); write_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, @@ -313,7 +312,7 @@ static int migrate_page_move_mapping(str if (page_count(page) != 2 + !!PagePrivate(page) || (struct page *)radix_tree_deref_slot(pslot) != page) { write_unlock_irq(&mapping->tree_lock); - ClearPageNoNewRefs(page); + end_page_no_new_refs(page); return -EAGAIN; } @@ -330,9 +329,8 @@ static int migrate_page_move_mapping(str radix_tree_replace_slot(pslot, newpage); page->mapping = NULL; - write_unlock_irq(&mapping->tree_lock); - smp_wmb(); - ClearPageNoNewRefs(page); + write_unlock_irq(&mapping->tree_lock); + end_page_no_new_refs(page); /* * Drop cache reference from old page. Index: linux-rt.q/mm/swap_state.c =================================================================== --- linux-rt.q.orig/mm/swap_state.c +++ linux-rt.q/mm/swap_state.c @@ -78,8 +78,7 @@ static int __add_to_swap_cache(struct pa BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { - SetPageNoNewRefs(page); - smp_wmb(); + set_page_no_new_refs(page); write_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); @@ -92,8 +91,7 @@ static int __add_to_swap_cache(struct pa __inc_zone_page_state(page, NR_FILE_PAGES); } write_unlock_irq(&swapper_space.tree_lock); - smp_wmb(); - ClearPageNoNewRefs(page); + end_page_no_new_refs(page); radix_tree_preload_end(); } return error; Index: linux-rt.q/mm/vmscan.c =================================================================== --- linux-rt.q.orig/mm/vmscan.c +++ linux-rt.q/mm/vmscan.c @@ -387,8 +387,7 @@ int remove_mapping(struct address_space BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - SetPageNoNewRefs(page); - smp_wmb(); + set_page_no_new_refs(page); write_lock_irq(&mapping->tree_lock); /* * The non racy check for a busy page. @@ -433,14 +432,13 @@ int remove_mapping(struct address_space write_unlock_irq(&mapping->tree_lock); free_it: - smp_wmb(); - __ClearPageNoNewRefs(page); + end_page_no_new_refs(page); __put_page(page); /* The pagecache ref */ return 1; cannot_free: write_unlock_irq(&mapping->tree_lock); - ClearPageNoNewRefs(page); + end_page_no_new_refs(page); return 0; } patches/lockdep-prettify.patch0000664000077200007720000000363510646635213016044 0ustar mingomingoSubject: [patch] lockdep: prettify output From: Ingo Molnar recent changes to the lockdep code made some of the printouts uglier - mend them. Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -548,7 +548,7 @@ print_circular_bug_entry(struct lock_lis static void print_kernel_version(void) { - printk("%s %.*s\n", init_utsname()->release, + printk("[ %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); } @@ -2707,13 +2707,13 @@ void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + printk("... MAX_LOCKDEP_SUBCLASSES: %6lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCK_DEPTH: %6lu\n", MAX_LOCK_DEPTH); + printk("... MAX_LOCKDEP_KEYS: %6lu\n", MAX_LOCKDEP_KEYS); + printk("... CLASSHASH_SIZE: %6lu\n", CLASSHASH_SIZE); + printk("... MAX_LOCKDEP_ENTRIES: %6lu\n", MAX_LOCKDEP_ENTRIES); + printk("... MAX_LOCKDEP_CHAINS: %6lu\n", MAX_LOCKDEP_CHAINS); + printk("... CHAINHASH_SIZE: %6lu\n", CHAINHASH_SIZE); printk(" memory used by lock dependency info: %lu kB\n", (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + patches/preempt-realtime-supress-nohz-softirq-warning.patch0000664000077200007720000000111510646635215023632 0ustar mingomingo--- kernel/time/tick-sched.c | 3 +++ 1 file changed, 3 insertions(+) Index: linux-rt.q/kernel/time/tick-sched.c =================================================================== --- linux-rt.q.orig/kernel/time/tick-sched.c +++ linux-rt.q/kernel/time/tick-sched.c @@ -168,6 +168,8 @@ void tick_nohz_stop_sched_tick(void) goto end; cpu = smp_processor_id(); + +#ifndef CONFIG_PREEMPT_RT if (unlikely(local_softirq_pending())) { static int ratelimit; @@ -177,6 +179,7 @@ void tick_nohz_stop_sched_tick(void) ratelimit++; } } +#endif now = ktime_get(); /* patches/rt-mutex-mips.patch0000664000077200007720000001453410646635214015313 0ustar mingomingo--- arch/mips/Kconfig | 15 +++++++++++---- arch/mips/kernel/Makefile | 4 +++- include/asm-mips/atomic.h | 26 +++++++++++++++++++++----- include/asm-mips/semaphore.h | 30 +++++++++++++++++++++--------- 4 files changed, 56 insertions(+), 19 deletions(-) Index: linux-rt.q/arch/mips/Kconfig =================================================================== --- linux-rt.q.orig/arch/mips/Kconfig +++ linux-rt.q/arch/mips/Kconfig @@ -259,6 +259,7 @@ config MIPS_SIM config MOMENCO_OCELOT bool "Momentum Ocelot board" select DMA_NONCOHERENT + select NO_SPINLOCK select HW_HAS_PCI select IRQ_CPU select IRQ_CPU_RM7K @@ -675,10 +676,17 @@ endmenu config RWSEM_GENERIC_SPINLOCK bool + depends on !PREEMPT_RT default y config RWSEM_XCHGADD_ALGORITHM bool + depends on !PREEMPT_RT + +config ASM_SEMAPHORES + bool +# depends on !PREEMPT_RT + default y config ARCH_HAS_ILOG2_U32 bool @@ -738,6 +746,9 @@ config DMA_NONCOHERENT config DMA_NEED_PCI_MAP_STATE bool +config NO_SPINLOCK + bool + config EARLY_PRINTK bool "Early printk" if EMBEDDED && DEBUG_KERNEL depends on SYS_HAS_EARLY_PRINTK @@ -1777,10 +1788,6 @@ config SECCOMP endmenu -config RWSEM_GENERIC_SPINLOCK - bool - default y - config LOCKDEP_SUPPORT bool default y Index: linux-rt.q/arch/mips/kernel/Makefile =================================================================== --- linux-rt.q.orig/arch/mips/kernel/Makefile +++ linux-rt.q/arch/mips/kernel/Makefile @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y += cpu-probe.o branch.o entry.o genex.o irq.o process.o \ - ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \ + ptrace.o reset.o setup.o signal.o syscall.o \ time.o topology.o traps.o unaligned.o binfmt_irix-objs := irixelf.o irixinv.o irixioctl.o irixsig.o \ @@ -14,6 +14,8 @@ binfmt_irix-objs := irixelf.o irixinv.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_MODULES) += mips_ksyms.o module.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o + obj-$(CONFIG_CPU_R3000) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX39XX) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX49XX) += r4k_fpu.o r4k_switch.o Index: linux-rt.q/include/asm-mips/atomic.h =================================================================== --- linux-rt.q.orig/include/asm-mips/atomic.h +++ linux-rt.q/include/asm-mips/atomic.h @@ -171,7 +171,9 @@ static __inline__ int atomic_add_return( : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -180,6 +182,7 @@ static __inline__ int atomic_add_return( v->counter = result; raw_local_irq_restore(flags); } +#endif smp_mb(); @@ -223,7 +226,9 @@ static __inline__ int atomic_sub_return( : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -232,6 +237,7 @@ static __inline__ int atomic_sub_return( v->counter = result; raw_local_irq_restore(flags); } +#endif smp_mb(); @@ -291,7 +297,9 @@ static __inline__ int atomic_sub_if_posi : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -301,6 +309,7 @@ static __inline__ int atomic_sub_if_posi v->counter = result; raw_local_irq_restore(flags); } +#endif smp_mb(); @@ -552,7 +561,9 @@ static __inline__ long atomic64_add_retu : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -561,6 +572,8 @@ static __inline__ long atomic64_add_retu v->counter = result; raw_local_irq_restore(flags); } +#endif +#endif smp_mb(); @@ -604,7 +617,9 @@ static __inline__ long atomic64_sub_retu : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -682,6 +697,7 @@ static __inline__ long atomic64_sub_if_p v->counter = result; raw_local_irq_restore(flags); } +#endif smp_mb(); Index: linux-rt.q/include/asm-mips/semaphore.h =================================================================== --- linux-rt.q.orig/include/asm-mips/semaphore.h +++ linux-rt.q/include/asm-mips/semaphore.h @@ -24,12 +24,20 @@ #ifdef __KERNEL__ -#include -#include #include #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +#include +#include + +struct compat_semaphore { /* * Note that any negative value of count is equivalent to 0, * but additionally indicates that some process(es) might be @@ -79,31 +87,35 @@ static inline void down(struct semaphore * Try to get the semaphore, take the slow path if we fail. */ if (unlikely(atomic_dec_return(&sem->count) < 0)) - __down(sem); + __compat_down(sem); } -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; might_sleep(); if (unlikely(atomic_dec_return(&sem->count) < 0)) - ret = __down_interruptible(sem); + ret = __compat_down_interruptible(sem); return ret; } -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { return atomic_dec_if_positive(&sem->count) < 0; } -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { if (unlikely(atomic_inc_return(&sem->count) <= 0)) - __up(sem); + __compat_up(sem); } +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif /* __KERNEL__ */ #endif /* __ASM_SEMAPHORE_H */ patches/highmem_rewrite.patch0000664000077200007720000003762610646635216015750 0ustar mingomingoSubject: mm: remove kmap_lock Eradicate global locks. - kmap_lock is removed by extensive use of atomic_t and a new flush scheme. - pool_lock is removed by using the pkmap index for the page_address_maps and modifying set_page_address to only allow NULL<->virt transitions. Signed-off-by: Peter Zijlstra --- include/linux/mm.h | 32 ++- mm/highmem.c | 433 ++++++++++++++++++++++++++++++----------------------- 2 files changed, 276 insertions(+), 189 deletions(-) Index: linux-rt.q/include/linux/mm.h =================================================================== --- linux-rt.q.orig/include/linux/mm.h +++ linux-rt.q/include/linux/mm.h @@ -565,23 +565,39 @@ static __always_inline void *lowmem_page #endif #if defined(WANT_PAGE_VIRTUAL) -#define page_address(page) ((page)->virtual) -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) -#define page_address_init() do { } while(0) +/* + * wrap page->virtual so it is safe to set/read locklessly + */ +#define page_address(page) \ + ({ typeof((page)->virtual) v = (page)->virtual; \ + smp_read_barrier_depends(); \ + v; }) + +static inline int set_page_address(struct page *page, void *address) +{ + if (address) + return cmpxchg(&page->virtual, NULL, address) == NULL; + else { + /* + * cmpxchg is a bit abused because it is not guaranteed + * safe wrt direct assignment on all platforms. + */ + void *virt = page->virtual; + return cmpxchg(&page->vitrual, virt, NULL) == virt; + } +} +void page_address_init(void); #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(struct page *page); -void set_page_address(struct page *page, void *virtual); +int set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) -#define set_page_address(page, address) do { } while(0) +#define set_page_address(page, address) (0) #define page_address_init() do { } while(0) #endif Index: linux-rt.q/mm/highmem.c =================================================================== --- linux-rt.q.orig/mm/highmem.c +++ linux-rt.q/mm/highmem.c @@ -14,6 +14,11 @@ * based on Linus' idea. * * Copyright (C) 1999 Ingo Molnar + * + * Largely rewritten to get rid of all global locks + * + * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra + * */ #include @@ -27,18 +32,14 @@ #include #include #include + #include +#include -/* - * Virtual_count is not a pure "count". - * 0 means that it is not mapped, and has not been mapped - * since a TLB flush - it is usable. - * 1 means that there are no users, but it has been mapped - * since the last TLB flush - so we can't use it. - * n means that there are (n-1) current users of it. - */ #ifdef CONFIG_HIGHMEM +static int __set_page_address(struct page *page, void *virtual, int pos); + unsigned long totalhigh_pages __read_mostly; unsigned int nr_free_highpages (void) @@ -53,164 +54,208 @@ unsigned int nr_free_highpages (void) return pages; } -static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); +/* + * count is not a pure "count". + * 0 means its owned exclusively by someone + * 1 means its free for use - either mapped or not. + * n means that there are (n-1) current users of it. + */ +static atomic_t pkmap_count[LAST_PKMAP]; +static atomic_t pkmap_hand; pte_t * pkmap_page_table; static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); -static void flush_all_zero_pkmaps(void) +/* + * Try to free a given kmap slot. + * + * Returns: + * -1 - in use + * 0 - free, no TLB flush needed + * 1 - free, needs TLB flush + */ +static int pkmap_try_free(int pos) { - int i; - - flush_cache_kmaps(); + if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1) + return -1; - for (i = 0; i < LAST_PKMAP; i++) { - struct page *page; + /* + * TODO: add a young bit to make it CLOCK + */ + if (!pte_none(pkmap_page_table[pos])) { + struct page *page = pte_page(pkmap_page_table[pos]); + unsigned long addr = PKMAP_ADDR(pos); + pte_t *ptep = &pkmap_page_table[pos]; + + VM_BUG_ON(addr != (unsigned long)page_address(page)); + + if (!__set_page_address(page, NULL, pos)) + BUG(); + flush_kernel_dcache_page(page); + pte_clear(&init_mm, addr, ptep); - /* - * zero means we don't have anything to do, - * >1 means that it is still in use. Only - * a count of 1 means that it is free but - * needs to be unmapped - */ - if (pkmap_count[i] != 1) - continue; - pkmap_count[i] = 0; + return 1; + } - /* sanity check */ - BUG_ON(pte_none(pkmap_page_table[i])); + return 0; +} - /* - * Don't need an atomic fetch-and-clear op here; - * no-one has the page mapped, and cannot get at - * its virtual address (and hence PTE) without first - * getting the kmap_lock (which is held here). - * So no dangers, even with speculative execution. - */ - page = pte_page(pkmap_page_table[i]); - pte_clear(&init_mm, (unsigned long)page_address(page), - &pkmap_page_table[i]); +static inline void pkmap_put(atomic_t *counter) +{ + switch (atomic_dec_return(counter)) { + case 0: + BUG(); - set_page_address(page, NULL); + case 1: + wake_up(&pkmap_map_wait); } - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } -static inline unsigned long map_new_virtual(struct page *page) +#define TLB_BATCH 32 + +static int pkmap_get_free(void) { - unsigned long vaddr; - int count; + int i, pos, flush; + DECLARE_WAITQUEUE(wait, current); -start: - count = LAST_PKMAP; - /* Find an empty entry */ - for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { - flush_all_zero_pkmaps(); - count = LAST_PKMAP; - } - if (!pkmap_count[last_pkmap_nr]) - break; /* Found a usable entry */ - if (--count) - continue; +restart: + for (i = 0; i < LAST_PKMAP; i++) { + pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP; + flush = pkmap_try_free(pos); + if (flush >= 0) + goto got_one; + } + + /* + * wait for somebody else to unmap their entries + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&pkmap_map_wait, &wait); + schedule(); + remove_wait_queue(&pkmap_map_wait, &wait); + + goto restart; + +got_one: + if (flush) { +#if 0 + flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1)); +#else + int pos2 = (pos + 1) % LAST_PKMAP; + int nr; + int entries[TLB_BATCH]; /* - * Sleep for somebody else to unmap their entries + * For those architectures that cannot help but flush the + * whole TLB, flush some more entries to make it worthwhile. + * Scan ahead of the hand to minimise search distances. */ - { - DECLARE_WAITQUEUE(wait, current); + for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH; + i++, pos2 = (pos2 + 1) % LAST_PKMAP) { - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); - spin_unlock(&kmap_lock); - schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); - spin_lock(&kmap_lock); - - /* Somebody else might have mapped it while we slept */ - if (page_address(page)) - return (unsigned long)page_address(page); + flush = pkmap_try_free(pos2); + if (flush < 0) + continue; + + if (!flush) { + atomic_t *counter = &pkmap_count[pos2]; + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); + } else + entries[nr++] = pos2; + } + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); - /* Re-start */ - goto start; + for (i = 0; i < nr; i++) { + atomic_t *counter = &pkmap_count[entries[i]]; + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); } +#endif } - vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte_at(&init_mm, vaddr, - &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + return pos; +} + +static unsigned long pkmap_insert(struct page *page) +{ + int pos = pkmap_get_free(); + unsigned long vaddr = PKMAP_ADDR(pos); + pte_t *ptep = &pkmap_page_table[pos]; + pte_t entry = mk_pte(page, kmap_prot); + atomic_t *counter = &pkmap_count[pos]; + + VM_BUG_ON(atomic_read(counter) != 0); - pkmap_count[last_pkmap_nr] = 1; - set_page_address(page, (void *)vaddr); + set_pte_at(&init_mm, vaddr, ptep, entry); + if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) { + /* + * concurrent pkmap_inserts for this page - + * the other won the race, release this entry. + * + * we can still clear the pte without a tlb flush since + * it couldn't have been used yet. + */ + pte_clear(&init_mm, vaddr, ptep); + VM_BUG_ON(atomic_read(counter) != 0); + atomic_set(counter, 2); + pkmap_put(counter); + vaddr = 0; + } else + atomic_set(counter, 2); return vaddr; } -void fastcall *kmap_high(struct page *page) +fastcall void *kmap_high(struct page *page) { unsigned long vaddr; - - /* - * For highmem pages, we can't trust "virtual" until - * after we have the lock. - * - * We cannot call this from interrupts, as it may block - */ - spin_lock(&kmap_lock); +again: vaddr = (unsigned long)page_address(page); + if (vaddr) { + atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)]; + if (atomic_inc_not_zero(counter)) { + /* + * atomic_inc_not_zero implies a (memory) barrier on success + * so page address will be reloaded. + */ + unsigned long vaddr2 = (unsigned long)page_address(page); + if (likely(vaddr == vaddr2)) + return (void *)vaddr; + + /* + * Oops, we got someone else. + * + * This can happen if we get preempted after + * page_address() and before atomic_inc_not_zero() + * and during that preemption this slot is freed and + * reused. + */ + pkmap_put(counter); + goto again; + } + } + + vaddr = pkmap_insert(page); if (!vaddr) - vaddr = map_new_virtual(page); - pkmap_count[PKMAP_NR(vaddr)]++; - BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); - spin_unlock(&kmap_lock); - return (void*) vaddr; + goto again; + + return (void *)vaddr; } EXPORT_SYMBOL(kmap_high); -void fastcall kunmap_high(struct page *page) +fastcall void kunmap_high(struct page *page) { - unsigned long vaddr; - unsigned long nr; - int need_wakeup; - - spin_lock(&kmap_lock); - vaddr = (unsigned long)page_address(page); + unsigned long vaddr = (unsigned long)page_address(page); BUG_ON(!vaddr); - nr = PKMAP_NR(vaddr); - - /* - * A count must never go down to zero - * without a TLB flush! - */ - need_wakeup = 0; - switch (--pkmap_count[nr]) { - case 0: - BUG(); - case 1: - /* - * Avoid an unnecessary wake_up() function call. - * The common case is pkmap_count[] == 1, but - * no waiters. - * The tasks queued in the wait-queue are guarded - * by both the lock in the wait-queue-head and by - * the kmap_lock. As the kmap_lock is held here, - * no need for the wait-queue-head's lock. Simply - * test if the queue is empty. - */ - need_wakeup = waitqueue_active(&pkmap_map_wait); - } - spin_unlock(&kmap_lock); - - /* do wake-up, if needed, race-free outside of the spin lock */ - if (need_wakeup) - wake_up(&pkmap_map_wait); + pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]); } EXPORT_SYMBOL(kunmap_high); + #endif #if defined(HASHED_PAGE_VIRTUAL) @@ -218,19 +263,13 @@ EXPORT_SYMBOL(kunmap_high); #define PA_HASH_ORDER 7 /* - * Describes one page->virtual association + * Describes one page->virtual address association. */ -struct page_address_map { +static struct page_address_map { struct page *page; void *virtual; struct list_head list; -}; - -/* - * page_address_map freelist, allocated from page_address_maps. - */ -static struct list_head page_address_pool; /* freelist */ -static spinlock_t pool_lock; /* protects page_address_pool */ +} page_address_maps[LAST_PKMAP]; /* * Hash table bucket @@ -245,91 +284,123 @@ static struct page_address_slot *page_sl return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; } -void *page_address(struct page *page) +static void *__page_address(struct page_address_slot *pas, struct page *page) { - unsigned long flags; - void *ret; - struct page_address_slot *pas; - - if (!PageHighMem(page)) - return lowmem_page_address(page); + void *ret = NULL; - pas = page_slot(page); - ret = NULL; - spin_lock_irqsave(&pas->lock, flags); if (!list_empty(&pas->lh)) { struct page_address_map *pam; list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { ret = pam->virtual; - goto done; + break; } } } -done: + + return ret; +} + +void *page_address(struct page *page) +{ + unsigned long flags; + void *ret; + struct page_address_slot *pas; + + if (!PageHighMem(page)) + return lowmem_page_address(page); + + pas = page_slot(page); + spin_lock_irqsave(&pas->lock, flags); + ret = __page_address(pas, page); spin_unlock_irqrestore(&pas->lock, flags); return ret; } EXPORT_SYMBOL(page_address); -void set_page_address(struct page *page, void *virtual) +static int __set_page_address(struct page *page, void *virtual, int pos) { + int ret = 0; unsigned long flags; struct page_address_slot *pas; struct page_address_map *pam; - BUG_ON(!PageHighMem(page)); + VM_BUG_ON(!PageHighMem(page)); + VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0); + VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP); pas = page_slot(page); - if (virtual) { /* Add */ - BUG_ON(list_empty(&page_address_pool)); + pam = &page_address_maps[pos]; - spin_lock_irqsave(&pool_lock, flags); - pam = list_entry(page_address_pool.next, - struct page_address_map, list); - list_del(&pam->list); - spin_unlock_irqrestore(&pool_lock, flags); - - pam->page = page; - pam->virtual = virtual; - - spin_lock_irqsave(&pas->lock, flags); - list_add_tail(&pam->list, &pas->lh); - spin_unlock_irqrestore(&pas->lock, flags); - } else { /* Remove */ - spin_lock_irqsave(&pas->lock, flags); - list_for_each_entry(pam, &pas->lh, list) { - if (pam->page == page) { - list_del(&pam->list); - spin_unlock_irqrestore(&pas->lock, flags); - spin_lock_irqsave(&pool_lock, flags); - list_add_tail(&pam->list, &page_address_pool); - spin_unlock_irqrestore(&pool_lock, flags); - goto done; - } + spin_lock_irqsave(&pas->lock, flags); + if (virtual) { /* add */ + VM_BUG_ON(!list_empty(&pam->list)); + + if (!__page_address(pas, page)) { + pam->page = page; + pam->virtual = virtual; + list_add_tail(&pam->list, &pas->lh); + ret = 1; + } + } else { /* remove */ + if (!list_empty(&pam->list)) { + list_del_init(&pam->list); + ret = 1; } - spin_unlock_irqrestore(&pas->lock, flags); } -done: - return; + spin_unlock_irqrestore(&pas->lock, flags); + + return ret; } -static struct page_address_map page_address_maps[LAST_PKMAP]; +int set_page_address(struct page *page, void *virtual) +{ + /* + * set_page_address is not supposed to be called when using + * hashed virtual addresses. + */ + BUG(); + return 0; +} -void __init page_address_init(void) +void __init __page_address_init(void) { int i; - INIT_LIST_HEAD(&page_address_pool); for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) - list_add(&page_address_maps[i].list, &page_address_pool); + INIT_LIST_HEAD(&page_address_maps[i].list); + for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); } - spin_lock_init(&pool_lock); +} + +#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */ + +static int __set_page_address(struct page *page, void *virtual, int pos) +{ + return set_page_address(page, virtual); } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ + +#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL) + +void __init page_address_init(void) +{ +#ifdef CONFIG_HIGHMEM + int i; + + for (i = 0; i < ARRAY_SIZE(pkmap_count); i++) + atomic_set(&pkmap_count[i], 1); +#endif + +#ifdef HASHED_PAGE_VIRTUAL + __page_address_init(); +#endif +} + +#endif patches/ep93xx-timer-accuracy.patch0000664000077200007720000000322410646635211016615 0ustar mingomingo The ep93xx has a weird timer tick base (983.04 kHz.) This experimental patch tries to increase time of day accuracy by keeping the number of ticks until the next jiffy in a fractional value representation. Signed-off-by: Lennert Buytenhek --- arch/arm/mach-ep93xx/core.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) Index: linux-rt.q/arch/arm/mach-ep93xx/core.c =================================================================== --- linux-rt.q.orig/arch/arm/mach-ep93xx/core.c +++ linux-rt.q/arch/arm/mach-ep93xx/core.c @@ -94,19 +94,32 @@ void __init ep93xx_map_io(void) * track of lost jiffies. */ static unsigned int last_jiffy_time; +static unsigned int next_jiffy_time; +static unsigned int accumulator; -#define TIMER4_TICKS_PER_JIFFY ((CLOCK_TICK_RATE + (HZ/2)) / HZ) +#define TIMER4_TICKS_PER_JIFFY (983040 / HZ) +#define TIMER4_TICKS_MOD_JIFFY (983040 % HZ) + +static int after_eq(unsigned long a, unsigned long b) +{ + return ((signed long)(a - b)) >= 0; +} static int ep93xx_timer_interrupt(int irq, void *dev_id) { write_seqlock(&xtime_lock); __raw_writel(1, EP93XX_TIMER1_CLEAR); - while ((signed long) - (__raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time) - >= TIMER4_TICKS_PER_JIFFY) { - last_jiffy_time += TIMER4_TICKS_PER_JIFFY; + while (after_eq(__raw_readl(EP93XX_TIMER4_VALUE_LOW), next_jiffy_time)) { timer_tick(); + + last_jiffy_time = next_jiffy_time; + next_jiffy_time += TIMER4_TICKS_PER_JIFFY; + accumulator += TIMER4_TICKS_MOD_JIFFY; + if (accumulator >= HZ) { + next_jiffy_time++; + accumulator -= HZ; + } } write_sequnlock(&xtime_lock); patches/latency-tracer-optimize-a-bit.patch0000664000077200007720000000126510646635212020320 0ustar mingomingo--- kernel/latency_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -91,7 +91,7 @@ static inline int DEBUG_WARN_ON(int cond #ifdef CONFIG_CRITICAL_IRQSOFF_TIMING # ifdef CONFIG_CRITICAL_PREEMPT_TIMING static DEFINE_PER_CPU(int, trace_cpu_idle); -# define irqs_off_preempt_count() (!__get_cpu_var(trace_cpu_idle) && preempt_count()) +# define irqs_off_preempt_count() (preempt_count() && !__get_cpu_var(trace_cpu_idle)) # else # define irqs_off_preempt_count() 0 # endif patches/replace-bugon-by-warn-on.patch0000664000077200007720000000104210646635211017255 0ustar mingomingo--- arch/i386/mm/highmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/i386/mm/highmem.c =================================================================== --- linux-rt.q.orig/arch/i386/mm/highmem.c +++ linux-rt.q/arch/i386/mm/highmem.c @@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); - BUG_ON(!pte_none(*(kmap_pte-idx))); + WARN_ON_ONCE(!pte_none(*(kmap_pte-idx))); if (!PageHighMem(page)) return page_address(page); patches/trace-cpuidle.patch0000664000077200007720000001352510646635212015276 0ustar mingomingoFrom linux-rt-users-owner@vger.kernel.org Sat Jul 14 04:08:19 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.2 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by mail.tglx.de (Postfix) with ESMTP id BB2AB65C292; Sat, 14 Jul 2007 04:08:19 +0200 (CEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758741AbXGNCIT (ORCPT + 1 other); Fri, 13 Jul 2007 22:08:19 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1760025AbXGNCIT (ORCPT ); Fri, 13 Jul 2007 22:08:19 -0400 Received: from rwcrmhc11.comcast.net ([216.148.227.151]:38099 "EHLO rwcrmhc11.comcast.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758741AbXGNCIR (ORCPT ); Fri, 13 Jul 2007 22:08:17 -0400 Received: from sx.thebigcorporation.com ([69.181.45.228]) by comcast.net (rwcrmhc11) with ESMTP id <20070714020816m1100999cee>; Sat, 14 Jul 2007 02:08:17 +0000 Received: from sx.thebigcorporation.com (localhost.localdomain [127.0.0.1]) by sx.thebigcorporation.com (8.14.1/8.13.8) with ESMTP id l6E28G5M018639; Fri, 13 Jul 2007 19:08:16 -0700 Received: (from sven@localhost) by sx.thebigcorporation.com (8.14.1/8.14.1/Submit) id l6E28FCT018638; Fri, 13 Jul 2007 19:08:15 -0700 X-Authentication-Warning: sx.thebigcorporation.com: sven set sender to sven@thebigcorporation.com using -f Subject: Re: [PATCH -rt 6/6] Compile fix for PREEMPT_TIMING on and IRQSOFF_TIMING off From: Sven-Thorsten Dietrich To: Kevin Hilman Cc: tglx@linutronix.de, mingo@elte.hu, linux-rt-users@vger.kernel.org, linux-kernel@vger.kernel.org In-Reply-To: <46980935.3060509@mvista.com> References: <20070713175214.336577416@mvista.com> <20070713175229.239602308@mvista.com> <46980935.3060509@mvista.com> Content-Type: text/plain Organization: The Big Corporation Date: Fri, 13 Jul 2007 19:08:14 -0700 Message-Id: <1184378894.16207.14.camel@sx.thebigcorporation.com> Mime-Version: 1.0 X-Mailer: Evolution 2.10.2 (2.10.2-3.fc7) Sender: linux-rt-users-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-rt-users@vger.kernel.org X-Filter-To: .Kernel.rt-users X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit On Fri, 2007-07-13 at 16:22 -0700, Kevin Hilman wrote: > [Minor update to avoid a compiler warning in the case of DEBUG_KERNEL=n] > The resent patch (v2) had white space damage - Here is a reconstituted version that applies for me on 2.6.22-rt3 Acked-by: Sven-Thorsten Dietrich >From linux-rt-users-owner@vger.kernel.org Fri Jul 13 16:22:34 2007 Return-Path: Received: from sx.thebigcorporation.com ([unix socket]) by sx.thebigcorporation.com (Cyrus v2.3.8-Fedora-RPM-2.3.8-3.fc7) with LMTPA; Fri, 13 Jul 2007 16:22:34 -0700 X-Sieve: CMU Sieve 2.3 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by sx.thebigcorporation.com (8.14.1/8.13.8) with ESMTP id l6DNMXP6017382 for ; Fri, 13 Jul 2007 16:22:33 -0700 Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1759195AbXGMXW1 (ORCPT ); Fri, 13 Jul 2007 19:22:27 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1760623AbXGMXW1 (ORCPT ); Fri, 13 Jul 2007 19:22:27 -0400 Received: from h155.mvista.com ([63.81.120.158]:7301 "EHLO gateway-1237.mvista.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1759161AbXGMXW1 (ORCPT ); Fri, 13 Jul 2007 19:22:27 -0400 Received: from [127.0.0.1] (asshur.mvista.com [10.0.0.11]) by hermes.mvista.com (Postfix) with ESMTP id 69F871DE39; Fri, 13 Jul 2007 16:22:25 -0700 (PDT) Message-ID: <46980935.3060509@mvista.com> Date: Fri, 13 Jul 2007 16:22:29 -0700 From: Kevin Hilman User-Agent: Thunderbird 1.5.0.12 (X11/20070604) MIME-Version: 1.0 To: Kevin Hilman Cc: tglx@linutronix.de, mingo@elte.hu, linux-rt-users@vger.kernel.org, linux-kernel@vger.kernel.org Subject: Re: [PATCH -rt 6/6] Compile fix for PREEMPT_TIMING on and IRQSOFF_TIMING off References: <20070713175214.336577416@mvista.com> <20070713175229.239602308@mvista.com> In-Reply-To: <20070713175229.239602308@mvista.com> Content-Type: text/plain; charset=ISO-8859-1 Sender: linux-rt-users-owner@vger.kernel.org Precedence: bulk X-Mailing-List: linux-rt-users@vger.kernel.org X-Evolution-Source: imap://sven@sx.thebigcorporation.com/ Content-Transfer-Encoding: 8bit [Minor update to avoid a compiler warning in the case of DEBUG_KERNEL=n] Compile fix for PREEMPT_TIMING on and IRQSOFF_TIMING off The per-cpu trace_cpu_idle variable is used when timing *either* IRQs-off or preempt sections. Signed-off-by: Kevin Hilman --- kernel/latency_trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -88,9 +88,13 @@ static inline int DEBUG_WARN_ON(int cond } #endif +#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || \ + (defined(CONFIG_CRITICAL_PREEMPT_TIMING) && defined(CONFIG_TRACE_IRQFLAGS)) + static DEFINE_PER_CPU(int, trace_cpu_idle); +#endif + #ifdef CONFIG_CRITICAL_IRQSOFF_TIMING # ifdef CONFIG_CRITICAL_PREEMPT_TIMING - static DEFINE_PER_CPU(int, trace_cpu_idle); # define irqs_off_preempt_count() (preempt_count() && !__get_cpu_var(trace_cpu_idle)) # else # define irqs_off_preempt_count() 0 patches/hpet-build-fix.patch0000664000077200007720000000064610646635217015403 0ustar mingomingo--- arch/i386/kernel/hpet.c | 1 + 1 file changed, 1 insertion(+) Index: linux-rt.q/arch/i386/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/hpet.c +++ linux-rt.q/arch/i386/kernel/hpet.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include patches/clockevents-remove-unused-code.patch0000664000077200007720000000317110646635210020573 0ustar mingomingoSubject: Clockevents remove clockevents_{release,request}_device From: Andi Kleen Not called by anything in tree. Removed the prototypes as well [tglx] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 41 ----------------------------------------- 1 file changed, 41 deletions(-) Index: linux-rt.q/kernel/time/clockevents.c =================================================================== --- linux-rt.q.orig/kernel/time/clockevents.c +++ linux-rt.q/kernel/time/clockevents.c @@ -205,47 +205,6 @@ void clockevents_exchange_device(struct } /** - * clockevents_request_device - */ -struct clock_event_device *clockevents_request_device(unsigned int features, - cpumask_t cpumask) -{ - struct clock_event_device *cur, *dev = NULL; - struct list_head *tmp; - - spin_lock(&clockevents_lock); - - list_for_each(tmp, &clockevent_devices) { - cur = list_entry(tmp, struct clock_event_device, list); - - if ((cur->features & features) == features && - cpus_equal(cpumask, cur->cpumask)) { - if (!dev || dev->rating < cur->rating) - dev = cur; - } - } - - clockevents_exchange_device(NULL, dev); - - spin_unlock(&clockevents_lock); - - return dev; -} - -/** - * clockevents_release_device - */ -void clockevents_release_device(struct clock_event_device *dev) -{ - spin_lock(&clockevents_lock); - - clockevents_exchange_device(dev, NULL); - clockevents_notify_released(); - - spin_unlock(&clockevents_lock); -} - -/** * clockevents_notify - notification about relevant events */ void clockevents_notify(unsigned long reason, void *arg) patches/ppc-read-persistent-clock.patch0000664000077200007720000001010510646635213017527 0ustar mingomingoFrom sshtylyov@ru.mvista.com Thu May 17 20:11:33 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from imap.sh.mvista.com (unknown [63.81.120.155]) by mail.tglx.de (Postfix) with ESMTP id 069DD65C065 for ; Thu, 17 May 2007 20:11:33 +0200 (CEST) Received: from wasted.dev.rtsoft.ru (unknown [10.150.0.9]) by imap.sh.mvista.com (Postfix) with ESMTP id 928713EC9; Thu, 17 May 2007 11:11:28 -0700 (PDT) From: Sergei Shtylyov (by way of Sergei Shtylyov ) Organization: MontaVista Software Inc. Subject: [PATCH 2.6.21-rt2] PowerPC: implement read_persistent_clock() Date: Thu, 17 May 2007 22:13:01 +0400 User-Agent: KMail/1.5 To: tglx@linutronix.de, mingo@elte.hu MIME-Version: 1.0 Content-Disposition: inline Cc: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org, johnstul@us.ibm.com Content-Type: text/plain; charset="iso-8859-1" Message-Id: <200705172213.01877.sshtylyov@ru.mvista.com> X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Here's the read_persistent_clock() implementation for PowerPC. I'm deliberately renaming get_boot_time() despite it's not static as it doesn't get called from anywhere else. Signed-off-by: Sergei Shtylyov --- Have almost forgotten about this one... :-) This patch hasn't received a good testing though -- at least it doesn't break without RTC... ;-) arch/powerpc/kernel/time.c | 62 ++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 34 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/time.c +++ linux-rt.q/arch/powerpc/kernel/time.c @@ -633,31 +633,46 @@ void __init generic_calibrate_decr(void) #endif } -unsigned long get_boot_time(void) +unsigned long read_persistent_clock(void) { - struct rtc_time tm; + unsigned long time = 0; + static int first = 1; + + if (first && ppc_md.time_init) { + timezone_offset = ppc_md.time_init(); + + /* If platform provided a timezone (pmac), we correct the time */ + if (timezone_offset) { + sys_tz.tz_minuteswest = -timezone_offset / 60; + sys_tz.tz_dsttime = 0; + } + } if (ppc_md.get_boot_time) - return ppc_md.get_boot_time(); - if (!ppc_md.get_rtc_time) - return 0; - ppc_md.get_rtc_time(&tm); - return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec); + time = ppc_md.get_boot_time(); + else if (ppc_md.get_rtc_time) { + struct rtc_time tm; + + ppc_md.get_rtc_time(&tm); + time = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + } + time -= timezone_offset; + + if (first) { + last_rtc_update = time; + first = 0; + } + return time; } /* This function is only called on the boot processor */ void __init time_init(void) { - unsigned long flags; - unsigned long tm = 0; struct div_result res; u64 scale, x; unsigned shift; - if (ppc_md.time_init != NULL) - timezone_offset = ppc_md.time_init(); - if (__USE_RTC()) { /* 601 processor: dec counts down by 128 every 128ns */ ppc_tb_freq = 1000000000; @@ -730,27 +745,6 @@ void __init time_init(void) tb_to_ns_scale = scale; tb_to_ns_shift = shift; - tm = get_boot_time(); - - write_seqlock_irqsave(&xtime_lock, flags); - - /* If platform provided a timezone (pmac), we correct the time */ - if (timezone_offset) { - sys_tz.tz_minuteswest = -timezone_offset / 60; - sys_tz.tz_dsttime = 0; - tm -= timezone_offset; - } - - xtime.tv_sec = tm; - xtime.tv_nsec = 0; - - time_freq = 0; - - last_rtc_update = xtime.tv_sec; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); - /* Not exact, but the timer interrupt takes care of this */ set_dec(tb_ticks_per_jiffy); } patches/slob-scale-no-bigblock-list.patch0000664000077200007720000000735610646635211017742 0ustar mingomingoThis patch uses the mem_map pages to find the bigblock descriptor for large allocations. -- Steve Signed-off-by: Steven Rostedt mm/slob.c | 74 ++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 33 deletions(-) Index: linux-rt.q/mm/slob.c =================================================================== --- linux-rt.q.orig/mm/slob.c +++ linux-rt.q/mm/slob.c @@ -50,7 +50,6 @@ typedef struct slob_block slob_t; struct bigblock { int order; void *pages; - struct bigblock *next; }; typedef struct bigblock bigblock_t; @@ -66,14 +65,42 @@ struct slob_rcu { static slob_t arena = { .next = &arena, .units = 1 }; static slob_t *slobfree = &arena; -static bigblock_t *bigblocks; static DEFINE_SPINLOCK(slob_lock); -static DEFINE_SPINLOCK(block_lock); static void slob_free(void *b, int size); static void slob_timer_cbk(void); +#define __get_slob_block(b) ((unsigned long)(b) & ~(PAGE_SIZE-1)) + +static inline struct page *get_slob_page(const void *mem) +{ + void *virt = (void*)__get_slob_block(mem); + + return virt_to_page(virt); +} + +static inline void zero_slob_block(const void *b) +{ + struct page *page; + page = get_slob_page(b); + memset(&page->lru, 0, sizeof(page->lru)); +} + +static inline void *get_slob_block(const void *b) +{ + struct page *page; + page = get_slob_page(b); + return page->lru.next; +} + +static inline void set_slob_block(const void *b, void *data) +{ + struct page *page; + page = get_slob_page(b); + page->lru.next = data; +} + static void *slob_alloc(size_t size, gfp_t gfp, int align) { slob_t *prev, *cur, *aligned = 0; @@ -120,6 +147,7 @@ static void *slob_alloc(size_t size, gfp if (!cur) return 0; + zero_slob_block(cur); slob_free(cur, PAGE_SIZE); spin_lock_irqsave(&slob_lock, flags); cur = slobfree; @@ -165,7 +193,6 @@ void *__kmalloc(size_t size, gfp_t gfp) { slob_t *m; bigblock_t *bb; - unsigned long flags; if (size < PAGE_SIZE - SLOB_UNIT) { m = slob_alloc(size + SLOB_UNIT, gfp, 0); @@ -180,10 +207,7 @@ void *__kmalloc(size_t size, gfp_t gfp) bb->pages = (void *)__get_free_pages(gfp, bb->order); if (bb->pages) { - spin_lock_irqsave(&block_lock, flags); - bb->next = bigblocks; - bigblocks = bb; - spin_unlock_irqrestore(&block_lock, flags); + set_slob_block(bb->pages, bb); return bb->pages; } @@ -227,25 +251,16 @@ EXPORT_SYMBOL(krealloc); void kfree(const void *block) { - bigblock_t *bb, **last = &bigblocks; - unsigned long flags; + bigblock_t *bb; if (!block) return; - if (!((unsigned long)block & (PAGE_SIZE-1))) { - /* might be on the big block list */ - spin_lock_irqsave(&block_lock, flags); - for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) { - if (bb->pages == block) { - *last = bb->next; - spin_unlock_irqrestore(&block_lock, flags); - free_pages((unsigned long)block, bb->order); - slob_free(bb, sizeof(bigblock_t)); - return; - } - } - spin_unlock_irqrestore(&block_lock, flags); + bb = get_slob_block(block); + if (bb) { + free_pages((unsigned long)block, bb->order); + slob_free(bb, sizeof(bigblock_t)); + return; } slob_free((slob_t *)block - 1, 0); @@ -257,20 +272,13 @@ EXPORT_SYMBOL(kfree); size_t ksize(const void *block) { bigblock_t *bb; - unsigned long flags; if (!block) return 0; - if (!((unsigned long)block & (PAGE_SIZE-1))) { - spin_lock_irqsave(&block_lock, flags); - for (bb = bigblocks; bb; bb = bb->next) - if (bb->pages == block) { - spin_unlock_irqrestore(&slob_lock, flags); - return PAGE_SIZE << bb->order; - } - spin_unlock_irqrestore(&block_lock, flags); - } + bb = get_slob_block(block); + if (bb) + return PAGE_SIZE << bb->order; return ((slob_t *)block - 1)->units * SLOB_UNIT; } patches/tasklet-busy-loop-hack.patch0000664000077200007720000000322010646635214017050 0ustar mingomingo--- include/linux/interrupt.h | 6 ++---- kernel/softirq.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) Index: linux-rt.q/include/linux/interrupt.h =================================================================== --- linux-rt.q.orig/include/linux/interrupt.h +++ linux-rt.q/include/linux/interrupt.h @@ -374,10 +374,8 @@ static inline void tasklet_unlock(struct clear_bit(TASKLET_STATE_RUN, &(t)->state); } -static inline void tasklet_unlock_wait(struct tasklet_struct *t) -{ - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } -} +extern void tasklet_unlock_wait(struct tasklet_struct *t); + #else # define tasklet_trylock(t) 1 # define tasklet_tryunlock(t) 1 Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -640,6 +641,25 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); } +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) + +void tasklet_unlock_wait(struct tasklet_struct *t) +{ + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { + /* + * Hack for now to avoid this busy-loop: + */ +#ifdef CONFIG_PREEMPT_RT + msleep(1); +#else + barrier(); +#endif + } +} +EXPORT_SYMBOL(tasklet_unlock_wait); + +#endif + static int ksoftirqd(void * __data) { struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 }; patches/ppc-mark-notrace-mainline.patch0000664000077200007720000000106110646635212017502 0ustar mingomingo--- arch/powerpc/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/powerpc/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/irq.c +++ linux-rt.q/arch/powerpc/kernel/irq.c @@ -114,7 +114,7 @@ static inline void set_soft_enabled(unsi : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -void local_irq_restore(unsigned long en) +void notrace local_irq_restore(unsigned long en) { /* * get_paca()->soft_enabled = en; patches/percpu-locked-netfilter.patch0000664000077200007720000001024610646635214017303 0ustar mingomingo net/core/flow.c | 22 ++++++++++++++-------- net/ipv4/netfilter/arp_tables.c | 4 ++-- net/ipv4/netfilter/ip_tables.c | 2 +- 3 files changed, 17 insertions(+), 11 deletions(-) --- Index: linux-rt.q/net/core/flow.c =================================================================== --- linux-rt.q.orig/net/core/flow.c +++ linux-rt.q/net/core/flow.c @@ -40,9 +40,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT( static u32 flow_hash_shift; #define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) +static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables); + +#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu)) static struct kmem_cache *flow_cachep __read_mostly; @@ -172,24 +173,24 @@ static int flow_key_compare(struct flowi void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { - struct flow_cache_entry *fle, **head = NULL /* shut up GCC */; + struct flow_cache_entry **table, *fle, **head = NULL /* shut up GCC */; unsigned int hash; int cpu; local_bh_disable(); - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); fle = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!table) goto nocache; if (flow_hash_rnd_recalc(cpu)) flow_new_hash_rnd(cpu); hash = flow_hash_code(key, cpu); - head = &flow_table(cpu)[hash]; + head = &table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && @@ -199,6 +200,7 @@ void *flow_cache_lookup(struct flowi *ke if (ret) atomic_inc(fle->object_ref); + put_cpu_var_locked(flow_tables, cpu); local_bh_enable(); return ret; @@ -224,6 +226,8 @@ void *flow_cache_lookup(struct flowi *ke } nocache: + put_cpu_var_locked(flow_tables, cpu); + { int err; void *obj; @@ -253,14 +257,15 @@ nocache: static void flow_cache_flush_tasklet(unsigned long data) { struct flow_flush_info *info = (void *)data; + struct flow_cache_entry **table; int i; int cpu; - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); for (i = 0; i < flow_hash_size; i++) { struct flow_cache_entry *fle; - fle = flow_table(cpu)[i]; + fle = table[i]; for (; fle; fle = fle->next) { unsigned genid = atomic_read(&flow_cache_genid); @@ -271,6 +276,7 @@ static void flow_cache_flush_tasklet(uns atomic_dec(fle->object_ref); } } + put_cpu_var_locked(flow_tables, cpu); if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); Index: linux-rt.q/net/ipv4/netfilter/arp_tables.c =================================================================== --- linux-rt.q.orig/net/ipv4/netfilter/arp_tables.c +++ linux-rt.q/net/ipv4/netfilter/arp_tables.c @@ -241,7 +241,7 @@ unsigned int arpt_do_table(struct sk_buf read_lock_bh(&table->lock); private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + table_base = (void *)private->entries[raw_smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -951,7 +951,7 @@ static int do_add_counters(void __user * i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[smp_processor_id()]; + loc_cpu_entry = private->entries[raw_smp_processor_id()]; ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, Index: linux-rt.q/net/ipv4/netfilter/ip_tables.c =================================================================== --- linux-rt.q.orig/net/ipv4/netfilter/ip_tables.c +++ linux-rt.q/net/ipv4/netfilter/ip_tables.c @@ -240,7 +240,7 @@ ipt_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + table_base = (void *)private->entries[raw_smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); /* For return from builtin chain */ patches/preempt-realtime-warn-and-bug-on-fix.patch0000664000077200007720000000163110646635215021505 0ustar mingomingo To fix the following compile error by enclosing it in ifndef __ASSEMBLY__/endif. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - include/asm-generic/bug.h include/asm-generic/bug.h: Assembler messages: include/asm-generic/bug.h:7: Error: Unrecognized opcode: `extern' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Signed-off-by: Tsutomu Owa -- owa --- include/asm-generic/bug.h | 2 ++ 1 file changed, 2 insertions(+) Index: linux-rt.q/include/asm-generic/bug.h =================================================================== --- linux-rt.q.orig/include/asm-generic/bug.h +++ linux-rt.q/include/asm-generic/bug.h @@ -3,7 +3,9 @@ #include +#ifndef __ASSEMBLY__ extern void __WARN_ON(const char *func, const char *file, const int line); +#endif /* __ASSEMBLY__ */ #ifdef CONFIG_BUG patches/preempt-irqs-i386-idle-poll-loop-fix.patch0000664000077200007720000000077410646635215021313 0ustar mingomingo--- arch/i386/kernel/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/process.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/process.c +++ linux-rt.q/arch/i386/kernel/process.c @@ -136,7 +136,9 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { - cpu_relax(); + do { + cpu_relax(); + } while (!need_resched() && !need_resched_delayed()); } #ifdef CONFIG_HOTPLUG_CPU patches/latency-tracing-exclude-printk.patch0000664000077200007720000000175210646635212020574 0ustar mingomingo kernel/printk.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) Index: linux-rt.q/kernel/printk.c =================================================================== --- linux-rt.q.orig/kernel/printk.c +++ linux-rt.q/kernel/printk.c @@ -328,8 +328,23 @@ static void __call_console_drivers(unsig for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) + (con->flags & CON_ANYTIME))) { + /* + * Disable tracing of printk details - it just + * clobbers the trace output with lots of + * repetitive lines (especially if console is + * on a serial line): + */ +#ifdef CONFIG_EVENT_TRACE + int trace_save = trace_enabled; + + trace_enabled = 0; + con->write(con, &LOG_BUF(start), end - start); + trace_enabled = trace_save; +#else con->write(con, &LOG_BUF(start), end - start); +#endif + } } touch_critical_timing(); } patches/Add-dev-rmem-device-driver-for-real-time-JVM-testing.patch0000664000077200007720000001317110646635216024254 0ustar mingomingoAdd /dev/rmem device driver for real-time JVM testing From: Theodore Ts'o This kernel modules is needed for use by the TCK conformance test which tests the JVM's RTSJ implementation. Unfortunately, RTSJ requires that Java programs have direct access to physical memory, and /dev/mem does not allow mmap to work to anything beyond I/O mapped memory regions on the x86 platform. Since this is a spectacularly bad idea (so much for write once, debug everywehere) and could potentially destablize the kernel, set the TAINT_USER flag if available. Signed-off-by: "Theodore Ts'o" --- drivers/char/Kconfig | 11 ++++ drivers/char/Makefile | 1 drivers/char/rmem.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) Index: linux-rt.q/drivers/char/Kconfig =================================================================== --- linux-rt.q.orig/drivers/char/Kconfig +++ linux-rt.q/drivers/char/Kconfig @@ -1115,6 +1115,17 @@ config TELCLOCK /sys/devices/platform/telco_clock, with a number of files for controlling the behavior of this hardware. +config RMEM + tristate "Access to physical memory via /dev/rmem" + default m + help + The /dev/mem device only allows mmap() memory available to + I/O mapped memory; it does not allow access to "real" + physical memory. The /dev/rmem device is a hack which does + allow access to physical memory. We use this instead of + patching /dev/mem because we don't expect this functionality + to ever be accepted into mainline. + config DEVPORT bool depends on !M68K Index: linux-rt.q/drivers/char/Makefile =================================================================== --- linux-rt.q.orig/drivers/char/Makefile +++ linux-rt.q/drivers/char/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_TELCLOCK) += tlclk.o obj-$(CONFIG_BLOCKER) += blocker.o obj-$(CONFIG_LPPTEST) += lpptest.o +obj-$(CONFIG_RMEM) += rmem.o obj-$(CONFIG_WATCHDOG) += watchdog/ obj-$(CONFIG_MWAVE) += mwave/ Index: linux-rt.q/drivers/char/rmem.c =================================================================== --- /dev/null +++ linux-rt.q/drivers/char/rmem.c @@ -0,0 +1,134 @@ +/* + * Rmem - REALLY simple memory mapping demonstration. + * + * Copyright (C) 2005 by Theodore Ts'o + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int rmem_major = 0; +module_param(rmem_major, int, 0444); + +static struct class *rmem_class; + +MODULE_AUTHOR("Theodore Ts'o"); +MODULE_LICENSE("GPL"); + +struct page *rmem_vma_nopage(struct vm_area_struct *vma, + unsigned long address, int *type) +{ + struct page *pageptr; + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long physaddr = address - vma->vm_start + offset; + unsigned long pageframe = physaddr >> PAGE_SHIFT; + + if (!pfn_valid(pageframe)) + return NOPAGE_SIGBUS; + pageptr = pfn_to_page(pageframe); + get_page(pageptr); + if (type) + *type = VM_FAULT_MINOR; + return pageptr; +} + +static struct vm_operations_struct rmem_nopage_vm_ops = { + .nopage = rmem_vma_nopage, +}; + +static int rmem_nopage_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + + if (offset >= __pa(high_memory) || (filp->f_flags & O_SYNC)) + vma->vm_flags |= VM_IO; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &rmem_nopage_vm_ops; +#ifdef TAINT_USER + add_taint(TAINT_USER); +#endif + return 0; +} + +static struct file_operations rmem_nopage_ops = { + .owner = THIS_MODULE, + .mmap = rmem_nopage_mmap, +}; + +static struct cdev rmem_cdev = { + .kobj = {.name = "rmem", }, + .owner = THIS_MODULE, +}; + +static int __init rmem_init(void) +{ + int result; + dev_t dev = MKDEV(rmem_major, 0); + + /* Figure out our device number. */ + if (rmem_major) + result = register_chrdev_region(dev, 1, "rmem"); + else { + result = alloc_chrdev_region(&dev, 0, 1, "rmem"); + rmem_major = MAJOR(dev); + } + if (result < 0) { + printk(KERN_WARNING "rmem: unable to get major %d\n", rmem_major); + return result; + } + if (rmem_major == 0) + rmem_major = result; + + cdev_init(&rmem_cdev, &rmem_nopage_ops); + result = cdev_add(&rmem_cdev, dev, 1); + if (result) { + printk (KERN_NOTICE "Error %d adding /dev/rmem", result); + kobject_put(&rmem_cdev.kobj); + unregister_chrdev_region(dev, 1); + return 1; + } + + rmem_class = class_create(THIS_MODULE, "rmem"); + class_device_create(rmem_class, NULL, dev, NULL, "rmem"); + + return 0; +} + + +static void __exit rmem_cleanup(void) +{ + cdev_del(&rmem_cdev); + unregister_chrdev_region(MKDEV(rmem_major, 0), 1); + class_destroy(rmem_class); +} + + +module_init(rmem_init); +module_exit(rmem_cleanup); patches/ppc-highres-dyntick.patch0000664000077200007720000000544710646635213016436 0ustar mingomingoFrom sshtylyov@ru.mvista.com Thu May 17 19:45:16 2007 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.1.7-deb (2006-10-05) on debian X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=AWL autolearn=unavailable version=3.1.7-deb Received: from imap.sh.mvista.com (unknown [63.81.120.155]) by mail.tglx.de (Postfix) with ESMTP id E0E7965C065 for ; Thu, 17 May 2007 19:45:16 +0200 (CEST) Received: from wasted.dev.rtsoft.ru (unknown [10.150.0.9]) by imap.sh.mvista.com (Postfix) with ESMTP id 323023EC9; Thu, 17 May 2007 10:45:13 -0700 (PDT) From: Sergei Shtylyov Organization: MontaVista Software Inc. To: tglx@linutronix.de, mingo@elte.hu Subject: [PATCH 2.6.21-rt2] PowerPC: enable HRT and dynticks support Date: Thu, 17 May 2007 21:46:46 +0400 User-Agent: KMail/1.5 Cc: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org MIME-Version: 1.0 Content-Disposition: inline Content-Type: text/plain; charset="iso-8859-1" Message-Id: <200705172146.46769.sshtylyov@ru.mvista.com> X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit Enable HRT and dynamic ticks support for PowerPC. Signed-off-by: Sergei Shtylyov --- This patch has been reworked against the 2.6.21 clockevents framework. It has only been tested on the Book E 32-bit CPU this time, so re-testing on "classic" PowerPC CPUs is needed (there have been issues as of 2.6.18-rt7 but those should now be fixed)... arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/idle.c | 6 ++++++ 2 files changed, 7 insertions(+) Index: linux-rt.q/arch/powerpc/Kconfig =================================================================== --- linux-rt.q.orig/arch/powerpc/Kconfig +++ linux-rt.q/arch/powerpc/Kconfig @@ -420,6 +420,7 @@ config GENERIC_CLOCKEVENTS NOTE: This is not compatible with the deterministic time accounting option on PPC64. +source kernel/time/Kconfig source kernel/Kconfig.preempt source "fs/Kconfig.binfmt" Index: linux-rt.q/arch/powerpc/kernel/idle.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/idle.c +++ linux-rt.q/arch/powerpc/kernel/idle.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,8 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { + tick_nohz_stop_sched_tick(); + while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -92,6 +95,9 @@ void cpu_idle(void) ppc64_runlatch_on(); if (cpu_should_die()) cpu_die(); + + tick_nohz_restart_sched_tick(); + preempt_enable_no_resched(); schedule(); preempt_disable(); patches/rtmutex-debug.h-cleanup.patch0000664000077200007720000000277510646635211017230 0ustar mingomingoSubject: [patch] lock debugging: clean up rtmutex-debug.h From: Ingo Molnar style cleanups. Signed-off-by: Ingo Molnar --- kernel/rtmutex-debug.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) Index: linux-rt.q/kernel/rtmutex-debug.h =================================================================== --- linux-rt.q.orig/kernel/rtmutex-debug.h +++ linux-rt.q/kernel/rtmutex-debug.h @@ -17,17 +17,17 @@ extern void debug_rt_mutex_free_waiter(s extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); +extern void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ +# define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline int +debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect) { - return (waiter != NULL); + return waiter != NULL; } patches/s_files-per_cpu-rt.patch0000664000077200007720000000356710646635216016270 0ustar mingomingoSubject: rt: convert the filevec primites for -RT Convert the filevec to -RT primitives. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- fs/file_table.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) Index: linux-rt.q/fs/file_table.c =================================================================== --- linux-rt.q.orig/fs/file_table.c +++ linux-rt.q/fs/file_table.c @@ -253,7 +253,7 @@ struct filevec { struct file *files[FILEVEC_SIZE]; }; -static DEFINE_PER_CPU(struct filevec, sb_fvec); +static DEFINE_PER_CPU_LOCKED(struct filevec, sb_fvec); static inline unsigned int filevec_size(struct filevec *fvec) { @@ -334,20 +334,21 @@ static void __filevec_add(struct filevec static void filevec_add_drain(void) { - struct filevec *fvec = &get_cpu_var(sb_fvec, &cpu); + int cpu; + struct filevec *fvec = &get_cpu_var_locked(sb_fvec, &cpu); if (filevec_count(fvec)) __filevec_add(fvec); - put_cpu_var(sb_fvec, cpu); + put_cpu_var_locked(sb_fvec, cpu); } -static void filevec_add_drain_per_cpu(struct work_struct *dummy) +static void filevec_add_drain_per_cpu(void *dummy) { filevec_add_drain(); } int filevec_add_drain_all(void) { - return schedule_on_each_cpu(filevec_add_drain_per_cpu); + return schedule_on_each_cpu(filevec_add_drain_per_cpu, NULL); } EXPORT_SYMBOL_GPL(filevec_add_drain_all); @@ -400,11 +401,12 @@ void file_move(struct file *file, struct sb = file->f_mapping->host->i_sb; if (list == &sb->s_files.head) { - struct filevec *fvec = &get_cpu_var(sb_fvec, &cpu); + int cpu; + struct filevec *fvec = &get_cpu_var_locked(sb_fvec, &cpu); file_flag_set(file, F_SUPERBLOCK); if (!filevec_add(fvec, file)) __filevec_add(fvec); - put_cpu_var(sb_fvec, cpu); + put_cpu_var_locked(sb_fvec, cpu); } else { file_list_lock(); list_add(&file->f_u.fu_list, list); patches/lockdep-lock_set_subclass.patch0000664000077200007720000000716310646635212017677 0ustar mingomingoSubject: [patch] lockdep: lock_set_subclass - reset a held lock's subclass From: Peter Zijlstra this can be used to reset a held lock's subclass, for arbitrary-depth iterated data structures such as trees or lists which have per-node locks. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 4 ++ kernel/lockdep.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) Index: linux-rt.q/include/linux/lockdep.h =================================================================== --- linux-rt.q.orig/include/linux/lockdep.h +++ linux-rt.q/include/linux/lockdep.h @@ -243,6 +243,9 @@ extern void lock_acquire(struct lockdep_ extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); +extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, + unsigned long ip); + # define INIT_LOCKDEP .lockdep_recursion = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -259,6 +262,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) do { (void)(key); } while (0) Index: linux-rt.q/kernel/lockdep.c =================================================================== --- linux-rt.q.orig/kernel/lockdep.c +++ linux-rt.q/kernel/lockdep.c @@ -2297,6 +2297,55 @@ static int check_unlock(struct task_stru return 1; } +static int +__lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + class = register_lock_class(lock, subclass, 0); + hlock->class = class; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock->class->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->acquire_ip)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + /* * Remove the lock to the list of currently held locks in a * potentially non-nested (out of order) manner. This is a @@ -2494,6 +2543,26 @@ lock_release(struct lockdep_map *lock, i EXPORT_SYMBOL_GPL(lock_release); +void +lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_subclass(lock, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_set_subclass); + /* * Used by the testsuite, sanitize the validator state * after a simulated failure: patches/preempt-irqs-timer.patch0000664000077200007720000001636610646635213016332 0ustar mingomingo--- include/linux/timer.h | 4 + kernel/timer.c | 127 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 98 insertions(+), 33 deletions(-) Index: linux-rt.q/include/linux/timer.h =================================================================== --- linux-rt.q.orig/include/linux/timer.h +++ linux-rt.q/include/linux/timer.h @@ -154,10 +154,12 @@ static inline void add_timer(struct time __mod_timer(timer, timer->expires); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + extern int timer_pending_sync(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else +# define timer_pending_sync(t) timer_pending(t) # define try_to_del_timer_sync(t) del_timer(t) # define del_timer_sync(t) del_timer(t) #endif Index: linux-rt.q/kernel/timer.c =================================================================== --- linux-rt.q.orig/kernel/timer.c +++ linux-rt.q/kernel/timer.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ typedef struct tvec_root_s { struct tvec_t_base_s { spinlock_t lock; struct timer_list *running_timer; + wait_queue_head_t wait_for_running_timer; unsigned long timer_jiffies; tvec_root_t tv1; tvec_t tv2; @@ -248,9 +250,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_relative static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { -#ifdef CONFIG_SMP base->running_timer = timer; -#endif } static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) @@ -380,7 +380,7 @@ int __mod_timer(struct timer_list *timer { tvec_base_t *base, *new_base; unsigned long flags; - int ret = 0; + int ret = 0, cpu; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -392,7 +392,8 @@ int __mod_timer(struct timer_list *timer ret = 1; } - new_base = __get_cpu_var(tvec_bases); + cpu = raw_smp_processor_id(); + new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { /* @@ -441,6 +442,17 @@ void add_timer_on(struct timer_list *tim spin_unlock_irqrestore(&base->lock, flags); } +/* + * Wait for a running timer + */ +void wait_for_running_timer(struct timer_list *timer) +{ + tvec_base_t *base = timer->base; + + if (base->running_timer == timer) + wait_event(base->wait_for_running_timer, + base->running_timer != timer); +} /** * mod_timer - modify a timer's timeout @@ -512,7 +524,35 @@ int del_timer(struct timer_list *timer) EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) +/* + * This function checks whether a timer is active and not running on any + * CPU. Upon successful (ret >= 0) exit the timer is not queued and the + * handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int timer_pending_sync(struct timer_list *timer) +{ + tvec_base_t *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) + ret = 1; +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + + /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del @@ -569,7 +609,7 @@ int del_timer_sync(struct timer_list *ti int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; - cpu_relax(); + wait_for_running_timer(timer); } } @@ -615,6 +655,20 @@ static inline void __run_timers(tvec_bas struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; + if (softirq_need_resched()) { + spin_unlock_irq(&base->lock); + wake_up(&base->wait_for_running_timer); + cond_resched_softirq_context(); + cpu_relax(); + spin_lock_irq(&base->lock); + /* + * We can simply continue after preemption, nobody + * else can touch timer_jiffies so 'index' is still + * valid. Any new jiffy will be taken care of in + * subsequent loops: + */ + } + /* * Cascade timers: */ @@ -642,18 +696,17 @@ static inline void __run_timers(tvec_bas int preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { - printk(KERN_WARNING "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); + print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; } } + set_running_timer(base, NULL); + cond_resched_softirq_context(); spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + wake_up(&base->wait_for_running_timer); spin_unlock_irq(&base->lock); } @@ -816,10 +869,10 @@ void update_process_times(int user_tick) account_user_time(p, jiffies_to_cputime(1)); else account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); + scheduler_tick(); run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); - scheduler_tick(); run_posix_cpu_timers(p); } @@ -865,35 +918,45 @@ static inline void calc_load(unsigned lo } /* - * This function runs timers and the timer-tq in bottom half context. + * Called by the local, per-CPU timer interrupt on SMP. */ -static void run_timer_softirq(struct softirq_action *h) +void run_local_timers(void) { - tvec_base_t *base = __get_cpu_var(tvec_bases); - - hrtimer_run_queues(); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); + raise_softirq(TIMER_SOFTIRQ); + softlockup_tick(); } /* - * Called by the local, per-CPU timer interrupt on SMP. + * Time of day handling: */ -void run_local_timers(void) +static inline void update_times(void) { - raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); + static unsigned long last_tick; + unsigned long ticks, flags; + + write_seqlock_irqsave(&xtime_lock, flags); + ticks = jiffies - last_tick; + if (ticks) { + last_tick += ticks; + update_wall_time(); + calc_load(ticks); + } + write_sequnlock_irqrestore(&xtime_lock, flags); } + /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! + * This function runs timers and the timer-tq in bottom half context. */ -static inline void update_times(unsigned long ticks) +static void run_timer_softirq(struct softirq_action *h) { - update_wall_time(); - calc_load(ticks); + tvec_base_t *base = __get_cpu_var(tvec_bases); + + update_times(); + hrtimer_run_queues(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); } /* @@ -905,7 +968,6 @@ static inline void update_times(unsigned void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1241,6 +1303,7 @@ static int __devinit init_timers_cpu(int spin_lock_init(&base->lock); lockdep_set_class(&base->lock, base_lock_keys + cpu); + init_waitqueue_head(&base->wait_for_running_timer); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); patches/preempt-irqs-ppc.patch0000664000077200007720000001175510646635214015772 0ustar mingomingo--- arch/powerpc/kernel/entry_32.S | 6 +++--- arch/powerpc/kernel/irq.c | 2 -- arch/powerpc/kernel/ppc_ksyms.c | 1 - arch/powerpc/platforms/iseries/setup.c | 6 ++++-- arch/powerpc/platforms/pseries/setup.c | 6 ++++-- include/asm-powerpc/thread_info.h | 3 +++ 6 files changed, 14 insertions(+), 10 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/entry_32.S =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/entry_32.S +++ linux-rt.q/arch/powerpc/kernel/entry_32.S @@ -641,7 +641,7 @@ user_exc_return: /* r10 contains MSR_KE /* Check current_thread_info()->flags */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED) + andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne do_work restore_user: @@ -863,7 +863,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -877,7 +877,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK beq restore_user Index: linux-rt.q/arch/powerpc/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/irq.c +++ linux-rt.q/arch/powerpc/kernel/irq.c @@ -94,8 +94,6 @@ extern atomic_t ipi_sent; #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(irq_desc); - int distribute_irqs = 1; static inline unsigned long get_hard_enabled(void) Index: linux-rt.q/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/ppc_ksyms.c +++ linux-rt.q/arch/powerpc/kernel/ppc_ksyms.c @@ -175,7 +175,6 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(console_drivers); EXPORT_SYMBOL(cacheable_memcpy); Index: linux-rt.q/arch/powerpc/platforms/iseries/setup.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/iseries/setup.c +++ linux-rt.q/arch/powerpc/platforms/iseries/setup.c @@ -564,12 +564,14 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - while (!need_resched() && !hvlpevent_is_pending()) { + while (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); /* Recheck with irqs off */ - if (!need_resched() && !hvlpevent_is_pending()) + if (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) yield_shared_processor(); HMT_medium(); Index: linux-rt.q/arch/powerpc/platforms/pseries/setup.c =================================================================== --- linux-rt.q.orig/arch/powerpc/platforms/pseries/setup.c +++ linux-rt.q/arch/powerpc/platforms/pseries/setup.c @@ -412,7 +412,8 @@ static void pseries_dedicated_idle_sleep set_thread_flag(TIF_POLLING_NRFLAG); while (get_tb() < start_snooze) { - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; ppc64_runlatch_off(); HMT_low(); @@ -423,7 +424,8 @@ static void pseries_dedicated_idle_sleep clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); local_irq_disable(); - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; } Index: linux-rt.q/include/asm-powerpc/thread_info.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/thread_info.h +++ linux-rt.q/include/asm-powerpc/thread_info.h @@ -120,6 +120,7 @@ static inline struct thread_info *curren #define TIF_MEMDIE 10 #define TIF_SECCOMP 11 /* secure computing */ #define TIF_RESTOREALL 12 /* Restore all regs (implies NOERROR) */ +#define TIF_NEED_RESCHED_DELAYED 13 /* reschedule on return to userspace */ #define TIF_NOERROR 14 /* Force successful syscall return */ #define TIF_RESTORE_SIGMASK 15 /* Restore signal mask in do_signal */ #define TIF_FREEZE 16 /* Freezing for suspend */ @@ -140,6 +141,8 @@ static inline struct thread_info *curren #define _TIF_NOERROR (1< A bugfix in ich5 hpet force detect which caused resumes to fail. Thanks to Udo A Steinberg for reporting the problem. Signed-off-by: Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andi Kleen Cc: john stultz Cc: Greg KH Signed-off-by: Andrew Morton --- arch/i386/kernel/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/quirks.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/quirks.c +++ linux-rt.q/arch/i386/kernel/quirks.c @@ -201,7 +201,6 @@ static void old_ich_force_enable_hpet(st force_hpet_address = 0xFED00000 | (val << 12); printk(KERN_DEBUG "HPET at base address 0x%lx\n", force_hpet_address); - cached_dev = dev; return; } @@ -223,6 +222,7 @@ static void old_ich_force_enable_hpet(st force_hpet_address = 0xFED00000 | (val << 12); printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", force_hpet_address); + cached_dev = dev; force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; return; } patches/latency-tracing-i386-paravirt-fastcall.patch0000664000077200007720000002353610646635212021750 0ustar mingomingo--- include/asm-i386/paravirt.h | 189 ++++++++++++++++++++++---------------------- 1 file changed, 98 insertions(+), 91 deletions(-) Index: linux-rt.q/include/asm-i386/paravirt.h =================================================================== --- linux-rt.q.orig/include/asm-i386/paravirt.h +++ linux-rt.q/include/asm-i386/paravirt.h @@ -37,7 +37,7 @@ struct paravirt_ops { unsigned int kernel_rpl; int shared_kernel_pmd; - int paravirt_enabled; + int paravirt_enabled; const char *name; /* @@ -72,27 +72,27 @@ struct paravirt_ops int (*set_wallclock)(unsigned long); /* cpuid emulation, mostly so that caps bits can be disabled */ - void (*cpuid)(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx); + void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); /* hooks for various privileged instructions */ - unsigned long (*get_debugreg)(int regno); - void (*set_debugreg)(int regno, unsigned long value); + unsigned long (fastcall *get_debugreg)(int regno); + void (fastcall *set_debugreg)(int regno, unsigned long value); - void (*clts)(void); + void (fastcall *clts)(void); - unsigned long (*read_cr0)(void); - void (*write_cr0)(unsigned long); + unsigned long (fastcall *read_cr0)(void); + void (fastcall *write_cr0)(unsigned long); - unsigned long (*read_cr2)(void); - void (*write_cr2)(unsigned long); + unsigned long (fastcall *read_cr2)(void); + void (fastcall *write_cr2)(unsigned long); - unsigned long (*read_cr3)(void); - void (*write_cr3)(unsigned long); + unsigned long (fastcall *read_cr3)(void); + void (fastcall *write_cr3)(unsigned long); - unsigned long (*read_cr4_safe)(void); - unsigned long (*read_cr4)(void); - void (*write_cr4)(unsigned long); + unsigned long (fastcall *read_cr4_safe)(void); + unsigned long (fastcall *read_cr4)(void); + void (fastcall *write_cr4)(unsigned long); /* * Get/set interrupt state. save_fl and restore_fl are only @@ -100,122 +100,129 @@ struct paravirt_ops * returned from save_fl are undefined, and may be ignored by * restore_fl. */ - unsigned long (*save_fl)(void); - void (*restore_fl)(unsigned long); - void (*irq_disable)(void); - void (*irq_enable)(void); - void (*safe_halt)(void); - void (*halt)(void); + unsigned long (fastcall *save_fl)(void); + void (*fastcall restore_fl)(unsigned long); + void (fastcall *irq_disable)(void); + void (fastcall *irq_enable)(void); + void (fastcall *safe_halt)(void); + void (fastcall *halt)(void); - void (*wbinvd)(void); + void (fastcall *wbinvd)(void); /* MSR, PMC and TSR operations. err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, u64 val); + u64 (fastcall *read_msr)(unsigned int msr, int *err); + int (fastcall *write_msr)(unsigned int msr, u64 val); - u64 (*read_tsc)(void); - u64 (*read_pmc)(void); - u64 (*get_scheduled_cycles)(void); + u64 (fastcall *read_tsc)(void); + u64 (fastcall *read_pmc)(void); + u64 (fastcall *get_scheduled_cycles)(void); unsigned long (*get_cpu_khz)(void); /* Segment descriptor handling */ - void (*load_tr_desc)(void); - void (*load_gdt)(const struct Xgt_desc_struct *); - void (*load_idt)(const struct Xgt_desc_struct *); - void (*store_gdt)(struct Xgt_desc_struct *); - void (*store_idt)(struct Xgt_desc_struct *); - void (*set_ldt)(const void *desc, unsigned entries); - unsigned long (*store_tr)(void); - void (*load_tls)(struct thread_struct *t, unsigned int cpu); - void (*write_ldt_entry)(struct desc_struct *, - int entrynum, u32 low, u32 high); - void (*write_gdt_entry)(struct desc_struct *, - int entrynum, u32 low, u32 high); - void (*write_idt_entry)(struct desc_struct *, - int entrynum, u32 low, u32 high); - void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t); + void (fastcall *load_tr_desc)(void); + void (fastcall *load_gdt)(const struct Xgt_desc_struct *); + void (fastcall *load_idt)(const struct Xgt_desc_struct *); + void (fastcall *store_gdt)(struct Xgt_desc_struct *); + void (fastcall *store_idt)(struct Xgt_desc_struct *); + void (fastcall *set_ldt)(const void *desc, unsigned entries); + unsigned long (fastcall *store_tr)(void); + void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu); + void (fastcall *write_ldt_entry)(struct desc_struct *, + int entrynum, u32 low, u32 high); + void (fastcall *write_gdt_entry)(struct desc_struct *, + int entrynum, u32 low, u32 high); + void (fastcall *write_idt_entry)(struct desc_struct *, + int entrynum, u32 low, u32 high); + void (fastcall *load_esp0)(struct tss_struct *tss, + struct thread_struct *t); - void (*set_iopl_mask)(unsigned mask); - void (*io_delay)(void); + void (fastcall *set_iopl_mask)(unsigned mask); + void (fastcall *io_delay)(void); /* * Hooks for intercepting the creation/use/destruction of an * mm_struct. */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); - void (*exit_mmap)(struct mm_struct *mm); + void (fastcall *activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (fastcall *dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (fastcall *exit_mmap)(struct mm_struct *mm); #ifdef CONFIG_X86_LOCAL_APIC /* * Direct APIC operations, principally for VMI. Ideally * these shouldn't be in this interface. */ - void (*apic_write)(unsigned long reg, unsigned long v); - void (*apic_write_atomic)(unsigned long reg, unsigned long v); - unsigned long (*apic_read)(unsigned long reg); + void (fastcall *apic_write)(unsigned long reg, unsigned long v); + void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v); + unsigned long (fastcall *apic_read)(unsigned long reg); void (*setup_boot_clock)(void); void (*setup_secondary_clock)(void); - void (*startup_ipi_hook)(int phys_apicid, - unsigned long start_eip, - unsigned long start_esp); + void (fastcall *startup_ipi_hook)(int phys_apicid, + unsigned long start_eip, + unsigned long start_esp); #endif /* TLB operations */ - void (*flush_tlb_user)(void); - void (*flush_tlb_kernel)(void); - void (*flush_tlb_single)(unsigned long addr); - void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, - unsigned long va); + void (fastcall *flush_tlb_user)(void); + void (fastcall *flush_tlb_kernel)(void); + void (fastcall *flush_tlb_single)(unsigned long addr); + void (fastcall *flush_tlb_others)(const cpumask_t *cpus, + struct mm_struct *mm, + unsigned long va); /* Hooks for allocating/releasing pagetable pages */ - void (*alloc_pt)(u32 pfn); - void (*alloc_pd)(u32 pfn); - void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); - void (*release_pt)(u32 pfn); - void (*release_pd)(u32 pfn); + void (fastcall *alloc_pt)(u32 pfn); + void (fastcall *alloc_pd)(u32 pfn); + void (fastcall *alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, + u32 count); + void (fastcall *release_pt)(u32 pfn); + void (fastcall *release_pd)(u32 pfn); /* Pagetable manipulation functions */ - void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); + void (fastcall *set_pte)(pte_t *ptep, pte_t pteval); + void (fastcall *set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); + void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (fastcall *pte_update)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (fastcall *pte_update_defer)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); #ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); + void *(fastcall *kmap_atomic_pte)(struct page *page, enum km_type type); #endif #ifdef CONFIG_X86_PAE - void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); - void (*set_pud)(pud_t *pudp, pud_t pudval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); - void (*pmd_clear)(pmd_t *pmdp); - - unsigned long long (*pte_val)(pte_t); - unsigned long long (*pmd_val)(pmd_t); - unsigned long long (*pgd_val)(pgd_t); - - pte_t (*make_pte)(unsigned long long pte); - pmd_t (*make_pmd)(unsigned long long pmd); - pgd_t (*make_pgd)(unsigned long long pgd); + void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (fastcall *set_pte_present)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte); + void (fastcall *set_pud)(pud_t *pudp, pud_t pudval); + void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (fastcall *pmd_clear)(pmd_t *pmdp); + + unsigned long long (fastcall *pte_val)(pte_t); + unsigned long long (fastcall *pmd_val)(pmd_t); + unsigned long long (fastcall *pgd_val)(pgd_t); + + pte_t (fastcall *make_pte)(unsigned long long pte); + pmd_t (fastcall *make_pmd)(unsigned long long pmd); + pgd_t (fastcall *make_pgd)(unsigned long long pgd); #else - unsigned long (*pte_val)(pte_t); - unsigned long (*pgd_val)(pgd_t); + unsigned long (fastcall *pte_val)(pte_t); + unsigned long (fastcall *pgd_val)(pgd_t); - pte_t (*make_pte)(unsigned long pte); - pgd_t (*make_pgd)(unsigned long pgd); + pte_t (fastcall *make_pte)(unsigned long pte); + pgd_t (fastcall *make_pgd)(unsigned long pgd); #endif /* Set deferred update mode, used for batching operations. */ - void (*set_lazy_mode)(enum paravirt_lazy_mode mode); + void (fastcall *set_lazy_mode)(enum paravirt_lazy_mode mode); /* These two are jmp to, not actually called. */ void (*irq_enable_sysexit)(void); patches/rt-mutex-arm-fix.patch0000664000077200007720000000170510646635214015702 0ustar mingomingo--- arch/arm/kernel/semaphore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/arch/arm/kernel/semaphore.c =================================================================== --- linux-rt.q.orig/arch/arm/kernel/semaphore.c +++ linux-rt.q/arch/arm/kernel/semaphore.c @@ -154,7 +154,7 @@ EXPORT_SYMBOL(__compat_down_interruptibl * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem) +fastcall int __attribute_used__ __sched __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -176,7 +176,7 @@ fastcall int __attribute_used__ __compat EXPORT_SYMBOL(__compat_down_trylock); -fastcall int compat_sem_is_locked(struct compat_semaphore *sem) +fastcall int __sched compat_sem_is_locked(struct compat_semaphore *sem) { return (int) atomic_read(&sem->count) < 0; } patches/preempt-realtime-powerpc-b2.patch0000664000077200007720000000475410646635215020016 0ustar mingomingo To convert the spinlocks into the raw onces to fix the following warnings/errors. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Badness at arch/powerpc/kernel/entry_64.S:651 Call Trace: [C0000000006133E0] [C00000000000FAAC] show_stack+0x68/0x1b0 (unreliable) [C000000000613480] [C0000000001EF004] .repor000001EF004] .report_bug+0x94/0xe8 [C000000000613510] [C0000000003EAD58] .program_check_exception+0x170/0x5a8 [C00000000000487C] program_check_common+0xfc/0x100 --- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/rtas.c | 2 +- arch/powerpc/mm/hash_native_64.c | 2 +- include/asm-powerpc/rtas.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) Index: linux-rt.q/arch/powerpc/kernel/irq.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/irq.c +++ linux-rt.q/arch/powerpc/kernel/irq.c @@ -403,7 +403,7 @@ EXPORT_SYMBOL(do_softirq); #ifdef CONFIG_PPC_MERGE static LIST_HEAD(irq_hosts); -static DEFINE_SPINLOCK(irq_big_lock); +static DEFINE_RAW_SPINLOCK(irq_big_lock); static DEFINE_PER_CPU(unsigned int, irq_radix_reader); static unsigned int irq_radix_writer; struct irq_map_entry irq_map[NR_IRQS]; Index: linux-rt.q/arch/powerpc/kernel/rtas.c =================================================================== --- linux-rt.q.orig/arch/powerpc/kernel/rtas.c +++ linux-rt.q/arch/powerpc/kernel/rtas.c @@ -36,7 +36,7 @@ #include struct rtas_t rtas = { - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(lock) }; EXPORT_SYMBOL(rtas); Index: linux-rt.q/arch/powerpc/mm/hash_native_64.c =================================================================== --- linux-rt.q.orig/arch/powerpc/mm/hash_native_64.c +++ linux-rt.q/arch/powerpc/mm/hash_native_64.c @@ -36,7 +36,7 @@ #define HPTE_LOCK_BIT 3 -static DEFINE_SPINLOCK(native_tlbie_lock); +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); static inline void __tlbie(unsigned long va, unsigned int psize) { Index: linux-rt.q/include/asm-powerpc/rtas.h =================================================================== --- linux-rt.q.orig/include/asm-powerpc/rtas.h +++ linux-rt.q/include/asm-powerpc/rtas.h @@ -58,7 +58,7 @@ struct rtas_t { unsigned long entry; /* physical address pointer */ unsigned long base; /* physical address pointer */ unsigned long size; - spinlock_t lock; + raw_spinlock_t lock; struct rtas_args args; struct device_node *dev; /* virtual address pointer */ }; patches/highmem-revert-mainline.patch0000664000077200007720000000120110646635216017262 0ustar mingomingo--- mm/highmem.c | 9 --------- 1 file changed, 9 deletions(-) Index: linux-rt.q/mm/highmem.c =================================================================== --- linux-rt.q.orig/mm/highmem.c +++ linux-rt.q/mm/highmem.c @@ -99,15 +99,6 @@ static void flush_all_zero_pkmaps(void) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } -/* Flush all unused kmap mappings in order to remove stray - mappings. */ -void kmap_flush_unused(void) -{ - spin_lock(&kmap_lock); - flush_all_zero_pkmaps(); - spin_unlock(&kmap_lock); -} - static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; patches/select-error-leak-fix.patch0000664000077200007720000000262210646635216016661 0ustar mingomingoAs it is currently written, sys_select checks its return code to convert ERESTARTNOHAND to EINTR. However, the check is within an if (tvp) clause, and so if select is called from userspace with a NULL timeval, then it is possible for the ERESTARTNOHAND errno to leak into userspace, which is incorrect. This patch moves that check outside of the conditional, and prevents the errno leak. Thanks & Regards Neil Signed-Off-By: Neil Horman fs/select.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) Index: linux-rt.q/fs/select.c =================================================================== --- linux-rt.q.orig/fs/select.c +++ linux-rt.q/fs/select.c @@ -414,20 +414,12 @@ asmlinkage long sys_select(int n, fd_set rtv.tv_sec = timeout; if (timeval_compare(&rtv, &tv) >= 0) rtv = tv; - if (copy_to_user(tvp, &rtv, sizeof(rtv))) { -sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - } + if (copy_to_user(tvp, &rtv, sizeof(rtv))) + return -EFAULT; } +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; return ret; } patches/ppc-rename-xmon-mcount.patch0000664000077200007720000000462210646635212017064 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Mon May 14 17:19:36 2007 Date: Mon, 14 May 2007 17:19:36 +0900 From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [patch 4/5] powerpc 2.6.21-rt1: rename mcount variable in xmon to xmon_mcount Rename variable name "mcount" in xmon to xmon_mcount, since it conflicts with mcount() function used by latency trace function. Signed-off-by: Tsutomu OWA -- owa --- From tsutomu.owa@toshiba.co.jp Mon May 14 17:19:36 2007 Date: Mon, 14 May 2007 17:19:36 +0900 From: Tsutomu OWA To: linuxppc-dev@ozlabs.org, linux-kernel@vger.kernel.org Cc: mingo@elte.hu, tglx@linutronix.de Subject: Re: [patch 4/5] powerpc 2.6.21-rt1: rename mcount variable in xmon to xmon_mcount Rename variable name "mcount" in xmon to xmon_mcount, since it conflicts with mcount() function used by latency trace function. Signed-off-by: Tsutomu OWA -- owa --- arch/powerpc/xmon/xmon.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) Index: linux-rt.q/arch/powerpc/xmon/xmon.c =================================================================== --- linux-rt.q.orig/arch/powerpc/xmon/xmon.c +++ linux-rt.q/arch/powerpc/xmon/xmon.c @@ -2129,7 +2129,7 @@ print_address(unsigned long addr) static unsigned long mdest; /* destination address */ static unsigned long msrc; /* source address */ static unsigned long mval; /* byte value to set memory to */ -static unsigned long mcount; /* # bytes to affect */ +static unsigned long xmon_mcount; /* # bytes to affect */ static unsigned long mdiffs; /* max # differences to print */ void @@ -2141,19 +2141,20 @@ memops(int cmd) scanhex((void *)(cmd == 's'? &mval: &msrc)); if( termch != '\n' ) termch = 0; - scanhex((void *)&mcount); + scanhex((void *)&xmon_mcount); switch( cmd ){ case 'm': - memmove((void *)mdest, (void *)msrc, mcount); + memmove((void *)mdest, (void *)msrc, xmon_mcount); break; case 's': - memset((void *)mdest, mval, mcount); + memset((void *)mdest, mval, xmon_mcount); break; case 'd': if( termch != '\n' ) termch = 0; scanhex((void *)&mdiffs); - memdiffs((unsigned char *)mdest, (unsigned char *)msrc, mcount, mdiffs); + memdiffs((unsigned char *)mdest, (unsigned char *)msrc, + xmon_mcount, mdiffs); break; } } patches/preempt-irqs-i386.patch0000664000077200007720000001326610646635213015677 0ustar mingomingo--- arch/i386/kernel/i8259.c | 10 ++++++---- arch/i386/kernel/io_apic.c | 23 +++++++---------------- arch/i386/mach-default/setup.c | 4 ++-- arch/i386/mach-visws/visws_apic.c | 2 ++ arch/i386/mach-voyager/setup.c | 4 ++-- 5 files changed, 19 insertions(+), 24 deletions(-) Index: linux-rt.q/arch/i386/kernel/i8259.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/i8259.c +++ linux-rt.q/arch/i386/kernel/i8259.c @@ -170,6 +170,8 @@ static void mask_and_ack_8259A(unsigned */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; + if (irq & 8) + outb(0x60+(irq&7),PIC_SLAVE_CMD); /* 'Specific EOI' to slave */ cached_irq_mask |= irqmask; handle_real_irq: @@ -297,10 +299,10 @@ void init_8259A(int auto_eoi) outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ + if (!auto_eoi) /* master expects normal EOI */ outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + else /* master does Auto EOI */ + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ @@ -350,7 +352,7 @@ static irqreturn_t math_error_irq(int cp * New motherboards sometimes make IRQ 13 be a PCI interrupt, * so allow interrupt sharing. */ -static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; +static struct irqaction fpu_irq = { math_error_irq, IRQF_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL }; void __init init_ISA_irqs (void) { Index: linux-rt.q/arch/i386/kernel/io_apic.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/io_apic.c +++ linux-rt.q/arch/i386/kernel/io_apic.c @@ -261,18 +261,6 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -} - static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -1278,9 +1266,10 @@ static void ioapic_register_intr(int irq trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else + else { set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); + } set_intr_gate(vector, interrupt[irq]); } @@ -1515,7 +1504,7 @@ void __init print_IO_APIC(void) return; } -#if 0 +#if 1 static void print_APIC_bitfield (int base) { @@ -2008,8 +1997,10 @@ static void ack_ioapic_quirk_irq(unsigne if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } Index: linux-rt.q/arch/i386/mach-default/setup.c =================================================================== --- linux-rt.q.orig/arch/i386/mach-default/setup.c +++ linux-rt.q/arch/i386/mach-default/setup.c @@ -35,7 +35,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; /** * intr_init_hook - post gate setup interrupt initialisation @@ -81,7 +81,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-rt.q/arch/i386/mach-visws/visws_apic.c =================================================================== --- linux-rt.q.orig/arch/i386/mach-visws/visws_apic.c +++ linux-rt.q/arch/i386/mach-visws/visws_apic.c @@ -257,11 +257,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; Index: linux-rt.q/arch/i386/mach-voyager/setup.c =================================================================== --- linux-rt.q.orig/arch/i386/mach-voyager/setup.c +++ linux-rt.q/arch/i386/mach-voyager/setup.c @@ -18,7 +18,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; void __init intr_init_hook(void) { @@ -42,7 +42,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; patches/ntp-move-the-cmos-update-code-into-ntpc.patch0000664000077200007720000002061710646635210022136 0ustar mingomingoFrom: Thomas Gleixner i386 and sparc64 have the identical code to update the cmos clock. Move it into kernel/time/ntp.c as there are other architectures coming along with the same requirements. Signed-off-by: Thomas Gleixner Cc: Chris Wright Cc: Ingo Molnar Cc: john stultz Cc: David Miller Cc: Roman Zippel Signed-off-by: Andrew Morton --- arch/i386/Kconfig | 4 +++ arch/i386/kernel/time.c | 50 +------------------------------------- arch/sparc64/Kconfig | 4 +++ arch/sparc64/kernel/time.c | 53 +--------------------------------------- include/asm-i386/timer.h | 1 include/linux/time.h | 2 + kernel/time/ntp.c | 59 ++++++++++++++++++++++++++++++++++++++++++--- 7 files changed, 70 insertions(+), 103 deletions(-) Index: linux-rt.q/arch/i386/Kconfig =================================================================== --- linux-rt.q.orig/arch/i386/Kconfig +++ linux-rt.q/arch/i386/Kconfig @@ -18,6 +18,10 @@ config GENERIC_TIME bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config CLOCKSOURCE_WATCHDOG bool default y Index: linux-rt.q/arch/i386/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/time.c +++ linux-rt.q/arch/i386/kernel/time.c @@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void return retval; } -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); -int no_sync_cmos_clock; - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timeval now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) +int update_persistent_clock(struct timespec now) { - if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + return set_rtc_mmss(now.tv_sec); } extern void (*late_time_init)(void); Index: linux-rt.q/arch/sparc64/Kconfig =================================================================== --- linux-rt.q.orig/arch/sparc64/Kconfig +++ linux-rt.q/arch/sparc64/Kconfig @@ -23,6 +23,10 @@ config GENERIC_TIME bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config GENERIC_CLOCKEVENTS bool default y Index: linux-rt.q/arch/sparc64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/sparc64/kernel/time.c +++ linux-rt.q/arch/sparc64/kernel/time.c @@ -403,58 +403,9 @@ static struct sparc64_tick_ops hbtick_op static unsigned long timer_ticks_per_nsec_quotient __read_mostly; -#define TICK_SIZE (tick_nsec / 1000) - -#define USEC_AFTER 500000 -#define USEC_BEFORE 500000 - -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timeval now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) +int update_persistent_clock(struct timespec now) { - mod_timer(&sync_cmos_timer, jiffies + 1); + return set_rtc_mmss(now.tv_sec); } /* Kick start a stopped clock (procedure from the Sun NVRAM/hostid FAQ). */ Index: linux-rt.q/include/asm-i386/timer.h =================================================================== --- linux-rt.q.orig/include/asm-i386/timer.h +++ linux-rt.q/include/asm-i386/timer.h @@ -11,7 +11,6 @@ unsigned long native_calculate_cpu_khz(v extern int timer_ack; extern int no_timer_check; -extern int no_sync_cmos_clock; extern int recalibrate_cpu_khz(void); #ifndef CONFIG_PARAVIRT Index: linux-rt.q/include/linux/time.h =================================================================== --- linux-rt.q.orig/include/linux/time.h +++ linux-rt.q/include/linux/time.h @@ -93,6 +93,8 @@ extern struct timespec wall_to_monotonic extern seqlock_t xtime_lock __attribute__((weak)); extern unsigned long read_persistent_clock(void); +extern int update_persistent_clock(struct timespec now); +extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); static inline unsigned long get_seconds(void) Index: linux-rt.q/kernel/time/ntp.c =================================================================== --- linux-rt.q.orig/kernel/time/ntp.c +++ linux-rt.q/kernel/time/ntp.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -185,12 +186,64 @@ u64 current_tick_length(void) return tick_length; } +#ifdef CONFIG_GENERIC_CMOS_UPDATE -void __attribute__ ((weak)) notify_arch_cmos_timer(void) +/* Disable the cmos update - used by virtualization and embedded */ +int no_sync_cmos_clock __read_mostly; + +static void sync_cmos_clock(unsigned long dummy); + +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) { - return; + struct timespec now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if (!ntp_synced()) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + getnstimeofday(&now); + if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) + fail = update_persistent_clock(now); + + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); } +static void notify_cmos_timer(void) +{ + if (no_sync_cmos_clock) + mod_timer(&sync_cmos_timer, jiffies + 1); +} + +#else +static inline void notify_cmos_timer(void) { } +#endif + /* adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ @@ -355,6 +408,6 @@ leave: if ((time_status & (STA_UNSYNC|ST txc->stbcnt = 0; write_sequnlock_irq(&xtime_lock); do_gettimeofday(&txc->time); - notify_arch_cmos_timer(); + notify_cmos_timer(); return(result); } patches/preempt-realtime-powerpc-tlb-batching.patch0000664000077200007720000000374510646635215022050 0ustar mingomingoFrom tsutomu.owa@toshiba.co.jp Tue May 15 15:27:26 2007 Date: Tue, 15 May 2007 15:27:26 +0900 From: Tsutomu OWA To: Arnd Bergmann Cc: linuxppc-dev@ozlabs.org, Thomas Gleixner , mingo@elte.hu, linux-kernel@vger.kernel.org Subject: Re: [patch 4/4] powerpc 2.6.21-rt1: reduce scheduling latency by changing tlb flush size At Mon, 14 May 2007 16:40:02 +0200, Arnd Bergmann wrote: > > +#if defined(CONFIG_PPC_CELLEB) && defined(CONFIG_PREEMPT_RT) > > +/* Since tlb flush takes long time on Celleb, reduce it to 1 when Celleb && RT */ > > +#define PPC64_TLB_BATCH_NR 1 > With this code, you get silent side-effects of enabling PPC_CELLEB > along with another platform. > Maybe instead you should change the hpte_need_flush() to always flush > when running on the celleb platform and PREEMPT_RT is enabled. OK, how about this one? thanks a lot! Since flushing tlb needs expensive hypervisor call(s) on celleb, always flush it on RT to reduce scheduling latency. Signed-off-by: Tsutomu OWA -- owa --- arch/powerpc/mm/tlb_64.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) Index: linux-rt.q/arch/powerpc/mm/tlb_64.c =================================================================== --- linux-rt.q.orig/arch/powerpc/mm/tlb_64.c +++ linux-rt.q/arch/powerpc/mm/tlb_64.c @@ -31,6 +31,7 @@ #include #include #include +#include DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); @@ -204,6 +205,18 @@ void hpte_need_flush(struct mm_struct *m batch->pte[i] = rpte; batch->vaddr[i] = vaddr; batch->index = ++i; + +#ifdef CONFIG_PREEMPT_RT + /* + * Since flushing tlb needs expensive hypervisor call(s) on celleb, + * always flush it on RT to reduce scheduling latency. + */ + if (machine_is(celleb)) { + flush_tlb_pending(); + return; + } +#endif /* CONFIG_PREEMPT_RT */ + if (i >= PPC64_TLB_BATCH_NR) __flush_tlb_pending(batch); } patches/fix-softirq-checks-for-non-rt-preempt-hardirq.patch0000664000077200007720000000166310646635216023373 0ustar mingomingo--- include/linux/bottom_half.h | 2 +- kernel/softirq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) Index: linux-rt.q/include/linux/bottom_half.h =================================================================== --- linux-rt.q.orig/include/linux/bottom_half.h +++ linux-rt.q/include/linux/bottom_half.h @@ -1,7 +1,7 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H -#ifdef CONFIG_PREEMPT_RT +#ifdef CONFIG_PREEMPT_HARDIRQS # define local_bh_disable() do { } while (0) # define __local_bh_disable(ip) do { } while (0) # define _local_bh_enable() do { } while (0) Index: linux-rt.q/kernel/softirq.c =================================================================== --- linux-rt.q.orig/kernel/softirq.c +++ linux-rt.q/kernel/softirq.c @@ -140,7 +140,7 @@ static void trigger_softirqs(void) } } -#ifndef CONFIG_PREEMPT_RT +#ifndef CONFIG_PREEMPT_HARDIRQS /* * This one is for softirq.c-internal use, patches/serial-locking-rt-cleanup.patch0000664000077200007720000000214710646635214017530 0ustar mingomingo drivers/serial/8250.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) Index: linux-rt.q/drivers/serial/8250.c =================================================================== --- linux-rt.q.orig/drivers/serial/8250.c +++ linux-rt.q/drivers/serial/8250.c @@ -2456,14 +2456,10 @@ serial8250_console_write(struct console touch_nmi_watchdog(); - local_irq_save(flags); - if (up->port.sysrq) { - /* serial8250_handle_port() already took the lock */ - locked = 0; - } else if (oops_in_progress) { - locked = spin_trylock(&up->port.lock); - } else - spin_lock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); + else + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -2485,8 +2481,7 @@ serial8250_console_write(struct console serial_out(up, UART_IER, ier); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init serial8250_console_setup(struct console *co, char *options) patches/mips-gtod_clocksource.patch0000664000077200007720000000205610646635214017053 0ustar mingomingo arch/mips/kernel/time.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) Index: linux-rt.q/arch/mips/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/mips/kernel/time.c +++ linux-rt.q/arch/mips/kernel/time.c @@ -287,6 +287,29 @@ void (*mips_timer_ack)(void); /* last time when xtime and rtc are sync'ed up */ static long last_rtc_update; +unsigned long read_persistent_clock(void) +{ + unsigned long sec; + sec = rtc_mips_get_time(); + return sec; +} + +void sync_persistent_clock(struct timespec ts) +{ + if (ntp_synced() && + xtime.tv_sec > last_rtc_update + 660 && + (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 && + (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) { + if (rtc_mips_set_mmss(xtime.tv_sec) == 0) { + last_rtc_update = xtime.tv_sec; + } + else { + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } + } +} + /* * local_timer_interrupt() does profiling and process accounting * on a per-CPU basis. patches/ich-force-hpet-ich7-or-later-quirk-to-force-detect-enable.patch0000664000077200007720000001110410646635211025252 0ustar mingomingoFrom: Venki Pallipadi Force detect and/or enable HPET on ICH chipsets. This patch just handles the detection part and following patches use this information. Adds a function to repeat the force enabling during resume time. Using HPET this way, instead of PIT increases the time CPUs can reside in C-state when system is totally idle. On my test system with Core 2 Duo, average C-state residency goes up from ~20mS to ~80mS. Signed-off-by: Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andi Kleen Cc: john stultz Cc: Greg KH Signed-off-by: Andrew Morton --- arch/i386/kernel/quirks.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/hpet.h | 1 2 files changed, 102 insertions(+) Index: linux-rt.q/arch/i386/kernel/quirks.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/quirks.c +++ linux-rt.q/arch/i386/kernel/quirks.c @@ -4,6 +4,8 @@ #include #include +#include + #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) @@ -47,3 +49,102 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_IN DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); #endif + +#if defined(CONFIG_HPET_TIMER) +unsigned long force_hpet_address; + +static void __iomem *rcba_base; + +void ich_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address) + return; + + if (rcba_base == NULL) + BUG(); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + } + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) + BUG(); + else + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + + return; +} + +static void ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val, rcba; + int err = 0; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xF0, &rcba); + rcba &= 0xFFFFC000; + if (rcba == 0) { + printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n"); + return; + } + + /* use bits 31:14, 16 kB aligned */ + rcba_base = ioremap_nocache(rcba, 0x4000); + if (rcba_base == NULL) { + printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n"); + return; + } + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + + if (val & 0x80) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + iounmap(rcba_base); + return; + } + + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + err = 1; + } else { + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + } + + if (err) { + force_hpet_address = 0; + iounmap(rcba_base); + printk(KERN_DEBUG "Failed to force enable HPET\n"); + } else { + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + } +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, + ich_force_enable_hpet); +#endif Index: linux-rt.q/include/asm-i386/hpet.h =================================================================== --- linux-rt.q.orig/include/asm-i386/hpet.h +++ linux-rt.q/include/asm-i386/hpet.h @@ -67,6 +67,7 @@ extern unsigned long hpet_address; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern unsigned long hpet_readl(unsigned long a); +extern void ich_force_hpet_resume(void); #ifdef CONFIG_HPET_EMULATE_RTC patches/jbd_assertions_smp_only.patch0000664000077200007720000000370210646635214017504 0ustar mingomingo fs/jbd/transaction.c | 6 +++--- include/linux/jbd.h | 9 +++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) Index: linux-rt.q/fs/jbd/transaction.c =================================================================== --- linux-rt.q.orig/fs/jbd/transaction.c +++ linux-rt.q/fs/jbd/transaction.c @@ -1506,7 +1506,7 @@ static void __journal_temp_unlink_buffer transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -1949,7 +1949,7 @@ void __journal_file_buffer(struct journa int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -2038,7 +2038,7 @@ void __journal_refile_buffer(struct jour int was_dirty; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); Index: linux-rt.q/include/linux/jbd.h =================================================================== --- linux-rt.q.orig/include/linux/jbd.h +++ linux-rt.q/include/linux/jbd.h @@ -276,6 +276,15 @@ void buffer_assertion_failure(struct buf #define J_ASSERT(assert) do { } while (0) #endif /* JBD_ASSERTIONS */ +/* + * For assertions that are only valid on SMP (e.g. spin_is_locked()): + */ +#ifdef CONFIG_SMP +# define J_ASSERT_JH_SMP(jh, expr) J_ASSERT_JH(jh, expr) +#else +# define J_ASSERT_JH_SMP(jh, assert) do { } while (0) +#endif + #if defined(JBD_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) patches/x86_64-remove-unused-code.patch0000664000077200007720000004455610646635211017226 0ustar mingomingoSubject: x86_64: remove now unused code Remove the unused code after the switch to clock events. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 108 ---------- arch/x86_64/kernel/hpet.c | 444 --------------------------------------------- arch/x86_64/kernel/time.c | 42 ---- include/asm-x86_64/apic.h | 6 include/asm-x86_64/proto.h | 7 5 files changed, 1 insertion(+), 606 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -42,9 +42,7 @@ int apic_mapped; int apic_verbosity; -int apic_runs_main_timer; int apic_calibrate_pmtmr __initdata; - int disable_apic_timer __initdata; /* Local APIC timer works in C2? */ @@ -130,15 +128,6 @@ static void lapic_timer_broadcast(cpumas #endif } -/* - * cpu_mask that denotes the CPUs that needs timer interrupt coming in as - * IPIs in place of local APIC timers - */ -static cpumask_t timer_interrupt_broadcast_ipi_mask; - -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer __read_mostly = 0; - static void apic_pm_activate(void); void apic_wait_icr_idle(void) @@ -974,84 +963,6 @@ void __cpuinit setup_secondary_APIC_cloc setup_APIC_timer(); } -void disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - /* - * When an illegal vector value (0-15) is written to an LVT - * entry and delivery mode is Fixed, the APIC may signal an - * illegal vector error, with out regard to whether the mask - * bit is set or whether an interrupt is actually seen on input. - * - * Boot sequence might call this function when the LVTT has - * '0' vector value. So make sure vector field is set to - * valid value. - */ - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - } -} - -void enable_APIC_timer(void) -{ - int cpu = smp_processor_id(); - - if (using_apic_timer && - !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED); - } -} - -void switch_APIC_timer_to_ipi(void *cpumask) -{ - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - disable_APIC_timer(); - cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); - } -} -EXPORT_SYMBOL(switch_APIC_timer_to_ipi); - -void smp_send_timer_broadcast_ipi(void) -{ - int cpu = smp_processor_id(); - cpumask_t mask; - - cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask); - - if (cpu_isset(cpu, mask)) { - cpu_clear(cpu, mask); - add_pda(apic_timer_irqs, 1); - smp_local_timer_interrupt(); - } - - if (!cpus_empty(mask)) { - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); - } -} - -void switch_ipi_to_APIC_timer(void *cpumask) -{ - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask); - enable_APIC_timer(); - } -} -EXPORT_SYMBOL(switch_ipi_to_APIC_timer); - int setup_profiling_timer(unsigned int multiplier) { return -EINVAL; @@ -1199,7 +1110,6 @@ asmlinkage void smp_spurious_interrupt(v v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); - irq_exit(); } @@ -1298,21 +1208,7 @@ static __init int setup_noapictimer(char disable_apic_timer = 1; return 1; } - -static __init int setup_apicmaintimer(char *str) -{ - apic_runs_main_timer = 1; - - return 1; -} -__setup("apicmaintimer", setup_apicmaintimer); - -static __init int setup_noapicmaintimer(char *str) -{ - apic_runs_main_timer = -1; - return 1; -} -__setup("noapicmaintimer", setup_noapicmaintimer); +__setup("noapictimer", setup_noapictimer); static __init int setup_apicpmtimer(char *s) { @@ -1322,5 +1218,3 @@ static __init int setup_apicpmtimer(char } __setup("apicpmtimer", setup_apicpmtimer); -__setup("noapictimer", setup_noapictimer); - Index: linux-rt.q/arch/x86_64/kernel/hpet.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/hpet.c +++ /dev/null @@ -1,444 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define HPET_MASK 0xFFFFFFFF -#define HPET_SHIFT 22 - -/* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000 - -int nohpet __initdata; - -unsigned long hpet_address; -unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ - -int hpet_use_timer; /* Use counter of hpet for time keeping, - * otherwise PIT - */ - -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!hpet_address) - return 0; - - memset(&hd, 0, sizeof(hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = hpet_address; - hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - timer = &hpet->hpet_timers[2]; - for (i = 2; i < ntimer; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - return 0; -} -fs_initcall(late_hpet_init); -#endif - -int hpet_timer_stop_set_go(unsigned long tick) -{ - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ - - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); - -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ - hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ - cfg |= HPET_CFG_LEGACY; - } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static cycle_t read_hpet(void) -{ - return (cycle_t)hpet_readl(HPET_COUNTER); -} - -static cycle_t __vsyscall_fn vread_hpet(void) -{ - return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); -} - -struct clocksource clocksource_hpet = { - .name = "hpet", - .rating = 250, - .read = read_hpet, - .mask = (cycle_t)HPET_MASK, - .mult = 0, /* set below */ - .shift = HPET_SHIFT, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .vread = vread_hpet, -}; - -int hpet_arch_init(void) -{ - unsigned int id; - u64 tmp; - - if (!hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, hpet_address); - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); - - /* - * hpet period is in femto seconds per cycle - * so we need to convert this to ns/cyc units - * aproximated by mult/2^shift - * - * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift - * fsec/cyc * 1ns/1000000fsec * 2^shift = mult - * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult - * (fsec/cyc << shift)/1000000 = mult - * (hpet_period << shift)/FSEC_PER_NSEC = mult - */ - tmp = (u64)hpet_period << HPET_SHIFT; - do_div(tmp, FSEC_PER_NSEC); - clocksource_hpet.mult = (u32)tmp; - clocksource_register(&clocksource_hpet); - - return hpet_timer_stop_set_go(hpet_tick); -} - -int hpet_reenable(void) -{ - return hpet_timer_stop_set_go(hpet_tick); -} - -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -int is_hpet_enabled(void) -{ - return hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - local_irq_restore(flags); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt, ticks_per_int, lost_ints; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; - hpet_t1_cmp += ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - /* - * If the interrupt handler was delayed too long, the write above tries - * to schedule the next interrupt in the past and the hardware would - * not interrupt until the counter had wrapped around. - * So we have to check that the comparator wasn't set to a past time. - */ - cnt = hpet_readl(HPET_COUNTER); - if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { - lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; - /* Make sure that, even with the time needed to execute - * this code, the next scheduled interrupt has been moved - * back to the future: */ - lost_ints++; - - hpet_t1_cmp += lost_ints * ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - if (PIE_on) - PIE_count += lost_ints; - - if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); - } -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); - } - return IRQ_HANDLED; -} -#endif - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 1; -} - -__setup("nohpet", nohpet_setup); Index: linux-rt.q/arch/x86_64/kernel/time.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/time.c +++ linux-rt.q/arch/x86_64/kernel/time.c @@ -149,48 +149,6 @@ int update_persistent_clock(struct times return set_rtc_mmss(now.tv_sec); } -void main_timer_handler(void) -{ -/* - * Here we are in the timer irq handler. We have irqs locally disabled (so we - * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running - * on the other CPU, so we need a lock. We also need to lock the vsyscall - * variables, because both do_timer() and us change them -arca+vojtech - */ - - write_seqlock(&xtime_lock); - -/* - * Do the timer stuff. - */ - - do_timer(1); -#ifndef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - - if (!using_apic_timer) - smp_local_timer_interrupt(); - - write_sequnlock(&xtime_lock); -} - -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - if (apic_runs_main_timer > 1) - return IRQ_HANDLED; - main_timer_handler(); - if (using_apic_timer) - smp_send_timer_broadcast_ipi(); - return IRQ_HANDLED; -} - static irqreturn_t timer_event_interrupt(int irq, void *dev_id) { global_clock_event->event_handler(global_clock_event); Index: linux-rt.q/include/asm-x86_64/apic.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/apic.h +++ linux-rt.q/include/asm-x86_64/apic.h @@ -79,8 +79,6 @@ extern void smp_local_timer_interrupt (v extern void setup_boot_APIC_clock (void); extern void setup_secondary_APIC_clock (void); extern int APIC_init_uniprocessor (void); -extern void disable_APIC_timer(void); -extern void enable_APIC_timer(void); extern void setup_apic_routing(void); extern void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, @@ -95,10 +93,6 @@ extern int apic_is_clustered_box(void); #define K8_APIC_EXT_INT_MSG_EXT 0x7 #define K8_APIC_EXT_LVT_ENTRY_THRESHOLD 0 -void smp_send_timer_broadcast_ipi(void); -void switch_APIC_timer_to_ipi(void *cpumask); -void switch_ipi_to_APIC_timer(void *cpumask); - #define ARCH_APICTIMER_STOPS_ON_C3 1 extern unsigned boot_cpu_id; Index: linux-rt.q/include/asm-x86_64/proto.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/proto.h +++ linux-rt.q/include/asm-x86_64/proto.h @@ -51,9 +51,6 @@ extern void reserve_bootmem_generic(unsi extern void load_gs_index(unsigned gs); -extern void stop_timer_interrupt(void); -extern void main_timer_handler(void); - extern unsigned long end_pfn_map; extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long * rsp); @@ -110,14 +107,10 @@ extern int timer_over_8254; extern int gsi_irq_sharing(int gsi); -extern void smp_local_timer_interrupt(void); - extern int force_mwait; long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); -void i8254_timer_resume(void); - #define round_up(x,y) (((x) + (y) - 1) & ~((y)-1)) #define round_down(x,y) ((x) & ~((y)-1)) patches/futex-performance-hack-sysctl-fix.patch0000664000077200007720000000551210646635216021217 0ustar mingomingoFrom lethal@linux-sh.org Fri May 18 06:46:43 2007 Return-Path: Received: from smtp.ocgnet.org (smtp.ocgnet.org [64.20.243.3]) by mail.tglx.de (Postfix) with ESMTP id 0FCC865C065 for ; Fri, 18 May 2007 06:46:43 +0200 (CEST) Received: from smtp.ocgnet.org (localhost [127.0.0.1]) by smtp.ocgnet.org (Postfix) with ESMTP id 616355203FB; Thu, 17 May 2007 23:46:39 -0500 (CDT) X-Spam-Checker-Version: SpamAssassin 3.1.3-gr0 (2006-06-01) on smtp.ocgnet.org X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=none autolearn=no version=3.1.3-gr0 Received: from master.linux-sh.org (124x34x33x190.ap124.ftth.ucom.ne.jp [124.34.33.190]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (No client certificate requested) by smtp.ocgnet.org (Postfix) with ESMTP id E1F585203E0; Thu, 17 May 2007 23:46:38 -0500 (CDT) Received: from localhost (unknown [127.0.0.1]) by master.linux-sh.org (Postfix) with ESMTP id 4984664C7C; Fri, 18 May 2007 04:46:00 +0000 (UTC) X-Virus-Scanned: amavisd-new at linux-sh.org Received: from master.linux-sh.org ([127.0.0.1]) by localhost (master.linux-sh.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id BE+H5LV2TYuQ; Fri, 18 May 2007 13:46:00 +0900 (JST) Received: by master.linux-sh.org (Postfix, from userid 500) id 08A5664C7D; Fri, 18 May 2007 13:46:00 +0900 (JST) Date: Fri, 18 May 2007 13:45:59 +0900 From: Paul Mundt To: Ingo Molnar , Thomas Gleixner Cc: linux-kernel@vger.kernel.org Subject: [PATCH -rt] futex_performance_hack sysctl build fix Message-ID: <20070518044559.GB22660@linux-sh.org> Mail-Followup-To: Paul Mundt , Ingo Molnar , Thomas Gleixner , linux-kernel@vger.kernel.org MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.13 (2006-08-11) X-Virus-Scanned: ClamAV using ClamSMTP X-Evolution-Source: imap://tglx%40linutronix.de@localhost:8993/ Content-Transfer-Encoding: 8bit -rt adds a futex_performance_hack sysctl, which is only defined if kernel/futex.c is built in. This fixes the build in the CONFIG_FUTEX=n case. Signed-off-by: Paul Mundt -- kernel/sysctl.c | 2 ++ 1 file changed, 2 insertions(+) Index: linux-rt.q/kernel/sysctl.c =================================================================== --- linux-rt.q.orig/kernel/sysctl.c +++ linux-rt.q/kernel/sysctl.c @@ -294,6 +294,7 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_FUTEX { .ctl_name = CTL_UNNUMBERED, .procname = "futex_performance_hack", @@ -302,6 +303,7 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif { .ctl_name = KERN_PANIC, .procname = "prof_pid", patches/paravirt-function-pointer-fix.patch0000664000077200007720000000162110646635217020471 0ustar mingomingo--- arch/i386/kernel/paravirt.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) Index: linux-rt.q/arch/i386/kernel/paravirt.c =================================================================== --- linux-rt.q.orig/arch/i386/kernel/paravirt.c +++ linux-rt.q/arch/i386/kernel/paravirt.c @@ -228,6 +228,16 @@ static int __init print_banner(void) } core_initcall(print_banner); +#ifdef CONFIG_HIGHPTE +/* + * kmap_atomic() might be an inline or a macro: + */ +static void *kmap_atomic_func(struct page *page, enum km_type idx) +{ + return kmap_atomic(page, idx); +} +#endif + struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, @@ -316,7 +326,7 @@ struct paravirt_ops paravirt_ops = { .pte_update_defer = paravirt_nop, #ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, + .kmap_atomic_pte = kmap_atomic_func, #endif #ifdef CONFIG_X86_PAE patches/mm-speculative-get-page.patch0000664000077200007720000002536010646635216017203 0ustar mingomingoFrom: Nick Piggin Subject: [patch 3/4] mm: speculative get page If we can be sure that elevating the page_count on a pagecache page will pin it, we can speculatively run this operation, and subsequently check to see if we hit the right page rather than relying on holding a lock or otherwise pinning a reference to the page. This can be done if get_page/put_page behaves consistently throughout the whole tree (ie. if we "get" the page after it has been used for something else, we must be able to free it with a put_page). Actually, there is a period where the count behaves differently: when the page is free or if it is a constituent page of a compound page. We need an atomic_inc_not_zero operation to ensure we don't try to grab the page in either case. This patch introduces the core locking protocol to the pagecache (ie. adds page_cache_get_speculative, and tweaks some update-side code to make it work). [Hugh notices that PG_nonewrefs might be dispensed with entirely if current SetPageNoNewRefs instead atomically save the page count and temporarily set it to zero. This is a nice idea, and simplifies find_get_page very much, but cannot be applied to all current SetPageNoNewRefs sites. Need to verify that add_to_page_cache and add_to_swap_cache can cope without it or make do some other way. In the meantime, this version is a slightly more mechanical replacement.] Signed-off-by: Nick Piggin --- include/linux/page-flags.h | 8 +++ include/linux/pagemap.h | 105 +++++++++++++++++++++++++++++++++++++++++++++ mm/filemap.c | 4 + mm/migrate.c | 9 +++ mm/swap_state.c | 4 + mm/vmscan.c | 12 +++-- 6 files changed, 136 insertions(+), 6 deletions(-) Index: linux-rt.q/include/linux/page-flags.h =================================================================== --- linux-rt.q.orig/include/linux/page-flags.h +++ linux-rt.q/include/linux/page-flags.h @@ -92,7 +92,8 @@ /* PG_owner_priv_1 users should have descriptive aliases */ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ - +#define PG_nonewrefs 20 /* Block concurrent pagecache lookups + * while testing refcount */ #if (BITS_PER_LONG > 32) /* * 64-bit-only flags build down from bit 31 @@ -270,6 +271,11 @@ static inline void __ClearPageTail(struc #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageNoNewRefs(page) test_bit(PG_nonewrefs, &(page)->flags) +#define SetPageNoNewRefs(page) set_bit(PG_nonewrefs, &(page)->flags) +#define ClearPageNoNewRefs(page) clear_bit(PG_nonewrefs, &(page)->flags) +#define __ClearPageNoNewRefs(page) __clear_bit(PG_nonewrefs, &(page)->flags) + struct page; /* forward declaration */ extern void cancel_dirty_page(struct page *page, unsigned int account_size); Index: linux-rt.q/include/linux/pagemap.h =================================================================== --- linux-rt.q.orig/include/linux/pagemap.h +++ linux-rt.q/include/linux/pagemap.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include /* for in_interrupt() */ /* * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page @@ -62,6 +64,109 @@ static inline void mapping_set_gfp_mask( #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +/* + * speculatively take a reference to a page. + * If the page is free (_count == 0), then _count is untouched, and 0 + * is returned. Otherwise, _count is incremented by 1 and 1 is returned. + * + * This function must be run in the same rcu_read_lock() section as has + * been used to lookup the page in the pagecache radix-tree: this allows + * allocators to use a synchronize_rcu() to stabilize _count. + * + * Unless an RCU grace period has passed, the count of all pages coming out + * of the allocator must be considered unstable. page_count may return higher + * than expected, and put_page must be able to do the right thing when the + * page has been finished with (because put_page is what is used to drop an + * invalid speculative reference). + * + * After incrementing the refcount, this function spins until PageNoNewRefs + * is clear, then a read memory barrier is issued. + * + * This forms the core of the lockless pagecache locking protocol, where + * the lookup-side (eg. find_get_page) has the following pattern: + * 1. find page in radix tree + * 2. conditionally increment refcount + * 3. wait for PageNoNewRefs + * 4. check the page is still in pagecache + * + * Remove-side (that cares about _count, eg. reclaim) has the following: + * A. SetPageNoNewRefs + * B. check refcount is correct + * C. remove page + * D. ClearPageNoNewRefs + * + * There are 2 critical interleavings that matter: + * - 2 runs before B: in this case, B sees elevated refcount and bails out + * - B runs before 2: in this case, 3 ensures 4 will not run until *after* C + * (after D, even). In which case, 4 will notice C and lookup side can retry + * + * It is possible that between 1 and 2, the page is removed then the exact same + * page is inserted into the same position in pagecache. That's OK: the + * old find_get_page using tree_lock could equally have run before or after + * the write-side, depending on timing. + * + * Pagecache insertion isn't a big problem: either 1 will find the page or + * it will not. Likewise, the old find_get_page could run either before the + * insertion or afterwards, depending on timing. + */ +static inline int page_cache_get_speculative(struct page *page) +{ + VM_BUG_ON(in_interrupt()); + +#ifndef CONFIG_SMP +# ifdef CONFIG_PREEMPT + VM_BUG_ON(!in_atomic()); +# endif + /* + * Preempt must be disabled here - we rely on rcu_read_lock doing + * this for us. + * + * Pagecache won't be truncated from interrupt context, so if we have + * found a page in the radix tree here, we have pinned its refcount by + * disabling preempt, and hence no need for the "speculative get" that + * SMP requires. + */ + VM_BUG_ON(page_count(page) == 0); + atomic_inc(&page->_count); + +#else + if (unlikely(!get_page_unless_zero(page))) + return 0; /* page has been freed */ + + /* + * Note that get_page_unless_zero provides a memory barrier. + * This is needed to ensure PageNoNewRefs is evaluated after the + * page refcount has been raised. See below comment. + */ + + while (unlikely(PageNoNewRefs(page))) + cpu_relax(); + + /* + * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs()) + * is performed before a future load used to ensure the page is + * the correct on (usually: page->mapping and page->index). + * + * Those places that set PageNoNewRefs have the following pattern: + * SetPageNoNewRefs(page) + * wmb(); + * if (page_count(page) == X) + * remove page from pagecache + * wmb(); + * ClearPageNoNewRefs(page) + * + * If the load was out of order, page->mapping might be loaded before + * the page is removed from pagecache but PageNoNewRefs evaluated + * after the ClearPageNoNewRefs(). + */ + smp_rmb(); + +#endif + VM_BUG_ON(PageCompound(page) && (struct page *)page_private(page) != page); + + return 1; +} + #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else Index: linux-rt.q/mm/filemap.c =================================================================== --- linux-rt.q.orig/mm/filemap.c +++ linux-rt.q/mm/filemap.c @@ -440,6 +440,8 @@ int add_to_page_cache(struct page *page, int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { + SetPageNoNewRefs(page); + smp_wmb(); write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { @@ -451,6 +453,8 @@ int add_to_page_cache(struct page *page, __inc_zone_page_state(page, NR_FILE_PAGES); } write_unlock_irq(&mapping->tree_lock); + smp_wmb(); + ClearPageNoNewRefs(page); radix_tree_preload_end(); } return error; Index: linux-rt.q/mm/migrate.c =================================================================== --- linux-rt.q.orig/mm/migrate.c +++ linux-rt.q/mm/migrate.c @@ -303,6 +303,8 @@ static int migrate_page_move_mapping(str return 0; } + SetPageNoNewRefs(page); + smp_wmb(); write_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, @@ -311,6 +313,7 @@ static int migrate_page_move_mapping(str if (page_count(page) != 2 + !!PagePrivate(page) || (struct page *)radix_tree_deref_slot(pslot) != page) { write_unlock_irq(&mapping->tree_lock); + ClearPageNoNewRefs(page); return -EAGAIN; } @@ -326,6 +329,10 @@ static int migrate_page_move_mapping(str #endif radix_tree_replace_slot(pslot, newpage); + page->mapping = NULL; + write_unlock_irq(&mapping->tree_lock); + smp_wmb(); + ClearPageNoNewRefs(page); /* * Drop cache reference from old page. @@ -346,8 +353,6 @@ static int migrate_page_move_mapping(str __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - write_unlock_irq(&mapping->tree_lock); - return 0; } Index: linux-rt.q/mm/swap_state.c =================================================================== --- linux-rt.q.orig/mm/swap_state.c +++ linux-rt.q/mm/swap_state.c @@ -78,6 +78,8 @@ static int __add_to_swap_cache(struct pa BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { + SetPageNoNewRefs(page); + smp_wmb(); write_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); @@ -90,6 +92,8 @@ static int __add_to_swap_cache(struct pa __inc_zone_page_state(page, NR_FILE_PAGES); } write_unlock_irq(&swapper_space.tree_lock); + smp_wmb(); + ClearPageNoNewRefs(page); radix_tree_preload_end(); } return error; Index: linux-rt.q/mm/vmscan.c =================================================================== --- linux-rt.q.orig/mm/vmscan.c +++ linux-rt.q/mm/vmscan.c @@ -387,6 +387,8 @@ int remove_mapping(struct address_space BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); + SetPageNoNewRefs(page); + smp_wmb(); write_lock_irq(&mapping->tree_lock); /* * The non racy check for a busy page. @@ -424,17 +426,21 @@ int remove_mapping(struct address_space __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + goto free_it; } __remove_from_page_cache(page); write_unlock_irq(&mapping->tree_lock); - __put_page(page); + +free_it: + smp_wmb(); + __ClearPageNoNewRefs(page); + __put_page(page); /* The pagecache ref */ return 1; cannot_free: write_unlock_irq(&mapping->tree_lock); + ClearPageNoNewRefs(page); return 0; } patches/nmi-watchdog-disable.patch0000664000077200007720000000674410646635216016550 0ustar mingomingoSubject: [patch] x86_64: do not enable the NMI watchdog by default From: Ingo Molnar do not enable the NMI watchdog by default. Now that we have lockdep i cannot remember the last time it caught a real bug, but the NMI watchdog can /cause/ problems. Furthermore, to the typical user, an NMI watchdog assert results in a total lockup anyway (if under X). In that sense, all that the NMI watchdog does is that it makes the system /less/ stable and /less/ debuggable. people can still enable it either after bootup via: echo 1 > /proc/sys/kernel/nmi or via the nmi_watchdog=1 or nmi_watchdog=2 boot options. build and boot tested on an Athlon64 box. Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 1 - arch/x86_64/kernel/io_apic.c | 2 -- arch/x86_64/kernel/nmi.c | 2 +- arch/x86_64/kernel/smpboot.c | 1 - include/asm-x86_64/nmi.h | 1 - 5 files changed, 1 insertion(+), 6 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -517,7 +517,6 @@ void __cpuinit setup_local_APIC (void) oldvalue, value); } - nmi_watchdog_default(); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } Index: linux-rt.q/arch/x86_64/kernel/io_apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/io_apic.c +++ linux-rt.q/arch/x86_64/kernel/io_apic.c @@ -1666,7 +1666,6 @@ static inline void check_timer(void) */ unmask_IO_APIC_irq(0); if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); setup_nmi(); @@ -1692,7 +1691,6 @@ static inline void check_timer(void) setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } Index: linux-rt.q/arch/x86_64/kernel/nmi.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/nmi.c +++ linux-rt.q/arch/x86_64/kernel/nmi.c @@ -54,7 +54,7 @@ static DEFINE_PER_CPU(short, wd_enabled) static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) +static inline void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; Index: linux-rt.q/arch/x86_64/kernel/smpboot.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/smpboot.c +++ linux-rt.q/arch/x86_64/kernel/smpboot.c @@ -850,7 +850,6 @@ static int __init smp_sanity_check(unsig */ void __init smp_prepare_cpus(unsigned int max_cpus) { - nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ set_cpu_sibling_map(0); Index: linux-rt.q/include/asm-x86_64/nmi.h =================================================================== --- linux-rt.q.orig/include/asm-x86_64/nmi.h +++ linux-rt.q/include/asm-x86_64/nmi.h @@ -59,7 +59,6 @@ extern void disable_timer_nmi_watchdog(v extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); -extern void nmi_watchdog_default(void); extern int setup_nmi_watchdog(char *); extern atomic_t nmi_active; patches/x86_64-apic-whitespace-comment-and-remove-unused-code.patch0000664000077200007720000001201610646635210024453 0ustar mingomingoSubject: x86_64: apic.c coding style janitor work Fix coding style, white space wreckage and remove unused code. Signed-off-by: Thomas Gleixner Signed-off-by: Chris Wright Signed-off-by: Ingo Molnar --- arch/x86_64/kernel/apic.c | 73 ++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 43 deletions(-) Index: linux-rt.q/arch/x86_64/kernel/apic.c =================================================================== --- linux-rt.q.orig/arch/x86_64/kernel/apic.c +++ linux-rt.q/arch/x86_64/kernel/apic.c @@ -92,8 +92,9 @@ unsigned int safe_apic_wait_icr_idle(voi void enable_NMI_through_LVT0 (void * dummy) { unsigned int v; - - v = APIC_DM_NMI; /* unmask and set to NMI */ + + /* unmask and set to NMI */ + v = APIC_DM_NMI; apic_write(APIC_LVT0, v); } @@ -120,7 +121,7 @@ void ack_bad_irq(unsigned int irq) * holds up an irq slot - in excessive cases (when multiple * unexpected vectors occur) that might lock up the APIC * completely. - * But don't ack when the APIC is disabled. -AK + * But don't ack when the APIC is disabled. -AK */ if (!disable_apic) ack_APIC_irq(); @@ -616,7 +617,7 @@ early_param("apic", apic_set_verbosity); * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) + * not correctly set up (usually the APIC timer won't work etc.) */ static int __init detect_init_APIC (void) @@ -789,13 +790,13 @@ static void setup_APIC_timer(unsigned in local_irq_save(flags); /* wait for irq slice */ - if (hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { + if (hpet_address && hpet_use_timer) { + int trigger = hpet_readl(HPET_T0_CMP); + while (hpet_readl(HPET_COUNTER) >= trigger) + /* do nothing */ ; + while (hpet_readl(HPET_COUNTER) < trigger) + /* do nothing */ ; + } else { int c1, c2; outb_p(0x00, 0x43); c2 = inb_p(0x40); @@ -881,10 +882,10 @@ static unsigned int calibration_result; void __init setup_boot_APIC_clock (void) { - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - return; - } + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + return; + } printk(KERN_INFO "Using local APIC timer interrupts.\n"); using_apic_timer = 1; @@ -1128,20 +1129,6 @@ asmlinkage void smp_spurious_interrupt(v if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); -#if 0 - static unsigned long last_warning; - static unsigned long skipped; - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - if (time_before(last_warning+30*HZ,jiffies)) { - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", - smp_processor_id(), skipped); - last_warning = jiffies; - skipped = 0; - } else { - skipped++; - } -#endif irq_exit(); } @@ -1173,11 +1160,11 @@ asmlinkage void smp_error_interrupt(void 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } -int disable_apic; +int disable_apic; /* * This initializes the IO-APIC and APIC hardware if this is @@ -1185,11 +1172,11 @@ int disable_apic; */ int __init APIC_init_uniprocessor (void) { - if (disable_apic) { + if (disable_apic) { printk(KERN_INFO "Apic disabled\n"); - return -1; + return -1; } - if (!cpu_has_apic) { + if (!cpu_has_apic) { disable_apic = 1; printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; @@ -1211,8 +1198,8 @@ int __init APIC_init_uniprocessor (void) return 0; } -static __init int setup_disableapic(char *str) -{ +static __init int setup_disableapic(char *str) +{ disable_apic = 1; clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); return 0; @@ -1220,10 +1207,10 @@ static __init int setup_disableapic(char early_param("disableapic", setup_disableapic); /* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) -{ +static __init int setup_nolapic(char *str) +{ return setup_disableapic(str); -} +} early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) @@ -1233,13 +1220,13 @@ static int __init parse_lapic_timer_c2_o } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static __init int setup_noapictimer(char *str) -{ +static __init int setup_noapictimer(char *str) +{ if (str[0] != ' ' && str[0] != 0) return 0; disable_apic_timer = 1; return 1; -} +} static __init int setup_apicmaintimer(char *str) { @@ -1264,5 +1251,5 @@ static __init int setup_apicpmtimer(char } __setup("apicpmtimer", setup_apicpmtimer); -__setup("noapictimer", setup_noapictimer); +__setup("noapictimer", setup_noapictimer); patches/s_files-proc-generic-fix.patch0000664000077200007720000000140510646635216017336 0ustar mingomingo--- fs/proc/generic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Index: linux-rt.q/fs/proc/generic.c =================================================================== --- linux-rt.q.orig/fs/proc/generic.c +++ linux-rt.q/fs/proc/generic.c @@ -564,7 +564,6 @@ static void proc_kill_inodes(struct proc /* * Actually it's a partial revoke(). */ - filevec_add_drain_all(); lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) { struct dentry * dentry = filp->f_path.dentry; struct inode * inode; @@ -725,6 +724,8 @@ void remove_proc_entry(const char *name, goto out; len = strlen(fn); + filevec_add_drain_all(); + spin_lock(&proc_subdir_lock); for (p = &parent->subdir; *p; p=&(*p)->next ) { if (!proc_match(len, fn, *p)) patches/rcu-hrt-fixups.patch0000664000077200007720000001076310646635213015457 0ustar mingomingo include/linux/rcuclassic.h | 3 +++ include/linux/rcupdate.h | 1 + include/linux/rcupreempt.h | 3 +++ kernel/rcuclassic.c | 19 ++++++++++++++++--- kernel/rcupreempt.c | 22 +++++++++++++++++++++- 5 files changed, 44 insertions(+), 4 deletions(-) Index: linux-rt.q/include/linux/rcuclassic.h =================================================================== --- linux-rt.q.orig/include/linux/rcuclassic.h +++ linux-rt.q/include/linux/rcuclassic.h @@ -144,5 +144,8 @@ extern void rcu_check_callbacks(int cpu, extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); +struct softirq_action; +extern void rcu_process_callbacks(struct softirq_action *unused); + #endif /* __KERNEL__ */ #endif /* __LINUX_RCUCLASSIC_H */ Index: linux-rt.q/include/linux/rcupdate.h =================================================================== --- linux-rt.q.orig/include/linux/rcupdate.h +++ linux-rt.q/include/linux/rcupdate.h @@ -225,6 +225,7 @@ extern void rcu_barrier(void); /* Internal to kernel */ extern void rcu_init(void); +extern void rcu_advance_callbacks(int cpu, int user); extern void rcu_check_callbacks(int cpu, int user); #endif /* __KERNEL__ */ Index: linux-rt.q/include/linux/rcupreempt.h =================================================================== --- linux-rt.q.orig/include/linux/rcupreempt.h +++ linux-rt.q/include/linux/rcupreempt.h @@ -49,6 +49,7 @@ extern void __rcu_read_lock(void); extern void __rcu_read_unlock(void); extern int rcu_pending(int cpu); +extern int rcu_needs_cpu(int cpu); #define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); } #define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); } @@ -62,5 +63,7 @@ extern void rcu_check_callbacks(int cpu, extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); +extern void rcu_process_callbacks(unsigned long unused); + #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPREEMPT_H */ Index: linux-rt.q/kernel/rcuclassic.c =================================================================== --- linux-rt.q.orig/kernel/rcuclassic.c +++ linux-rt.q/kernel/rcuclassic.c @@ -382,6 +382,8 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { *rdp->donetail = rdp->curlist; rdp->donetail = rdp->curtail; @@ -390,12 +392,12 @@ static void __rcu_process_callbacks(stru } if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); + local_irq_save(flags); rdp->curlist = rdp->nxtlist; rdp->curtail = rdp->nxttail; rdp->nxtlist = NULL; rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); + local_irq_restore(flags); /* * start the next batch of callbacks @@ -422,7 +424,7 @@ static void __rcu_process_callbacks(stru rcu_do_batch(rdp); } -static void rcu_process_callbacks(struct softirq_action *unused) +void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); @@ -477,6 +479,17 @@ int rcu_needs_cpu(int cpu) return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); } +void rcu_advance_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); +} + void rcu_check_callbacks(int cpu, int user) { if (user || Index: linux-rt.q/kernel/rcupreempt.c =================================================================== --- linux-rt.q.orig/kernel/rcupreempt.c +++ linux-rt.q/kernel/rcupreempt.c @@ -259,7 +259,27 @@ void rcu_check_callbacks(int cpu, int us } } -static void rcu_process_callbacks(unsigned long data) +/* + * Needed by dynticks, to make sure all RCU processing has finished + * when we go idle: + */ +void rcu_advance_callbacks(int cpu, int user) +{ + unsigned long oldirq; + + if (rcu_ctrlblk.completed == rcu_data.completed) { + rcu_try_flip(); + if (rcu_ctrlblk.completed == rcu_data.completed) { + return; + } + } + spin_lock_irqsave(&rcu_data.lock, oldirq); + RCU_TRACE(rcupreempt_trace_check_callbacks, &rcu_data.trace); + __rcu_advance_callbacks(); + spin_unlock_irqrestore(&rcu_data.lock, oldirq); +} + +void rcu_process_callbacks(unsigned long unused) { unsigned long flags; struct rcu_head *next, *list; patches/trace-name-plus.patch0000664000077200007720000000643110646635212015550 0ustar mingomingo--- kernel/latency_trace.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) Index: linux-rt.q/kernel/latency_trace.c =================================================================== --- linux-rt.q.orig/kernel/latency_trace.c +++ linux-rt.q/kernel/latency_trace.c @@ -868,29 +868,33 @@ static void notrace print_name(struct se * Special trace values: */ if (((long)eip < 10000L) && ((long)eip > -10000L)) { - seq_printf(m, "(%5ld)", eip); + seq_printf(m, "<%ld>", eip); return; } sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); if (sym_name) - seq_puts(m, sym_name); + seq_printf(m, "%s+%#lx/%#lx", + sym_name, offset, size); else seq_printf(m, "<%08lx>", eip); } -static void notrace print_name_offset(struct seq_file *m, unsigned long eip) +static void notrace print_name_eip(struct seq_file *m, unsigned long eip) { char namebuf[KSYM_NAME_LEN+1]; unsigned long size, offset; const char *sym_name; char *modname; - sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); - if (sym_name) - seq_printf(m, "%s+%#lx/%#lx <%08lx>", - sym_name, offset, size, eip); - else - seq_printf(m, "<%08lx>", eip); + if (eip) { + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s+%#lx/%#lx <%08lx>", + sym_name, offset, size, eip); + else + seq_printf(m, "<%08lx>", eip); + } else + seq_printf(m, "0"); } static unsigned long out_sequence = -1; @@ -1255,9 +1259,9 @@ static void * notrace l_start(struct seq seq_puts(m, " -----------------\n"); if (trace_user_triggered) { seq_puts(m, " => started at: "); - print_name_offset(m, tr->critical_start); + print_name_eip(m, tr->critical_start); seq_puts(m, "\n => ended at: "); - print_name_offset(m, tr->critical_end); + print_name_eip(m, tr->critical_end); seq_puts(m, "\n"); } seq_puts(m, "\n"); @@ -1367,9 +1371,9 @@ static int notrace l_show_fn(struct seq_ entry->preempt_count, trace_idx, entry->timestamp, abs_usecs/1000, abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); - print_name_offset(m, entry->u.fn.eip); + print_name_eip(m, entry->u.fn.eip); seq_puts(m, " ("); - print_name_offset(m, entry->u.fn.parent_eip); + print_name_eip(m, entry->u.fn.parent_eip); seq_puts(m, ")\n"); } else { print_generic(m, entry); @@ -1394,7 +1398,7 @@ static int notrace l_show_special(struct print_generic(m, entry); print_timestamp(m, abs_usecs, rel_usecs); if (trace_verbose) - print_name_offset(m, entry->u.special.eip); + print_name_eip(m, entry->u.special.eip); else print_name(m, entry->u.special.eip); @@ -1438,7 +1442,7 @@ l_show_special_pid(struct seq_file *m, u print_generic(m, entry); print_timestamp(m, abs_usecs, rel_usecs); if (trace_verbose) - print_name_offset(m, entry->u.special.eip); + print_name_eip(m, entry->u.special.eip); else print_name(m, entry->u.special.eip); seq_printf(m, " <%.8s-%d> (%ld %ld)\n", @@ -1461,7 +1465,7 @@ l_show_special_sym(struct seq_file *m, u print_generic(m, entry); print_timestamp(m, abs_usecs, rel_usecs); if (trace_verbose) - print_name_offset(m, entry->u.special.eip); + print_name_eip(m, entry->u.special.eip); else print_name(m, entry->u.special.eip); patches/mm-lockless-pagecache-lookups.patch0000664000077200007720000001650510646635216020400 0ustar mingomingoFrom: Nick Piggin Subject: [patch 4/4] mm: lockless pagecache lookups Combine page_cache_get_speculative with lockless radix tree lookups to introduce lockless page cache lookups (ie. no mapping->tree_lock on the read-side). The only atomicity changes this introduces is that the gang pagecache lookup functions now behave as if they are implemented with multiple find_get_page calls, rather than operating on a snapshot of the pages. In practice, this atomicity guarantee is not used anyway, and it is difficult to see how it could be. Gang pagecache lookups are designed to replace individual lookups, so these semantics are natural. Signed-off-by: Nick Piggin --- mm/filemap.c | 133 +++++++++++++++++++++++++++++++++++++--------------- mm/page-writeback.c | 8 +-- mm/readahead.c | 6 -- 3 files changed, 102 insertions(+), 45 deletions(-) Index: linux-rt.q/mm/filemap.c =================================================================== --- linux-rt.q.orig/mm/filemap.c +++ linux-rt.q/mm/filemap.c @@ -596,15 +596,31 @@ void fastcall __lock_page_nosync(struct * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) +struct page *find_get_page(struct address_space *mapping, unsigned long offset) { + void **pagep; struct page *page; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page || page == RADIX_TREE_RETRY)) + goto repeat; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_get_page); @@ -624,26 +640,19 @@ struct page *find_lock_page(struct addre { struct page *page; - read_lock_irq(&mapping->tree_lock); repeat: - page = radix_tree_lookup(&mapping->page_tree, offset); + page = find_get_page(mapping, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - read_lock_irq(&mapping->tree_lock); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping || - page->index != offset)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping + || page->index != offset)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } } - read_unlock_irq(&mapping->tree_lock); + return page; } EXPORT_SYMBOL(find_lock_page); @@ -714,13 +723,39 @@ unsigned find_get_pages(struct address_s { unsigned int i; unsigned int ret; + unsigned int nr_found; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); return ret; } @@ -741,19 +776,44 @@ unsigned find_get_pages_contig(struct ad { unsigned int i; unsigned int ret; + unsigned int nr_found; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (page->mapping == NULL || page->index != index) break; - page_cache_get(pages[i]); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; index++; } - read_unlock_irq(&mapping->tree_lock); - return i; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(find_get_pages_contig); @@ -775,6 +835,7 @@ unsigned find_get_pages_tag(struct addre unsigned int ret; read_lock_irq(&mapping->tree_lock); + /* TODO: implement lookup_tag_slot and make this lockless */ ret = radix_tree_gang_lookup_tag(&mapping->page_tree, (void **)pages, *index, nr_pages, tag); for (i = 0; i < ret; i++) Index: linux-rt.q/mm/page-writeback.c =================================================================== --- linux-rt.q.orig/mm/page-writeback.c +++ linux-rt.q/mm/page-writeback.c @@ -1011,17 +1011,15 @@ int test_set_page_writeback(struct page EXPORT_SYMBOL(test_set_page_writeback); /* - * Return true if any of the pages in the mapping are marged with the + * Return true if any of the pages in the mapping are marked with the * passed tag. */ int mapping_tagged(struct address_space *mapping, int tag) { - unsigned long flags; int ret; - - read_lock_irqsave(&mapping->tree_lock, flags); + rcu_read_lock(); ret = radix_tree_tagged(&mapping->page_tree, tag); - read_unlock_irqrestore(&mapping->tree_lock, flags); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL(mapping_tagged); Index: linux-rt.q/mm/readahead.c =================================================================== --- linux-rt.q.orig/mm/readahead.c +++ linux-rt.q/mm/readahead.c @@ -283,27 +283,25 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - read_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { pgoff_t page_offset = offset + page_idx; if (page_offset > end_index) break; + rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); + rcu_read_unlock(); if (page) continue; - read_unlock_irq(&mapping->tree_lock); page = page_cache_alloc_cold(mapping); - read_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; list_add(&page->lru, &page_pool); ret++; } - read_unlock_irq(&mapping->tree_lock); /* * Now start the IO. We ignore I/O errors - if the page is not patches/preempt-realtime-rawlocks.patch0000664000077200007720000001066010646635215017654 0ustar mingomingo--- drivers/oprofile/oprofilefs.c | 2 +- drivers/pci/access.c | 2 +- drivers/video/console/vgacon.c | 2 +- include/linux/kprobes.h | 2 +- include/linux/oprofile.h | 2 +- include/linux/percpu_counter.h | 2 +- kernel/kprobes.c | 2 +- kernel/module.c | 2 +- kernel/softlockup.c | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) Index: linux-rt.q/drivers/oprofile/oprofilefs.c =================================================================== --- linux-rt.q.orig/drivers/oprofile/oprofilefs.c +++ linux-rt.q/drivers/oprofile/oprofilefs.c @@ -21,7 +21,7 @@ #define OPROFILEFS_MAGIC 0x6f70726f -DEFINE_SPINLOCK(oprofilefs_lock); +DEFINE_RAW_SPINLOCK(oprofilefs_lock); static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode) { Index: linux-rt.q/drivers/pci/access.c =================================================================== --- linux-rt.q.orig/drivers/pci/access.c +++ linux-rt.q/drivers/pci/access.c @@ -11,7 +11,7 @@ * configuration space. */ -static DEFINE_SPINLOCK(pci_lock); +static DEFINE_RAW_SPINLOCK(pci_lock); /* * Wrappers for all PCI configuration access functions. They just check Index: linux-rt.q/drivers/video/console/vgacon.c =================================================================== --- linux-rt.q.orig/drivers/video/console/vgacon.c +++ linux-rt.q/drivers/video/console/vgacon.c @@ -51,7 +51,7 @@ #include