Index: linux-2.6.25.4-rt4/arch/x86/kernel/tsc_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/tsc_64.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/tsc_64.c	2008-05-29 09:46:07.000000000 -0400
@@ -66,6 +66,7 @@ static void set_cyc2ns_scale(unsigned lo
 unsigned long long native_sched_clock(void)
 {
 	unsigned long a = 0;
+	unsigned long long ret;
 
 	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
 	 * which means it is not completely exact and may not be monotonous
@@ -73,8 +74,11 @@ unsigned long long native_sched_clock(vo
 	 * scheduling purposes.
 	 */
 
+	preempt_disable_notrace();
 	rdtscll(a);
-	return cycles_2_ns(a);
+	ret = cycles_2_ns(a);
+	preempt_enable_notrace();
+	return ret;
 }
 
 /* We need to define a real function for sched_clock, to override the
@@ -227,14 +231,14 @@ void __init tsc_calibrate(void)
 	/* hpet or pmtimer available ? */
 	if (!hpet && !pm1 && !pm2) {
 		printk(KERN_INFO "TSC calibrated against PIT\n");
-		return;
+		goto out;
 	}
 
 	/* Check, whether the sampling was disturbed by an SMI */
 	if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
 		printk(KERN_WARNING "TSC calibration disturbed by SMI, "
 		       "using PIT calibration result\n");
-		return;
+		goto out;
 	}
 
 	tsc2 = (tsc2 - tsc1) * 1000000L;
@@ -255,6 +259,7 @@ void __init tsc_calibrate(void)
 
 	tsc_khz = tsc2 / tsc1;
 
+out:
 	for_each_possible_cpu(cpu)
 		set_cyc2ns_scale(tsc_khz, cpu);
 }
Index: linux-2.6.25.4-rt4/arch/x86/kernel/tsc_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/tsc_32.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/tsc_32.c	2008-05-29 09:46:07.000000000 -0400
@@ -14,7 +14,7 @@
 
 #include "mach_timer.h"
 
-static int tsc_enabled;
+static int tsc_disabled;
 
 /*
  * On some systems the TSC frequency does not
@@ -28,8 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz);
 static int __init tsc_setup(char *str)
 {
 	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-				"cannot disable TSC completely.\n");
-	mark_tsc_unstable("user disabled TSC");
+	       "cannot disable TSC completely.\n");
+	tsc_disabled = 1;
 	return 1;
 }
 #else
@@ -109,9 +109,10 @@ static void set_cyc2ns_scale(unsigned lo
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
-unsigned long long native_sched_clock(void)
+unsigned long long notrace  native_sched_clock(void)
 {
 	unsigned long long this_offset;
+	unsigned long long ret;
 
 	/*
 	 * Fall back to jiffies if there's no TSC available:
@@ -121,15 +122,21 @@ unsigned long long native_sched_clock(vo
 	 *   very important for it to be as fast as the platform
 	 *   can achive it. )
 	 */
-	if (unlikely(!tsc_enabled && !tsc_unstable))
+	if (unlikely(tsc_disabled))
 		/* No locking but a rare wrong value is not a big deal: */
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
+	preempt_disable_notrace();
+
 	/* read the Time Stamp Counter: */
 	rdtscll(this_offset);
 
 	/* return the value in ns */
-	return cycles_2_ns(this_offset);
+	ret = cycles_2_ns(this_offset);
+
+	preempt_enable_notrace();
+
+	return ret;
 }
 
 /* We need to define a real function for sched_clock, to override the
@@ -310,7 +317,6 @@ void mark_tsc_unstable(char *reason)
 {
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
-		tsc_enabled = 0;
 		printk("Marking TSC unstable due to: %s.\n", reason);
 		/* Can be called before registration */
 		if (clocksource_tsc.mult)
@@ -324,7 +330,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
 {
 	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
-		       d->ident);
+	       d->ident);
 	tsc_unstable = 1;
 	return 0;
 }
@@ -391,14 +397,23 @@ void __init tsc_init(void)
 {
 	int cpu;
 
-	if (!cpu_has_tsc)
+	if (!cpu_has_tsc || tsc_disabled) {
+		/* Disable the TSC in case of !cpu_has_tsc */
+		tsc_disabled = 1;
 		goto out_no_tsc;
+	}
 
 	cpu_khz = calculate_cpu_khz();
 	tsc_khz = cpu_khz;
 
-	if (!cpu_khz)
+	if (!cpu_khz) {
+		/*
+		 * We need to disable the TSC completely in this case
+		 * to prevent sched_clock() from using it.
+		 */
+		tsc_disabled = 1;
 		goto out_no_tsc;
+	}
 
 	printk("Detected %lu.%03lu MHz processor.\n",
 				(unsigned long)cpu_khz / 1000,
@@ -427,8 +442,7 @@ void __init tsc_init(void)
 	if (check_tsc_unstable()) {
 		clocksource_tsc.rating = 0;
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
-	} else
-		tsc_enabled = 1;
+	}
 
 	clocksource_register(&clocksource_tsc);
 
Index: linux-2.6.25.4-rt4/security/selinux/avc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/security/selinux/avc.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/security/selinux/avc.c	2008-05-29 09:46:01.000000000 -0400
@@ -312,6 +312,7 @@ static inline int avc_reclaim_node(void)
 		if (!spin_trylock_irqsave(&avc_cache.slots_lock[hvalue], flags))
 			continue;
 
+		rcu_read_lock();
 		list_for_each_entry(node, &avc_cache.slots[hvalue], list) {
 			if (atomic_dec_and_test(&node->ae.used)) {
 				/* Recently Unused */
@@ -319,11 +320,13 @@ static inline int avc_reclaim_node(void)
 				avc_cache_stats_incr(reclaims);
 				ecx++;
 				if (ecx >= AVC_CACHE_RECLAIM) {
+					rcu_read_unlock();
 					spin_unlock_irqrestore(&avc_cache.slots_lock[hvalue], flags);
 					goto out;
 				}
 			}
 		}
+		rcu_read_unlock();
 		spin_unlock_irqrestore(&avc_cache.slots_lock[hvalue], flags);
 	}
 out:
@@ -821,8 +824,14 @@ int avc_ss_reset(u32 seqno)
 
 	for (i = 0; i < AVC_CACHE_SLOTS; i++) {
 		spin_lock_irqsave(&avc_cache.slots_lock[i], flag);
+		/*
+		 * On -rt the outer spinlock does not prevent RCU
+		 * from being performed:
+		 */
+		rcu_read_lock();
 		list_for_each_entry(node, &avc_cache.slots[i], list)
 			avc_node_delete(node);
+		rcu_read_unlock();
 		spin_unlock_irqrestore(&avc_cache.slots_lock[i], flag);
 	}
 
Index: linux-2.6.25.4-rt4/security/selinux/netif.c
===================================================================
--- linux-2.6.25.4-rt4.orig/security/selinux/netif.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/security/selinux/netif.c	2008-05-29 09:46:01.000000000 -0400
@@ -259,11 +259,13 @@ static void sel_netif_flush(void)
 	int idx;
 	struct sel_netif *netif;
 
+	rcu_read_lock();
 	spin_lock_bh(&sel_netif_lock);
 	for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++)
 		list_for_each_entry(netif, &sel_netif_hash[idx], list)
 			sel_netif_destroy(netif);
 	spin_unlock_bh(&sel_netif_lock);
+	rcu_read_unlock();
 }
 
 static int sel_netif_avc_callback(u32 event, u32 ssid, u32 tsid,
Index: linux-2.6.25.4-rt4/kernel/rcupreempt_trace.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rcupreempt_trace.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rcupreempt_trace.c	2008-05-29 09:46:02.000000000 -0400
@@ -296,8 +296,14 @@ static int rcupreempt_debugfs_init(void)
 						NULL, &rcuctrs_fops);
 	if (!ctrsdir)
 		goto free_out;
+
+	if (!rcu_trace_boost_create(rcudir))
+		goto free_out;
+
 	return 0;
 free_out:
+	if (ctrsdir)
+		debugfs_remove(ctrsdir);
 	if (statdir)
 		debugfs_remove(statdir);
 	if (gpdir)
@@ -309,15 +315,21 @@ out:
 
 static int __init rcupreempt_trace_init(void)
 {
+	int ret;
+
 	mutex_init(&rcupreempt_trace_mutex);
 	rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
 	if (!rcupreempt_trace_buf)
 		return 1;
-	return rcupreempt_debugfs_init();
+	ret = rcupreempt_debugfs_init();
+	if (ret)
+		kfree(rcupreempt_trace_buf);
+	return ret;
 }
 
 static void __exit rcupreempt_trace_cleanup(void)
 {
+	rcu_trace_boost_destroy();
 	debugfs_remove(statdir);
 	debugfs_remove(gpdir);
 	debugfs_remove(ctrsdir);
Index: linux-2.6.25.4-rt4/include/linux/rcupreempt.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/rcupreempt.h	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/rcupreempt.h	2008-05-29 09:46:04.000000000 -0400
@@ -44,7 +44,39 @@
 
 #define rcu_qsctr_inc(cpu)
 #define rcu_bh_qsctr_inc(cpu)
-#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+/*
+ * Someone might want to pass call_rcu_bh as a function pointer.
+ * So this needs to just be a rename and not a macro function.
+ *  (no parentheses)
+ */
+#define call_rcu_bh	 	call_rcu
+
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+/*
+ * Task state with respect to being RCU-boosted.  This state is changed
+ * by the task itself in response to the following three events:
+ * 1. Preemption (or block on lock) while in RCU read-side critical section.
+ * 2. Outermost rcu_read_unlock() for blocked RCU read-side critical section.
+ *
+ * The RCU-boost task also updates the state when boosting priority.
+ */
+enum rcu_boost_state {
+	RCU_BOOST_IDLE = 0,	   /* Not yet blocked if in RCU read-side. */
+	RCU_BOOST_BLOCKED = 1,	   /* Blocked from RCU read-side. */
+	RCU_BOOSTED = 2,	   /* Boosting complete. */
+	RCU_BOOST_INVALID = 3,	   /* For bogus state sightings. */
+};
+
+#define N_RCU_BOOST_STATE (RCU_BOOST_INVALID + 1)
+
+int __init rcu_preempt_boost_init(void);
+
+#else /* CONFIG_PREEPMT_RCU_BOOST */
+
+#define rcu_preempt_boost_init() do { } while (0)
+
+#endif /* CONFIG_PREEMPT_RCU_BOOST */
 
 extern void __rcu_read_lock(void)	__acquires(RCU);
 extern void __rcu_read_unlock(void)	__releases(RCU);
@@ -89,14 +121,26 @@ static inline void rcu_enter_nohz(void)
 {
 	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 	__get_cpu_var(dynticks_progress_counter)++;
-	WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
+	if (unlikely(__get_cpu_var(dynticks_progress_counter) & 0x1)) {
+		printk("BUG: bad accounting of dynamic ticks\n");
+		printk("   will try to fix, but it is best to reboot\n");
+		WARN_ON(1);
+		/* try to fix it */
+		__get_cpu_var(dynticks_progress_counter)++;
+	}
 }
 
 static inline void rcu_exit_nohz(void)
 {
 	__get_cpu_var(dynticks_progress_counter)++;
 	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
-	WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
+	if (unlikely(!(__get_cpu_var(dynticks_progress_counter) & 0x1))) {
+		printk("BUG: bad accounting of dynamic ticks\n");
+		printk("   will try to fix, but it is best to reboot\n");
+		WARN_ON(1);
+		/* try to fix it */
+		__get_cpu_var(dynticks_progress_counter)++;
+	}
 }
 
 #else /* CONFIG_NO_HZ */
@@ -104,5 +148,7 @@ static inline void rcu_exit_nohz(void)
 #define rcu_exit_nohz()		do { } while (0)
 #endif /* CONFIG_NO_HZ */
 
+extern void rcu_process_callbacks(struct softirq_action *unused);
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPREEMPT_H */
Index: linux-2.6.25.4-rt4/include/linux/init_task.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/init_task.h	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/init_task.h	2008-05-29 09:47:26.000000000 -0400
@@ -10,6 +10,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <net/net_namespace.h>
+#include <linux/spinlock.h>
 
 #define INIT_FDTABLE \
 {							\
@@ -87,6 +88,24 @@ extern struct nsproxy init_nsproxy;
 	.signalfd_wqh	= __WAIT_QUEUE_HEAD_INITIALIZER(sighand.signalfd_wqh),	\
 }
 
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+#define INIT_RCU_BOOST_PRIO .rcu_prio	= MAX_PRIO,
+#define INIT_PREEMPT_RCU_BOOST(tsk)					\
+	.rcub_rbdp	= NULL,						\
+	.rcub_state	= RCU_BOOST_IDLE,				\
+	.rcub_entry	= LIST_HEAD_INIT(tsk.rcub_entry),
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+#define INIT_RCU_BOOST_PRIO
+#define INIT_PREEMPT_RCU_BOOST(tsk)
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
+#ifdef CONFIG_PREEMPT_RT
+# define INIT_RW_OWNERS(tsk) .owned_read_locks = {			\
+		[0 ... (MAX_RWLOCK_DEPTH - 1) ] = { .task = &tsk } },
+#else
+# define INIT_RW_OWNERS(tsk)
+#endif
+
 extern struct group_info init_groups;
 
 #define INIT_STRUCT_PID {						\
@@ -148,6 +167,7 @@ extern struct group_info init_groups;
 	.static_prio	= MAX_PRIO-20,					\
 	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
+	INIT_RCU_BOOST_PRIO						\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
@@ -186,7 +206,8 @@ extern struct group_info init_groups;
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
-	.pi_lock	= __SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.posix_timer_list = NULL,					\
+	.pi_lock	= RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
@@ -196,6 +217,8 @@ extern struct group_info init_groups;
 	INIT_IDS							\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
+	INIT_PREEMPT_RCU_BOOST(tsk)					\
+	INIT_RW_OWNERS(tsk)						\
 }
 
 
Index: linux-2.6.25.4-rt4/include/linux/rcupdate.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/rcupdate.h	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/rcupdate.h	2008-05-29 09:46:01.000000000 -0400
@@ -245,5 +245,49 @@ extern long rcu_batches_completed_bh(voi
 extern void rcu_init(void);
 extern int rcu_needs_cpu(int cpu);
 
+struct dentry;
+
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+extern void init_rcu_boost_late(void);
+extern void rcu_boost_readers(void);
+extern void rcu_unboost_readers(void);
+extern void __rcu_preempt_boost(void);
+#ifdef CONFIG_RCU_TRACE
+extern int rcu_trace_boost_create(struct dentry *rcudir);
+extern void rcu_trace_boost_destroy(void);
+#endif /* CONFIG_RCU_TRACE */
+#define rcu_preempt_boost() /* cpp to avoid #include hell. */ \
+	do { \
+		if (unlikely(current->rcu_read_lock_nesting > 0)) \
+			__rcu_preempt_boost(); \
+	} while (0)
+extern void __rcu_preempt_unboost(void);
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+static inline void init_rcu_boost_late(void)
+{
+}
+static inline void rcu_preempt_boost(void)
+{
+}
+static inline void __rcu_preempt_unboost(void)
+{
+}
+static inline void rcu_boost_readers(void)
+{
+}
+static inline void rcu_unboost_readers(void)
+{
+}
+#ifdef CONFIG_RCU_TRACE
+static inline int rcu_trace_boost_create(struct dentry *rcudir)
+{
+	return 0;
+}
+static inline void rcu_trace_boost_destroy(void)
+{
+}
+#endif /* CONFIG_RCU_TRACE */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
Index: linux-2.6.25.4-rt4/include/linux/sched.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/sched.h	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/sched.h	2008-05-29 09:47:32.000000000 -0400
@@ -92,6 +92,28 @@ struct sched_param {
 
 #include <asm/processor.h>
 
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern int softirq_preemption;
+#else
+# define softirq_preemption 0
+#endif
+
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern int hardirq_preemption;
+#else
+# define hardirq_preemption 0
+#endif
+
 struct mem_cgroup;
 struct exec_domain;
 struct futex_pi_state;
@@ -160,6 +182,8 @@ print_cfs_rq(struct seq_file *m, int cpu
 }
 #endif
 
+extern struct semaphore kernel_sem;
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
@@ -171,16 +195,18 @@ print_cfs_rq(struct seq_file *m, int cpu
  * mistake.
  */
 #define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define __TASK_STOPPED		4
-#define __TASK_TRACED		8
+#define TASK_RUNNING_MUTEX	1
+#define TASK_INTERRUPTIBLE	2
+#define TASK_UNINTERRUPTIBLE	4
+#define __TASK_STOPPED		8
+#define __TASK_TRACED		16
 /* in tsk->exit_state */
-#define EXIT_ZOMBIE		16
-#define EXIT_DEAD		32
+#define EXIT_ZOMBIE		32
+#define EXIT_DEAD		64
 /* in tsk->state again */
-#define TASK_DEAD		64
-#define TASK_WAKEKILL		128
+#define TASK_NONINTERACTIVE	128
+#define TASK_DEAD		256
+#define TASK_WAKEKILL		512
 
 /* Convenience macros for the sake of set_task_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -189,10 +215,12 @@ print_cfs_rq(struct seq_file *m, int cpu
 
 /* Convenience macros for the sake of wake_up */
 #define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
-#define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
+#define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED | \
+				 TASK_RUNNING_MUTEX)
 
 /* get_task_state() */
-#define TASK_REPORT		(TASK_RUNNING | TASK_INTERRUPTIBLE | \
+#define TASK_REPORT		(TASK_RUNNING | TASK_RUNNING_MUTEX | \
+				 TASK_INTERRUPTIBLE | \
 				 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
 				 __TASK_TRACED)
 
@@ -208,6 +236,28 @@ print_cfs_rq(struct seq_file *m, int cpu
 #define set_task_state(tsk, state_value)		\
 	set_mb((tsk)->state, (state_value))
 
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+
+struct exec_domain;
+
 /*
  * set_current_state() includes a barrier so that the write of current->state
  * is correctly serialised wrt the caller's subsequent test of whether to
@@ -246,6 +296,8 @@ extern asmlinkage void schedule_tail(str
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
+extern int runqueue_is_locked(void);
+
 extern cpumask_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
@@ -269,6 +321,7 @@ static inline void show_state(void)
 }
 
 extern void show_regs(struct pt_regs *);
+extern int irq_show_regs_callback(int cpu, struct pt_regs *regs);
 
 /*
  * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
@@ -289,6 +342,12 @@ extern void hrtick_resched(void);
 
 extern void sched_show_task(struct task_struct *p);
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern int debug_direct_keyboard;
+#else
+# define debug_direct_keyboard 0
+#endif
+
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
@@ -329,6 +388,11 @@ extern signed long schedule_timeout_inte
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+asmlinkage void __sched __schedule(void);
 
 struct nsproxy;
 struct user_namespace;
@@ -557,6 +621,19 @@ struct signal_struct {
 #define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
 
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+#define set_rcu_prio(p, prio) /* cpp to avoid #include hell */ \
+	do { \
+		(p)->rcu_prio = (prio); \
+	} while (0)
+#define get_rcu_prio(p) (p)->rcu_prio  /* cpp to avoid #include hell */
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+static inline void set_rcu_prio(struct task_struct *p, int prio)
+{
+}
+#define get_rcu_prio(p) (MAX_PRIO)  /* cpp to use MAX_PRIO before it's defined */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
 /* If true, all threads except ->group_exit_task have pending SIGKILL */
 static inline int signal_group_exit(const struct signal_struct *sig)
 {
@@ -991,6 +1068,16 @@ struct sched_rt_entity {
 #endif
 };
 
+#ifdef CONFIG_PREEMPT_RT
+struct rw_mutex;
+struct reader_lock_struct {
+	struct rw_mutex *lock;
+	struct list_head list;
+	struct task_struct *task;
+	int count;
+};
+
+#endif
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1001,12 +1088,13 @@ struct task_struct {
 	int lock_depth;		/* BKL lock depth */
 
 #ifdef CONFIG_SMP
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	int oncpu;
 #endif
-#endif
 
 	int prio, static_prio, normal_prio;
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+	int rcu_prio;
+#endif
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
@@ -1041,6 +1129,12 @@ struct task_struct {
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 #endif
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+	struct rcu_boost_dat *rcub_rbdp;
+	enum rcu_boost_state rcub_state;
+	struct list_head rcub_entry;
+	unsigned long rcu_preempt_counter;
+#endif
 
 	struct list_head tasks;
 	/*
@@ -1104,6 +1198,8 @@ struct task_struct {
 	unsigned long long it_sched_expires;
 	struct list_head cpu_timers[3];
 
+	struct task_struct* posix_timer_list;
+
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
@@ -1169,7 +1265,7 @@ struct task_struct {
 	spinlock_t alloc_lock;
 
 	/* Protection of the PI data structures: */
-	spinlock_t pi_lock;
+	raw_spinlock_t pi_lock;
 
 #ifdef CONFIG_RT_MUTEXES
 	/* PI waiters blocked on a rt_mutex held by this task */
@@ -1182,6 +1278,7 @@ struct task_struct {
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
+	int pagefault_disabled;
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	int hardirqs_enabled;
@@ -1205,6 +1302,38 @@ struct task_struct {
 	unsigned int lockdep_recursion;
 #endif
 
+#define MAX_PREEMPT_TRACE 25
+#define MAX_RWLOCK_DEPTH 5
+
+#ifdef CONFIG_PREEMPT_RT
+	int reader_lock_count;
+	struct reader_lock_struct owned_read_locks[MAX_RWLOCK_DEPTH];
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACE
+	unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+	unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
+#define MAX_LOCK_STACK	MAX_PREEMPT_TRACE
+#ifdef CONFIG_DEBUG_PREEMPT
+	int lock_count;
+# ifdef CONFIG_PREEMPT_RT
+	struct rt_mutex *owned_lock[MAX_LOCK_STACK];
+# endif
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	unsigned long	softlockup_count; /* Count to keep track how long the
+					   *  thread is in the kernel without
+					   *  sleeping.
+					   */
+#endif
+	/* realtime bits */
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	void *last_kernel_lock;
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;
 
@@ -1271,26 +1400,22 @@ struct task_struct {
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Temporary hack, until we find a solution to
+	 * handle printk in atomic operations.
+	 */
+	int in_printk;
+#endif
 };
 
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#ifdef CONFIG_PREEMPT_RT
+# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0)
+#else
+# define set_printk_might_sleep(x) do { } while(0)
+#endif
 
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+#include <linux/sched_prio.h>
 
 static inline int rt_prio(int prio)
 {
@@ -1437,6 +1562,15 @@ extern struct pid *cad_pid;
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
+#ifdef CONFIG_PREEMPT_RT
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
+#else
 extern void __put_task_struct(struct task_struct *t);
 
 static inline void put_task_struct(struct task_struct *t)
@@ -1444,6 +1578,7 @@ static inline void put_task_struct(struc
 	if (atomic_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
+#endif
 
 /*
  * Per process flags
@@ -1454,6 +1589,7 @@ static inline void put_task_struct(struc
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_NOSCHED	0x00000020	/* Userspace does not expect scheduling */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
@@ -1461,6 +1597,7 @@ static inline void put_task_struct(struc
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
 #define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
+#define PF_KMAP		0x00004000	/* this context has a kmap */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
@@ -1472,6 +1609,8 @@ static inline void put_task_struct(struc
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_SOFTIRQ	0x04000000	/* softirq context */
+#define PF_HARDIRQ	0x08000000	/* hardirq context */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
@@ -1514,6 +1653,35 @@ static inline int set_cpus_allowed(struc
 
 extern unsigned long long sched_clock(void);
 
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline void sched_clock_init(void)
+{
+}
+
+static inline u64 sched_clock_cpu(int cpu)
+{
+	return sched_clock();
+}
+
+static inline void sched_clock_tick(void)
+{
+}
+
+static inline void sched_clock_idle_sleep_event(void)
+{
+}
+
+static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+}
+#else
+extern void sched_clock_init(void);
+extern u64 sched_clock_cpu(int cpu);
+extern void sched_clock_tick(void);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+#endif
+
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
@@ -1566,9 +1734,14 @@ extern int sysctl_sched_rt_runtime;
 
 extern unsigned int sysctl_sched_compat_yield;
 
+extern void task_setprio(struct task_struct *p, int prio);
+
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
+static inline void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+	task_setprio(p, prio);
+}
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 #else
 static inline int rt_mutex_getprio(struct task_struct *p)
@@ -1590,6 +1763,7 @@ extern struct task_struct *curr_task(int
 extern void set_curr_task(int cpu, struct task_struct *p);
 
 void yield(void);
+void __yield(void);
 
 /*
  * The default (Linux) execution domain.
@@ -1661,6 +1835,9 @@ extern void do_timer(unsigned long ticks
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_process_mutex(struct task_struct * tsk);
+extern int wake_up_process_sync(struct task_struct * tsk);
+extern int wake_up_process_mutex_sync(struct task_struct * tsk);
 extern void wake_up_new_task(struct task_struct *tsk,
 				unsigned long clone_flags);
 #ifdef CONFIG_SMP
@@ -1753,12 +1930,20 @@ extern struct mm_struct * mm_alloc(void)
 
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
+extern void __mmdrop_delayed(struct mm_struct *);
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
@@ -1931,6 +2116,11 @@ static inline void clear_tsk_need_resche
 	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 
+static inline int test_tsk_need_resched(struct task_struct *tsk)
+{
+	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
+}
+
 static inline int signal_pending(struct task_struct *p)
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
@@ -1943,9 +2133,29 @@ static inline int fatal_signal_pending(s
 	return signal_pending(p) && __fatal_signal_pending(p);
 }
 
+static inline int _need_resched(void)
+{
+	return unlikely(test_tsk_need_resched(current));
+}
+
 static inline int need_resched(void)
 {
-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+	return _need_resched();
+}
+
+static inline void set_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline int need_resched_delayed(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED));
 }
 
 /*
@@ -1967,15 +2177,23 @@ static inline int cond_resched(void)
 	return _cond_resched();
 }
 #endif
-extern int cond_resched_lock(spinlock_t * lock);
+extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock);
+extern int __cond_resched_spinlock(spinlock_t *spinlock);
+
+#define cond_resched_lock(lock) \
+	PICK_SPIN_OP_RET(__cond_resched_raw_spinlock, __cond_resched_spinlock,\
+		 lock)
+
 extern int cond_resched_softirq(void);
+extern int cond_resched_softirq_context(void);
+extern int cond_resched_hardirq_context(void);
 
 /*
  * Does a critical section need to be broken due to another
  * task waiting?: (technically does not depend on CONFIG_PREEMPT,
  * but a general need for low latency)
  */
-static inline int spin_needbreak(spinlock_t *lock)
+static inline int __raw_spin_needbreak(raw_spinlock_t *lock)
 {
 #ifdef CONFIG_PREEMPT
 	return spin_is_contended(lock);
@@ -1984,6 +2202,37 @@ static inline int spin_needbreak(spinloc
 #endif
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static inline int __spin_needbreak(spinlock_t *lock)
+{
+	return lock->break_lock;
+}
+#else
+static inline int __spin_needbreak(spinlock_t *lock)
+{
+	/* should never be call outside of RT */
+	BUG();
+	return 0;
+}
+#endif
+
+#define spin_needbreak(lock) \
+	PICK_SPIN_OP_RET(__raw_spin_needbreak, __spin_needbreak, lock)
+
+static inline int softirq_need_resched(void)
+{
+	if (softirq_preemption && (current->flags & PF_SOFTIRQ))
+		return need_resched();
+	return 0;
+}
+
+static inline int hardirq_need_resched(void)
+{
+	if (hardirq_preemption && (current->flags & PF_HARDIRQ))
+		return need_resched();
+	return 0;
+}
+
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
@@ -2031,6 +2280,18 @@ static inline void arch_pick_mmap_layout
 }
 #endif
 
+#ifdef CONFIG_TRACING
+extern void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3);
+#else
+static inline void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+}
+#endif
+
 extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
@@ -2094,6 +2355,12 @@ static inline void inc_syscw(struct task
 }
 #endif
 
+#ifdef CONFIG_PREEMPT_TRACE
+void print_preempt_trace(struct task_struct *tsk);
+#else
+# define print_preempt_trace(tsk) do { } while (0)
+#endif
+
 #ifdef CONFIG_SMP
 void migration_init(void);
 #else
@@ -2106,6 +2373,15 @@ static inline void migration_init(void)
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
+#ifdef CONFIG_SMP
+static inline int task_is_current(struct task_struct *task)
+{
+	return task->oncpu;
+}
+#endif
+
+#define TASK_STATE_TO_CHAR_STR "RMSDTtZX"
+
 #endif /* __KERNEL__ */
 
 #endif
Index: linux-2.6.25.4-rt4/kernel/Kconfig.preempt
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/Kconfig.preempt	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/Kconfig.preempt	2008-05-29 09:46:43.000000000 -0400
@@ -1,14 +1,13 @@
-
 choice
-	prompt "Preemption Model"
-	default PREEMPT_NONE
+	prompt "Preemption Mode"
+	default PREEMPT_RT
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
 	help
-	  This is the traditional Linux preemption model, geared towards
+	  This is the traditional Linux preemption model geared towards
 	  throughput. It will still provide good latencies most of the
-	  time, but there are no guarantees and occasional longer delays
+	  time but there are no guarantees and occasional long delays
 	  are possible.
 
 	  Select this option if you are building a kernel for a server or
@@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
-	  preemption points have been selected to reduce the maximum
+	  preemption points have been selected to minimize the maximum
 	  latency of rescheduling, providing faster application reactions,
 	  at the cost of slightly lower throughput.
 
@@ -33,25 +32,95 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT_DESKTOP
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	help
 	  This option reduces the latency of the kernel by making
-	  all kernel code (that is not executing in a critical section)
+	  all kernel code that is not executing in a critical section
 	  preemptible.  This allows reaction to interactive events by
 	  permitting a low priority process to be preempted involuntarily
 	  even if it is in kernel mode executing a system call and would
-	  otherwise not be about to reach a natural preemption point.
-	  This allows applications to run more 'smoothly' even when the
-	  system is under load, at the cost of slightly lower throughput
-	  and a slight runtime overhead to kernel code.
+	  otherwise not about to reach a preemption point.  This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of slighly lower throughput and a
+	  slight runtime overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 50% of the time.)
 
 	  Select this if you are building a kernel for a desktop or
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RT
+	bool "Complete Preemption (Real-Time)"
+	select PREEMPT_SOFTIRQS
+	select PREEMPT_HARDIRQS
+	select PREEMPT_RCU
+	select RT_MUTEXES
+	help
+	  This option further reduces the scheduling latency of the
+	  kernel by replacing almost every spinlock used by the kernel
+	  with preemptible mutexes and thus making all but the most
+	  critical kernel code involuntarily preemptible. The remaining
+	  handful of lowlevel non-preemptible codepaths are short and
+	  have a deterministic latency of a couple of tens of
+	  microseconds (depending on the hardware).  This also allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of lower throughput and runtime
+	  overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 95% of the time.)
+
+	  Select this if you are building a kernel for a desktop,
+	  embedded or real-time system with guaranteed latency
+	  requirements of 100 usecs or lower.
+
 endchoice
 
+config PREEMPT
+	bool
+	default y
+	depends on PREEMPT_DESKTOP || PREEMPT_RT
+
+config PREEMPT_SOFTIRQS
+	bool "Thread Softirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          soft interrupts. This means that all softirqs will execute
+          in softirqd's context. While this helps latency, it can also
+          reduce performance.
+
+          The threading of softirqs can also be controlled via
+          /proc/sys/kernel/softirq_preemption runtime flag and the
+          sofirq-preempt=0/1 boot-time option.
+
+	  Say N if you are unsure.
+
+config PREEMPT_HARDIRQS
+	bool "Thread Hardirqs"
+	default n
+	depends on !GENERIC_HARDIRQS_NO__DO_IRQ
+	select PREEMPT_SOFTIRQS
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          hardirqs. This means that all (or selected) hardirqs will run
+          in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+          The threading of hardirqs can also be controlled via the
+          /proc/sys/kernel/hardirq_preemption runtime flag and the
+          hardirq-preempt=0/1 boot-time option. Per-irq threading can
+          be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
+          runtime flags.
+
+	  Say N if you are unsure.
+
 config PREEMPT_RCU
 	bool "Preemptible RCU"
 	depends on PREEMPT
@@ -66,6 +135,20 @@ config PREEMPT_RCU
 
 	  Say N if you are unsure.
 
+config PREEMPT_RCU_BOOST
+	bool "Enable priority boosting of RCU read-side critical sections"
+	depends on PREEMPT_RCU
+	default y if PREEMPT_RT
+	help
+	  This option permits priority boosting of RCU read-side critical
+	  sections tat have been preempted and a RT process is waiting
+	  on a synchronize_rcu.
+
+	  An RCU thread is also created that periodically wakes up and
+	  performs a synchronize_rcu to make sure that all readers eventually
+	  do complete to prevent an indefinite delay of grace periods and
+	  possible OOM problems.
+
 config RCU_TRACE
 	bool "Enable tracing for RCU - currently stats in debugfs"
 	depends on PREEMPT_RCU
Index: linux-2.6.25.4-rt4/kernel/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/Makefile	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/Makefile	2008-05-29 09:47:28.000000000 -0400
@@ -7,14 +7,30 @@ obj-y     = sched.o fork.o exec_domain.o
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o \
-	    notifier.o ksysfs.o pm_qos_params.o
+	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+
+CFLAGS_REMOVE_sched.o = -mno-spe
+
+ifdef CONFIG_FTRACE
+# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_marker.o = -pg
+endif
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+ifneq ($(CONFIG_PREEMPT_RT),y)
+obj-y += mutex.o
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+endif
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
@@ -26,6 +42,7 @@ endif
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_PREEMPT_RT) += rt.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
@@ -59,6 +76,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
+obj-$(CONFIG_PREEMPT_RCU_BOOST) += rcupreempt-boost.o
 ifeq ($(CONFIG_PREEMPT_RCU),y)
 obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
 endif
@@ -68,6 +86,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayac
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
+obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_SMP) += sched_cpupri.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6.25.4-rt4/kernel/fork.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/fork.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/fork.c	2008-05-29 09:47:28.000000000 -0400
@@ -34,6 +34,7 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
+#include <linux/interrupt.h>
 #include <linux/futex.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
@@ -42,6 +43,8 @@
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
 #include <linux/profile.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
@@ -73,6 +76,15 @@ DEFINE_PER_CPU(unsigned long, process_co
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+/*
+ * Delayed mmdrop. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling
+ * context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
+
 int nr_processes(void)
 {
 	int cpu;
@@ -117,10 +129,13 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
 
-void __put_task_struct(struct task_struct *tsk)
+#ifdef CONFIG_PREEMPT_RT
+void __put_task_struct_cb(struct rcu_head *rhp)
 {
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+	BUG_ON(atomic_read(&tsk->usage));
 	WARN_ON(!tsk->exit_state);
-	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
 	security_task_free(tsk);
@@ -132,8 +147,27 @@ void __put_task_struct(struct task_struc
 		free_task(tsk);
 }
 
+#else
+
+void __put_task_struct(struct task_struct *tsk)
+{
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(tsk == current);
+
+	security_task_free(tsk);
+	free_uid(tsk->user);
+	put_group_info(tsk->group_info);
+
+	if (!profile_handoff_task(tsk))
+		free_task(tsk);
+}
+#endif
+
 void __init fork_init(unsigned long mempages)
 {
+	int i;
+
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -161,6 +195,9 @@ void __init fork_init(unsigned long memp
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
+
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -226,6 +263,7 @@ static int dup_mmap(struct mm_struct *mm
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
+	INIT_LIST_HEAD(&mm->delayed_drop);
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
@@ -990,6 +1028,9 @@ static void rt_mutex_init_task(struct ta
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init(&p->pi_waiters, &p->pi_lock);
 	p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	p->last_kernel_lock = NULL;
+# endif
 #endif
 }
 
@@ -1041,7 +1082,7 @@ static struct task_struct *copy_process(
 
 	rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
@@ -1079,7 +1120,13 @@ static struct task_struct *copy_process(
 #ifdef CONFIG_PREEMPT_RCU
 	p->rcu_read_lock_nesting = 0;
 	p->rcu_flipctr_idx = 0;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+	p->rcu_prio = MAX_PRIO;
+	p->rcub_rbdp = NULL;
+	p->rcub_state = RCU_BOOST_IDLE;
+	INIT_LIST_HEAD(&p->rcub_entry);
+#endif
+#endif /* CONFIG_PREEMPT_RCU */
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 
@@ -1114,7 +1161,7 @@ static struct task_struct *copy_process(
 	INIT_LIST_HEAD(&p->cpu_timers[0]);
 	INIT_LIST_HEAD(&p->cpu_timers[1]);
 	INIT_LIST_HEAD(&p->cpu_timers[2]);
-
+	p->posix_timer_list = NULL;
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
@@ -1154,6 +1201,7 @@ static struct task_struct *copy_process(
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+	p->pagefault_disabled = 0;
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
@@ -1193,6 +1241,22 @@ static struct task_struct *copy_process(
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_io;
+#ifdef CONFIG_DEBUG_PREEMPT
+	p->lock_count = 0;
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+	p->reader_lock_count = 0;
+	{
+		int i;
+		for (i = 0; i < MAX_RWLOCK_DEPTH; i++) {
+			INIT_LIST_HEAD(&p->owned_read_locks[i].list);
+			p->owned_read_locks[i].count = 0;
+			p->owned_read_locks[i].lock = NULL;
+			p->owned_read_locks[i].task = p;
+		}
+	}
+#endif
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
@@ -1277,11 +1341,13 @@ static struct task_struct *copy_process(
 	 * to ensure it is on a valid CPU (and if not, just force it back to
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
+	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
 	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
+	preempt_enable();
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
@@ -1344,7 +1410,9 @@ static struct task_struct *copy_process(
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+			preempt_disable();
 			__get_cpu_var(process_counts)++;
+			preempt_enable();
 		}
 		attach_pid(p, PIDTYPE_PID, pid);
 		nr_threads++;
@@ -1788,3 +1856,124 @@ bad_unshare_cleanup_thread:
 bad_unshare_out:
 	return err;
 }
+
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+
+		__mmdrop(mm);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void  __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		if (mmdrop_complete())
+			continue;
+		schedule();
+
+		/*
+ 		 * This must be called from time to time on ia64, and is a
+ 		 * no-op on other archs. Used to be in cpu_idle(), but with
+ 		 * the new -rt semantics it can't stay there.
+		 */
+		check_pgt_cache();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
Index: linux-2.6.25.4-rt4/kernel/rcupreempt-boost.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/rcupreempt-boost.c	2008-05-29 09:46:48.000000000 -0400
@@ -0,0 +1,588 @@
+/*
+ * Read-Copy Update preempt priority boosting
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright Red Hat Inc, 2007
+ *
+ * Authors: Steven Rostedt <srostedt@redhat.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>.
+ *
+ */
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/kthread.h>
+
+DEFINE_RAW_SPINLOCK(rcu_boost_wake_lock);
+static int rcu_boost_prio = MAX_PRIO;	/* Prio to set preempted RCU readers */
+static long rcu_boost_counter;		/* used to keep track of who boosted */
+static int rcu_preempt_thread_secs = 3;	/* Seconds between waking rcupreemptd thread */
+
+struct rcu_boost_dat {
+	raw_spinlock_t rbs_lock;	/* Sync changes to this struct */
+	int rbs_prio;			/* CPU copy of rcu_boost_prio  */
+	struct list_head rbs_toboost;	/* Preempted RCU readers       */
+	struct list_head rbs_boosted;	/* RCU readers that have been boosted */
+#ifdef CONFIG_RCU_TRACE
+	/* The rest are for statistics */
+	unsigned long rbs_stat_task_boost_called;
+	unsigned long rbs_stat_task_boosted;
+	unsigned long rbs_stat_boost_called;
+	unsigned long rbs_stat_try_boost;
+	unsigned long rbs_stat_boosted;
+	unsigned long rbs_stat_unboost_called;
+	unsigned long rbs_stat_unboosted;
+	unsigned long rbs_stat_try_boost_readers;
+	unsigned long rbs_stat_boost_readers;
+	unsigned long rbs_stat_try_unboost_readers;
+	unsigned long rbs_stat_unboost_readers;
+	unsigned long rbs_stat_over_taken;
+#endif /* CONFIG_RCU_TRACE */
+};
+
+static DEFINE_PER_CPU(struct rcu_boost_dat, rcu_boost_data);
+#define RCU_BOOST_ME &__get_cpu_var(rcu_boost_data)
+
+#ifdef CONFIG_RCU_TRACE
+
+#define RCUPREEMPT_BOOST_TRACE_BUF_SIZE 4096
+static char rcupreempt_boost_trace_buf[RCUPREEMPT_BOOST_TRACE_BUF_SIZE];
+
+static ssize_t rcuboost_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	static DEFINE_MUTEX(mutex);
+	int cnt = 0;
+	int cpu;
+	struct rcu_boost_dat *rbd;
+	ssize_t bcount;
+	unsigned long task_boost_called = 0;
+	unsigned long task_boosted = 0;
+	unsigned long boost_called = 0;
+	unsigned long try_boost = 0;
+	unsigned long boosted = 0;
+	unsigned long unboost_called = 0;
+	unsigned long unboosted = 0;
+	unsigned long try_boost_readers = 0;
+	unsigned long boost_readers = 0;
+	unsigned long try_unboost_readers = 0;
+	unsigned long unboost_readers = 0;
+	unsigned long over_taken = 0;
+
+	mutex_lock(&mutex);
+
+	for_each_online_cpu(cpu) {
+		rbd = &per_cpu(rcu_boost_data, cpu);
+
+		task_boost_called += rbd->rbs_stat_task_boost_called;
+		task_boosted += rbd->rbs_stat_task_boosted;
+		boost_called += rbd->rbs_stat_boost_called;
+		try_boost += rbd->rbs_stat_try_boost;
+		boosted += rbd->rbs_stat_boosted;
+		unboost_called += rbd->rbs_stat_unboost_called;
+		unboosted += rbd->rbs_stat_unboosted;
+		try_boost_readers += rbd->rbs_stat_try_boost_readers;
+		boost_readers += rbd->rbs_stat_boost_readers;
+		try_unboost_readers += rbd->rbs_stat_try_boost_readers;
+		unboost_readers += rbd->rbs_stat_boost_readers;
+		over_taken += rbd->rbs_stat_over_taken;
+	}
+
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"task_boost_called = %ld\n",
+			task_boost_called);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"task_boosted = %ld\n",
+			task_boosted);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"boost_called = %ld\n",
+			boost_called);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"try_boost = %ld\n",
+			try_boost);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"boosted = %ld\n",
+			boosted);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"unboost_called = %ld\n",
+			unboost_called);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"unboosted = %ld\n",
+			unboosted);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"try_boost_readers = %ld\n",
+			try_boost_readers);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"boost_readers = %ld\n",
+			boost_readers);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"try_unboost_readers = %ld\n",
+			try_unboost_readers);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"unboost_readers = %ld\n",
+			unboost_readers);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"over_taken = %ld\n",
+			over_taken);
+	cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
+			RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
+			"rcu_boost_prio = %d\n",
+			rcu_boost_prio);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_boost_trace_buf, strlen(rcupreempt_boost_trace_buf));
+	mutex_unlock(&mutex);
+
+	return bcount;
+}
+
+static struct file_operations rcuboost_fops = {
+	.read = rcuboost_read,
+};
+
+static struct dentry  *rcuboostdir;
+int rcu_trace_boost_create(struct dentry *rcudir)
+{
+	rcuboostdir = debugfs_create_file("rcuboost", 0444, rcudir,
+					  NULL, &rcuboost_fops);
+	if (!rcuboostdir)
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(rcu_trace_boost_create);
+
+void rcu_trace_boost_destroy(void)
+{
+	if (rcuboostdir)
+		debugfs_remove(rcuboostdir);
+	rcuboostdir = NULL;
+}
+EXPORT_SYMBOL_GPL(rcu_trace_boost_destroy);
+
+#define RCU_BOOST_TRACE_FUNC_DECL(type)			      \
+	static void rcu_trace_boost_##type(struct rcu_boost_dat *rbd)	\
+	{								\
+		rbd->rbs_stat_##type++;					\
+	}
+RCU_BOOST_TRACE_FUNC_DECL(task_boost_called)
+RCU_BOOST_TRACE_FUNC_DECL(task_boosted)
+RCU_BOOST_TRACE_FUNC_DECL(boost_called)
+RCU_BOOST_TRACE_FUNC_DECL(try_boost)
+RCU_BOOST_TRACE_FUNC_DECL(boosted)
+RCU_BOOST_TRACE_FUNC_DECL(unboost_called)
+RCU_BOOST_TRACE_FUNC_DECL(unboosted)
+RCU_BOOST_TRACE_FUNC_DECL(try_boost_readers)
+RCU_BOOST_TRACE_FUNC_DECL(boost_readers)
+RCU_BOOST_TRACE_FUNC_DECL(try_unboost_readers)
+RCU_BOOST_TRACE_FUNC_DECL(unboost_readers)
+RCU_BOOST_TRACE_FUNC_DECL(over_taken)
+#else /* CONFIG_RCU_TRACE */
+/* These were created by the above macro "RCU_BOOST_TRACE_FUNC_DECL" */
+# define rcu_trace_boost_task_boost_called(rbd) do { } while (0)
+# define rcu_trace_boost_task_boosted(rbd) do { } while (0)
+# define rcu_trace_boost_boost_called(rbd) do { } while (0)
+# define rcu_trace_boost_try_boost(rbd) do { } while (0)
+# define rcu_trace_boost_boosted(rbd) do { } while (0)
+# define rcu_trace_boost_unboost_called(rbd) do { } while (0)
+# define rcu_trace_boost_unboosted(rbd) do { } while (0)
+# define rcu_trace_boost_try_boost_readers(rbd) do { } while (0)
+# define rcu_trace_boost_boost_readers(rbd) do { } while (0)
+# define rcu_trace_boost_try_unboost_readers(rbd) do { } while (0)
+# define rcu_trace_boost_unboost_readers(rbd) do { } while (0)
+# define rcu_trace_boost_over_taken(rbd) do { } while (0)
+#endif /* CONFIG_RCU_TRACE */
+
+static inline int rcu_is_boosted(struct task_struct *task)
+{
+	return !list_empty(&task->rcub_entry);
+}
+
+/*
+ * Helper function to boost a task's prio.
+ */
+static void rcu_boost_task(struct task_struct *task)
+{
+	WARN_ON(!irqs_disabled());
+	WARN_ON_SMP(!spin_is_locked(&task->pi_lock));
+
+	rcu_trace_boost_task_boost_called(RCU_BOOST_ME);
+
+	if (task->rcu_prio < task->prio) {
+		rcu_trace_boost_task_boosted(RCU_BOOST_ME);
+		task_setprio(task, task->rcu_prio);
+	}
+}
+
+/**
+ * __rcu_preepmt_boost - Called by sleeping RCU readers.
+ *
+ * When the RCU read-side critical section is preempted
+ * (or schedules out due to RT mutex)
+ * it places itself onto a list to notify that it is sleeping
+ * while holding a RCU read lock. If there is already a
+ * synchronize_rcu happening, then it will increase its
+ * priority (if necessary).
+ */
+void __rcu_preempt_boost(void)
+{
+	struct task_struct *curr = current;
+	struct rcu_boost_dat *rbd;
+	int prio;
+	unsigned long flags;
+
+	WARN_ON(!current->rcu_read_lock_nesting);
+
+	rcu_trace_boost_boost_called(RCU_BOOST_ME);
+
+	/* check to see if we are already boosted */
+	if (unlikely(rcu_is_boosted(curr)))
+		return;
+
+	/*
+	 * To keep us from preempting between grabing
+	 * the rbd and locking it, we use local_irq_save
+	 */
+	local_irq_save(flags);
+	rbd = &__get_cpu_var(rcu_boost_data);
+	spin_lock(&rbd->rbs_lock);
+
+	spin_lock(&curr->pi_lock);
+
+	curr->rcub_rbdp = rbd;
+
+	rcu_trace_boost_try_boost(rbd);
+
+	prio = rt_mutex_getprio(curr);
+
+	if (list_empty(&curr->rcub_entry))
+		list_add_tail(&curr->rcub_entry, &rbd->rbs_toboost);
+	if (prio <= rbd->rbs_prio)
+		goto out;
+
+	rcu_trace_boost_boosted(curr->rcub_rbdp);
+
+	set_rcu_prio(curr, rbd->rbs_prio);
+	rcu_boost_task(curr);
+
+ out:
+	spin_unlock(&curr->pi_lock);
+	spin_unlock_irqrestore(&rbd->rbs_lock, flags);
+}
+
+/**
+ * __rcu_preempt_unboost - called when releasing the RCU read lock
+ *
+ * When releasing the RCU read lock, a check is made to see if
+ * the task was preempted. If it was, it removes itself from the
+ * RCU data lists and if necessary, sets its priority back to
+ * normal.
+ */
+void __rcu_preempt_unboost(void)
+{
+	struct task_struct *curr = current;
+	struct rcu_boost_dat *rbd;
+	int prio;
+	unsigned long flags;
+
+	rcu_trace_boost_unboost_called(RCU_BOOST_ME);
+
+	/* if not boosted, then ignore */
+	if (likely(!rcu_is_boosted(curr)))
+		return;
+
+	/*
+	 * Need to be very careful with NMIs.
+	 * If we take the lock and an NMI comes in
+	 * and it may try to unboost us if curr->rcub_rbdp
+	 * is still set. So we zero it before grabbing the lock.
+	 * But this also means that we might be boosted again
+	 * so the boosting code needs to be aware of this.
+	 */
+	rbd = curr->rcub_rbdp;
+	curr->rcub_rbdp = NULL;
+
+	/*
+	 * Now an NMI might have came in after we grab
+	 * the below lock. This check makes sure that
+	 * the NMI doesn't try grabbing the lock
+	 * while we already have it.
+	 */
+	if (unlikely(!rbd))
+		return;
+
+	spin_lock_irqsave(&rbd->rbs_lock, flags);
+	/*
+	 * It is still possible that an NMI came in
+	 * between the "is_boosted" check and setting
+	 * the rcu_rbdp to NULL. This would mean that
+	 * the NMI already dequeued us.
+	 */
+	if (unlikely(!rcu_is_boosted(curr)))
+		goto out;
+
+	list_del_init(&curr->rcub_entry);
+
+	rcu_trace_boost_unboosted(rbd);
+
+	set_rcu_prio(curr, MAX_PRIO);
+
+	spin_lock(&curr->pi_lock);
+	prio = rt_mutex_getprio(curr);
+	task_setprio(curr, prio);
+
+	curr->rcub_rbdp = NULL;
+
+	spin_unlock(&curr->pi_lock);
+ out:
+	spin_unlock_irqrestore(&rbd->rbs_lock, flags);
+}
+
+/*
+ * For each rcu_boost_dat structure, update all the tasks that
+ * are on the lists to the priority of the caller of
+ * synchronize_rcu.
+ */
+static int __rcu_boost_readers(struct rcu_boost_dat *rbd, int prio, unsigned long flags)
+{
+	struct task_struct *curr = current;
+	struct task_struct *p;
+
+	spin_lock(&rbd->rbs_lock);
+
+	rbd->rbs_prio = prio;
+
+	/*
+	 * Move the already boosted readers onto the list and reboost
+	 * them.
+	 */
+	list_splice_init(&rbd->rbs_boosted,
+			 &rbd->rbs_toboost);
+
+	while (!list_empty(&rbd->rbs_toboost)) {
+		p = list_entry(rbd->rbs_toboost.next,
+			       struct task_struct, rcub_entry);
+		list_move_tail(&p->rcub_entry,
+			       &rbd->rbs_boosted);
+		set_rcu_prio(p, prio);
+		spin_lock(&p->pi_lock);
+		rcu_boost_task(p);
+		spin_unlock(&p->pi_lock);
+
+		/*
+		 * Now we release the lock to allow for a higher
+		 * priority task to come in and boost the readers
+		 * even higher. Or simply to let a higher priority
+		 * task to run now.
+		 */
+		spin_unlock(&rbd->rbs_lock);
+		spin_unlock_irqrestore(&rcu_boost_wake_lock, flags);
+
+		cpu_relax();
+		spin_lock_irqsave(&rcu_boost_wake_lock, flags);
+		/*
+		 * Another task may have taken over.
+		 */
+		if (curr->rcu_preempt_counter != rcu_boost_counter) {
+			rcu_trace_boost_over_taken(rbd);
+			return 1;
+		}
+
+		spin_lock(&rbd->rbs_lock);
+	}
+
+	spin_unlock(&rbd->rbs_lock);
+
+	return 0;
+}
+
+/**
+ * rcu_boost_readers - called by synchronize_rcu to boost sleeping RCU readers.
+ *
+ * This function iterates over all the per_cpu rcu_boost_data descriptors
+ * and boosts any sleeping (or slept) RCU readers.
+ */
+void rcu_boost_readers(void)
+{
+	struct task_struct *curr = current;
+	struct rcu_boost_dat *rbd;
+	unsigned long flags;
+	int prio;
+	int cpu;
+	int ret;
+
+	spin_lock_irqsave(&rcu_boost_wake_lock, flags);
+
+	prio = rt_mutex_getprio(curr);
+
+	rcu_trace_boost_try_boost_readers(RCU_BOOST_ME);
+
+	if (prio >= rcu_boost_prio) {
+		/* already boosted */
+		spin_unlock_irqrestore(&rcu_boost_wake_lock, flags);
+		return;
+	}
+
+	rcu_boost_prio = prio;
+
+	rcu_trace_boost_boost_readers(RCU_BOOST_ME);
+
+	/* Flag that we are the one to unboost */
+	curr->rcu_preempt_counter = ++rcu_boost_counter;
+
+	for_each_online_cpu(cpu) {
+		rbd = &per_cpu(rcu_boost_data, cpu);
+		ret = __rcu_boost_readers(rbd, prio, flags);
+		if (ret)
+			break;
+	}
+
+	spin_unlock_irqrestore(&rcu_boost_wake_lock, flags);
+
+}
+
+/**
+ * rcu_unboost_readers - set the boost level back to normal.
+ *
+ * This function DOES NOT change the priority of any RCU reader
+ * that was boosted. The RCU readers do that when they release
+ * the RCU lock. This function only sets the global
+ * rcu_boost_prio to MAX_PRIO so that new RCU readers that sleep
+ * do not increase their priority.
+ */
+void rcu_unboost_readers(void)
+{
+	struct rcu_boost_dat *rbd;
+	unsigned long flags;
+	int cpu;
+
+	spin_lock_irqsave(&rcu_boost_wake_lock, flags);
+
+	rcu_trace_boost_try_unboost_readers(RCU_BOOST_ME);
+
+	if (current->rcu_preempt_counter != rcu_boost_counter)
+		goto out;
+
+	rcu_trace_boost_unboost_readers(RCU_BOOST_ME);
+
+	/*
+	 * We could also put in something that
+	 * would allow other synchronize_rcu callers
+	 * of lower priority that are still waiting
+	 * to boost the prio.
+	 */
+	rcu_boost_prio = MAX_PRIO;
+
+	for_each_online_cpu(cpu) {
+		rbd = &per_cpu(rcu_boost_data, cpu);
+
+		spin_lock(&rbd->rbs_lock);
+		rbd->rbs_prio = rcu_boost_prio;
+		spin_unlock(&rbd->rbs_lock);
+	}
+
+ out:
+	spin_unlock_irqrestore(&rcu_boost_wake_lock, flags);
+}
+
+/*
+ * The krcupreemptd wakes up every "rcu_preempt_thread_secs"
+ * seconds at the minimum priority of 1 to do a
+ * synchronize_rcu. This ensures that grace periods finish
+ * and that we do not starve the system. If there are RT
+ * tasks above priority 1 that are hogging the system and
+ * preventing release of memory, then its the fault of the
+ * system designer running RT tasks too aggressively and the
+ * system is flawed regardless.
+ */
+static int krcupreemptd(void *data)
+{
+	struct sched_param param = { .sched_priority = 1 };
+	int ret;
+	int prio;
+
+	ret = sched_setscheduler(current, SCHED_FIFO, &param);
+	printk("krcupreemptd setsched %d\n", ret);
+	prio = current->prio;
+	printk("  prio = %d\n", prio);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+		schedule_timeout(rcu_preempt_thread_secs * HZ);
+
+		__set_current_state(TASK_RUNNING);
+		if (prio != current->prio) {
+			prio = current->prio;
+			printk("krcupreemptd new prio is %d??\n",prio);
+		}
+
+		synchronize_rcu();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+int __init rcu_preempt_boost_init(void)
+{
+	struct rcu_boost_dat *rbd;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		rbd = &per_cpu(rcu_boost_data, cpu);
+
+		spin_lock_init(&rbd->rbs_lock);
+		rbd->rbs_prio = MAX_PRIO;
+		INIT_LIST_HEAD(&rbd->rbs_toboost);
+		INIT_LIST_HEAD(&rbd->rbs_boosted);
+	}
+
+	return 0;
+}
+
+static int __init rcu_preempt_start_krcupreemptd(void)
+{
+	struct task_struct *p;
+
+	p = kthread_create(krcupreemptd, NULL,
+			   "krcupreemptd");
+
+	if (IS_ERR(p)) {
+		printk("krcupreemptd failed\n");
+		return NOTIFY_BAD;
+	}
+	wake_up_process(p);
+
+	return 0;
+}
+
+__initcall(rcu_preempt_start_krcupreemptd);
Index: linux-2.6.25.4-rt4/kernel/rcupreempt.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rcupreempt.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rcupreempt.c	2008-05-29 09:46:48.000000000 -0400
@@ -79,7 +79,7 @@
  */
 #define GP_STAGES    2
 struct rcu_data {
-	spinlock_t	lock;		/* Protect rcu_data fields. */
+	raw_spinlock_t	lock;		/* Protect rcu_data fields. */
 	long		completed;	/* Number of last completed batch. */
 	int		waitlistcount;
 	struct tasklet_struct rcu_tasklet;
@@ -132,7 +132,7 @@ enum rcu_try_flip_states {
 };
 
 struct rcu_ctrlblk {
-	spinlock_t	fliplock;	/* Protect state-machine transitions. */
+	raw_spinlock_t	fliplock;	/* Protect state-machine transitions. */
 	long		completed;	/* Number of last completed batch. */
 	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
 							the rcu state machine */
@@ -140,7 +140,7 @@ struct rcu_ctrlblk {
 
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 static struct rcu_ctrlblk rcu_ctrlblk = {
-	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
+	.fliplock = RAW_SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 	.completed = 0,
 	.rcu_try_flip_state = rcu_try_flip_idle_state,
 };
@@ -351,6 +351,8 @@ void __rcu_read_unlock(void)
 
 		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
 		local_irq_restore(flags);
+
+		__rcu_preempt_unboost();
 	}
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -948,7 +950,7 @@ void __devinit rcu_online_cpu(int cpu)
 
 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_process_callbacks(struct softirq_action *unused)
+void rcu_process_callbacks(struct softirq_action *unused)
 {
 	unsigned long flags;
 	struct rcu_head *next, *list;
@@ -1126,6 +1128,8 @@ void __init __rcu_init(void)
 		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
 
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+
+	rcu_preempt_boost_init();
 }
 
 /*
Index: linux-2.6.25.4-rt4/kernel/rtmutex.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rtmutex.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rtmutex.c	2008-05-29 09:47:31.000000000 -0400
@@ -8,12 +8,19 @@
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
  *
+ * Adaptive Spinlocks:
+ *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ *                                   and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
+ *
  *  See Documentation/rt-mutex-design.txt for details.
  */
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
+#include <linux/hardirq.h>
 
 #include "rtmutex_common.h"
 
@@ -80,6 +87,7 @@ static void fixup_rt_mutex_waiters(struc
  */
 #if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
 # define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+# define rt_rwlock_cmpxchg(rwm,c,n)	(cmpxchg(&(rwm)->owner, c, n) == c)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -88,14 +96,56 @@ static inline void mark_rt_mutex_waiters
 		owner = *p;
 	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
+#ifdef CONFIG_PREEMPT_RT
+static inline void mark_rt_rwlock_check(struct rw_mutex *rwm)
+{
+	unsigned long owner, *p = (unsigned long *) &rwm->owner;
+
+	do {
+		owner = *p;
+	} while (cmpxchg(p, owner, owner | RT_RWLOCK_CHECK) != owner);
+}
+#endif /* CONFIG_PREEMPT_RT */
 #else
 # define rt_mutex_cmpxchg(l,c,n)	(0)
+# define rt_rwlock_cmpxchg(l,c,n)	({ (void)c; (void)n; 0; })
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	lock->owner = (struct task_struct *)
 			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
 }
+#ifdef CONFIG_PREEMPT_RT
+static inline void mark_rt_rwlock_check(struct rw_mutex *rwm)
+{
+	rwm->owner = (struct task_struct *)
+			((unsigned long)rwm->owner | RT_RWLOCK_CHECK);
+}
+#endif /* CONFIG_PREEMPT_RT */
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+#define task_is_reader(task) ((task) == RT_RW_READER)
+#else
+#define task_is_reader(task) (0)
+#endif
+
+int pi_initialized;
+
+/*
+ * we initialize the wait_list runtime. (Could be done build-time and/or
+ * boot-time.)
+ */
+static inline void init_lists(struct rt_mutex *lock)
+{
+	if (unlikely(!lock->wait_list.prio_list.prev)) {
+		plist_head_init(&lock->wait_list, &lock->wait_lock);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		pi_initialized++;
 #endif
+	}
+}
+
+static int rt_mutex_get_readers_prio(struct task_struct *task, int prio);
 
 /*
  * Calculate task priority from the waiter list priority
@@ -105,11 +155,14 @@ static inline void mark_rt_mutex_waiters
  */
 int rt_mutex_getprio(struct task_struct *task)
 {
+	int prio = min(task->normal_prio, get_rcu_prio(task));
+
+	prio = rt_mutex_get_readers_prio(task, prio);
+
 	if (likely(!task_has_pi_waiters(task)))
-		return task->normal_prio;
+		return prio;
 
-	return min(task_top_pi_waiter(task)->pi_list_entry.prio,
-		   task->normal_prio);
+	return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio);
 }
 
 /*
@@ -148,6 +201,11 @@ static void rt_mutex_adjust_prio(struct 
  */
 int max_lock_depth = 1024;
 
+static int rt_mutex_adjust_readers(struct rt_mutex *orig_lock,
+				   struct rt_mutex_waiter *orig_waiter,
+				   struct task_struct *top_task,
+				   struct rt_mutex *lock,
+				   int recursion_depth);
 /*
  * Adjust the priority chain. Also used for deadlock detection.
  * Decreases task's usage by one - may thus free the task.
@@ -157,7 +215,8 @@ static int rt_mutex_adjust_prio_chain(st
 				      int deadlock_detect,
 				      struct rt_mutex *orig_lock,
 				      struct rt_mutex_waiter *orig_waiter,
-				      struct task_struct *top_task)
+				      struct task_struct *top_task,
+				      int recursion_depth)
 {
 	struct rt_mutex *lock;
 	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -191,7 +250,7 @@ static int rt_mutex_adjust_prio_chain(st
 
 		return deadlock_detect ? -EDEADLK : 0;
 	}
- retry:
+  retry:
 	/*
 	 * Task can not go away as we did a get_task() before !
 	 */
@@ -253,13 +312,25 @@ static int rt_mutex_adjust_prio_chain(st
 	plist_add(&waiter->list_entry, &lock->wait_list);
 
 	/* Release the task */
-	spin_unlock_irqrestore(&task->pi_lock, flags);
+	spin_unlock(&task->pi_lock);
 	put_task_struct(task);
 
 	/* Grab the next task */
 	task = rt_mutex_owner(lock);
+
+	/*
+	 * Readers are special. We may need to boost more than one owner.
+	 */
+	if (task_is_reader(task)) {
+		ret = rt_mutex_adjust_readers(orig_lock, orig_waiter,
+					      top_task, lock,
+					      recursion_depth);
+		spin_unlock_irqrestore(&lock->wait_lock, flags);
+		goto out;
+	}
+
 	get_task_struct(task);
-	spin_lock_irqsave(&task->pi_lock, flags);
+	spin_lock(&task->pi_lock);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		/* Boost the owner */
@@ -277,10 +348,10 @@ static int rt_mutex_adjust_prio_chain(st
 		__rt_mutex_adjust_prio(task);
 	}
 
-	spin_unlock_irqrestore(&task->pi_lock, flags);
+	spin_unlock(&task->pi_lock);
 
 	top_waiter = rt_mutex_top_waiter(lock);
-	spin_unlock(&lock->wait_lock);
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	if (!detect_deadlock && waiter != top_waiter)
 		goto out_put_task;
@@ -291,7 +362,7 @@ static int rt_mutex_adjust_prio_chain(st
 	spin_unlock_irqrestore(&task->pi_lock, flags);
  out_put_task:
 	put_task_struct(task);
-
+ out:
 	return ret;
 }
 
@@ -300,11 +371,10 @@ static int rt_mutex_adjust_prio_chain(st
  * assigned pending owner [which might not have taken the
  * lock yet]:
  */
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock, int mode)
 {
 	struct task_struct *pendowner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *next;
-	unsigned long flags;
 
 	if (!rt_mutex_owner_pending(lock))
 		return 0;
@@ -312,9 +382,11 @@ static inline int try_to_steal_lock(stru
 	if (pendowner == current)
 		return 1;
 
-	spin_lock_irqsave(&pendowner->pi_lock, flags);
-	if (current->prio >= pendowner->prio) {
-		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+	WARN_ON(task_is_reader(rt_mutex_owner(lock)));
+
+	spin_lock(&pendowner->pi_lock);
+	if (!lock_is_stealable(pendowner, mode)) {
+		spin_unlock(&pendowner->pi_lock);
 		return 0;
 	}
 
@@ -324,7 +396,7 @@ static inline int try_to_steal_lock(stru
 	 * priority.
 	 */
 	if (likely(!rt_mutex_has_waiters(lock))) {
-		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		spin_unlock(&pendowner->pi_lock);
 		return 1;
 	}
 
@@ -332,7 +404,7 @@ static inline int try_to_steal_lock(stru
 	next = rt_mutex_top_waiter(lock);
 	plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
 	__rt_mutex_adjust_prio(pendowner);
-	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+	spin_unlock(&pendowner->pi_lock);
 
 	/*
 	 * We are going to steal the lock and a waiter was
@@ -349,10 +421,10 @@ static inline int try_to_steal_lock(stru
 	 * might be current:
 	 */
 	if (likely(next->task != current)) {
-		spin_lock_irqsave(&current->pi_lock, flags);
+		spin_lock(&current->pi_lock);
 		plist_add(&next->pi_list_entry, &current->pi_waiters);
 		__rt_mutex_adjust_prio(current);
-		spin_unlock_irqrestore(&current->pi_lock, flags);
+		spin_unlock(&current->pi_lock);
 	}
 	return 1;
 }
@@ -366,7 +438,7 @@ static inline int try_to_steal_lock(stru
  *
  * Must be called with lock->wait_lock held.
  */
-static int try_to_take_rt_mutex(struct rt_mutex *lock)
+static int do_try_to_take_rt_mutex(struct rt_mutex *lock, int mode)
 {
 	/*
 	 * We have to be careful here if the atomic speedups are
@@ -389,7 +461,7 @@ static int try_to_take_rt_mutex(struct r
 	 */
 	mark_rt_mutex_waiters(lock);
 
-	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, mode))
 		return 0;
 
 	/* We got the lock. */
@@ -402,6 +474,11 @@ static int try_to_take_rt_mutex(struct r
 	return 1;
 }
 
+static inline int try_to_take_rt_mutex(struct rt_mutex *lock)
+{
+	return do_try_to_take_rt_mutex(lock, STEAL_NORMAL);
+}
+
 /*
  * Task blocks on lock.
  *
@@ -411,14 +488,13 @@ static int try_to_take_rt_mutex(struct r
  */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
-				   int detect_deadlock)
+				   int detect_deadlock, unsigned long flags)
 {
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *top_waiter = waiter;
-	unsigned long flags;
 	int chain_walk = 0, res;
 
-	spin_lock_irqsave(&current->pi_lock, flags);
+	spin_lock(&current->pi_lock);
 	__rt_mutex_adjust_prio(current);
 	waiter->task = current;
 	waiter->lock = lock;
@@ -432,22 +508,29 @@ static int task_blocks_on_rt_mutex(struc
 
 	current->pi_blocked_on = waiter;
 
-	spin_unlock_irqrestore(&current->pi_lock, flags);
+	spin_unlock(&current->pi_lock);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
-		spin_lock_irqsave(&owner->pi_lock, flags);
+		/* readers are handled differently */
+		if (task_is_reader(owner)) {
+			res = rt_mutex_adjust_readers(lock, waiter,
+						      current, lock, 0);
+			return res;
+		}
+
+		spin_lock(&owner->pi_lock);
 		plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
 		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
 
 		__rt_mutex_adjust_prio(owner);
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
-		spin_unlock_irqrestore(&owner->pi_lock, flags);
+		spin_unlock(&owner->pi_lock);
 	}
 	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
 		chain_walk = 1;
 
-	if (!chain_walk)
+	if (!chain_walk || task_is_reader(owner))
 		return 0;
 
 	/*
@@ -457,12 +540,12 @@ static int task_blocks_on_rt_mutex(struc
 	 */
 	get_task_struct(owner);
 
-	spin_unlock(&lock->wait_lock);
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
-					 current);
+					 current, 0);
 
-	spin_lock(&lock->wait_lock);
+	spin_lock_irq(&lock->wait_lock);
 
 	return res;
 }
@@ -475,13 +558,13 @@ static int task_blocks_on_rt_mutex(struc
  *
  * Called with lock->wait_lock held.
  */
-static void wakeup_next_waiter(struct rt_mutex *lock)
+static void wakeup_next_waiter(struct rt_mutex *lock, int savestate)
 {
 	struct rt_mutex_waiter *waiter;
 	struct task_struct *pendowner;
-	unsigned long flags;
+	struct rt_mutex_waiter *next;
 
-	spin_lock_irqsave(&current->pi_lock, flags);
+	spin_lock(&current->pi_lock);
 
 	waiter = rt_mutex_top_waiter(lock);
 	plist_del(&waiter->list_entry, &lock->wait_list);
@@ -496,9 +579,44 @@ static void wakeup_next_waiter(struct rt
 	pendowner = waiter->task;
 	waiter->task = NULL;
 
+	/*
+	 * Do the wakeup before the ownership change to give any spinning
+	 * waiter grantees a headstart over the other threads that will
+	 * trigger once owner changes.
+	 */
+	if (!savestate)
+		wake_up_process(pendowner);
+	else {
+		/*
+		 * We can skip the actual (expensive) wakeup if the
+		 * waiter is already running, but we have to be careful
+		 * of race conditions because they may be about to sleep.
+		 *
+		 * The waiter-side protocol has the following pattern:
+		 * 1: Set state != RUNNING
+		 * 2: Conditionally sleep if waiter->task != NULL;
+		 *
+		 * And the owner-side has the following:
+		 * A: Set waiter->task = NULL
+		 * B: Conditionally wake if the state != RUNNING
+		 *
+		 * As long as we ensure 1->2 order, and A->B order, we
+		 * will never miss a wakeup.
+		 *
+		 * Therefore, this barrier ensures that waiter->task = NULL
+		 * is visible before we test the pendowner->state.  The
+		 * corresponding barrier is in the sleep logic.
+		 */
+		smp_mb();
+
+		/* If !RUNNING && !RUNNING_MUTEX */
+		if (pendowner->state & ~TASK_RUNNING_MUTEX)
+			wake_up_process_mutex(pendowner);
+	}
+
 	rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
 
-	spin_unlock_irqrestore(&current->pi_lock, flags);
+	spin_unlock(&current->pi_lock);
 
 	/*
 	 * Clear the pi_blocked_on variable and enqueue a possible
@@ -507,7 +625,13 @@ static void wakeup_next_waiter(struct rt
 	 * waiter with higher priority than pending-owner->normal_prio
 	 * is blocked on the unboosted (pending) owner.
 	 */
-	spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+	if (rt_mutex_has_waiters(lock))
+		next = rt_mutex_top_waiter(lock);
+	else
+		next = NULL;
+
+	spin_lock(&pendowner->pi_lock);
 
 	WARN_ON(!pendowner->pi_blocked_on);
 	WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -515,15 +639,10 @@ static void wakeup_next_waiter(struct rt
 
 	pendowner->pi_blocked_on = NULL;
 
-	if (rt_mutex_has_waiters(lock)) {
-		struct rt_mutex_waiter *next;
-
-		next = rt_mutex_top_waiter(lock);
+	if (next)
 		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
-	}
-	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
 
-	wake_up_process(pendowner);
+	spin_unlock(&pendowner->pi_lock);
 }
 
 /*
@@ -532,22 +651,22 @@ static void wakeup_next_waiter(struct rt
  * Must be called with lock->wait_lock held
  */
 static void remove_waiter(struct rt_mutex *lock,
-			  struct rt_mutex_waiter *waiter)
+			  struct rt_mutex_waiter *waiter,
+			  unsigned long flags)
 {
 	int first = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
-	unsigned long flags;
 	int chain_walk = 0;
 
-	spin_lock_irqsave(&current->pi_lock, flags);
+	spin_lock(&current->pi_lock);
 	plist_del(&waiter->list_entry, &lock->wait_list);
 	waiter->task = NULL;
 	current->pi_blocked_on = NULL;
-	spin_unlock_irqrestore(&current->pi_lock, flags);
+	spin_unlock(&current->pi_lock);
 
-	if (first && owner != current) {
+	if (first && owner != current && !task_is_reader(owner)) {
 
-		spin_lock_irqsave(&owner->pi_lock, flags);
+		spin_lock(&owner->pi_lock);
 
 		plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
 
@@ -562,7 +681,7 @@ static void remove_waiter(struct rt_mute
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
 
-		spin_unlock_irqrestore(&owner->pi_lock, flags);
+		spin_unlock(&owner->pi_lock);
 	}
 
 	WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -573,11 +692,11 @@ static void remove_waiter(struct rt_mute
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(owner);
 
-	spin_unlock(&lock->wait_lock);
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
 
-	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current, 0);
 
-	spin_lock(&lock->wait_lock);
+	spin_lock_irq(&lock->wait_lock);
 }
 
 /*
@@ -598,164 +717,231 @@ void rt_mutex_adjust_pi(struct task_stru
 		return;
 	}
 
-	spin_unlock_irqrestore(&task->pi_lock, flags);
-
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(task);
-	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task, 0);
 }
 
 /*
- * Slow path lock function:
+ * preemptible spin_lock functions:
  */
-static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
-		  struct hrtimer_sleeper *timeout,
-		  int detect_deadlock)
+
+#ifdef CONFIG_PREEMPT_RT
+
+static inline void
+rt_spin_lock_fastlock(struct rt_mutex *lock,
+		void  (*slowfn)(struct rt_mutex *lock))
 {
-	struct rt_mutex_waiter waiter;
-	int ret = 0;
+	/* Temporary HACK! */
+	if (likely(!current->in_printk))
+		might_sleep();
+	else if (in_atomic() || irqs_disabled())
+		/* don't grab locks for printk in atomic */
+		return;
 
-	debug_rt_mutex_init_waiter(&waiter);
-	waiter.task = NULL;
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
+		rt_mutex_deadlock_account_lock(lock, current);
+	else
+		slowfn(lock);
+}
 
-	spin_lock(&lock->wait_lock);
+static inline void
+rt_spin_lock_fastunlock(struct rt_mutex *lock,
+			void  (*slowfn)(struct rt_mutex *lock))
+{
+	/* Temporary HACK! */
+	if (unlikely(rt_mutex_owner(lock) != current) && current->in_printk)
+		/* don't grab locks for printk in atomic */
+		return;
 
-	/* Try to acquire the lock again: */
-	if (try_to_take_rt_mutex(lock)) {
-		spin_unlock(&lock->wait_lock);
-		return 0;
-	}
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
 
-	set_current_state(state);
+static inline void
+update_current(unsigned long new_state, unsigned long *saved_state)
+{
+	unsigned long state = xchg(&current->state, new_state);
+	if (unlikely(state == TASK_RUNNING))
+		*saved_state = TASK_RUNNING;
+}
 
-	/* Setup the timer, when timeout != NULL */
-	if (unlikely(timeout)) {
-		hrtimer_start(&timeout->timer, timeout->timer.expires,
-			      HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
+#ifdef CONFIG_SMP
+static int adaptive_wait(struct rt_mutex_waiter *waiter,
+			 struct task_struct *orig_owner)
+{
+	int sleep = 0;
 
 	for (;;) {
-		/* Try to acquire the lock: */
-		if (try_to_take_rt_mutex(lock))
+
+		/* we are the owner? */
+		if (!waiter->task)
 			break;
 
 		/*
-		 * TASK_INTERRUPTIBLE checks for signals and
-		 * timeout. Ignored otherwise.
+		 * We need to read the owner of the lock and then check
+		 * its state. But we can't let the owner task be freed
+		 * while we read the state. We grab the rcu_lock and
+		 * this makes sure that the owner task wont disappear
+		 * between testing that it still has the lock, and checking
+		 * its state.
 		 */
-		if (unlikely(state == TASK_INTERRUPTIBLE)) {
-			/* Signal pending? */
-			if (signal_pending(current))
-				ret = -EINTR;
-			if (timeout && !timeout->task)
-				ret = -ETIMEDOUT;
-			if (ret)
-				break;
+		rcu_read_lock();
+		/* Owner changed? Then lets update the original */
+		if (orig_owner != rt_mutex_owner(waiter->lock)) {
+			rcu_read_unlock();
+			break;
+		}
+
+		/* Owner went to bed, so should we */
+		if (!task_is_current(orig_owner)) {
+			sleep = 1;
+			rcu_read_unlock();
+			break;
+		}
+		rcu_read_unlock();
+
+		cpu_relax();
+	}
+
+	return sleep;
+}
+#else
+static int adaptive_wait(struct rt_mutex_waiter *waiter,
+			 struct task_struct *orig_owner)
+{
+	return 1;
+}
+#endif
+
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * The wakeup side uses wake_up_process_mutex, which, combined with
+ * the xchg code of this function is a transparent sleep/wakeup
+ * mechanism nested within any existing sleep/wakeup mechanism. This
+ * enables the seemless use of arbitrary (blocking) spinlocks within
+ * sleep/wakeup event loops.
+ */
+static void  noinline __sched
+rt_spin_lock_slowlock(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter waiter;
+	unsigned long saved_state, state, flags;
+	struct task_struct *orig_owner;
+	int missed = 0;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+	waiter.write_lock = 0;
+
+	spin_lock_irqsave(&lock->wait_lock, flags);
+	init_lists(lock);
+
+	BUG_ON(rt_mutex_owner(lock) == current);
+
+	/*
+	 * Here we save whatever state the task was in originally,
+	 * we'll restore it at the end of the function and we'll take
+	 * any intermediate wakeup into account as well, independently
+	 * of the lock sleep/wakeup mechanism. When we get a real
+	 * wakeup the task->state is TASK_RUNNING and we change
+	 * saved_state accordingly. If we did not get a real wakeup
+	 * then we return with the saved state.
+	 */
+	saved_state = current->state;
+
+	for (;;) {
+		unsigned long saved_flags;
+		int saved_lock_depth = current->lock_depth;
+
+		/* Try to acquire the lock */
+		if (do_try_to_take_rt_mutex(lock, STEAL_LATERAL)) {
+			/* If we never blocked break out now */
+			if (!missed)
+				goto unlock;
+			break;
 		}
+		missed = 1;
 
 		/*
 		 * waiter.task is NULL the first time we come here and
 		 * when we have been woken up by the previous owner
-		 * but the lock got stolen by a higher prio task.
+		 * but the lock got stolen by an higher prio task.
 		 */
 		if (!waiter.task) {
-			ret = task_blocks_on_rt_mutex(lock, &waiter,
-						      detect_deadlock);
-			/*
-			 * If we got woken up by the owner then start loop
-			 * all over without going into schedule to try
-			 * to get the lock now:
-			 */
-			if (unlikely(!waiter.task)) {
-				/*
-				 * Reset the return value. We might
-				 * have returned with -EDEADLK and the
-				 * owner released the lock while we
-				 * were walking the pi chain.
-				 */
-				ret = 0;
+			task_blocks_on_rt_mutex(lock, &waiter, 0, flags);
+			/* Wakeup during boost ? */
+			if (unlikely(!waiter.task))
 				continue;
-			}
-			if (unlikely(ret))
-				break;
 		}
 
-		spin_unlock(&lock->wait_lock);
+		/*
+		 * Prevent schedule() to drop BKL, while waiting for
+		 * the lock ! We restore lock_depth when we come back.
+		 */
+		saved_flags = current->flags & PF_NOSCHED;
+		current->lock_depth = -1;
+		current->flags &= ~PF_NOSCHED;
+		orig_owner = rt_mutex_owner(lock);
+		spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 		debug_rt_mutex_print_deadlock(&waiter);
 
-		if (waiter.task)
-			schedule_rt_mutex(lock);
+		if (adaptive_wait(&waiter, orig_owner)) {
+			update_current(TASK_UNINTERRUPTIBLE, &saved_state);
+			/*
+			 * The xchg() in update_current() is an implicit
+			 * barrier which we rely upon to ensure current->state
+			 * is visible before we test waiter.task.
+			 */
+			if (waiter.task)
+				schedule_rt_mutex(lock);
+		}
 
-		spin_lock(&lock->wait_lock);
-		set_current_state(state);
+		spin_lock_irqsave(&lock->wait_lock, flags);
+		current->flags |= saved_flags;
+		current->lock_depth = saved_lock_depth;
 	}
 
-	set_current_state(TASK_RUNNING);
+	state = xchg(&current->state, saved_state);
+	if (unlikely(state == TASK_RUNNING))
+		current->state = TASK_RUNNING;
 
+	/*
+	 * Extremely rare case, if we got woken up by a non-mutex wakeup,
+	 * and we managed to steal the lock despite us not being the
+	 * highest-prio waiter (due to SCHED_OTHER changing prio), then we
+	 * can end up with a non-NULL waiter.task:
+	 */
 	if (unlikely(waiter.task))
-		remove_waiter(lock, &waiter);
-
+		remove_waiter(lock, &waiter, flags);
 	/*
 	 * try_to_take_rt_mutex() sets the waiter bit
-	 * unconditionally. We might have to fix that up.
+	 * unconditionally. We might have to fix that up:
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	spin_unlock(&lock->wait_lock);
-
-	/* Remove pending timer: */
-	if (unlikely(timeout))
-		hrtimer_cancel(&timeout->timer);
-
-	/*
-	 * Readjust priority, when we did not get the lock. We might
-	 * have been the pending owner and boosted. Since we did not
-	 * take the lock, the PI boost has to go.
-	 */
-	if (unlikely(ret))
-		rt_mutex_adjust_prio(current);
+ unlock:
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	debug_rt_mutex_free_waiter(&waiter);
-
-	return ret;
 }
 
 /*
- * Slow path try-lock function:
+ * Slow path to release a rt_mutex spin_lock style
  */
-static inline int
-rt_mutex_slowtrylock(struct rt_mutex *lock)
+static void  noinline __sched
+rt_spin_lock_slowunlock(struct rt_mutex *lock)
 {
-	int ret = 0;
-
-	spin_lock(&lock->wait_lock);
-
-	if (likely(rt_mutex_owner(lock) != current)) {
-
-		ret = try_to_take_rt_mutex(lock);
-		/*
-		 * try_to_take_rt_mutex() sets the lock waiters
-		 * bit unconditionally. Clean this up.
-		 */
-		fixup_rt_mutex_waiters(lock);
-	}
-
-	spin_unlock(&lock->wait_lock);
-
-	return ret;
-}
+	unsigned long flags;
 
-/*
- * Slow path to release a rt-mutex:
- */
-static void __sched
-rt_mutex_slowunlock(struct rt_mutex *lock)
-{
-	spin_lock(&lock->wait_lock);
+	spin_lock_irqsave(&lock->wait_lock, flags);
 
 	debug_rt_mutex_unlock(lock);
 
@@ -763,49 +949,1461 @@ rt_mutex_slowunlock(struct rt_mutex *loc
 
 	if (!rt_mutex_has_waiters(lock)) {
 		lock->owner = NULL;
-		spin_unlock(&lock->wait_lock);
+		spin_unlock_irqrestore(&lock->wait_lock, flags);
 		return;
 	}
 
-	wakeup_next_waiter(lock);
+	wakeup_next_waiter(lock, 1);
 
-	spin_unlock(&lock->wait_lock);
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
 
-	/* Undo pi boosting if necessary: */
+	/* Undo pi boosting.when necessary */
 	rt_mutex_adjust_prio(current);
 }
 
-/*
- * debug aware fast / slowpath lock,trylock,unlock
- *
- * The atomic acquire/release ops are compiled away, when either the
- * architecture does not support cmpxchg or when debugging is enabled.
- */
-static inline int
-rt_mutex_fastlock(struct rt_mutex *lock, int state,
-		  int detect_deadlock,
-		  int (*slowfn)(struct rt_mutex *lock, int state,
-				struct hrtimer_sleeper *timeout,
-				int detect_deadlock))
+void __lockfunc rt_spin_lock(spinlock_t *lock)
 {
-	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
-		rt_mutex_deadlock_account_lock(lock, current);
-		return 0;
-	} else
-		return slowfn(lock, state, NULL, detect_deadlock);
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	LOCK_CONTENDED_RT(lock, rt_mutex_trylock, __rt_spin_lock);
 }
+EXPORT_SYMBOL(rt_spin_lock);
 
-static inline int
-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
-			struct hrtimer_sleeper *timeout, int detect_deadlock,
-			int (*slowfn)(struct rt_mutex *lock, int state,
-				      struct hrtimer_sleeper *timeout,
-				      int detect_deadlock))
+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
 {
-	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
-		rt_mutex_deadlock_account_lock(lock, current);
-		return 0;
-	} else
+	rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(__rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED_RT(lock, rt_mutex_trylock, __rt_spin_lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(__rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+{
+	spin_lock(lock);
+	spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+
+int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic)
+{
+	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+	if (atomic_add_unless(atomic, -1, 1))
+		return 0;
+	rt_spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	rt_spin_unlock(lock);
+	return 0;
+}
+EXPORT_SYMBOL(_atomic_dec_and_spin_lock);
+
+void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+	__rt_mutex_init(&lock->lock, name);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+
+int rt_rwlock_limit = NR_CPUS;
+
+static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags);
+static inline void rt_reacquire_bkl(int saved_lock_depth);
+
+static inline void
+rt_rwlock_set_owner(struct rw_mutex *rwm, struct task_struct *owner,
+		    unsigned long mask)
+{
+	unsigned long val = (unsigned long)owner | mask;
+
+	rwm->owner = (struct task_struct *)val;
+}
+
+static inline void init_rw_lists(struct rw_mutex *rwm)
+{
+	if (unlikely(!rwm->readers.prev)) {
+		init_lists(&rwm->mutex);
+		INIT_LIST_HEAD(&rwm->readers);
+	}
+}
+
+/*
+ * The fast paths of the rw locks do not set up owners to
+ * the mutex. When blocking on an rwlock we must make sure
+ * there exists an owner.
+ */
+static void
+update_rw_mutex_owner(struct rw_mutex *rwm)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	struct task_struct *mtxowner;
+
+	mtxowner = rt_mutex_owner(mutex);
+	if (mtxowner)
+		return;
+
+	mtxowner = rt_rwlock_owner(rwm);
+	WARN_ON(!mtxowner);
+	if (rt_rwlock_writer(rwm))
+		WARN_ON(mtxowner == RT_RW_READER);
+	else
+		mtxowner = RT_RW_READER;
+	rt_mutex_set_owner(mutex, mtxowner, 0);
+}
+
+/*
+ * The fast path does not add itself to the reader list to keep
+ * from needing to grab the spinlock. We need to add the owner
+ * itself. This may seem racy, but in practice, it is fine.
+ * The link list is protected by mutex->wait_lock. But to find
+ * the lock on the owner we need to read the owners reader counter.
+ * That counter is modified only by the owner. We are OK with that
+ * because to remove the lock that we are looking for, the owner
+ * must first grab the mutex->wait_lock. The lock will not disappear
+ * from the owner now, and we don't care if we see other locks
+ * held or not held.
+ */
+
+static inline void
+rt_rwlock_update_owner(struct rw_mutex *rwm, unsigned owners)
+{
+	struct reader_lock_struct *rls;
+	struct task_struct *own;
+	int i;
+
+	if (!owners || rt_rwlock_pending(rwm))
+		return;
+
+	own = rt_rwlock_owner(rwm);
+	if (own == RT_RW_READER)
+		return;
+
+	for (i = own->reader_lock_count - 1; i >= 0; i--) {
+		if (own->owned_read_locks[i].lock == rwm)
+			break;
+	}
+	/* It is possible the owner didn't add it yet */
+	if (i < 0)
+		return;
+
+	rls = &own->owned_read_locks[i];
+	/* It is also possible that the owner added it already */
+	if (rls->list.prev && !list_empty(&rls->list))
+		return;
+
+	list_add(&rls->list, &rwm->readers);
+
+	/* change to reader, so no one else updates too */
+	rt_rwlock_set_owner(rwm, RT_RW_READER, RT_RWLOCK_CHECK);
+}
+
+static int try_to_take_rw_read(struct rw_mutex *rwm, int mtx)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	struct rt_mutex_waiter *waiter;
+	struct reader_lock_struct *rls;
+	struct task_struct *mtxowner;
+	int owners;
+	int reader_count, i;
+	int incr = 1;
+
+	assert_spin_locked(&mutex->wait_lock);
+
+	/* mark the lock to force the owner to check on release */
+	mark_rt_rwlock_check(rwm);
+
+	/* is the owner a writer? */
+	if (unlikely(rt_rwlock_writer(rwm)))
+		return 0;
+
+	/* check to see if we don't already own this lock */
+	for (i = current->reader_lock_count - 1; i >= 0; i--) {
+		if (current->owned_read_locks[i].lock == rwm) {
+			rls = &current->owned_read_locks[i];
+			/*
+			 * If this was taken via the fast path, then
+			 * it hasn't been added to the link list yet.
+			 */
+			if (!rls->list.prev || list_empty(&rls->list))
+				list_add(&rls->list, &rwm->readers);
+			rt_rwlock_set_owner(rwm, RT_RW_READER, 0);
+			rls->count++;
+			incr = 0;
+			goto taken;
+		}
+	}
+
+	/* A writer is not the owner, but is a writer waiting */
+	mtxowner = rt_mutex_owner(mutex);
+
+	/* if the owner released it before we marked it then take it */
+	if (!mtxowner && !rt_rwlock_owner(rwm)) {
+		/* Still unlock with the slow path (for PI handling) */
+		rt_rwlock_set_owner(rwm, RT_RW_READER, 0);
+		goto taken;
+	}
+
+	owners = atomic_read(&rwm->owners);
+	rt_rwlock_update_owner(rwm, owners);
+
+	/* Check for rwlock limits */
+	if (rt_rwlock_limit && owners >= rt_rwlock_limit)
+		return 0;
+
+	if (mtxowner && mtxowner != RT_RW_READER) {
+		int mode = mtx ? STEAL_NORMAL : STEAL_LATERAL;
+
+		if (!try_to_steal_lock(mutex, mode)) {
+			/*
+			 * readers don't own the mutex, and rwm shows that a
+			 * writer doesn't have it either. If we enter this
+			 * condition, then we must be pending.
+			 */
+			WARN_ON(!rt_mutex_owner_pending(mutex));
+			/*
+			 * Even though we didn't steal the lock, if the owner
+			 * is a reader, and we are of higher priority than
+			 * any waiting writer, we might still be able to continue.
+			 */
+			if (rt_rwlock_pending_writer(rwm))
+				return 0;
+			if (rt_mutex_has_waiters(mutex)) {
+				waiter = rt_mutex_top_waiter(mutex);
+				if (!lock_is_stealable(waiter->task, mode))
+					return 0;
+				/*
+				 * The pending reader has PI waiters,
+				 * but we are taking the lock.
+				 * Remove the waiters from the pending owner.
+				 */
+				spin_lock(&mtxowner->pi_lock);
+				plist_del(&waiter->pi_list_entry, &mtxowner->pi_waiters);
+				spin_unlock(&mtxowner->pi_lock);
+			}
+		} else if (rt_mutex_has_waiters(mutex)) {
+			/* Readers do things differently with respect to PI */
+			waiter = rt_mutex_top_waiter(mutex);
+			spin_lock(&current->pi_lock);
+			plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+			spin_unlock(&current->pi_lock);
+		}
+		/* Readers never own the mutex */
+		rt_mutex_set_owner(mutex, RT_RW_READER, 0);
+	}
+
+	/* RT_RW_READER forces slow paths */
+	rt_rwlock_set_owner(rwm, RT_RW_READER, 0);
+ taken:
+	if (incr) {
+		atomic_inc(&rwm->owners);
+		reader_count = current->reader_lock_count++;
+		if (likely(reader_count < MAX_RWLOCK_DEPTH)) {
+			rls = &current->owned_read_locks[reader_count];
+			rls->lock = rwm;
+			rls->count = 1;
+			WARN_ON(rls->list.prev && !list_empty(&rls->list));
+			list_add(&rls->list, &rwm->readers);
+		} else
+			WARN_ON_ONCE(1);
+	}
+	rt_mutex_deadlock_account_lock(mutex, current);
+	atomic_inc(&rwm->count);
+	return 1;
+}
+
+static int
+try_to_take_rw_write(struct rw_mutex *rwm, int mtx)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	struct task_struct *own;
+
+	/* mark the lock to force the owner to check on release */
+	mark_rt_rwlock_check(rwm);
+
+	own = rt_rwlock_owner(rwm);
+
+	/* owners must be zero for writer */
+	rt_rwlock_update_owner(rwm, atomic_read(&rwm->owners));
+
+	/* readers or writers? */
+	if ((own && !rt_rwlock_pending(rwm)))
+		return 0;
+
+	/*
+	 * RT_RW_PENDING means that the lock is free, but there are
+	 * pending owners on the mutex
+	 */
+	WARN_ON(own && !rt_mutex_owner_pending(mutex));
+
+	if (!do_try_to_take_rt_mutex(mutex, mtx ? STEAL_NORMAL : STEAL_LATERAL))
+		return 0;
+
+	/*
+	 * We stole the lock. Add both WRITER and CHECK flags
+	 * since we must release the mutex.
+	 */
+	rt_rwlock_set_owner(rwm, current, RT_RWLOCK_WRITER | RT_RWLOCK_CHECK);
+
+	return 1;
+}
+
+static void
+rt_read_slowlock(struct rw_mutex *rwm, int mtx)
+{
+	struct rt_mutex_waiter waiter;
+	struct rt_mutex *mutex = &rwm->mutex;
+	int saved_lock_depth = -1;
+	unsigned long saved_state = -1, state, flags;
+
+	spin_lock_irqsave(&mutex->wait_lock, flags);
+	init_rw_lists(rwm);
+
+	if (try_to_take_rw_read(rwm, mtx)) {
+		spin_unlock_irqrestore(&mutex->wait_lock, flags);
+		return;
+	}
+	update_rw_mutex_owner(rwm);
+
+	/* Owner is a writer (or a blocked writer). Block on the lock */
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+	waiter.write_lock = 0;
+
+	if (mtx) {
+		/*
+		 * We drop the BKL here before we go into the wait loop to avoid a
+		 * possible deadlock in the scheduler.
+		 */
+		if (unlikely(current->lock_depth >= 0))
+			saved_lock_depth = rt_release_bkl(mutex, flags);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	} else {
+		/* Spin lock must preserve BKL */
+		saved_state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+		saved_lock_depth = current->lock_depth;
+	}
+
+	for (;;) {
+		unsigned long saved_flags;
+
+		/* Try to acquire the lock: */
+		if (try_to_take_rw_read(rwm, mtx))
+			break;
+		update_rw_mutex_owner(rwm);
+
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by a higher prio task.
+		 */
+		if (!waiter.task) {
+			task_blocks_on_rt_mutex(mutex, &waiter, 0, flags);
+			/* Wakeup during boost ? */
+			if (unlikely(!waiter.task))
+				continue;
+		}
+		saved_flags = current->flags & PF_NOSCHED;
+		current->flags &= ~PF_NOSCHED;
+		if (!mtx)
+			current->lock_depth = -1;
+
+		spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (!mtx || waiter.task)
+			schedule_rt_mutex(mutex);
+
+		spin_lock_irqsave(&mutex->wait_lock, flags);
+
+		current->flags |= saved_flags;
+		if (mtx)
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		else {
+			current->lock_depth = saved_lock_depth;
+			state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+			if (unlikely(state == TASK_RUNNING))
+				saved_state = TASK_RUNNING;
+		}
+	}
+
+	if (mtx)
+		set_current_state(TASK_RUNNING);
+	else {
+		state = xchg(&current->state, saved_state);
+		if (unlikely(state == TASK_RUNNING))
+			current->state = TASK_RUNNING;
+	}
+
+	if (unlikely(waiter.task))
+		remove_waiter(mutex, &waiter, flags);
+
+	WARN_ON(rt_mutex_owner(mutex) &&
+		rt_mutex_owner(mutex) != current &&
+		rt_mutex_owner(mutex) != RT_RW_READER &&
+		!rt_mutex_owner_pending(mutex));
+
+	spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+	/* Must we reaquire the BKL? */
+	if (mtx && unlikely(saved_lock_depth >= 0))
+		rt_reacquire_bkl(saved_lock_depth);
+
+	debug_rt_mutex_free_waiter(&waiter);
+}
+
+static inline int
+__rt_read_fasttrylock(struct rw_mutex *rwm)
+{
+ retry:
+	if (likely(rt_rwlock_cmpxchg(rwm, NULL, current))) {
+		int reader_count;
+
+		rt_mutex_deadlock_account_lock(&rwm->mutex, current);
+		atomic_inc(&rwm->count);
+		smp_mb();
+		/*
+		 * It is possible that the owner was zeroed
+		 * before we incremented count. If owner is not
+		 * current, then retry again
+		 */
+		if (unlikely(rwm->owner != current)) {
+			atomic_dec(&rwm->count);
+			goto retry;
+		}
+
+		atomic_inc(&rwm->owners);
+		reader_count = current->reader_lock_count;
+		if (likely(reader_count < MAX_RWLOCK_DEPTH)) {
+			current->owned_read_locks[reader_count].lock = rwm;
+			current->owned_read_locks[reader_count].count = 1;
+		} else
+			WARN_ON_ONCE(1);
+		/*
+		 * If this task is no longer the sole owner of the lock
+		 * or someone is blocking, then we need to add the task
+		 * to the lock.
+		 */
+		smp_mb();
+		current->reader_lock_count++;
+		if (unlikely(rwm->owner != current)) {
+			struct rt_mutex *mutex = &rwm->mutex;
+			struct reader_lock_struct *rls;
+			unsigned long flags;
+
+			spin_lock_irqsave(&mutex->wait_lock, flags);
+			rls = &current->owned_read_locks[reader_count];
+			if (!rls->list.prev || list_empty(&rls->list))
+				list_add(&rls->list, &rwm->readers);
+			spin_unlock_irqrestore(&mutex->wait_lock, flags);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static inline void
+rt_read_fastlock(struct rw_mutex *rwm,
+		 void (*slowfn)(struct rw_mutex *rwm, int mtx),
+		 int mtx)
+{
+	if (unlikely(!__rt_read_fasttrylock(rwm)))
+		slowfn(rwm, mtx);
+}
+
+void rt_mutex_down_read(struct rw_mutex *rwm)
+{
+	rt_read_fastlock(rwm, rt_read_slowlock, 1);
+}
+
+void rt_rwlock_read_lock(struct rw_mutex *rwm)
+{
+	rt_read_fastlock(rwm, rt_read_slowlock, 0);
+}
+
+static inline int
+rt_read_slowtrylock(struct rw_mutex *rwm, int mtx)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&mutex->wait_lock, flags);
+	init_rw_lists(rwm);
+
+	if (try_to_take_rw_read(rwm, mtx))
+		ret = 1;
+
+	spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+	return ret;
+}
+
+static inline int
+rt_read_fasttrylock(struct rw_mutex *rwm,
+		    int (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx)
+{
+	if (likely(__rt_read_fasttrylock(rwm)))
+		return 1;
+	else
+		return slowfn(rwm, mtx);
+}
+
+int __sched rt_mutex_down_read_trylock(struct rw_mutex *rwm)
+{
+	return rt_read_fasttrylock(rwm, rt_read_slowtrylock, 1);
+}
+
+static void
+rt_write_slowlock(struct rw_mutex *rwm, int mtx)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	struct rt_mutex_waiter waiter;
+	int saved_lock_depth = -1;
+	unsigned long flags, saved_state = -1, state;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	/* we do PI different for writers that are blocked */
+	waiter.write_lock = 1;
+
+	spin_lock_irqsave(&mutex->wait_lock, flags);
+	init_rw_lists(rwm);
+
+	if (try_to_take_rw_write(rwm, mtx)) {
+		spin_unlock_irqrestore(&mutex->wait_lock, flags);
+		return;
+	}
+	update_rw_mutex_owner(rwm);
+
+	if (mtx) {
+		/*
+		 * We drop the BKL here before we go into the wait loop to avoid a
+		 * possible deadlock in the scheduler.
+		 */
+		if (unlikely(current->lock_depth >= 0))
+			saved_lock_depth = rt_release_bkl(mutex, flags);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	} else {
+		/* Spin locks must preserve the BKL */
+		saved_lock_depth = current->lock_depth;
+		saved_state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+	}
+
+	for (;;) {
+		unsigned long saved_flags;
+
+		/* Try to acquire the lock: */
+		if (try_to_take_rw_write(rwm, mtx))
+			break;
+		update_rw_mutex_owner(rwm);
+
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by a higher prio task.
+		 */
+		if (!waiter.task) {
+			task_blocks_on_rt_mutex(mutex, &waiter, 0, flags);
+			/* Wakeup during boost ? */
+			if (unlikely(!waiter.task))
+				continue;
+		}
+		saved_flags = current->flags & PF_NOSCHED;
+		current->flags &= ~PF_NOSCHED;
+		if (!mtx)
+			current->lock_depth = -1;
+
+		spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (!mtx || waiter.task)
+			schedule_rt_mutex(mutex);
+
+		spin_lock_irqsave(&mutex->wait_lock, flags);
+
+		current->flags |= saved_flags;
+		if (mtx)
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		else {
+			current->lock_depth = saved_lock_depth;
+			state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+			if (unlikely(state == TASK_RUNNING))
+				saved_state = TASK_RUNNING;
+		}
+	}
+
+	if (mtx)
+		set_current_state(TASK_RUNNING);
+	else {
+		state = xchg(&current->state, saved_state);
+		if (unlikely(state == TASK_RUNNING))
+			current->state = TASK_RUNNING;
+	}
+
+	if (unlikely(waiter.task))
+		remove_waiter(mutex, &waiter, flags);
+
+	/* check on unlock if we have any waiters. */
+	if (rt_mutex_has_waiters(mutex))
+		mark_rt_rwlock_check(rwm);
+
+	spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+	/* Must we reaquire the BKL? */
+	if (mtx && unlikely(saved_lock_depth >= 0))
+		rt_reacquire_bkl(saved_lock_depth);
+
+	debug_rt_mutex_free_waiter(&waiter);
+
+}
+
+static inline void
+rt_write_fastlock(struct rw_mutex *rwm,
+		  void (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx)
+{
+	struct task_struct *val = (void *)((unsigned long)current | RT_RWLOCK_WRITER);
+
+	if (likely(rt_rwlock_cmpxchg(rwm, NULL, val)))
+		rt_mutex_deadlock_account_lock(&rwm->mutex, current);
+	else
+		slowfn(rwm, mtx);
+}
+
+void rt_mutex_down_write(struct rw_mutex *rwm)
+{
+	rt_write_fastlock(rwm, rt_write_slowlock, 1);
+}
+
+void rt_rwlock_write_lock(struct rw_mutex *rwm)
+{
+	rt_write_fastlock(rwm, rt_write_slowlock, 0);
+}
+
+static int
+rt_write_slowtrylock(struct rw_mutex *rwm, int mtx)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&mutex->wait_lock, flags);
+	init_rw_lists(rwm);
+
+	if (try_to_take_rw_write(rwm, mtx))
+		ret = 1;
+
+	spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+	return ret;
+}
+
+static inline int
+rt_write_fasttrylock(struct rw_mutex *rwm,
+		     int (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx)
+{
+	struct task_struct *val = (void *)((unsigned long)current | RT_RWLOCK_WRITER);
+
+	if (likely(rt_rwlock_cmpxchg(rwm, NULL, val))) {
+		rt_mutex_deadlock_account_lock(&rwm->mutex, current);
+		return 1;
+	} else
+		return slowfn(rwm, mtx);
+}
+
+int rt_mutex_down_write_trylock(struct rw_mutex *rwm)
+{
+	return rt_write_fasttrylock(rwm, rt_write_slowtrylock, 1);
+}
+
+static void noinline __sched
+rt_read_slowunlock(struct rw_mutex *rwm, int mtx)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	struct rt_mutex_waiter *waiter;
+	struct reader_lock_struct *rls;
+	unsigned long flags;
+	unsigned int reader_count;
+	int savestate = !mtx;
+	int i;
+
+	spin_lock_irqsave(&mutex->wait_lock, flags);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	/*
+	 * To prevent multiple readers from zeroing out the owner
+	 * when the count goes to zero and then have another task
+	 * grab the task. We mark the lock. This makes all tasks
+	 * go to the slow path. Then we can check the owner without
+	 * worry that it changed.
+	 */
+	mark_rt_rwlock_check(rwm);
+
+	for (i = current->reader_lock_count - 1; i >= 0; i--) {
+		if (current->owned_read_locks[i].lock == rwm) {
+			current->owned_read_locks[i].count--;
+			if (!current->owned_read_locks[i].count) {
+				current->reader_lock_count--;
+				WARN_ON_ONCE(i != current->reader_lock_count);
+				atomic_dec(&rwm->owners);
+				rls = &current->owned_read_locks[i];
+				WARN_ON(!rls->list.prev || list_empty(&rls->list));
+				list_del_init(&rls->list);
+				rls->lock = NULL;
+			}
+			break;
+		}
+	}
+	WARN_ON_ONCE(i < 0);
+
+	/*
+	 * If the last two (or more) readers unlocked at the same
+	 * time, the owner could be cleared since the count went to
+	 * zero. If this has happened, the rwm owner will not
+	 * be set to current or readers. This means that another reader
+	 * already reset the lock, so there is nothing left to do.
+	 */
+	if (unlikely(rt_rwlock_owner(rwm) != current &&
+		     rt_rwlock_owner(rwm) != RT_RW_READER)) {
+		/* Update the owner if necessary */
+		rt_rwlock_update_owner(rwm, atomic_read(&rwm->owners));
+		goto out;
+	}
+
+	/*
+	 * If there are more readers and we are under the limit
+	 * let the last reader do the wakeups.
+	 */
+	reader_count = atomic_read(&rwm->count);
+	if (reader_count &&
+	    (!rt_rwlock_limit || atomic_read(&rwm->owners) >= rt_rwlock_limit))
+		goto out;
+
+	/* If no one is blocked, then clear all ownership */
+	if (!rt_mutex_has_waiters(mutex)) {
+		rwm->prio = MAX_PRIO;
+		/*
+		 * If count is not zero, we are under the limit with
+		 * no other readers.
+		 */
+		if (reader_count)
+			goto out;
+
+		/* We could still have a pending reader waiting */
+		if (rt_mutex_owner_pending(mutex)) {
+			/* set the rwm back to pending */
+			rwm->owner = RT_RW_PENDING_READ;
+		} else {
+			rwm->owner = NULL;
+			mutex->owner = NULL;
+		}
+		goto out;
+	}
+
+	/*
+	 * If the next waiter is a reader, this can be because of
+	 * two things. One is that we hit the reader limit, or
+	 * Two, there is a pending writer.
+	 * We still only wake up one reader at a time (even if
+	 * we could wake up more). This is because we dont
+	 * have any idea if a writer is pending.
+	 */
+	waiter = rt_mutex_top_waiter(mutex);
+	if (waiter->write_lock) {
+		/* only wake up if there are no readers */
+		if (reader_count)
+			goto out;
+		rwm->owner = RT_RW_PENDING_WRITE;
+	} else {
+		/*
+		 * It is also possible that the reader limit decreased.
+		 * If the limit did decrease, we may not be able to
+		 * wake up the reader if we are currently above the limit.
+		 */
+		if (rt_rwlock_limit &&
+		    unlikely(atomic_read(&rwm->owners) >= rt_rwlock_limit))
+			goto out;
+		rwm->owner = RT_RW_PENDING_READ;
+	}
+
+	wakeup_next_waiter(mutex, savestate);
+
+ out:
+	spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+	/* Undo pi boosting.when necessary */
+	rt_mutex_adjust_prio(current);
+}
+
+static inline void
+rt_read_fastunlock(struct rw_mutex *rwm,
+		   void (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx)
+{
+	WARN_ON(!atomic_read(&rwm->count));
+	WARN_ON(!atomic_read(&rwm->owners));
+	WARN_ON(!rwm->owner);
+	atomic_dec(&rwm->count);
+	if (likely(rt_rwlock_cmpxchg(rwm, current, NULL))) {
+		struct reader_lock_struct *rls;
+		int reader_count = --current->reader_lock_count;
+		int owners;
+		rt_mutex_deadlock_account_unlock(current);
+		if (unlikely(reader_count < 0)) {
+			    reader_count = 0;
+			    WARN_ON_ONCE(1);
+		}
+		owners = atomic_dec_return(&rwm->owners);
+		if (unlikely(owners < 0)) {
+			atomic_set(&rwm->owners, 0);
+			WARN_ON_ONCE(1);
+		}
+		rls = &current->owned_read_locks[reader_count];
+		WARN_ON_ONCE(rls->lock != rwm);
+		WARN_ON(rls->list.prev && !list_empty(&rls->list));
+		rls->lock = NULL;
+	} else
+		slowfn(rwm, mtx);
+}
+
+void rt_mutex_up_read(struct rw_mutex *rwm)
+{
+	rt_read_fastunlock(rwm, rt_read_slowunlock, 1);
+}
+
+void rt_rwlock_read_unlock(struct rw_mutex *rwm)
+{
+	rt_read_fastunlock(rwm, rt_read_slowunlock, 0);
+}
+
+static void noinline __sched
+rt_write_slowunlock(struct rw_mutex *rwm, int mtx)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	struct rt_mutex_waiter *waiter;
+	struct task_struct *pendowner;
+	int savestate = !mtx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mutex->wait_lock, flags);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(mutex)) {
+		rwm->owner = NULL;
+		mutex->owner = NULL;
+		spin_unlock_irqrestore(&mutex->wait_lock, flags);
+		return;
+	}
+
+	debug_rt_mutex_unlock(mutex);
+
+	/*
+	 * This is where it gets a bit tricky.
+	 * We can have both readers and writers waiting below us.
+	 * They are ordered by priority. For each reader we wake
+	 * up, we check to see if there's another reader waiting.
+	 * If that is the case, we continue to wake up the readers
+	 * until we hit a writer. Once we hit a writer, then we
+	 * stop (and don't wake it up).
+	 *
+	 * If the next waiter is a writer, than we just wake up
+	 * the writer and we are done.
+	 */
+
+	waiter = rt_mutex_top_waiter(mutex);
+	pendowner = waiter->task;
+	wakeup_next_waiter(mutex, savestate);
+
+	/* another writer is next? */
+	if (waiter->write_lock) {
+		rwm->owner = RT_RW_PENDING_WRITE;
+		goto out;
+	}
+
+	rwm->owner = RT_RW_PENDING_READ;
+
+	if (!rt_mutex_has_waiters(mutex))
+		goto out;
+
+	spin_lock(&pendowner->pi_lock);
+	/*
+	 * Wake up all readers.
+	 * This gets a bit more complex. More than one reader can't
+	 * own the mutex. We give it to the first (highest prio)
+	 * reader, and then wake up the rest of the readers until
+	 * we wake up all readers or come to a writer. The woken
+	 * up readers that don't own the lock will try to take it
+	 * when they schedule. Doing this lets a high prio writer
+	 * come along and steal the lock.
+	 */
+	waiter = rt_mutex_top_waiter(mutex);
+	while (waiter && !waiter->write_lock) {
+		struct task_struct *reader = waiter->task;
+
+		plist_del(&waiter->list_entry, &mutex->wait_list);
+
+		/* nop if not on a list */
+		plist_del(&waiter->pi_list_entry, &pendowner->pi_waiters);
+
+		waiter->task = NULL;
+		reader->pi_blocked_on = NULL;
+
+		if (savestate)
+			wake_up_process_mutex(reader);
+		else
+			wake_up_process(reader);
+
+		if (rt_mutex_has_waiters(mutex))
+			waiter = rt_mutex_top_waiter(mutex);
+		else
+			waiter = NULL;
+	}
+
+	/* If a writer is still pending, then update its plist. */
+	if (rt_mutex_has_waiters(mutex)) {
+		struct rt_mutex_waiter *next;
+
+		next = rt_mutex_top_waiter(mutex);
+		/* delete incase we didn't go through the loop */
+		plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+		/* add back in as top waiter */
+		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+	}
+	spin_unlock(&pendowner->pi_lock);
+
+ out:
+
+	spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+	/* Undo pi boosting.when necessary */
+	rt_mutex_adjust_prio(current);
+}
+
+static inline void
+rt_write_fastunlock(struct rw_mutex *rwm,
+		   void (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx)
+{
+	struct task_struct *val = (void *)((unsigned long)current | RT_RWLOCK_WRITER);
+
+	WARN_ON(rt_rwlock_owner(rwm) != current);
+	if (likely(rt_rwlock_cmpxchg(rwm, (struct task_struct *)val, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(rwm, mtx);
+}
+
+void rt_mutex_up_write(struct rw_mutex *rwm)
+{
+	rt_write_fastunlock(rwm, rt_write_slowunlock, 1);
+}
+
+void rt_rwlock_write_unlock(struct rw_mutex *rwm)
+{
+	rt_write_fastunlock(rwm, rt_write_slowunlock, 0);
+}
+
+/*
+ * We own the lock for write, and we want to convert it to a read,
+ * so we simply take the lock as read, and wake up all other readers.
+ */
+void __sched
+rt_mutex_downgrade_write(struct rw_mutex *rwm)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+	struct reader_lock_struct *rls;
+	struct rt_mutex_waiter *waiter;
+	unsigned long flags;
+	int reader_count;
+
+	spin_lock_irqsave(&mutex->wait_lock, flags);
+	init_rw_lists(rwm);
+
+	/* we have the lock and are sole owner, then update the accounting */
+	atomic_inc(&rwm->count);
+	atomic_inc(&rwm->owners);
+	reader_count = current->reader_lock_count++;
+	rls = &current->owned_read_locks[reader_count];
+	if (likely(reader_count < MAX_RWLOCK_DEPTH)) {
+		rls->lock = rwm;
+		rls->count = 1;
+	} else
+		WARN_ON_ONCE(1);
+
+	if (!rt_mutex_has_waiters(mutex)) {
+		/* We are sole owner, we are done */
+		rwm->owner = current;
+		rwm->prio = MAX_PRIO;
+		mutex->owner = NULL;
+		spin_unlock_irqrestore(&mutex->wait_lock, flags);
+		return;
+	}
+
+	/* Set us up for multiple readers or conflicts */
+
+	list_add(&rls->list, &rwm->readers);
+	rwm->owner = RT_RW_READER;
+
+	/*
+	 * This is like the write unlock, but we already own the
+	 * reader. We still want to wake up other readers that are
+	 * waiting, until we hit the reader limit, or a writer.
+	 */
+
+	spin_lock(&current->pi_lock);
+	waiter = rt_mutex_top_waiter(mutex);
+	while (waiter && !waiter->write_lock) {
+		struct task_struct *reader = waiter->task;
+
+		plist_del(&waiter->list_entry, &mutex->wait_list);
+
+		/* nop if not on a list */
+		plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+
+		waiter->task = NULL;
+		reader->pi_blocked_on = NULL;
+
+		/* downgrade is only for mutexes */
+		wake_up_process(reader);
+
+		if (rt_mutex_has_waiters(mutex))
+			waiter = rt_mutex_top_waiter(mutex);
+		else
+			waiter = NULL;
+	}
+
+	/* If a writer is still pending, then update its plist. */
+	if (rt_mutex_has_waiters(mutex)) {
+		struct rt_mutex_waiter *next;
+
+		next = rt_mutex_top_waiter(mutex);
+
+		/* setup this mutex prio for read */
+		rwm->prio = next->task->prio;
+
+		/* delete incase we didn't go through the loop */
+		plist_del(&next->pi_list_entry, &current->pi_waiters);
+		/* add back in as top waiter */
+		plist_add(&next->pi_list_entry, &current->pi_waiters);
+	} else
+		rwm->prio = MAX_PRIO;
+
+	spin_unlock(&current->pi_lock);
+
+	rt_mutex_set_owner(mutex, RT_RW_READER, 0);
+
+	spin_unlock_irqrestore(&mutex->wait_lock, flags);
+
+	/*
+	 * Undo pi boosting when necessary.
+	 * If one of the awoken readers boosted us, we don't want to keep
+	 * that priority.
+	 */
+	rt_mutex_adjust_prio(current);
+}
+
+void rt_mutex_rwsem_init(struct rw_mutex *rwm, const char *name)
+{
+	struct rt_mutex *mutex = &rwm->mutex;
+
+	rwm->owner = NULL;
+	atomic_set(&rwm->count, 0);
+	atomic_set(&rwm->owners, 0);
+	rwm->prio = MAX_PRIO;
+	INIT_LIST_HEAD(&rwm->readers);
+
+	__rt_mutex_init(mutex, name);
+}
+
+static int rt_mutex_get_readers_prio(struct task_struct *task, int prio)
+{
+	struct reader_lock_struct *rls;
+	struct rw_mutex *rwm;
+	int lock_prio;
+	int i;
+
+	for (i = 0; i < task->reader_lock_count; i++) {
+		rls = &task->owned_read_locks[i];
+		rwm = rls->lock;
+		if (rwm) {
+			lock_prio = rwm->prio;
+			if (prio > lock_prio)
+				prio = lock_prio;
+		}
+	}
+
+	return prio;
+}
+
+static int rt_mutex_adjust_readers(struct rt_mutex *orig_lock,
+				   struct rt_mutex_waiter *orig_waiter,
+				   struct task_struct *top_task,
+				   struct rt_mutex *lock,
+				   int recursion_depth)
+{
+	struct reader_lock_struct *rls;
+	struct rt_mutex_waiter *waiter;
+	struct task_struct *task;
+	struct rw_mutex *rwm = container_of(lock, struct rw_mutex, mutex);
+
+	if (rt_mutex_has_waiters(lock)) {
+		waiter = rt_mutex_top_waiter(lock);
+		/*
+		 * Do we need to grab the task->pi_lock?
+		 * Really, we are only reading it. If it
+		 * changes, then that should follow this chain
+		 * too.
+		 */
+		rwm->prio = waiter->task->prio;
+	} else
+		rwm->prio = MAX_PRIO;
+
+	if (recursion_depth >= MAX_RWLOCK_DEPTH) {
+		WARN_ON(1);
+		return 1;
+	}
+
+	list_for_each_entry(rls, &rwm->readers, list) {
+		task = rls->task;
+		get_task_struct(task);
+		/*
+		 * rt_mutex_adjust_prio_chain will do
+		 * the put_task_struct
+		 */
+		rt_mutex_adjust_prio_chain(task, 0, orig_lock,
+					   orig_waiter, top_task,
+					   recursion_depth+1);
+	}
+
+	return 0;
+}
+#else
+static int rt_mutex_adjust_readers(struct rt_mutex *orig_lock,
+				   struct rt_mutex_waiter *orig_waiter,
+				   struct task_struct *top_task,
+				   struct rt_mutex *lock,
+				   int recursion_depth)
+{
+	return 0;
+}
+
+static int rt_mutex_get_readers_prio(struct task_struct *task, int prio)
+{
+	return prio;
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags)
+{
+	int saved_lock_depth = current->lock_depth;
+
+	current->lock_depth = -1;
+	/*
+	 * try_to_take_lock set the waiters, make sure it's
+	 * still correct.
+	 */
+	fixup_rt_mutex_waiters(lock);
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+	up(&kernel_sem);
+
+	spin_lock_irq(&lock->wait_lock);
+
+	return saved_lock_depth;
+}
+
+static inline void rt_reacquire_bkl(int saved_lock_depth)
+{
+	down(&kernel_sem);
+	current->lock_depth = saved_lock_depth;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		  struct hrtimer_sleeper *timeout,
+		  int detect_deadlock)
+{
+	int ret = 0, saved_lock_depth = -1;
+	struct rt_mutex_waiter waiter;
+	unsigned long flags;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+	waiter.write_lock = 0;
+
+	spin_lock_irqsave(&lock->wait_lock, flags);
+	init_lists(lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock)) {
+		spin_unlock_irqrestore(&lock->wait_lock, flags);
+		return 0;
+	}
+
+	/*
+	 * We drop the BKL here before we go into the wait loop to avoid a
+	 * possible deadlock in the scheduler.
+	 */
+	if (unlikely(current->lock_depth >= 0))
+		saved_lock_depth = rt_release_bkl(lock, flags);
+
+	set_current_state(state);
+
+	/* Setup the timer, when timeout != NULL */
+	if (unlikely(timeout)) {
+		hrtimer_start(&timeout->timer, timeout->timer.expires,
+			      HRTIMER_MODE_ABS);
+		if (!hrtimer_active(&timeout->timer))
+			timeout->task = NULL;
+	}
+
+	for (;;) {
+		unsigned long saved_flags;
+
+		/* Try to acquire the lock: */
+		if (try_to_take_rt_mutex(lock))
+			break;
+
+		/*
+		 * TASK_INTERRUPTIBLE checks for signals and
+		 * timeout. Ignored otherwise.
+		 */
+		if (unlikely(state == TASK_INTERRUPTIBLE)) {
+			/* Signal pending? */
+			if (signal_pending(current))
+				ret = -EINTR;
+			if (timeout && !timeout->task)
+				ret = -ETIMEDOUT;
+			if (ret)
+				break;
+		}
+
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by a higher prio task.
+		 */
+		if (!waiter.task) {
+			ret = task_blocks_on_rt_mutex(lock, &waiter,
+						      detect_deadlock, flags);
+			/*
+			 * If we got woken up by the owner then start loop
+			 * all over without going into schedule to try
+			 * to get the lock now:
+			 */
+			if (unlikely(!waiter.task)) {
+				/*
+				 * Reset the return value. We might
+				 * have returned with -EDEADLK and the
+				 * owner released the lock while we
+				 * were walking the pi chain.
+				 */
+				ret = 0;
+				continue;
+			}
+			if (unlikely(ret))
+				break;
+		}
+		saved_flags = current->flags & PF_NOSCHED;
+		current->flags &= ~PF_NOSCHED;
+
+		spin_unlock_irq(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (waiter.task)
+			schedule_rt_mutex(lock);
+
+		spin_lock_irq(&lock->wait_lock);
+
+		current->flags |= saved_flags;
+		set_current_state(state);
+	}
+
+	set_current_state(TASK_RUNNING);
+
+	if (unlikely(waiter.task))
+		remove_waiter(lock, &waiter, flags);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up.
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+	/* Remove pending timer: */
+	if (unlikely(timeout))
+		hrtimer_cancel(&timeout->timer);
+
+	/*
+	 * Readjust priority, when we did not get the lock. We might
+	 * have been the pending owner and boosted. Since we did not
+	 * take the lock, the PI boost has to go.
+	 */
+	if (unlikely(ret))
+		rt_mutex_adjust_prio(current);
+
+	/* Must we reaquire the BKL? */
+	if (unlikely(saved_lock_depth >= 0))
+		rt_reacquire_bkl(saved_lock_depth);
+
+	debug_rt_mutex_free_waiter(&waiter);
+
+	return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&lock->wait_lock, flags);
+
+	if (likely(rt_mutex_owner(lock) != current)) {
+
+		init_lists(lock);
+
+		ret = try_to_take_rt_mutex(lock);
+		/*
+		 * try_to_take_rt_mutex() sets the lock waiters
+		 * bit unconditionally. Clean this up.
+		 */
+		fixup_rt_mutex_waiters(lock);
+	}
+
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lock->wait_lock, flags);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		spin_unlock_irqrestore(&lock->wait_lock, flags);
+		return;
+	}
+
+	wakeup_next_waiter(lock, 0);
+
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+	/* Undo pi boosting if necessary: */
+	rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+		  int detect_deadlock,
+		  int (*slowfn)(struct rt_mutex *lock, int state,
+				struct hrtimer_sleeper *timeout,
+				int detect_deadlock))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, NULL, detect_deadlock);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+			struct hrtimer_sleeper *timeout, int detect_deadlock,
+			int (*slowfn)(struct rt_mutex *lock, int state,
+				      struct hrtimer_sleeper *timeout,
+				      int detect_deadlock))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
 		return slowfn(lock, state, timeout, detect_deadlock);
 }
 
@@ -831,6 +2429,27 @@ rt_mutex_fastunlock(struct rt_mutex *loc
 }
 
 /**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock,
+				   int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_KILLABLE,
+				 detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
  * rt_mutex_lock - lock a rt_mutex
  *
  * @lock: the rt_mutex to be locked
Index: linux-2.6.25.4-rt4/kernel/sched.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/sched.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/sched.c	2008-05-29 09:47:32.000000000 -0400
@@ -4,6 +4,7 @@
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
@@ -16,6 +17,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
@@ -59,6 +61,7 @@
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/kallsyms.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
@@ -66,19 +69,12 @@
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
+#include <linux/ftrace.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
-/*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-}
+#include "sched_cpupri.h"
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
@@ -89,6 +85,11 @@ unsigned long long __attribute__((weak))
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
+#define __PRIO(prio) \
+	((prio) <= 99 ? 199 - (prio) : (prio) - 120)
+
+#define PRIO(p) __PRIO((p)->prio)
+
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
@@ -106,6 +107,20 @@ unsigned long long __attribute__((weak))
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 
+#if (BITS_PER_LONG < 64)
+#define JIFFIES_TO_NS64(TIME) \
+	((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ)))
+
+#define NS64_TO_JIFFIES(TIME) \
+	((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \
+	(1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME)))
+#else /* BITS_PER_LONG < 64 */
+
+#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME)
+#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME)
+
+#endif /* BITS_PER_LONG < 64 */
+
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
@@ -135,6 +150,32 @@ static inline void sg_inc_cpu_power(stru
 }
 #endif
 
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+
+/*
+ * Tweaks for current
+ */
+
+#ifdef CURRENT_PTR
+struct task_struct * const ___current = &init_task;
+struct task_struct ** const current_ptr = (struct task_struct ** const)&___current;
+struct thread_info * const current_ti = &init_thread_union.thread_info;
+struct thread_info ** const current_ti_ptr = (struct thread_info ** const)&current_ti;
+
+EXPORT_SYMBOL(___current);
+EXPORT_SYMBOL(current_ti);
+
+/*
+ * The scheduler itself doesnt want 'current' to be cached
+ * during context-switches:
+ */
+# undef current
+# define current __current()
+# undef current_thread_info
+# define current_thread_info() __current_thread_info()
+#endif
+
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
@@ -152,7 +193,8 @@ static inline int task_has_rt_policy(str
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-	struct list_head queue[MAX_RT_PRIO];
+	struct list_head xqueue[MAX_RT_PRIO];  /* exclusive queue */
+	struct list_head squeue[MAX_RT_PRIO];  /* shared queue */
 };
 
 #ifdef CONFIG_GROUP_SCHED
@@ -332,6 +374,7 @@ struct rt_rq {
 	unsigned long rt_nr_migratory;
 	int overloaded;
 #endif
+	unsigned long rt_nr_uninterruptible;
 	int rt_throttled;
 	u64 rt_time;
 
@@ -366,6 +409,9 @@ struct root_domain {
 	 */
 	cpumask_t rto_mask;
 	atomic_t rto_count;
+#ifdef CONFIG_SMP
+	struct cpupri cpupri;
+#endif
 };
 
 /*
@@ -385,7 +431,7 @@ static struct root_domain def_root_domai
  */
 struct rq {
 	/* runqueue lock: */
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
@@ -424,17 +470,13 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
+	unsigned long switch_timestamp;
+	unsigned long slice_avg;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	u64 clock, prev_clock_raw;
-	s64 clock_max_delta;
-
-	unsigned int clock_warps, clock_overflows, clock_underflows;
-	u64 idle_clock;
-	unsigned int clock_deep_idle_events;
-	u64 tick_timestamp;
+	u64 clock;
 
 	atomic_t nr_iowait;
 
@@ -479,6 +521,13 @@ struct rq {
 
 	/* BKL stats */
 	unsigned int bkl_count;
+
+	/* RT-overload stats: */
+	unsigned long rto_schedule;
+	unsigned long rto_schedule_tail;
+	unsigned long rto_wakeup;
+	unsigned long rto_pulled;
+	unsigned long rto_pushed;
 #endif
 	struct lock_class_key rq_lock_key;
 };
@@ -499,52 +548,7 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-/*
- * Update the per-runqueue clock, as finegrained as the platform can give
- * us, but without assuming monotonicity, etc.:
- */
-static void __update_rq_clock(struct rq *rq)
-{
-	u64 prev_raw = rq->prev_clock_raw;
-	u64 now = sched_clock();
-	s64 delta = now - prev_raw;
-	u64 clock = rq->clock;
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-#endif
-	/*
-	 * Protect against sched_clock() occasionally going backwards:
-	 */
-	if (unlikely(delta < 0)) {
-		clock++;
-		rq->clock_warps++;
-	} else {
-		/*
-		 * Catch too large forward jumps too:
-		 */
-		if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
-			if (clock < rq->tick_timestamp + TICK_NSEC)
-				clock = rq->tick_timestamp + TICK_NSEC;
-			else
-				clock++;
-			rq->clock_overflows++;
-		} else {
-			if (unlikely(delta > rq->clock_max_delta))
-				rq->clock_max_delta = delta;
-			clock += delta;
-		}
-	}
-
-	rq->prev_clock_raw = now;
-	rq->clock = clock;
-}
-
-static void update_rq_clock(struct rq *rq)
-{
-	if (likely(smp_processor_id() == cpu_of(rq)))
-		__update_rq_clock(rq);
-}
+#include "sched_trace.h"
 
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -561,6 +565,11 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+static inline void update_rq_clock(struct rq *rq)
+{
+	rq->clock = sched_clock_cpu(cpu_of(rq));
+}
+
 unsigned long rt_needs_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -587,6 +596,31 @@ unsigned long rt_needs_cpu(int cpu)
 # define const_debug static const
 #endif
 
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+	int cpu = get_cpu();
+	struct rq *rq = cpu_rq(cpu);
+	int ret;
+
+	ret = spin_is_locked(&rq->lock);
+	put_cpu();
+	return ret;
+}
+
+#ifndef CONFIG_SMP
+int task_is_current(struct task_struct *task)
+{
+	return task_rq(task)->curr == task;
+}
+#endif
+
 /*
  * Debugging: various feature bits
  */
@@ -611,7 +645,11 @@ const_debug unsigned int sysctl_sched_fe
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
+#ifdef CONFIG_PREEMPT_RT
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#else
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#endif
 
 /*
  * period over which we measure -rt task cpu usage in us.
@@ -625,7 +663,8 @@ static __read_mostly int scheduler_runni
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
  */
-int sysctl_sched_rt_runtime = 950000;
+/* int sysctl_sched_rt_runtime = 950000; */
+int sysctl_sched_rt_runtime = -1;
 
 /*
  * single value that denotes runtime == period, ie unlimited time.
@@ -633,37 +672,22 @@ int sysctl_sched_rt_runtime = 950000;
 #define RUNTIME_INF	((u64)~0ULL)
 
 /*
- * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- * clock constructed from sched_clock():
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
  */
-unsigned long long cpu_clock(int cpu)
-{
-	unsigned long long now;
-	unsigned long flags;
-	struct rq *rq;
-
-	/*
-	 * Only call sched_clock() if the scheduler has already been
-	 * initialized (some code might call cpu_clock() very early):
-	 */
-	if (unlikely(!scheduler_running))
-		return 0;
-
-	local_irq_save(flags);
-	rq = cpu_rq(cpu);
-	update_rq_clock(rq);
-	now = rq->clock;
-	local_irq_restore(flags);
-
-	return now;
-}
-EXPORT_SYMBOL_GPL(cpu_clock);
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+#   error FIXME
+# else
+#  define _finish_arch_switch finish_arch_switch
+# endif
+#endif
 
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
-# define finish_arch_switch(prev)	do { } while (0)
+# define _finish_arch_switch(prev)	do { } while (0)
 #endif
 
 static inline int task_current(struct rq *rq, struct task_struct *p)
@@ -671,18 +695,39 @@ static inline int task_current(struct rq
 	return rq->curr == p;
 }
 
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+	return p->oncpu;
+#else
 	return task_current(rq, p);
+#endif
 }
 
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->oncpu = 1;
+#endif
 }
 
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+	/*
+	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->oncpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
@@ -694,18 +739,10 @@ static inline void finish_lock_switch(st
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 
-	spin_unlock_irq(&rq->lock);
+	spin_unlock(&rq->lock);
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-	return p->oncpu;
-#else
-	return task_current(rq, p);
-#endif
-}
 
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
@@ -735,8 +772,8 @@ static inline void finish_lock_switch(st
 	smp_wmb();
 	prev->oncpu = 0;
 #endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-	local_irq_enable();
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_disable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -804,43 +841,6 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
-{
-	struct rq *rq = cpu_rq(smp_processor_id());
-
-	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
-	spin_unlock(&rq->lock);
-	rq->clock_deep_idle_events++;
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-	struct rq *rq = cpu_rq(smp_processor_id());
-	u64 now = sched_clock();
-
-	rq->idle_clock += delta_ns;
-	/*
-	 * Override the previous timestamp and ignore all
-	 * sched_clock() deltas that occured while we idled,
-	 * and use the PM-provided delta_ns to advance the
-	 * rq clock:
-	 */
-	spin_lock(&rq->lock);
-	rq->prev_clock_raw = now;
-	rq->clock += delta_ns;
-	spin_unlock(&rq->lock);
-	touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-
 static void __resched_task(struct task_struct *p, int tif_bit);
 
 static inline void resched_task(struct task_struct *p)
@@ -965,7 +965,7 @@ static enum hrtimer_restart hrtick(struc
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
 	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
+	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	spin_unlock(&rq->lock);
 
@@ -1131,7 +1131,7 @@ void wake_up_idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 
-	if (cpu == smp_processor_id())
+	if (cpu == raw_smp_processor_id())
 		return;
 
 	/*
@@ -1400,6 +1400,8 @@ static inline int normal_prio(struct tas
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
+
+//	trace_special_pid(p->pid, PRIO(p), __PRIO(prio));
 	return prio;
 }
 
@@ -1431,6 +1433,7 @@ static void activate_task(struct rq *rq,
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 
+	ftrace_event_task_activate(p, cpu_of(rq));
 	enqueue_task(rq, p, wakeup);
 	inc_nr_running(p, rq);
 }
@@ -1443,6 +1446,7 @@ static void deactivate_task(struct rq *r
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 
+	ftrace_event_task_deactivate(p, cpu_of(rq));
 	dequeue_task(rq, p, sleep);
 	dec_nr_running(p, rq);
 }
@@ -1626,6 +1630,7 @@ void wait_task_inactive(struct task_stru
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
+		trace_kernel_sched_wait(p);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		task_rq_unlock(rq, &flags);
@@ -1895,13 +1900,21 @@ static int sched_balance_self(int cpu, i
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * sync wakeups can increase wakeup latencies:
+	 */
+	if (rt_task(p))
+		sync = 0;
+#endif
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
@@ -1966,9 +1979,13 @@ out_activate:
 	success = 1;
 
 out_running:
+	trace_kernel_sched_wakeup(rq, p);
 	check_preempt_curr(rq, p);
 
-	p->state = TASK_RUNNING;
+	if (mutex)
+		p->state = TASK_RUNNING_MUTEX;
+	else
+		p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
@@ -1981,13 +1998,31 @@ out:
 
 int wake_up_process(struct task_struct *p)
 {
-	return try_to_wake_up(p, TASK_ALL, 0);
+	return try_to_wake_up(p, TASK_ALL, 0, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 
+int  wake_up_process_sync(struct task_struct * p)
+{
+	return try_to_wake_up(p, TASK_ALL, 1, 0);
+}
+EXPORT_SYMBOL(wake_up_process_sync);
+
+int  wake_up_process_mutex(struct task_struct * p)
+{
+	return try_to_wake_up(p, TASK_ALL, 0, 1);
+}
+EXPORT_SYMBOL(wake_up_process_mutex);
+
+int  wake_up_process_mutex_sync(struct task_struct * p)
+{
+	return try_to_wake_up(p, TASK_ALL, 1, 1);
+}
+EXPORT_SYMBOL(wake_up_process_mutex_sync);
+
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	return try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0);
 }
 
 /*
@@ -2057,7 +2092,7 @@ void sched_fork(struct task_struct *p, i
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
 	p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
@@ -2095,6 +2130,7 @@ void wake_up_new_task(struct task_struct
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(p, rq);
 	}
+	trace_kernel_sched_wakeup_new(rq, p);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2132,8 +2168,17 @@ static void fire_sched_in_preempt_notifi
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 
+	if (hlist_empty(&curr->preempt_notifiers))
+		return;
+
+	/*
+	 * The KVM sched in notifier expects to be called with
+	 * interrupts enabled.
+	 */
+	local_irq_enable();
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+	local_irq_disable();
 }
 
 static void
@@ -2161,6 +2206,26 @@ fire_sched_out_preempt_notifiers(struct 
 
 #endif
 
+#ifdef CONFIG_DEBUG_PREEMPT
+void notrace preempt_enable_no_resched(void)
+{
+	static int once = 1;
+
+	barrier();
+	dec_preempt_count();
+
+	if (once && !preempt_count()) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+}
+
+EXPORT_SYMBOL(preempt_enable_no_resched);
+#endif
+
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -2218,7 +2283,7 @@ static void finish_task_switch(struct rq
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
-	finish_arch_switch(prev);
+	_finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
 	if (current->sched_class->post_schedule)
@@ -2226,8 +2291,12 @@ static void finish_task_switch(struct rq
 #endif
 
 	fire_sched_in_preempt_notifiers(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
 	if (mm)
-		mmdrop(mm);
+ 		mmdrop_delayed(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
@@ -2245,12 +2314,15 @@ static void finish_task_switch(struct rq
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
-	struct rq *rq = this_rq();
-
-	finish_task_switch(rq, prev);
+	preempt_disable(); // TODO: move this to fork setup
+	finish_task_switch(this_rq(), prev);
+	__preempt_enable_no_resched();
+	local_irq_enable();
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
+#else
+	preempt_check_resched();
 #endif
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
@@ -2267,6 +2339,8 @@ context_switch(struct rq *rq, struct tas
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
+
+	trace_kernel_sched_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
@@ -2297,6 +2371,11 @@ context_switch(struct rq *rq, struct tas
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 
+#ifdef CURRENT_PTR
+	barrier();
+	*current_ptr = next;
+	*current_ti_ptr = next->thread_info;
+#endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
@@ -2343,6 +2422,11 @@ unsigned long nr_uninterruptible(void)
 	return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_uninterruptible;
+}
+
 unsigned long long nr_context_switches(void)
 {
 	int i;
@@ -2361,6 +2445,13 @@ unsigned long nr_iowait(void)
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 
+	/*
+	 * Since we read the counters lockless, it might be slightly
+	 * inaccurate. Do not allow it to go below zero though:
+	 */
+	if (unlikely((long)sum < 0))
+		sum = 0;
+
 	return sum;
 }
 
@@ -2499,6 +2590,7 @@ static void sched_migrate_task(struct ta
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 
+	trace_kernel_sched_migrate_task(p, cpu_of(rq), dest_cpu);
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
@@ -3551,7 +3643,7 @@ out:
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
-	int this_cpu = smp_processor_id();
+	int this_cpu = raw_smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
 	enum cpu_idle_type idle = this_rq->idle_at_tick ?
 						CPU_IDLE : CPU_NOT_IDLE;
@@ -3703,7 +3795,9 @@ void account_user_time(struct task_struc
 
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
-	if (TASK_NICE(p) > 0)
+	if (rt_task(p))
+		cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp);
+	else if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -3758,10 +3852,12 @@ void account_system_time(struct task_str
 
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
-	if (hardirq_count() - hardirq_offset)
+	if (hardirq_count() - hardirq_offset || (p->flags & PF_HARDIRQ))
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
-	else if (softirq_count())
+	else if (softirq_count() || (p->flags & PF_SOFTIRQ))
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	else if (rt_task(p))
+		cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp);
 	else if (p != rq->idle)
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
@@ -3816,18 +3912,13 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
-	u64 next_tick = rq->tick_timestamp + TICK_NSEC;
+
+	sched_clock_tick();
+
+	BUG_ON(!irqs_disabled());
 
 	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
-	/*
-	 * Let rq->clock advance by at least TICK_NSEC:
-	 */
-	if (unlikely(rq->clock < next_tick)) {
-		rq->clock = next_tick;
-		rq->clock_underflows++;
-	}
-	rq->tick_timestamp = rq->clock;
+	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_sched_rt_period(rq);
@@ -3839,26 +3930,57 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER) || \
+				defined(CONFIG_PREEMPT_TRACE))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
 
 void __kprobes add_preempt_count(int val)
 {
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_ip(CALLER_ADDR1);
+
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
+#endif
 	preempt_count() += val;
+#ifdef CONFIG_PREEMPT_TRACE
+	if (val <= 10) {
+		unsigned int idx = preempt_count() & PREEMPT_MASK;
+		if (idx < MAX_PREEMPT_TRACE) {
+			current->preempt_trace_eip[idx] = eip;
+			current->preempt_trace_parent_eip[idx] = parent_eip;
+		}
+	}
+#endif
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(eip, parent_eip);
 }
 EXPORT_SYMBOL(add_preempt_count);
 
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
@@ -3870,7 +3992,10 @@ void __kprobes sub_preempt_count(int val
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
+#endif
 
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
@@ -3884,8 +4009,8 @@ static noinline void __schedule_bug(stru
 {
 	struct pt_regs *regs = get_irq_regs();
 
-	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
-		prev->comm, prev->pid, preempt_count());
+	printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n",
+	       prev->comm, preempt_count(), prev->pid, smp_processor_id());
 
 	debug_show_held_locks(prev);
 	if (irqs_disabled())
@@ -3902,6 +4027,8 @@ static noinline void __schedule_bug(stru
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
+	WARN_ON(system_state == SYSTEM_BOOTING);
+
 	/*
 	 * Test if we are atomic. Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
@@ -3956,14 +4083,15 @@ pick_next_task(struct rq *rq, struct tas
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-need_resched:
+	rcu_preempt_boost();
+
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
@@ -3972,7 +4100,6 @@ need_resched:
 	switch_count = &prev->nivcsw;
 
 	release_kernel_lock(prev);
-need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
@@ -3982,20 +4109,27 @@ need_resched_nonpreemptible:
 	 * Do the rq-clock update outside the rq lock:
 	 */
 	local_irq_disable();
-	__update_rq_clock(rq);
+	update_rq_clock(rq);
 	spin_lock(&rq->lock);
+	cpu = smp_processor_id();
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_delayed(prev);
 
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
+	if ((prev->state & ~TASK_RUNNING_MUTEX) &&
+			!(preempt_count() & PREEMPT_ACTIVE)) {
+  		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				signal_pending(prev))) {
 			prev->state = TASK_RUNNING;
 		} else {
+			touch_softlockup_watchdog();
 			deactivate_task(rq, prev, 1);
 		}
 		switch_count = &prev->nvcsw;
 	}
 
+	if (preempt_count() & PREEMPT_ACTIVE)
+		sub_preempt_count(PREEMPT_ACTIVE);
+
 #ifdef CONFIG_SMP
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
@@ -4021,21 +4155,93 @@ need_resched_nonpreemptible:
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
-	} else
-		spin_unlock_irq(&rq->lock);
+		__preempt_enable_no_resched();
+	} else {
+		__preempt_enable_no_resched();
+		spin_unlock(&rq->lock);
+	}
 
 	hrtick_set(rq);
 
-	if (unlikely(reacquire_kernel_lock(current) < 0))
-		goto need_resched_nonpreemptible;
+	reacquire_kernel_lock(current);
+	if (!irqs_disabled()) {
+		static int once = 1;
+		if (once) {
+			once = 0;
+			print_irqtrace_events(current);
+			WARN_ON(1);
+		}
+	}
 
-	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-		goto need_resched;
+}
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	WARN_ON(system_state == SYSTEM_BOOTING);
+	/*
+	 * Test if we have interrupts disabled.
+	 */
+	if (unlikely(irqs_disabled())) {
+		printk(KERN_ERR "BUG: scheduling with irqs disabled: "
+		       "%s/0x%08x/%d\n", current->comm, preempt_count(),
+		       current->pid);
+		print_symbol("caller is %s\n",
+			     (long)__builtin_return_address(0));
+		dump_stack();
+	}
+
+	if (unlikely(current->flags & PF_NOSCHED)) {
+		current->flags &= ~PF_NOSCHED;
+		printk(KERN_ERR "%s:%d userspace BUG: scheduling in "
+		       "user-atomic context!\n", current->comm, current->pid);
+		dump_stack();
+		send_sig(SIGUSR2, current, 1);
+	}
+
+	local_irq_disable();
+
+	do {
+		__schedule();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED) ||
+			  test_thread_flag(TIF_NEED_RESCHED_DELAYED)));
+
+	local_irq_enable();
 }
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
+
+/*
+ * Global flag to turn preemption off on a CONFIG_PREEMPT kernel:
+ */
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3)) {
+		if (kernel_preemption) {
+			printk(KERN_INFO "turning off kernel preemption!\n");
+			kernel_preemption = 0;
+		}
+		return 1;
+	}
+	if (!strncmp(str, "on", 2)) {
+		if (!kernel_preemption) {
+			printk(KERN_INFO "turning on kernel preemption!\n");
+			kernel_preemption = 1;
+		}
+		return 1;
+	}
+	get_option(&str, &kernel_preemption);
+
+	return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -4047,6 +4253,8 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 
+	if (!kernel_preemption)
+		return;
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
@@ -4055,6 +4263,7 @@ asmlinkage void __sched preempt_schedule
 		return;
 
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -4064,9 +4273,9 @@ asmlinkage void __sched preempt_schedule
 		 */
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
-		schedule();
+		__schedule();
 		task->lock_depth = saved_lock_depth;
-		sub_preempt_count(PREEMPT_ACTIVE);
+		local_irq_enable();
 
 		/*
 		 * Check again in case we missed a preemption opportunity
@@ -4078,10 +4287,10 @@ asmlinkage void __sched preempt_schedule
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
@@ -4089,10 +4298,17 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 
-	/* Catch callers which need to be fixed */
-	BUG_ON(ti->preempt_count || !irqs_disabled());
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count then just return.
+	 * (interrupts are disabled)
+	 */
+	if (unlikely(ti->preempt_count))
+		return;
 
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -4102,11 +4318,9 @@ asmlinkage void __sched preempt_schedule
 		 */
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
-		local_irq_enable();
-		schedule();
+		__schedule();
 		local_irq_disable();
 		task->lock_depth = saved_lock_depth;
-		sub_preempt_count(PREEMPT_ACTIVE);
 
 		/*
 		 * Check again in case we missed a preemption opportunity
@@ -4121,7 +4335,8 @@ asmlinkage void __sched preempt_schedule
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode, sync);
+	return try_to_wake_up(curr->private, mode | TASK_RUNNING_MUTEX,
+			      sync, 0);
 }
 EXPORT_SYMBOL(default_wake_function);
 
@@ -4161,8 +4376,9 @@ void __wake_up(wait_queue_head_t *q, uns
 	unsigned long flags;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	__wake_up_common(q, mode, nr_exclusive, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(__wake_up);
 
@@ -4211,8 +4427,9 @@ void complete(struct completion *x)
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
-	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+	__wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete);
 
@@ -4222,11 +4439,17 @@ void complete_all(struct completion *x)
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
-	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+	__wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 
+unsigned int  completion_done(struct completion *x)
+{
+	return x->done;
+}
+EXPORT_SYMBOL(completion_done);
+
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
@@ -4354,28 +4577,47 @@ long __sched sleep_on_timeout(wait_queue
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 
-#ifdef CONFIG_RT_MUTEXES
-
 /*
- * rt_mutex_setprio - set the current priority of a task
+ * task_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance logic
+ * and by rcupreempt-boost to boost priorities of tasks sleeping
+ * with rcu locks.
  */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void task_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
-	int oldprio, on_rq, running;
+	int oldprio, prev_resched, on_rq, running;
 	struct rq *rq;
 	const struct sched_class *prev_class = p->sched_class;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
+
+	/*
+	 * Idle task boosting is a nono in general. There is one
+	 * exception, when NOHZ is active:
+	 *
+	 * The idle task calls get_next_timer_interrupt() and holds
+	 * the timer wheel base->lock on the CPU and another CPU wants
+	 * to access the timer (probably to cancel it). We can safely
+	 * ignore the boosting request, as the idle CPU runs this code
+	 * with interrupts disabled and will complete the lock
+	 * protected section without being interrupted. So there is no
+	 * real need to boost.
+	 */
+	if (unlikely(p == rq->idle)) {
+		WARN_ON(p != rq->curr);
+		WARN_ON(p->pi_blocked_on);
+		goto out_unlock;
+	}
+
 	update_rq_clock(rq);
 
 	oldprio = p->prio;
@@ -4393,6 +4635,9 @@ void rt_mutex_setprio(struct task_struct
 
 	p->prio = prio;
 
+//	trace_special_pid(p->pid, __PRIO(oldprio), PRIO(p));
+	prev_resched = _need_resched();
+
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
@@ -4400,11 +4645,12 @@ void rt_mutex_setprio(struct task_struct
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
+//	trace_special(prev_resched, _need_resched(), 0);
+
+out_unlock:
 	task_rq_unlock(rq, &flags);
 }
 
-#endif
-
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
@@ -4999,19 +5245,19 @@ asmlinkage long sys_sched_yield(void)
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	__release(rq->lock);
-	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-	_raw_spin_unlock(&rq->lock);
-	preempt_enable_no_resched();
+	spin_unlock_no_resched(&rq->lock);
 
-	schedule();
+	__schedule();
+
+	local_irq_enable();
+	preempt_check_resched();
 
 	return 0;
 }
 
 static void __cond_resched(void)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 	__might_sleep(__FILE__, __LINE__);
 #endif
 	/*
@@ -5020,10 +5266,11 @@ static void __cond_resched(void)
 	 * cond_resched() call.
 	 */
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
-		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
 	} while (need_resched());
+	local_irq_enable();
 }
 
 #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -5047,13 +5294,13 @@ EXPORT_SYMBOL(_cond_resched);
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_raw_spinlock(raw_spinlock_t *lock)
 {
 	int resched = need_resched() && system_state == SYSTEM_RUNNING;
 	int ret = 0;
 
 	if (spin_needbreak(lock) || resched) {
-		spin_unlock(lock);
+		spin_unlock_no_resched(lock);
 		if (resched && need_resched())
 			__cond_resched();
 		else
@@ -5063,12 +5310,35 @@ int cond_resched_lock(spinlock_t *lock)
 	}
 	return ret;
 }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_raw_spinlock);
 
-int __sched cond_resched_softirq(void)
+#ifdef CONFIG_PREEMPT_RT
+
+int __cond_resched_spinlock(spinlock_t *lock)
 {
-	BUG_ON(!in_softirq());
+#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT)
+	if (lock->break_lock) {
+		lock->break_lock = 0;
+		spin_unlock_no_resched(lock);
+		__cond_resched();
+		spin_lock(lock);
+		return 1;
+	}
+#endif
+	return 0;
+}
+EXPORT_SYMBOL(__cond_resched_spinlock);
 
+#endif
+
+/*
+ * Voluntarily preempt a process context that has softirqs disabled:
+ */
+int __sched cond_resched_softirq(void)
+{
+#ifndef CONFIG_PREEMPT_SOFTIRQS
+	WARN_ON_ONCE(!in_softirq());
+#endif
 	if (need_resched() && system_state == SYSTEM_RUNNING) {
 		local_bh_enable();
 		__cond_resched();
@@ -5079,17 +5349,102 @@ int __sched cond_resched_softirq(void)
 }
 EXPORT_SYMBOL(cond_resched_softirq);
 
+/*
+ * Voluntarily preempt a softirq context (possible with softirq threading):
+ */
+int __sched cond_resched_softirq_context(void)
+{
+	WARN_ON_ONCE(!in_softirq());
+
+	if (softirq_need_resched() && system_state == SYSTEM_RUNNING) {
+		raw_local_irq_disable();
+		_local_bh_enable();
+		raw_local_irq_enable();
+		__cond_resched();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cond_resched_softirq_context);
+
+/*
+ * Preempt a hardirq context if necessary (possible with hardirq threading):
+ */
+int cond_resched_hardirq_context(void)
+{
+	WARN_ON_ONCE(!in_irq());
+	WARN_ON_ONCE(!irqs_disabled());
+
+	if (hardirq_need_resched()) {
+#ifndef CONFIG_PREEMPT_RT
+		irq_exit();
+#endif
+		local_irq_enable();
+		__cond_resched();
+#ifndef CONFIG_PREEMPT_RT
+		local_irq_disable();
+		__irq_enter();
+#endif
+
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cond_resched_hardirq_context);
+
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+
+int voluntary_preemption = 1;
+
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		voluntary_preemption = 0;
+	else
+		get_option(&str, &voluntary_preemption);
+	if (!voluntary_preemption)
+		printk("turning off voluntary preemption!\n");
+
+	return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
+
 /**
  * yield - yield the current processor to other threads.
  *
  * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void __sched yield(void)
+void __sched __yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
+
+void __sched yield(void)
+{
+	static int once = 1;
+
+	/*
+	 * it's a bug to rely on yield() with RT priorities. We print
+	 * the first occurance after bootup ... this will still give
+	 * us an idea about the scope of the problem, without spamming
+	 * the syslog:
+	 */
+	if (once && rt_task(current)) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+	__yield();
+}
 EXPORT_SYMBOL(yield);
 
 /*
@@ -5229,7 +5584,7 @@ out_unlock:
 	return retval;
 }
 
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 
 void sched_show_task(struct task_struct *p)
 {
@@ -5237,19 +5592,23 @@ void sched_show_task(struct task_struct 
 	unsigned state;
 
 	state = p->state ? __ffs(p->state) + 1 : 0;
-	printk(KERN_INFO "%-13.13s %c", p->comm,
-		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+	printk("%-13.13s %c [%p]", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?', p);
 #if BITS_PER_LONG == 32
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk(KERN_CONT " running  ");
 	else
 		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
 #else
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk(KERN_CONT "  running task    ");
 	else
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
+	if (task_curr(p))
+		printk("[curr] ");
+	else if (p->se.on_rq)
+		printk("[on rq #%d] ", task_cpu(p));
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
@@ -5267,6 +5626,7 @@ void sched_show_task(struct task_struct 
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
+	int do_unlock = 1;
 
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
@@ -5275,7 +5635,16 @@ void show_state_filter(unsigned long sta
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	if (!read_trylock(&tasklist_lock)) {
+		printk("hm, tasklist_lock write-locked.\n");
+		printk("ignoring ...\n");
+		do_unlock = 0;
+	}
+#else
 	read_lock(&tasklist_lock);
+#endif
+
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -5291,7 +5660,8 @@ void show_state_filter(unsigned long sta
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
-	read_unlock(&tasklist_lock);
+	if (do_unlock)
+		read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
@@ -5326,7 +5696,7 @@ void __cpuinit init_idle(struct task_str
 
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
 	idle->oncpu = 1;
 #endif
 	spin_unlock_irqrestore(&rq->lock, flags);
@@ -5454,11 +5824,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
+	unsigned long flags;
 	int ret = 0, on_rq;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
 		return ret;
 
+	/*
+	 * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock)
+	 * disabling interrupts - which on PREEMPT_RT does not do:
+	 */
+	local_irq_save(flags);
+
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 
@@ -5482,6 +5859,8 @@ static int __migrate_task(struct task_st
 	ret = 1;
 out:
 	double_rq_unlock(rq_src, rq_dest);
+	local_irq_restore(flags);
+
 	return ret;
 }
 
@@ -6193,6 +6572,7 @@ static void rq_attach_root(struct rq *rq
 {
 	unsigned long flags;
 	const struct sched_class *class;
+	struct root_domain *reap = NULL;
 
 	spin_lock_irqsave(&rq->lock, flags);
 
@@ -6208,7 +6588,7 @@ static void rq_attach_root(struct rq *rq
 		cpu_clear(rq->cpu, old_rd->online);
 
 		if (atomic_dec_and_test(&old_rd->refcount))
-			kfree(old_rd);
+			reap = old_rd;
 	}
 
 	atomic_inc(&rd->refcount);
@@ -6224,6 +6604,10 @@ static void rq_attach_root(struct rq *rq
 	}
 
 	spin_unlock_irqrestore(&rq->lock, flags);
+
+	/* Don't try to free the memory while in-atomic() */
+	if (unlikely(reap))
+		kfree(reap);
 }
 
 static void init_rootdomain(struct root_domain *rd)
@@ -6232,6 +6616,8 @@ static void init_rootdomain(struct root_
 
 	cpus_clear(rd->span);
 	cpus_clear(rd->online);
+
+	cpupri_init(&rd->cpupri);
 }
 
 static void init_defrootdomain(void)
@@ -6955,7 +7341,6 @@ static void detach_destroy_domains(const
 
 	for_each_cpu_mask(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
-	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map);
 }
 
@@ -7194,7 +7579,8 @@ static void init_rt_rq(struct rt_rq *rt_
 
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
-		INIT_LIST_HEAD(array->queue + i);
+		INIT_LIST_HEAD(array->xqueue + i);
+		INIT_LIST_HEAD(array->squeue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
@@ -7277,7 +7663,6 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
-		rq->clock = 1;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7289,8 +7674,7 @@ void __init sched_init(void)
 
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
-		init_task_group.rt_runtime =
-			sysctl_sched_rt_runtime * NSEC_PER_USEC;
+		init_task_group.rt_runtime = global_rt_runtime();
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
@@ -7338,6 +7722,9 @@ void __init sched_init(void)
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 
+#ifdef CONFIG_PREEMPT_RT
+	printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n");
+#endif
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
@@ -7353,7 +7740,7 @@ void __init sched_init(void)
 	scheduler_running = 1;
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 void __might_sleep(char *file, int line)
 {
 #ifdef in_atomic
@@ -7361,13 +7748,16 @@ void __might_sleep(char *file, int line)
 
 	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (debug_direct_keyboard && hardirq_count())
+			return;
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+				" context %s(%d) at %s:%d\n",
+				current->comm, current->pid, file, line);
+		printk("in_atomic():%d [%08x], irqs_disabled():%d\n",
+			in_atomic(), preempt_count(), irqs_disabled());
 		debug_show_held_locks(current);
 		if (irqs_disabled())
 			print_irqtrace_events(current);
@@ -7382,6 +7772,7 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
+
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
@@ -7413,7 +7804,6 @@ void normalize_rt_tasks(void)
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
-		task_rq(p)->clock		= 0;
 
 		if (!rt_task(p)) {
 			/*
Index: linux-2.6.25.4-rt4/kernel/rcupdate.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rcupdate.c	2008-05-29 09:46:00.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rcupdate.c	2008-05-29 09:46:02.000000000 -0400
@@ -85,8 +85,11 @@ void synchronize_rcu(void)
 	/* Will wake me after RCU finished */
 	call_rcu(&rcu.head, wakeme_after_rcu);
 
+	rcu_boost_readers();
+
 	/* Wait for it */
 	wait_for_completion(&rcu.completion);
+	rcu_unboost_readers();
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
Index: linux-2.6.25.4-rt4/include/linux/rcuclassic.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/rcuclassic.h	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/rcuclassic.h	2008-05-29 09:46:48.000000000 -0400
@@ -51,7 +51,7 @@ struct rcu_ctrlblk {
 
 	int	signaled;
 
-	spinlock_t	lock	____cacheline_internodealigned_in_smp;
+	raw_spinlock_t	lock	____cacheline_internodealigned_in_smp;
 	cpumask_t	cpumask; /* CPUs that need to switch in order    */
 				 /* for current batch to proceed.        */
 } ____cacheline_internodealigned_in_smp;
@@ -162,6 +162,10 @@ extern long rcu_batches_completed_bh(voi
 
 #define rcu_enter_nohz()	do { } while (0)
 #define rcu_exit_nohz()		do { } while (0)
+#define rcu_preempt_boost_init() do { } while (0)
+
+struct softirq_action;
+extern void rcu_process_callbacks(struct softirq_action *unused);
 
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUCLASSIC_H */
Index: linux-2.6.25.4-rt4/kernel/rcuclassic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rcuclassic.c	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rcuclassic.c	2008-05-29 09:46:48.000000000 -0400
@@ -60,13 +60,13 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
+	.lock = RAW_SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
 	.cpumask = CPU_MASK_NONE,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
+	.lock = RAW_SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
 	.cpumask = CPU_MASK_NONE,
 };
 
@@ -402,6 +402,8 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
+	unsigned long flags;
+
 	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
 		*rdp->donetail = rdp->curlist;
 		rdp->donetail = rdp->curtail;
@@ -410,12 +412,12 @@ static void __rcu_process_callbacks(stru
 	}
 
 	if (rdp->nxtlist && !rdp->curlist) {
-		local_irq_disable();
+		local_irq_save(flags);
 		rdp->curlist = rdp->nxtlist;
 		rdp->curtail = rdp->nxttail;
 		rdp->nxtlist = NULL;
 		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
+		local_irq_restore(flags);
 
 		/*
 		 * start the next batch of callbacks
@@ -442,7 +444,7 @@ static void __rcu_process_callbacks(stru
 		rcu_do_batch(rdp);
 }
 
-static void rcu_process_callbacks(struct softirq_action *unused)
+void rcu_process_callbacks(struct softirq_action *unused)
 {
 	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
@@ -497,6 +499,17 @@ int rcu_needs_cpu(int cpu)
 	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
 }
 
+void rcu_advance_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+}
+
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
Index: linux-2.6.25.4-rt4/kernel/rcutorture.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rcutorture.c	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rcutorture.c	2008-05-29 09:46:58.000000000 -0400
@@ -52,11 +52,13 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck
 
 static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
 static int nfakewriters = 4;	/* # fake writer threads */
+static int npreempthogs = -1; 	/* # preempt hogs to run (defaults to ncpus-1) or 1 */
 static int stat_interval;	/* Interval between stats, in seconds. */
 				/*  Defaults to "only at end of test". */
 static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
+static int preempt_torture;	/* Realtime task preempts torture readers. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 
 module_param(nreaders, int, 0444);
@@ -71,6 +73,8 @@ module_param(test_no_idle_hz, bool, 0444
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 module_param(shuffle_interval, int, 0444);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
+module_param(preempt_torture, bool, 0444);
+MODULE_PARM_DESC(preempt_torture, "Enable realtime preemption torture");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 
@@ -85,9 +89,11 @@ MODULE_PARM_DESC(torture_type, "Type of 
 static char printk_buf[4096];
 
 static int nrealreaders;
+static int nrealpreempthogs;
 static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
+static struct task_struct **rcu_preempt_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 
@@ -191,6 +197,8 @@ struct rcu_torture_ops {
 	int (*completed)(void);
 	void (*deferredfree)(struct rcu_torture *p);
 	void (*sync)(void);
+	long (*preemptstart)(void);
+	void (*preemptend)(void);
 	int (*stats)(char *page);
 	char *name;
 };
@@ -255,16 +263,93 @@ static void rcu_torture_deferred_free(st
 	call_rcu(&p->rtort_rcu, rcu_torture_cb);
 }
 
+static unsigned long rcu_torture_preempt_errors;
+
+static int rcu_torture_preempt(void *arg)
+{
+	int completedstart;
+	int err;
+	time_t gcstart;
+	struct sched_param sp;
+
+	sp.sched_priority = 1;
+	err = sched_setscheduler(current, SCHED_RR, &sp);
+	if (err != 0)
+		printk(KERN_ALERT "rcu_torture_preempt() priority err: %d\n",
+		       err);
+	current->flags |= PF_NOFREEZE;
+
+	do {
+		completedstart = rcu_torture_completed();
+		gcstart = xtime.tv_sec;
+		while ((xtime.tv_sec - gcstart < 10) &&
+		       (rcu_torture_completed() == completedstart))
+			cond_resched();
+		if (rcu_torture_completed() == completedstart)
+			rcu_torture_preempt_errors++;
+		schedule_timeout_interruptible(1);
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+static long rcu_preempt_start(void)
+{
+	long retval = 0;
+	int i;
+
+	rcu_preempt_tasks = kzalloc(nrealpreempthogs * sizeof(rcu_preempt_tasks[0]),
+				GFP_KERNEL);
+	if (rcu_preempt_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	for (i=0; i < nrealpreempthogs; i++) {
+		rcu_preempt_tasks[i] = kthread_run(rcu_torture_preempt, NULL,
+						"rcu_torture_preempt");
+		if (IS_ERR(rcu_preempt_tasks[i])) {
+			VERBOSE_PRINTK_ERRSTRING("Failed to create preempter");
+			retval = PTR_ERR(rcu_preempt_tasks[i]);
+			rcu_preempt_tasks[i] = NULL;
+			break;
+		}
+	}
+ out:
+	return retval;
+}
+
+static void rcu_preempt_end(void)
+{
+	int i;
+	if (rcu_preempt_tasks) {
+		for (i=0; i < nrealpreempthogs; i++) {
+			if (rcu_preempt_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING("Stopping rcu_preempt task");
+				kthread_stop(rcu_preempt_tasks[i]);
+			}
+			rcu_preempt_tasks[i] = NULL;
+		}
+		kfree(rcu_preempt_tasks);
+	}
+}
+
+static int rcu_preempt_stats(char *page)
+{
+	return sprintf(page,
+		       "Preemption stalls: %lu\n", rcu_torture_preempt_errors);
+}
+
 static struct rcu_torture_ops rcu_ops = {
-	.init = NULL,
-	.cleanup = NULL,
 	.readlock = rcu_torture_read_lock,
 	.readdelay = rcu_read_delay,
 	.readunlock = rcu_torture_read_unlock,
 	.completed = rcu_torture_completed,
 	.deferredfree = rcu_torture_deferred_free,
 	.sync = synchronize_rcu,
-	.stats = NULL,
+	.preemptstart = rcu_preempt_start,
+	.preemptend = rcu_preempt_end,
+	.stats = rcu_preempt_stats,
 	.name = "rcu"
 };
 
@@ -296,14 +381,12 @@ static void rcu_sync_torture_init(void)
 
 static struct rcu_torture_ops rcu_sync_ops = {
 	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
 	.readlock = rcu_torture_read_lock,
 	.readdelay = rcu_read_delay,
 	.readunlock = rcu_torture_read_unlock,
 	.completed = rcu_torture_completed,
 	.deferredfree = rcu_sync_torture_deferred_free,
 	.sync = synchronize_rcu,
-	.stats = NULL,
 	.name = "rcu_sync"
 };
 
@@ -355,28 +438,23 @@ static void rcu_bh_torture_synchronize(v
 }
 
 static struct rcu_torture_ops rcu_bh_ops = {
-	.init = NULL,
-	.cleanup = NULL,
 	.readlock = rcu_bh_torture_read_lock,
 	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock = rcu_bh_torture_read_unlock,
 	.completed = rcu_bh_torture_completed,
 	.deferredfree = rcu_bh_torture_deferred_free,
 	.sync = rcu_bh_torture_synchronize,
-	.stats = NULL,
 	.name = "rcu_bh"
 };
 
 static struct rcu_torture_ops rcu_bh_sync_ops = {
 	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
 	.readlock = rcu_bh_torture_read_lock,
 	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock = rcu_bh_torture_read_unlock,
 	.completed = rcu_bh_torture_completed,
 	.deferredfree = rcu_sync_torture_deferred_free,
 	.sync = rcu_bh_torture_synchronize,
-	.stats = NULL,
 	.name = "rcu_bh_sync"
 };
 
@@ -488,14 +566,12 @@ static void sched_torture_synchronize(vo
 
 static struct rcu_torture_ops sched_ops = {
 	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
 	.readlock = sched_torture_read_lock,
 	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock = sched_torture_read_unlock,
 	.completed = sched_torture_completed,
 	.deferredfree = rcu_sync_torture_deferred_free,
 	.sync = sched_torture_synchronize,
-	.stats = NULL,
 	.name = "sched"
 };
 
@@ -550,10 +626,20 @@ rcu_torture_writer(void *arg)
 static int
 rcu_torture_fakewriter(void *arg)
 {
+	struct sched_param sp;
+	long id = (long) arg;
+	int err;
 	DEFINE_RCU_RANDOM(rand);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
-	set_user_nice(current, 19);
+	/*
+	 * Set up at a higher prio than the readers.
+	 */
+	sp.sched_priority = 1 + id;
+	err = sched_setscheduler(current, SCHED_RR, &sp);
+	if (err != 0)
+		printk(KERN_ALERT "rcu_torture_writer() priority err: %d\n",
+		       err);
 
 	do {
 		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
@@ -592,7 +678,7 @@ rcu_torture_reader(void *arg)
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
 			cur_ops->readunlock(idx);
-			schedule_timeout_interruptible(HZ);
+			schedule_timeout_interruptible(round_jiffies_relative(HZ));
 			continue;
 		}
 		if (p->rtort_mbtest == 0)
@@ -786,10 +872,13 @@ rcu_torture_print_module_parms(char *tag
 {
 	printk(KERN_ALERT "%s" TORTURE_FLAG
 		"--- %s: nreaders=%d nfakewriters=%d "
+	        "npreempthogs=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
-		"shuffle_interval = %d\n",
+		"shuffle_interval=%d preempt_torture=%d\n",
 		torture_type, tag, nrealreaders, nfakewriters,
-		stat_interval, verbose, test_no_idle_hz, shuffle_interval);
+		nrealpreempthogs,
+		stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+		preempt_torture);
 }
 
 static void
@@ -842,6 +931,8 @@ rcu_torture_cleanup(void)
 		kthread_stop(stats_task);
 	}
 	stats_task = NULL;
+	if (preempt_torture && (cur_ops->preemptend != NULL))
+		cur_ops->preemptend();
 
 	/* Wait for all RCU callbacks to fire.  */
 	rcu_barrier();
@@ -859,7 +950,7 @@ rcu_torture_cleanup(void)
 static int __init
 rcu_torture_init(void)
 {
-	int i;
+	long i;
 	int cpu;
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
@@ -887,6 +978,12 @@ rcu_torture_init(void)
 	rcu_torture_print_module_parms("Start of test");
 	fullstop = 0;
 
+	if (npreempthogs >= 0)
+		nrealpreempthogs = npreempthogs;
+	else
+		nrealpreempthogs = num_online_cpus() == 1 ? 1 :
+			num_online_cpus() - 1;
+
 	/* Set up the freelist. */
 
 	INIT_LIST_HEAD(&rcu_torture_freelist);
@@ -934,7 +1031,7 @@ rcu_torture_init(void)
 	}
 	for (i = 0; i < nfakewriters; i++) {
 		VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
-		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
+		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, (void*)i,
 		                                  "rcu_torture_fakewriter");
 		if (IS_ERR(fakewriter_tasks[i])) {
 			firsterr = PTR_ERR(fakewriter_tasks[i]);
@@ -984,6 +1081,11 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
+	if (preempt_torture && (cur_ops->preemptstart != NULL)) {
+		firsterr = cur_ops->preemptstart();
+		if (firsterr != 0)
+			goto unwind;
+	}
 	return 0;
 
 unwind:
Index: linux-2.6.25.4-rt4/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/time/timekeeping.c	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/time/timekeeping.c	2008-05-29 09:46:48.000000000 -0400
@@ -24,8 +24,9 @@
  * This read-write spinlock protects us from races in SMP while
  * playing with xtime and avenrun.
  */
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SEQLOCK(xtime_lock);
 
+EXPORT_SYMBOL_GPL(xtime_lock);
 
 /*
  * The current time
@@ -45,6 +46,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOC
 struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 static unsigned long total_sleep_time;		/* seconds */
+EXPORT_SYMBOL_GPL(xtime);
 
 static struct timespec xtime_cache __attribute__ ((aligned (16)));
 void update_xtime_cache(u64 nsec)
Index: linux-2.6.25.4-rt4/include/linux/srcu.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/srcu.h	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/srcu.h	2008-05-29 09:46:03.000000000 -0400
@@ -27,6 +27,8 @@
 #ifndef _LINUX_SRCU_H
 #define _LINUX_SRCU_H
 
+#include <linux/wait.h>
+
 struct srcu_struct_array {
 	int c[2];
 };
@@ -50,4 +52,24 @@ void srcu_read_unlock(struct srcu_struct
 void synchronize_srcu(struct srcu_struct *sp);
 long srcu_batches_completed(struct srcu_struct *sp);
 
+/*
+ * fully compatible with srcu, but optimized for writers.
+ */
+
+struct qrcu_struct {
+	int completed;
+	atomic_t ctr[2];
+	wait_queue_head_t wq;
+	struct mutex mutex;
+};
+
+int init_qrcu_struct(struct qrcu_struct *qp);
+int qrcu_read_lock(struct qrcu_struct *qp);
+void qrcu_read_unlock(struct qrcu_struct *qp, int idx);
+void synchronize_qrcu(struct qrcu_struct *qp);
+
+static inline void cleanup_qrcu_struct(struct qrcu_struct *qp)
+{
+}
+
 #endif
Index: linux-2.6.25.4-rt4/kernel/srcu.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/srcu.c	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/srcu.c	2008-05-29 09:46:03.000000000 -0400
@@ -255,3 +255,89 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
 EXPORT_SYMBOL_GPL(srcu_read_unlock);
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+int init_qrcu_struct(struct qrcu_struct *qp)
+{
+	qp->completed = 0;
+	atomic_set(qp->ctr + 0, 1);
+	atomic_set(qp->ctr + 1, 0);
+	init_waitqueue_head(&qp->wq);
+	mutex_init(&qp->mutex);
+
+	return 0;
+}
+
+int qrcu_read_lock(struct qrcu_struct *qp)
+{
+	for (;;) {
+		int idx = qp->completed & 0x1;
+		if (likely(atomic_inc_not_zero(qp->ctr + idx)))
+			return idx;
+	}
+}
+
+void qrcu_read_unlock(struct qrcu_struct *qp, int idx)
+{
+	if (atomic_dec_and_test(qp->ctr + idx))
+		wake_up(&qp->wq);
+}
+
+void synchronize_qrcu(struct qrcu_struct *qp)
+{
+	int idx;
+
+	smp_mb();  /* Force preceding change to happen before fastpath check. */
+
+	/*
+	 * Fastpath: If the two counters sum to "1" at a given point in
+	 * time, there are no readers.  However, it takes two separate
+	 * loads to sample both counters, which won't occur simultaneously.
+	 * So we might race with a counter switch, so that we might see
+	 * ctr[0]==0, then the counter might switch, then we might see
+	 * ctr[1]==1 (unbeknownst to us because there is a reader still
+	 * there).  So we do a read memory barrier and recheck.  If the
+	 * same race happens again, there must have been a second counter
+	 * switch.  This second counter switch could not have happened
+	 * until all preceding readers finished, so if the condition
+	 * is true both times, we may safely proceed.
+	 *
+	 * This relies critically on the atomic increment and atomic
+	 * decrement being seen as executing in order.
+	 */
+
+	if (atomic_read(&qp->ctr[0]) + atomic_read(&qp->ctr[1]) <= 1) {
+		smp_rmb();  /* Keep two checks independent. */
+		if (atomic_read(&qp->ctr[0]) + atomic_read(&qp->ctr[1]) <= 1)
+			goto out;
+	}
+
+	mutex_lock(&qp->mutex);
+
+	idx = qp->completed & 0x1;
+	if (atomic_read(qp->ctr + idx) == 1)
+		goto out_unlock;
+
+	atomic_inc(qp->ctr + (idx ^ 0x1));
+
+	/*
+	 * Prevent subsequent decrement from being seen before previous
+	 * increment -- such an inversion could cause the fastpath
+	 * above to falsely conclude that there were no readers.  Also,
+	 * reduce the likelihood that qrcu_read_lock() will loop.
+	 */
+
+	smp_mb__after_atomic_inc();
+	qp->completed++;
+
+	atomic_dec(qp->ctr + idx);
+	__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
+out_unlock:
+	mutex_unlock(&qp->mutex);
+out:
+	smp_mb(); /* force subsequent free after qrcu_read_unlock(). */
+}
+
+EXPORT_SYMBOL_GPL(init_qrcu_struct);
+EXPORT_SYMBOL_GPL(qrcu_read_lock);
+EXPORT_SYMBOL_GPL(qrcu_read_unlock);
+EXPORT_SYMBOL_GPL(synchronize_qrcu);
Index: linux-2.6.25.4-rt4/kernel/sched_cpupri.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/sched_cpupri.c	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,174 @@
+/*
+ *  kernel/sched_cpupri.c
+ *
+ *  CPU priority management
+ *
+ *  Copyright (C) 2007-2008 Novell
+ *
+ *  Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ *  This code tracks the priority of each CPU so that global migration
+ *  decisions are easy to calculate.  Each CPU can be in a state as follows:
+ *
+ *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ *  going from the lowest priority to the highest.  CPUs in the INVALID state
+ *  are not eligible for routing.  The system maintains this state with
+ *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  in that class).  Therefore a typical application without affinity
+ *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ *  searches).  For tasks with affinity restrictions, the algorithm has a
+ *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ *  yields the worst case search is fairly contrived.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+
+#include "sched_cpupri.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+	int cpupri;
+
+	if (prio == CPUPRI_INVALID)
+		cpupri = CPUPRI_INVALID;
+	else if (prio == MAX_PRIO)
+		cpupri = CPUPRI_IDLE;
+	else if (prio >= MAX_RT_PRIO)
+		cpupri = CPUPRI_NORMAL;
+	else
+		cpupri = MAX_RT_PRIO - prio + 1;
+
+	return cpupri;
+}
+
+#define for_each_cpupri_active(array, idx)                    \
+  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+       idx < CPUPRI_NR_PRIORITIES;                            \
+       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation.  By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times.  While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+		cpumask_t *lowest_mask)
+{
+	int                  idx      = 0;
+	int                  task_pri = convert_prio(p->prio);
+
+	for_each_cpupri_active(cp->pri_active, idx) {
+		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+		cpumask_t mask;
+
+		if (idx >= task_pri)
+			break;
+
+		cpus_and(mask, p->cpus_allowed, vec->mask);
+
+		if (cpus_empty(mask))
+			continue;
+
+		*lowest_mask = mask;
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+	int                 *currpri = &cp->cpu_to_pri[cpu];
+	int                  oldpri  = *currpri;
+	unsigned long        flags;
+
+	newpri = convert_prio(newpri);
+
+	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+
+	if (newpri == oldpri)
+		return;
+
+	/*
+	 * If the cpu was currently mapped to a different value, we
+	 * first need to unmap the old value
+	 */
+	if (likely(oldpri != CPUPRI_INVALID)) {
+		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+
+		spin_lock_irqsave(&vec->lock, flags);
+
+		vec->count--;
+		if (!vec->count)
+			clear_bit(oldpri, cp->pri_active);
+		cpu_clear(cpu, vec->mask);
+
+		spin_unlock_irqrestore(&vec->lock, flags);
+	}
+
+	if (likely(newpri != CPUPRI_INVALID)) {
+		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+		spin_lock_irqsave(&vec->lock, flags);
+
+		cpu_set(cpu, vec->mask);
+		vec->count++;
+		if (vec->count == 1)
+			set_bit(newpri, cp->pri_active);
+
+		spin_unlock_irqrestore(&vec->lock, flags);
+	}
+
+	*currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+	int i;
+
+	memset(cp, 0, sizeof(*cp));
+
+	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+		spin_lock_init(&vec->lock);
+		vec->count = 0;
+		cpus_clear(vec->mask);
+	}
+
+	for_each_possible_cpu(i)
+		cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
+
+
Index: linux-2.6.25.4-rt4/kernel/sched_cpupri.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/sched_cpupri.h	2008-05-29 09:46:43.000000000 -0400
@@ -0,0 +1,36 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
+#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE     0
+#define CPUPRI_NORMAL   1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec {
+	raw_spinlock_t lock;
+	int        count;
+	cpumask_t  mask;
+};
+
+struct cpupri {
+	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+	long              pri_active[CPUPRI_NR_PRI_WORDS];
+	int               cpu_to_pri[NR_CPUS];
+};
+
+#ifdef CONFIG_SMP
+int  cpupri_find(struct cpupri *cp,
+		 struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+
+#endif /* _LINUX_CPUPRI_H */
Index: linux-2.6.25.4-rt4/kernel/sched_rt.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/sched_rt.c	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/sched_rt.c	2008-05-29 09:47:32.000000000 -0400
@@ -33,6 +33,9 @@ static inline void rt_clear_overload(str
 
 static void update_rt_migration(struct rq *rq)
 {
+	if (unlikely(num_online_cpus() == 1))
+		return;
+
 	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
 		if (!rq->rt.overloaded) {
 			rt_set_overload(rq);
@@ -112,6 +115,7 @@ static void sched_rt_rq_dequeue(struct r
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 {
+	return 0;
 	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 }
 
@@ -171,6 +175,7 @@ static inline void sched_rt_rq_dequeue(s
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 {
+	return 0;
 	return rt_rq->rt_throttled;
 }
 #endif
@@ -191,6 +196,7 @@ static int sched_rt_runtime_exceeded(str
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
 
+	return 0;
 	if (runtime == RUNTIME_INF)
 		return 0;
 
@@ -217,6 +223,7 @@ static void update_sched_rt_period(struc
 	struct rt_rq *rt_rq;
 	u64 period;
 
+	return;
 	while (rq->clock > rq->rt_period_expire) {
 		period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 		rq->rt_period_expire += period;
@@ -259,7 +266,7 @@ static void update_curr_rt(struct rq *rq
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
-	rt_rq->rt_time += delta_exec;
+//	rt_rq->rt_time += delta_exec;
 	if (sched_rt_runtime_exceeded(rt_rq))
 		resched_task(curr);
 }
@@ -270,8 +277,11 @@ void inc_rt_tasks(struct sched_rt_entity
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rt_rq->highest_prio = rt_se_prio(rt_se);
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
+	}
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1) {
@@ -290,6 +300,10 @@ void inc_rt_tasks(struct sched_rt_entity
 static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_SMP
+	int highest_prio = rt_rq->highest_prio;
+#endif
+
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running--;
@@ -310,9 +324,15 @@ void dec_rt_tasks(struct sched_rt_entity
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
+		BUG_ON(!rq->rt.rt_nr_migratory);
 		rq->rt.rt_nr_migratory--;
 	}
 
+	if (rt_rq->highest_prio != highest_prio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+	}
+
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -332,7 +352,13 @@ static void enqueue_rt_entity(struct sch
 	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
-	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	if (rt_se->nr_cpus_allowed == 1)
+		list_add_tail(&rt_se->run_list,
+			      array->xqueue + rt_se_prio(rt_se));
+	else
+		list_add_tail(&rt_se->run_list,
+			      array->squeue + rt_se_prio(rt_se));
+
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
 	inc_rt_tasks(rt_se, rt_rq);
@@ -344,7 +370,8 @@ static void dequeue_rt_entity(struct sch
 	struct rt_prio_array *array = &rt_rq->active;
 
 	list_del_init(&rt_se->run_list);
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
+	if (list_empty(array->squeue + rt_se_prio(rt_se))
+	    && list_empty(array->xqueue + rt_se_prio(rt_se)))
 		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 
 	dec_rt_tasks(rt_se, rt_rq);
@@ -376,6 +403,55 @@ static void dequeue_rt_stack(struct task
 	} while (top_se);
 }
 
+static inline void incr_rt_nr_uninterruptible(struct task_struct *p,
+					      struct rq *rq)
+{
+	rq->rt.rt_nr_uninterruptible++;
+}
+
+static inline void decr_rt_nr_uninterruptible(struct task_struct *p,
+					      struct rq *rq)
+{
+	rq->rt.rt_nr_uninterruptible--;
+}
+
+unsigned long rt_nr_running(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt.rt_nr_running;
+
+	return sum;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt.rt_nr_running;
+}
+
+unsigned long rt_nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt.rt_nr_uninterruptible;
+
+	/*
+	 * Since we read the counters lockless, it might be slightly
+	 * inaccurate. Do not allow it to go below zero though:
+	 */
+	if (unlikely((long)sum < 0))
+		sum = 0;
+
+	return sum;
+}
+
+unsigned long rt_nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt.rt_nr_uninterruptible;
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -393,6 +469,9 @@ static void enqueue_task_rt(struct rq *r
 	 */
 	for_each_sched_rt_entity(rt_se)
 		enqueue_rt_entity(rt_se);
+
+	if (p->state == TASK_UNINTERRUPTIBLE)
+		decr_rt_nr_uninterruptible(p, rq);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -402,6 +481,9 @@ static void dequeue_task_rt(struct rq *r
 
 	update_curr_rt(rq);
 
+	if (p->state == TASK_UNINTERRUPTIBLE)
+		incr_rt_nr_uninterruptible(p, rq);
+
 	dequeue_rt_stack(p);
 
 	/*
@@ -417,13 +499,19 @@ static void dequeue_task_rt(struct rq *r
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
+ *
+ * Note: We always enqueue the task to the shared-queue, regardless of its
+ * previous position w.r.t. exclusive vs shared.  This is so that exclusive RR
+ * tasks fairly round-robin with all tasks on the runqueue, not just other
+ * exclusive tasks.
  */
 static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
 	struct rt_prio_array *array = &rt_rq->active;
 
-	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	list_del_init(&rt_se->run_list);
+	list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se));
 }
 
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -481,13 +569,46 @@ static int select_task_rq_rt(struct task
 }
 #endif /* CONFIG_SMP */
 
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+						   struct rt_rq *rt_rq);
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 {
-	if (p->prio < rq->curr->prio)
+	if (p->prio < rq->curr->prio) {
 		resched_task(rq->curr);
+		return;
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * If:
+	 *
+	 * - the newly woken task is of equal priority to the current task
+	 * - the newly woken task is non-migratable while current is migratable
+	 * - current will be preempted on the next reschedule
+	 *
+	 * we should check to see if current can readily move to a different
+	 * cpu.  If so, we will reschedule to allow the push logic to try
+	 * to move current somewhere else, making room for our non-migratable
+	 * task.
+	 */
+	if((p->prio == rq->curr->prio)
+	   && p->rt.nr_cpus_allowed == 1
+	   && rq->curr->rt.nr_cpus_allowed != 1
+	   && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) {
+		cpumask_t mask;
+
+		if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+			/*
+			 * There appears to be other cpus that can accept
+			 * current, so lets reschedule to try and push it away
+			 */
+			resched_task(rq->curr);
+	}
+#endif
 }
 
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -501,8 +622,15 @@ static struct sched_rt_entity *pick_next
 	idx = sched_find_first_bit(array->bitmap);
 	BUG_ON(idx >= MAX_RT_PRIO);
 
-	queue = array->queue + idx;
-	next = list_entry(queue->next, struct sched_rt_entity, run_list);
+	queue = array->xqueue + idx;
+	if (!list_empty(queue))
+		next = list_entry(queue->next, struct sched_rt_entity,
+				  run_list);
+	else {
+		queue = array->squeue + idx;
+		next = list_entry(queue->next, struct sched_rt_entity,
+				  run_list);
+	}
 
 	return next;
 }
@@ -572,7 +700,7 @@ static struct task_struct *pick_next_hig
 			continue;
 		if (next && next->prio < idx)
 			continue;
-		list_for_each_entry(rt_se, array->queue + idx, run_list) {
+		list_for_each_entry(rt_se, array->squeue + idx, run_list) {
 			struct task_struct *p = rt_task_of(rt_se);
 			if (pick_rt_task(rq, p, cpu)) {
 				next = p;
@@ -590,73 +718,6 @@ static struct task_struct *pick_next_hig
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 
-static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
-{
-	int       lowest_prio = -1;
-	int       lowest_cpu  = -1;
-	int       count       = 0;
-	int       cpu;
-
-	cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-
-	/*
-	 * Scan each rq for the lowest prio.
-	 */
-	for_each_cpu_mask(cpu, *lowest_mask) {
-		struct rq *rq = cpu_rq(cpu);
-
-		/* We look for lowest RT prio or non-rt CPU */
-		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			/*
-			 * if we already found a low RT queue
-			 * and now we found this non-rt queue
-			 * clear the mask and set our bit.
-			 * Otherwise just return the queue as is
-			 * and the count==1 will cause the algorithm
-			 * to use the first bit found.
-			 */
-			if (lowest_cpu != -1) {
-				cpus_clear(*lowest_mask);
-				cpu_set(rq->cpu, *lowest_mask);
-			}
-			return 1;
-		}
-
-		/* no locking for now */
-		if ((rq->rt.highest_prio > task->prio)
-		    && (rq->rt.highest_prio >= lowest_prio)) {
-			if (rq->rt.highest_prio > lowest_prio) {
-				/* new low - clear old data */
-				lowest_prio = rq->rt.highest_prio;
-				lowest_cpu = cpu;
-				count = 0;
-			}
-			count++;
-		} else
-			cpu_clear(cpu, *lowest_mask);
-	}
-
-	/*
-	 * Clear out all the set bits that represent
-	 * runqueues that were of higher prio than
-	 * the lowest_prio.
-	 */
-	if (lowest_cpu > 0) {
-		/*
-		 * Perhaps we could add another cpumask op to
-		 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
-		 * Then that could be optimized to use memset and such.
-		 */
-		for_each_cpu_mask(cpu, *lowest_mask) {
-			if (cpu >= lowest_cpu)
-				break;
-			cpu_clear(cpu, *lowest_mask);
-		}
-	}
-
-	return count;
-}
-
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
 	int first;
@@ -678,17 +739,12 @@ static int find_lowest_rq(struct task_st
 	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
-	int count    = find_lowest_cpus(task, lowest_mask);
 
-	if (!count)
-		return -1; /* No targets found */
+	if (task->rt.nr_cpus_allowed == 1)
+		return -1; /* No other targets possible */
 
-	/*
-	 * There is no sense in performing an optimal search if only one
-	 * target is found.
-	 */
-	if (count == 1)
-		return first_cpu(*lowest_mask);
+	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+		return -1; /* No targets found */
 
 	/*
 	 * At this point we have built a mask of cpus representing the
@@ -839,6 +895,8 @@ static int push_rt_task(struct rq *rq)
 
 	resched_task(lowest_rq->curr);
 
+	schedstat_inc(rq, rto_pushed);
+
 	spin_unlock(&lowest_rq->lock);
 
 	ret = 1;
@@ -943,6 +1001,7 @@ static int pull_rt_task(struct rq *this_
 			 */
 			next = p;
 
+			schedstat_inc(src_rq, rto_pulled);
 		}
  skip:
 		spin_unlock(&src_rq->lock);
@@ -954,8 +1013,10 @@ static int pull_rt_task(struct rq *this_
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) {
 		pull_rt_task(rq);
+		schedstat_inc(rq, rto_schedule);
+	}
 }
 
 static void post_schedule_rt(struct rq *rq)
@@ -968,9 +1029,10 @@ static void post_schedule_rt(struct rq *
 	 * first via finish_lock_switch and then reaquire it here.
 	 */
 	if (unlikely(rq->rt.overloaded)) {
-		spin_lock_irq(&rq->lock);
+		spin_lock(&rq->lock);
 		push_rt_tasks(rq);
-		spin_unlock_irq(&rq->lock);
+		schedstat_inc(rq, rto_schedule_tail);
+		spin_unlock(&rq->lock);
 	}
 }
 
@@ -978,9 +1040,11 @@ static void post_schedule_rt(struct rq *
 static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
-	    (p->prio >= rq->rt.highest_prio) &&
-	    rq->rt.overloaded)
+	    !test_tsk_need_resched(rq->curr) &&
+	    rq->rt.overloaded) {
 		push_rt_tasks(rq);
+ 		schedstat_inc(rq, rto_wakeup);
+	}
 }
 
 static unsigned long
@@ -1022,6 +1086,14 @@ static void set_cpus_allowed_rt(struct t
 		}
 
 		update_rt_migration(rq);
+
+		if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1))
+			/*
+			 * If either the new or old weight is a "1", we need
+			 * to requeue to properly move between shared and
+			 * exclusive queues.
+			 */
+			requeue_task_rt(rq, p);
 	}
 
 	p->cpus_allowed    = *new_mask;
@@ -1033,6 +1105,8 @@ static void join_domain_rt(struct rq *rq
 {
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
+
+	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
 }
 
 /* Assumes rq->lock is held */
@@ -1040,6 +1114,8 @@ static void leave_domain_rt(struct rq *r
 {
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
+
+	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }
 
 /*
Index: linux-2.6.25.4-rt4/arch/x86/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/Kconfig	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/Kconfig	2008-05-29 09:47:21.000000000 -0400
@@ -18,7 +18,10 @@ config X86_64
 ### Arch settings
 config X86
 	def_bool y
+	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
+	select HAVE_DYNAMIC_FTRACE
+	select HAVE_FTRACE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
@@ -104,10 +107,18 @@ config DMI
 	def_bool y
 
 config RWSEM_GENERIC_SPINLOCK
-	def_bool !X86_XADD
+	bool
+	depends on !X86_XADD || PREEMPT_RT
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
 
 config RWSEM_XCHGADD_ALGORITHM
-	def_bool X86_XADD
+	bool
+	depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT
+	default y
 
 config ARCH_HAS_ILOG2_U32
 	def_bool n
@@ -1237,6 +1248,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool X86_64
 	depends on NUMA
 
+config HARDIRQS_SW_RESEND
+	bool
+	default y
+
 menu "Power management options"
 	depends on !X86_VOYAGER
 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/alternative.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/alternative.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/alternative.c	2008-05-29 09:46:04.000000000 -0400
@@ -141,7 +141,7 @@ static const unsigned char *const p6_nop
 #ifdef CONFIG_X86_64
 
 extern char __vsyscall_0;
-static inline const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
 	return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
 	       boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -160,7 +160,7 @@ static const struct nop {
 	{ -1, NULL }
 };
 
-static const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
 	const unsigned char *const *noptable = intel_nops;
 	int i;
@@ -177,7 +177,7 @@ static const unsigned char*const * find_
 #endif /* CONFIG_X86_64 */
 
 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void add_nops(void *insns, unsigned int len)
+void add_nops(void *insns, unsigned int len)
 {
 	const unsigned char *const *noptable = find_nop_table();
 
@@ -190,6 +190,7 @@ static void add_nops(void *insns, unsign
 		len -= noplen;
 	}
 }
+EXPORT_SYMBOL_GPL(add_nops);
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern u8 *__smp_locks[], *__smp_locks_end[];
Index: linux-2.6.25.4-rt4/arch/x86/kernel/entry_32.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/entry_32.S	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/entry_32.S	2008-05-29 09:46:43.000000000 -0400
@@ -265,14 +265,18 @@ END(ret_from_exception)
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
 	DISABLE_INTERRUPTS(CLBR_ANY)
+	cmpl $0, kernel_preemption
+	jz restore_nocheck
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
-	jz restore_all
+	jz restore_nocheck
 	testl $IF_MASK,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
-	jz restore_all
+	jz restore_nocheck
+	DISABLE_INTERRUPTS(CLBR_ANY)
+
 	call preempt_schedule_irq
 	jmp need_resched
 END(resume_kernel)
@@ -330,6 +334,11 @@ sysenter_past_esp:
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+#ifdef CONFIG_EVENT_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -345,6 +354,11 @@ sysenter_past_esp:
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
+#ifdef CONFIG_EVENT_TRACE
+	pushl %eax
+	call sys_ret
+	popl %eax
+#endif
 /* if something modifies registers it must also disable sysexit */
 	movl PT_EIP(%esp), %edx
 	movl PT_OLDESP(%esp), %ecx
@@ -368,6 +382,11 @@ ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+#ifdef CONFIG_EVENT_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -467,20 +486,19 @@ ENDPROC(system_call)
 	ALIGN
 	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jz work_notifysig
 work_resched:
-	call schedule
 	LOCKDEP_SYS_EXIT
 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
+	call __schedule
 					# setting need_resched or sigpending
 					# between sampling and the iret
-	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
 	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jnz work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -1107,6 +1125,74 @@ ENDPROC(xen_failsafe_callback)
 
 #endif	/* CONFIG_XEN */
 
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+	cmpl $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+	call *ftrace_trace_function
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/entry_64.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/entry_64.S	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/entry_64.S	2008-05-29 09:46:27.000000000 -0400
@@ -54,6 +54,110 @@
 
 	.code64
 
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+	retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+ENTRY(mcount)
+	cmpq $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	retq
+
+trace:
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+	call   *ftrace_trace_function
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
+#define HARDNMI_MASK 0x40000000
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
@@ -249,7 +353,9 @@ ENTRY(system_call_after_swapgs)
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	movq %r10,%rcx
+	TRACE_SYS_CALL
 	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
@@ -281,8 +387,8 @@ sysret_check:		
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */	
 sysret_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz sysret_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
@@ -305,7 +411,7 @@ sysret_signal:
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
-1:	movl $_TIF_NEED_RESCHED,%edi
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -330,7 +436,9 @@ tracesys:			 
 	cmova %rcx,%rax
 	ja  1f
 	movq %r10,%rcx	/* fixup for C */
+	TRACE_SYS_CALL
 	call *sys_call_table(,%rax,8)
+ 	TRACE_SYS_RET
 1:	movq %rax,RAX-ARGOFFSET(%rsp)
 	/* Use IRET because user could have changed frame */
 		
@@ -359,8 +467,8 @@ int_with_check:
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
@@ -395,7 +503,7 @@ int_signal:
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
 	call do_notify_resume
-1:	movl $_TIF_NEED_RESCHED,%edi	
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 int_restore_rest:
 	RESTORE_REST
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -622,8 +730,8 @@ bad_iret:
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz    retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
@@ -649,7 +757,7 @@ retint_signal:
 	RESTORE_REST
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	movl $_TIF_NEED_RESCHED,%edi
+	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	GET_THREAD_INFO(%rcx)
 	jmp retint_check
 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/ftrace.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/arch/x86/kernel/ftrace.c	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,160 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/alternative.h>
+#include <asm/asm.h>
+
+#define CALL_BACK		5
+
+/* Long is fine, even if it is only 4 bytes ;-) */
+static long *ftrace_nop;
+
+union ftrace_code_union {
+	char code[5];
+	struct {
+		char e8;
+		int offset;
+	} __attribute__((packed));
+};
+
+notrace int ftrace_ip_converted(unsigned long ip)
+{
+	unsigned long save;
+
+	ip -= CALL_BACK;
+	save = *(long *)ip;
+
+	return save == *ftrace_nop;
+}
+
+static int notrace ftrace_calc_offset(long ip, long addr)
+{
+	return (int)(addr - ip);
+}
+
+notrace unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+	static union ftrace_code_union calc;
+
+	calc.e8		= 0xe8;
+	calc.offset	= ftrace_calc_offset(ip, addr);
+
+	/*
+	 * No locking needed, this must be called via kstop_machine
+	 * which in essence is like running on a uniprocessor machine.
+	 */
+	return calc.code;
+}
+
+notrace int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+		   unsigned char *new_code)
+{
+	unsigned replaced;
+	unsigned old = *(unsigned *)old_code; /* 4 bytes */
+	unsigned new = *(unsigned *)new_code; /* 4 bytes */
+	unsigned char newch = new_code[4];
+	int faulted = 0;
+
+	/* move the IP back to the start of the call */
+	ip -= CALL_BACK;
+
+	/*
+	 * Note: Due to modules and __init, code can
+	 *  disappear and change, we need to protect against faulting
+	 *  as well as code changing.
+	 *
+	 * No real locking needed, this code is run through
+	 * kstop_machine.
+	 */
+	asm volatile (
+		"1: lock\n"
+		"   cmpxchg %3, (%2)\n"
+		"   jnz 2f\n"
+		"   movb %b4, 4(%2)\n"
+		"2:\n"
+		".section .fixup, \"ax\"\n"
+		"3:	movl $1, %0\n"
+		"	jmp 2b\n"
+		".previous\n"
+		_ASM_EXTABLE(1b, 3b)
+		: "=r"(faulted), "=a"(replaced)
+		: "r"(ip), "r"(new), "r"(newch),
+		  "0"(faulted), "a"(old)
+		: "memory");
+	sync_core();
+
+	if (replaced != old && replaced != new)
+		faulted = 2;
+
+	return faulted;
+}
+
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned char old[5], *new;
+	int ret;
+
+	ip += CALL_BACK;
+
+	memcpy(old, &ftrace_call, 5);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+	unsigned long ip = (long)(&mcount_call);
+	unsigned long *addr = data;
+	unsigned char old[5], *new;
+
+	/* ip is at the location, but modify code will subtact this */
+	ip += CALL_BACK;
+
+	/*
+	 * Replace the mcount stub with a pointer to the
+	 * ip recorder function.
+	 */
+	memcpy(old, &mcount_call, 5);
+	new = ftrace_call_replace(ip, *addr);
+	*addr = ftrace_modify_code(ip, old, new);
+
+	return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	const unsigned char *const *noptable = find_nop_table();
+
+	/* This is running in kstop_machine */
+
+	ftrace_mcount_set(data);
+
+	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
+
+	return 0;
+}
+
Index: linux-2.6.25.4-rt4/arch/x86/kernel/process_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/process_32.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/process_32.c	2008-05-29 09:46:43.000000000 -0400
@@ -112,7 +112,7 @@ void default_idle(void)
 		smp_mb();
 
 		local_irq_disable();
-		if (!need_resched()) {
+		if (!need_resched() && !need_resched_delayed()) {
 			ktime_t t0, t1;
 			u64 t0n, t1n;
 
@@ -142,7 +142,9 @@ EXPORT_SYMBOL(default_idle);
  */
 static void poll_idle(void)
 {
-	cpu_relax();
+	do {
+		cpu_relax();
+	} while (!need_resched() && !need_resched_delayed());
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -186,10 +188,9 @@ void cpu_idle(void)
 	/* endless idle loop with no priority at all */
 	while (1) {
 		tick_nohz_stop_sched_tick();
-		while (!need_resched()) {
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
-			check_pgt_cache();
 			rmb();
 			idle = pm_idle;
 
@@ -203,12 +204,17 @@ void cpu_idle(void)
 				play_dead();
 
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 		}
+		local_irq_disable();
 		tick_nohz_restart_sched_tick();
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
@@ -244,10 +250,10 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (!need_resched())
+		if (!need_resched() && !need_resched_delayed())
 			__mwait(ax, cx);
 	}
 }
@@ -340,9 +346,10 @@ void __show_registers(struct pt_regs *re
 		regs->ax, regs->bx, regs->cx, regs->dx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
 		regs->si, regs->di, regs->bp, sp);
-	printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
+	printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x"
+	       " preempt:%08x\n",
 	       regs->ds & 0xffff, regs->es & 0xffff,
-	       regs->fs & 0xffff, gs, ss);
+	       regs->fs & 0xffff, gs, ss, preempt_count());
 
 	if (!all)
 		return;
@@ -414,15 +421,23 @@ void exit_thread(void)
 	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
 		struct task_struct *tsk = current;
 		struct thread_struct *t = &tsk->thread;
-		int cpu = get_cpu();
-		struct tss_struct *tss = &per_cpu(init_tss, cpu);
+		void *io_bitmap_ptr = t->io_bitmap_ptr;
+		int cpu;
+		struct tss_struct *tss;
 
-		kfree(t->io_bitmap_ptr);
+		/*
+		 * On PREEMPT_RT we must not call kfree() with
+		 * preemption disabled, so we first zap the pointer:
+		 */
 		t->io_bitmap_ptr = NULL;
+		kfree(io_bitmap_ptr);
+
 		clear_thread_flag(TIF_IO_BITMAP);
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
+		cpu = get_cpu();
+		tss = &per_cpu(init_tss, cpu);
 		memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		tss->io_bitmap_owner = NULL;
Index: linux-2.6.25.4-rt4/arch/x86/kernel/process_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/process_64.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/process_64.c	2008-05-29 09:46:37.000000000 -0400
@@ -106,7 +106,7 @@ void default_idle(void)
 	 */
 	smp_mb();
 	local_irq_disable();
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		ktime_t t0, t1;
 		u64 t0n, t1n;
 
@@ -169,7 +169,7 @@ void cpu_idle(void)
 	/* endless idle loop with no priority at all */
 	while (1) {
 		tick_nohz_stop_sched_tick();
-		while (!need_resched()) {
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			rmb();
@@ -185,7 +185,10 @@ void cpu_idle(void)
 			 */
 			local_irq_disable();
 			enter_idle();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
@@ -193,9 +196,11 @@ void cpu_idle(void)
 		}
 
 		tick_nohz_restart_sched_tick();
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
@@ -231,10 +236,10 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (!need_resched())
+		if (!need_resched() && !need_resched_delayed())
 			__mwait(ax, cx);
 	}
 }
@@ -242,10 +247,10 @@ void mwait_idle_with_hints(unsigned long
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (!need_resched())
+		if (!need_resched() && !need_resched_delayed())
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
@@ -379,7 +384,7 @@ void exit_thread(void)
 	struct thread_struct *t = &me->thread;
 
 	if (me->thread.io_bitmap_ptr) {
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+		struct tss_struct *tss;
 
 		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
@@ -387,6 +392,7 @@ void exit_thread(void)
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
+		tss = &per_cpu(init_tss, get_cpu());
 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		put_cpu();
Index: linux-2.6.25.4-rt4/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/vsyscall_64.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/vsyscall_64.c	2008-05-29 09:47:29.000000000 -0400
@@ -42,7 +42,7 @@
 #include <asm/topology.h>
 #include <asm/vgtod.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace
 #define __syscall_clobber "r11","cx","memory"
 
 /*
@@ -55,7 +55,7 @@ int __vgetcpu_mode __section_vgetcpu_mod
 
 struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
 {
-	.lock = SEQLOCK_UNLOCKED,
+	.lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
 	.sysctl_enabled = 1,
 };
 
@@ -74,14 +74,40 @@ void update_vsyscall(struct timespec *wa
 	unsigned long flags;
 
 	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+
+	if (likely(vsyscall_gtod_data.sysctl_enabled == 2)) {
+		struct timespec tmp = *(wall_time);
+		cycle_t (*vread)(void);
+		cycle_t now;
+
+		vread = vsyscall_gtod_data.clock.vread;
+		if (likely(vread))
+			now = vread();
+		else
+			now = clock->read();
+
+		/* calculate interval: */
+		now = (now - clock->cycle_last) & clock->mask;
+		/* convert to nsecs: */
+		tmp.tv_nsec += ( now * clock->mult) >> clock->shift;
+
+		while (tmp.tv_nsec >= NSEC_PER_SEC) {
+			tmp.tv_sec += 1;
+			tmp.tv_nsec -= NSEC_PER_SEC;
+		}
+
+		vsyscall_gtod_data.wall_time_sec = tmp.tv_sec;
+		vsyscall_gtod_data.wall_time_nsec = tmp.tv_nsec;
+	} else {
+		vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+		vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+	}
 	/* copy vsyscall data */
 	vsyscall_gtod_data.clock.vread = clock->vread;
 	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
 	vsyscall_gtod_data.clock.mask = clock->mask;
 	vsyscall_gtod_data.clock.mult = clock->mult;
 	vsyscall_gtod_data.clock.shift = clock->shift;
-	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
-	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
@@ -119,6 +145,26 @@ static __always_inline void do_vgettimeo
 	unsigned seq;
 	unsigned long mult, shift, nsec;
 	cycle_t (*vread)(void);
+
+	if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) {
+		struct timeval tmp;
+
+		do {
+			barrier();
+			tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
+			tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec;
+			barrier();
+			tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec;
+			tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec;
+
+		} while (tmp.tv_usec != tv->tv_usec ||
+					tmp.tv_sec != tv->tv_sec);
+
+		tv->tv_usec /= NSEC_PER_MSEC;
+		tv->tv_usec *= USEC_PER_MSEC;
+		return;
+	}
+
 	do {
 		seq = read_seqbegin(&__vsyscall_gtod_data.lock);
 
@@ -127,7 +173,6 @@ static __always_inline void do_vgettimeo
 			gettimeofday(tv,NULL);
 			return;
 		}
-		now = vread();
 		base = __vsyscall_gtod_data.clock.cycle_last;
 		mask = __vsyscall_gtod_data.clock.mask;
 		mult = __vsyscall_gtod_data.clock.mult;
@@ -137,6 +182,7 @@ static __always_inline void do_vgettimeo
 		nsec = __vsyscall_gtod_data.wall_time_nsec;
 	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
 
+	now = vread();
 	/* calculate interval: */
 	cycle_delta = (now - base) & mask;
 	/* convert to nsecs: */
Index: linux-2.6.25.4-rt4/arch/x86/lib/thunk_32.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/arch/x86/lib/thunk_32.S	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,47 @@
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ *  (inspired by Andi Kleen's thunk_64.S)
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+	#include <linux/linkage.h>
+
+#define ARCH_TRACE_IRQS_ON			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off;		\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* put return address in eax (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	/* Place EIP in the arg1 */
+	movl 3*4(%esp), %eax
+	call \func
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#endif
Index: linux-2.6.25.4-rt4/arch/x86/lib/thunk_64.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/lib/thunk_64.S	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/lib/thunk_64.S	2008-05-29 09:46:27.000000000 -0400
@@ -40,15 +40,31 @@
 	thunk rwsem_wake_thunk,rwsem_wake
 	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
 #endif	
-	
-	thunk __down_failed,__down
-	thunk_retrax __down_failed_interruptible,__down_interruptible
-	thunk_retrax __down_failed_trylock,__down_trylock
-	thunk __up_wakeup,__up
+
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+	thunk __compat_down_failed,__compat_down
+	thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible
+	thunk_retrax __compat_down_failed_trylock,__compat_down_trylock
+	thunk __compat_up_wakeup,__compat_up
+#endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
-	thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+	/* put return address in rdi (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	CFI_STARTPROC
+	SAVE_ARGS
+	/* SAVE_ARGS pushs 9 elements */
+	/* the next element would be the rip */
+	movq 9*8(%rsp), %rdi
+	call \func
+	jmp  restore
+	CFI_ENDPROC
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
Index: linux-2.6.25.4-rt4/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/mm/init_32.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/mm/init_32.c	2008-05-29 09:46:29.000000000 -0400
@@ -51,7 +51,7 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
@@ -723,7 +723,7 @@ void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;
 
-#ifndef CONFIG_KPROBES
+#if !defined(CONFIG_KPROBES) && !defined(CONFIG_DYNAMIC_FTRACE)
 #ifdef CONFIG_HOTPLUG_CPU
 	/* It must still be possible to apply SMP alternatives. */
 	if (num_possible_cpus() <= 1)
Index: linux-2.6.25.4-rt4/arch/x86/mm/init_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/mm/init_64.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/mm/init_64.c	2008-05-29 09:46:29.000000000 -0400
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(dma_ops);
 
 static unsigned long dma_reserve __initdata;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
@@ -604,7 +604,7 @@ void mark_rodata_ro(void)
 		start = (unsigned long)_etext;
 #endif
 
-#ifdef CONFIG_KPROBES
+#if defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
 	start = (unsigned long)__start_rodata;
 #endif
 
Index: linux-2.6.25.4-rt4/arch/x86/vdso/vclock_gettime.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/vdso/vclock_gettime.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/vdso/vclock_gettime.c	2008-05-29 09:46:04.000000000 -0400
@@ -23,7 +23,7 @@
 
 #define gtod vdso_vsyscall_gtod_data
 
-static long vdso_fallback_gettime(long clock, struct timespec *ts)
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
 	asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long c
 	return ret;
 }
 
-static inline long vgetns(void)
+notrace static inline long vgetns(void)
 {
 	long v;
 	cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
 	return (v * gtod->clock.mult) >> gtod->clock.shift;
 }
 
-static noinline int do_realtime(struct timespec *ts)
+notrace static noinline int do_realtime(struct timespec *ts)
 {
 	unsigned long seq, ns;
 	do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct t
 }
 
 /* Copy of the version in kernel/time.c which we cannot directly access */
-static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+notrace static void
+vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 {
 	while (nsec >= NSEC_PER_SEC) {
 		nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(str
 	ts->tv_nsec = nsec;
 }
 
-static noinline int do_monotonic(struct timespec *ts)
+notrace static noinline int do_monotonic(struct timespec *ts)
 {
 	unsigned long seq, ns, secs;
 	do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct 
 	return 0;
 }
 
-int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread))
 		switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock
 int clock_gettime(clockid_t, struct timespec *)
 	__attribute__((weak, alias("__vdso_clock_gettime")));
 
-int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
Index: linux-2.6.25.4-rt4/arch/x86/vdso/vgetcpu.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/vdso/vgetcpu.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/vdso/vgetcpu.c	2008-05-29 09:46:04.000000000 -0400
@@ -13,7 +13,8 @@
 #include <asm/vgtod.h>
 #include "vextern.h"
 
-long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
 	unsigned int p;
 
Index: linux-2.6.25.4-rt4/include/asm-x86/vsyscall.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/vsyscall.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/vsyscall.h	2008-05-29 09:46:04.000000000 -0400
@@ -24,7 +24,7 @@ enum vsyscall_num {
 	((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
 #define __section_vsyscall_clock __attribute__ \
 	((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) notrace
 
 #define VGETCPU_RDTSCP	1
 #define VGETCPU_LSL	2
Index: linux-2.6.25.4-rt4/include/linux/ftrace.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/linux/ftrace.h	2008-05-29 09:47:32.000000000 -0400
@@ -0,0 +1,189 @@
+#ifndef _LINUX_FTRACE_H
+#define _LINUX_FTRACE_H
+
+#include <linux/ktime.h>
+
+#ifdef CONFIG_FTRACE
+
+#include <linux/linkage.h>
+#include <linux/fs.h>
+
+extern int ftrace_enabled;
+extern int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+		     struct file *filp, void __user *buffer, size_t *lenp,
+		     loff_t *ppos);
+
+typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
+
+struct ftrace_ops {
+	ftrace_func_t	  func;
+	struct ftrace_ops *next;
+};
+
+/*
+ * The ftrace_ops must be a static and should also
+ * be read_mostly.  These functions do modify read_mostly variables
+ * so use them sparely. Never free an ftrace_op or modify the
+ * next pointer after it has been registered. Even after unregistering
+ * it, the next pointer may still be used internally.
+ */
+int register_ftrace_function(struct ftrace_ops *ops);
+int unregister_ftrace_function(struct ftrace_ops *ops);
+void clear_ftrace_function(void);
+
+extern void ftrace_stub(unsigned long a0, unsigned long a1);
+extern void mcount(void);
+
+#else /* !CONFIG_FTRACE */
+# define register_ftrace_function(ops) do { } while (0)
+# define unregister_ftrace_function(ops) do { } while (0)
+# define clear_ftrace_function(ops) do { } while (0)
+#endif /* CONFIG_FTRACE */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_HASHBITS	10
+# define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
+
+enum {
+	FTRACE_FL_FREE		= (1 << 0),
+	FTRACE_FL_FAILED	= (1 << 1),
+	FTRACE_FL_FILTER	= (1 << 2),
+	FTRACE_FL_ENABLED	= (1 << 3),
+	FTRACE_FL_NOTRACE	= (1 << 4),
+};
+
+struct dyn_ftrace {
+	struct hlist_node node;
+	unsigned long	  ip;
+	unsigned long	  flags;
+};
+
+int ftrace_force_update(void);
+void ftrace_set_filter(unsigned char *buf, int len, int reset);
+
+/* defined in arch */
+extern int ftrace_ip_converted(unsigned long ip);
+extern unsigned char *ftrace_nop_replace(void);
+extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
+extern int ftrace_dyn_arch_init(void *data);
+extern int ftrace_mcount_set(unsigned long *data);
+extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+			      unsigned char *new_code);
+extern int ftrace_update_ftrace_func(ftrace_func_t func);
+extern void ftrace_caller(void);
+extern void ftrace_call(void);
+extern void mcount_call(void);
+
+void ftrace_disable_daemon(void);
+void ftrace_enable_daemon(void);
+
+#else
+# define ftrace_force_update()			({ 0; })
+# define ftrace_set_filter(buf, len, reset)	do { } while (0)
+# define ftrace_disable_daemon()		do { } while (0)
+# define ftrace_enable_daemon()			do { } while (0)
+#endif
+
+/* totally disable ftrace - can not re-enable after this */
+void ftrace_kill(void);
+
+static inline void tracer_disable(void)
+{
+#ifdef CONFIG_FTRACE
+	ftrace_enabled = 0;
+#endif
+}
+
+#ifdef CONFIG_FRAME_POINTER
+/* TODO: need to fix this for ARM */
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+# define CALLER_ADDR4 0UL
+# define CALLER_ADDR5 0UL
+# define CALLER_ADDR6 0UL
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+  extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)		do { } while (0)
+# define time_hardirqs_off(a0, a1)		do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+  extern void trace_preempt_on(unsigned long a0, unsigned long a1);
+  extern void trace_preempt_off(unsigned long a0, unsigned long a1);
+#else
+# define trace_preempt_on(a0, a1)		do { } while (0)
+# define trace_preempt_off(a0, a1)		do { } while (0)
+#endif
+
+#ifdef CONFIG_TRACING
+extern void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+#else
+static inline void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+#endif
+
+#ifdef CONFIG_EVENT_TRACER
+#include <linux/marker.h>
+
+static inline void ftrace_event_irq(int irq, int user, unsigned long ip)
+{
+	trace_mark(ftrace_event_irq, "%d %d %ld", irq, user, ip);
+}
+
+static inline void ftrace_event_fault(unsigned long ip, unsigned long error,
+				      unsigned long addr)
+{
+	trace_mark(ftrace_event_fault, "%ld %ld %ld", ip, error, addr);
+}
+
+static inline void ftrace_event_timer_set(void *p1, void *p2)
+{
+	trace_mark(ftrace_event_timer_set, "%p %p", p1, p2);
+}
+
+static inline void ftrace_event_timer_triggered(void *p1, void *p2)
+{
+	trace_mark(ftrace_event_timer_triggered, "%p %p", p1, p2);
+}
+
+static inline void ftrace_event_timestamp(ktime_t *time)
+{
+	trace_mark(ftrace_event_hrtimer, "%p", time);
+}
+
+static inline void ftrace_event_task_activate(struct task_struct *p, int cpu)
+{
+	trace_mark(ftrace_event_task_activate, "%p %d", p, cpu);
+}
+
+static inline void ftrace_event_task_deactivate(struct task_struct *p, int cpu)
+{
+	trace_mark(ftrace_event_task_deactivate, "%p %d", p, cpu);
+}
+#else
+# define ftrace_event_irq(irq, user, ip)	do { } while (0)
+# define ftrace_event_fault(ip, error, addr)	do { } while (0)
+# define ftrace_event_timer_set(p1, p2)		do { } while (0)
+# define ftrace_event_timer_triggered(p1, p2)	do { } while (0)
+# define ftrace_event_timestamp(now)		do { } while (0)
+# define ftrace_event_task_activate(p, cpu)	do { } while (0)
+# define ftrace_event_task_deactivate(p, cpu)	do { } while (0)
+#endif /* CONFIG_TRACE_EVENTS */
+
+#endif /* _LINUX_FTRACE_H */
Index: linux-2.6.25.4-rt4/include/linux/irqflags.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/irqflags.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/irqflags.h	2008-05-29 09:47:33.000000000 -0400
@@ -11,6 +11,16 @@
 #ifndef _LINUX_TRACE_IRQFLAGS_H
 #define _LINUX_TRACE_IRQFLAGS_H
 
+#if !defined(BUILD_CHECK_IRQ_FLAGS) && defined(typecheck)
+#define BUILD_CHECK_IRQ_FLAGS(flags)					\
+	do {								\
+		BUILD_BUG_ON(sizeof(flags) != sizeof(unsigned long));	\
+		typecheck(unsigned long, flags);			\
+	} while (0)
+#else
+#define BUILD_CHECK_IRQ_FLAGS(flags)
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS
   extern void trace_hardirqs_on(void);
   extern void trace_hardirqs_off(void);
@@ -41,6 +51,15 @@
 # define INIT_TRACE_IRQFLAGS
 #endif
 
+#if defined(CONFIG_IRQSOFF_TRACER) || \
+	defined(CONFIG_PREEMPT_TRACER)
+ extern void stop_critical_timings(void);
+ extern void start_critical_timings(void);
+#else
+# define stop_critical_timings() do { } while (0)
+# define start_critical_timings() do { } while (0)
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 
 #include <asm/irqflags.h>
@@ -50,10 +69,15 @@
 #define local_irq_disable() \
 	do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)
 #define local_irq_save(flags) \
-	do { raw_local_irq_save(flags); trace_hardirqs_off(); } while (0)
+	do {					\
+		BUILD_CHECK_IRQ_FLAGS(flags);	\
+		raw_local_irq_save(flags);	\
+		trace_hardirqs_off();		\
+	} while (0)
 
 #define local_irq_restore(flags)				\
 	do {							\
+		BUILD_CHECK_IRQ_FLAGS(flags);			\
 		if (raw_irqs_disabled_flags(flags)) {		\
 			raw_local_irq_restore(flags);		\
 			trace_hardirqs_off();			\
@@ -69,8 +93,16 @@
  */
 # define raw_local_irq_disable()	local_irq_disable()
 # define raw_local_irq_enable()		local_irq_enable()
-# define raw_local_irq_save(flags)	local_irq_save(flags)
-# define raw_local_irq_restore(flags)	local_irq_restore(flags)
+# define raw_local_irq_save(flags)		\
+	do {					\
+		BUILD_CHECK_IRQ_FLAGS(flags);	\
+		local_irq_save(flags);		\
+	} while (0)
+# define raw_local_irq_restore(flags)		\
+	do {					\
+		BUILD_CHECK_IRQ_FLAGS(flags);	\
+		local_irq_restore(flags);	\
+	} while (0)
 #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
 
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
@@ -80,7 +112,11 @@
 		raw_safe_halt();				\
 	} while (0)
 
-#define local_save_flags(flags)		raw_local_save_flags(flags)
+#define local_save_flags(flags)			\
+	do {					\
+		BUILD_CHECK_IRQ_FLAGS(flags);	\
+		raw_local_save_flags(flags);	\
+	} while (0)
 
 #define irqs_disabled()						\
 ({								\
@@ -90,7 +126,11 @@
 	raw_irqs_disabled_flags(flags);				\
 })
 
-#define irqs_disabled_flags(flags)	raw_irqs_disabled_flags(flags)
+#define irqs_disabled_flags(flags)	\
+({					\
+	BUILD_CHECK_IRQ_FLAGS(flags);	\
+	raw_irqs_disabled_flags(flags);	\
+})
 #endif		/* CONFIG_X86 */
 
 #endif
Index: linux-2.6.25.4-rt4/include/linux/ktime.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/ktime.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/ktime.h	2008-05-29 09:46:04.000000000 -0400
@@ -327,4 +327,10 @@ extern void ktime_get_ts(struct timespec
 /* Get the real (wall-) time in timespec format: */
 #define ktime_get_real_ts(ts)	getnstimeofday(ts)
 
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+	static const ktime_t ktime_zero = { .tv64 = 0 };
+	return ktime_add_ns(ktime_zero, ns);
+}
+
 #endif
Index: linux-2.6.25.4-rt4/include/linux/linkage.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/linkage.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/linkage.h	2008-05-29 09:46:04.000000000 -0400
@@ -3,6 +3,8 @@
 
 #include <asm/linkage.h>
 
+#define notrace __attribute__((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
Index: linux-2.6.25.4-rt4/include/linux/mmiotrace.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/linux/mmiotrace.h	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,85 @@
+#ifndef MMIOTRACE_H
+#define MMIOTRACE_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+struct kmmio_probe;
+struct pt_regs;
+
+typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
+				struct pt_regs *, unsigned long addr);
+typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
+				unsigned long condition, struct pt_regs *);
+
+struct kmmio_probe {
+	struct list_head list; /* kmmio internal list */
+	unsigned long addr; /* start location of the probe point */
+	unsigned long len; /* length of the probe region */
+	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
+	kmmio_post_handler_t post_handler; /* Called after addr is executed */
+	void *private;
+};
+
+/* kmmio is active by some kmmio_probes? */
+static inline int is_kmmio_active(void)
+{
+	extern unsigned int kmmio_count;
+	return kmmio_count;
+}
+
+extern int register_kmmio_probe(struct kmmio_probe *p);
+extern void unregister_kmmio_probe(struct kmmio_probe *p);
+
+/* Called from page fault handler. */
+extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
+
+/* Called from ioremap.c */
+#ifdef CONFIG_MMIOTRACE
+extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+							void __iomem *addr);
+extern void mmiotrace_iounmap(volatile void __iomem *addr);
+#else
+static inline void mmiotrace_ioremap(resource_size_t offset,
+					unsigned long size, void __iomem *addr)
+{
+}
+
+static inline void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+}
+#endif /* CONFIG_MMIOTRACE_HOOKS */
+
+enum mm_io_opcode {
+	MMIO_READ = 0x1,     /* struct mmiotrace_rw */
+	MMIO_WRITE = 0x2,    /* struct mmiotrace_rw */
+	MMIO_PROBE = 0x3,    /* struct mmiotrace_map */
+	MMIO_UNPROBE = 0x4,  /* struct mmiotrace_map */
+	MMIO_MARKER = 0x5,   /* raw char data */
+	MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
+};
+
+struct mmiotrace_rw {
+	resource_size_t phys;	/* PCI address of register */
+	unsigned long value;
+	unsigned long pc;	/* optional program counter */
+	int map_id;
+	unsigned char opcode;	/* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
+	unsigned char width;	/* size of register access in bytes */
+};
+
+struct mmiotrace_map {
+	resource_size_t phys;	/* base address in PCI space */
+	unsigned long virt;	/* base virtual address */
+	unsigned long len;	/* mapping size */
+	int map_id;
+	unsigned char opcode;	/* MMIO_PROBE or MMIO_UNPROBE */
+};
+
+/* in kernel/trace/trace_mmiotrace.c */
+extern void enable_mmiotrace(void);
+extern void disable_mmiotrace(void);
+extern void mmio_trace_rw(struct mmiotrace_rw *rw);
+extern void mmio_trace_mapping(struct mmiotrace_map *map);
+
+#endif /* MMIOTRACE_H */
Index: linux-2.6.25.4-rt4/include/linux/preempt.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/preempt.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/preempt.h	2008-05-29 09:46:29.000000000 -0400
@@ -9,8 +9,10 @@
 #include <linux/thread_info.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
+#include <linux/thread_info.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) || \
+	defined(CONFIG_PREEMPT_TRACE)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
@@ -21,11 +23,12 @@
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
-#define preempt_count()	(current_thread_info()->preempt_count)
+#define preempt_count()		(current_thread_info()->preempt_count)
 
 #ifdef CONFIG_PREEMPT
 
 asmlinkage void preempt_schedule(void);
+asmlinkage void preempt_schedule_irq(void);
 
 #define preempt_disable() \
 do { \
@@ -33,21 +36,62 @@ do { \
 	barrier(); \
 } while (0)
 
-#define preempt_enable_no_resched() \
+#define __preempt_enable_no_resched() \
 do { \
 	barrier(); \
 	dec_preempt_count(); \
 } while (0)
 
+
+#ifdef CONFIG_DEBUG_PREEMPT
+extern void notrace preempt_enable_no_resched(void);
+#else
+# define preempt_enable_no_resched() __preempt_enable_no_resched()
+#endif
+
 #define preempt_check_resched() \
 do { \
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
 		preempt_schedule(); \
 } while (0)
 
+#define preempt_check_resched_delayed() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \
+		preempt_schedule(); \
+} while (0)
+
 #define preempt_enable() \
 do { \
-	preempt_enable_no_resched(); \
+	__preempt_enable_no_resched(); \
+	barrier(); \
+	preempt_check_resched(); \
+} while (0)
+
+/* For debugging and tracer internals only! */
+#define add_preempt_count_notrace(val)			\
+	do { preempt_count() += (val); } while (0)
+#define sub_preempt_count_notrace(val)			\
+	do { preempt_count() -= (val); } while (0)
+#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
+#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
+
+#define preempt_disable_notrace() \
+do { \
+	inc_preempt_count_notrace(); \
+	barrier(); \
+} while (0)
+
+#define preempt_enable_no_resched_notrace() \
+do { \
+	barrier(); \
+	dec_preempt_count_notrace(); \
+} while (0)
+
+/* preempt_check_resched is OK to trace */
+#define preempt_enable_notrace() \
+do { \
+	preempt_enable_no_resched_notrace(); \
 	barrier(); \
 	preempt_check_resched(); \
 } while (0)
@@ -56,8 +100,16 @@ do { \
 
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
+#define __preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
+#define preempt_check_resched_delayed()	do { } while (0)
+
+#define preempt_disable_notrace()		do { } while (0)
+#define preempt_enable_no_resched_notrace()	do { } while (0)
+#define preempt_enable_notrace()		do { } while (0)
+
+#define preempt_schedule_irq()		do { } while (0)
 
 #endif
 
Index: linux-2.6.25.4-rt4/include/linux/writeback.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/writeback.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/writeback.h	2008-05-29 09:46:04.000000000 -0400
@@ -105,6 +105,8 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
+extern unsigned long determine_dirtyable_memory(void);
+
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
 		struct file *filp, void __user *buffer, size_t *lenp,
 		loff_t *ppos);
Index: linux-2.6.25.4-rt4/kernel/lockdep.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/lockdep.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/lockdep.c	2008-05-29 09:47:29.000000000 -0400
@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 
 #include <asm/sections.h>
 
@@ -66,7 +67,7 @@ module_param(lock_stat, int, 0644);
  * to use a raw spinlock - we really dont want the spinlock
  * code to recurse back into the lockdep code...
  */
-static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static __raw_spinlock_t lockdep_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 
 static int graph_lock(void)
 {
@@ -81,6 +82,8 @@ static int graph_lock(void)
 		__raw_spin_unlock(&lockdep_lock);
 		return 0;
 	}
+	/* prevent any recursions within lockdep from causing deadlocks */
+	current->lockdep_recursion++;
 	return 1;
 }
 
@@ -89,6 +92,7 @@ static inline int graph_unlock(void)
 	if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
 		return DEBUG_LOCKS_WARN_ON(1);
 
+	current->lockdep_recursion--;
 	__raw_spin_unlock(&lockdep_lock);
 	return 0;
 }
@@ -508,7 +512,11 @@ static void print_lock(struct held_lock 
 
 static void lockdep_print_held_locks(struct task_struct *curr)
 {
-	int i, depth = curr->lockdep_depth;
+	int i, depth;
+
+	if (!curr)
+		curr = current;
+	depth = curr->lockdep_depth;
 
 	if (!depth) {
 		printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
@@ -573,7 +581,7 @@ static void print_lock_dependencies(stru
 
 static void print_kernel_version(void)
 {
-	printk("%s %.*s\n", init_utsname()->release,
+	printk("[ %s %.*s\n", init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
 }
@@ -813,6 +821,21 @@ out_unlock_set:
 	return class;
 }
 
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_TRACE_IRQFLAGS)
+
+#define RECURSION_LIMIT 40
+
+static int noinline print_infinite_recursion_bug(void)
+{
+	if (!debug_locks_off_graph_unlock())
+		return 0;
+
+	WARN_ON(1);
+
+	return 0;
+}
+#endif /* CONFIG_PROVE_LOCKING || CONFIG_TRACE_IRQFLAGS */
+
 #ifdef CONFIG_PROVE_LOCKING
 /*
  * Allocate a lockdep entry. (assumes the graph_lock held, returns
@@ -943,18 +966,6 @@ static noinline int print_circular_bug_t
 	return 0;
 }
 
-#define RECURSION_LIMIT 40
-
-static int noinline print_infinite_recursion_bug(void)
-{
-	if (!debug_locks_off_graph_unlock())
-		return 0;
-
-	WARN_ON(1);
-
-	return 0;
-}
-
 /*
  * Prove that the dependency graph starting at <entry> can not
  * lead to <target>. Print an error and return 0 if it does.
@@ -982,7 +993,7 @@ check_noncircular(struct lock_class *sou
 	return 1;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -1072,6 +1083,7 @@ find_usage_backwards(struct lock_class *
 	return 1;
 }
 
+#ifdef CONFIG_PROVE_LOCKING
 static int
 print_bad_irq_dependency(struct task_struct *curr,
 			 struct held_lock *prev,
@@ -1132,6 +1144,7 @@ print_bad_irq_dependency(struct task_str
 
 	return 0;
 }
+#endif /* CONFIG_PROVE_LOCKING */
 
 static int
 check_usage(struct task_struct *curr, struct held_lock *prev,
@@ -1680,7 +1693,7 @@ valid_state(struct task_struct *curr, st
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		     enum lock_usage_bit new_bit);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 /*
  * print irq inversion bug:
@@ -2013,11 +2026,12 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void trace_hardirqs_on_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
 
+	time_hardirqs_on(CALLER_ADDR0, a0);
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2055,16 +2069,23 @@ void trace_hardirqs_on(void)
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
+void trace_hardirqs_on(void)
+{
+	trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void trace_hardirqs_off_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 
+	time_hardirqs_off(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2082,7 +2103,12 @@ void trace_hardirqs_off(void)
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
+void trace_hardirqs_off(void)
+{
+	trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 /*
@@ -2518,6 +2544,55 @@ static int check_unlock(struct task_stru
 	return 1;
 }
 
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+		    unsigned int subclass, unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock, *prev_hlock;
+	struct lock_class *class;
+	unsigned int depth;
+	int i;
+
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return 0;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+	class = register_lock_class(lock, subclass, 0);
+	hlock->class = class;
+
+	curr->lockdep_depth = i;
+	curr->curr_chain_key = hlock->prev_chain_key;
+
+	for (; i < depth; i++) {
+		hlock = curr->held_locks + i;
+		if (!__lock_acquire(hlock->instance,
+			hlock->class->subclass, hlock->trylock,
+				hlock->read, hlock->check, hlock->hardirqs_off,
+				hlock->acquire_ip))
+			return 0;
+	}
+
+	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+		return 0;
+	return 1;
+}
+
 /*
  * Remove the lock to the list of currently held locks in a
  * potentially non-nested (out of order) manner. This is a
@@ -2681,6 +2756,29 @@ static void check_flags(unsigned long fl
 #endif
 }
 
+void
+lock_set_subclass(struct lockdep_map *lock,
+		  unsigned int subclass, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(!lock_stat && !prove_locking))
+		return;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	current->lockdep_recursion = 1;
+	check_flags(flags);
+	if (__lock_set_subclass(lock, subclass, ip))
+		check_chain_key(current);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_set_subclass);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
@@ -2791,7 +2889,7 @@ found_it:
 
 	stats = get_lock_stats(hlock->class);
 	if (point < ARRAY_SIZE(stats->contention_point))
-		stats->contention_point[i]++;
+		stats->contention_point[point]++;
 	if (lock->cpu != smp_processor_id())
 		stats->bounces[bounce_contended + !!hlock->read]++;
 	put_lock_stats(stats);
@@ -3039,13 +3137,13 @@ void __init lockdep_info(void)
 {
 	printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
 
-	printk("... MAX_LOCKDEP_SUBCLASSES:    %lu\n", MAX_LOCKDEP_SUBCLASSES);
-	printk("... MAX_LOCK_DEPTH:          %lu\n", MAX_LOCK_DEPTH);
-	printk("... MAX_LOCKDEP_KEYS:        %lu\n", MAX_LOCKDEP_KEYS);
-	printk("... CLASSHASH_SIZE:           %lu\n", CLASSHASH_SIZE);
-	printk("... MAX_LOCKDEP_ENTRIES:     %lu\n", MAX_LOCKDEP_ENTRIES);
-	printk("... MAX_LOCKDEP_CHAINS:      %lu\n", MAX_LOCKDEP_CHAINS);
-	printk("... CHAINHASH_SIZE:          %lu\n", CHAINHASH_SIZE);
+	printk("... MAX_LOCKDEP_SUBCLASSES: %6lu\n", MAX_LOCKDEP_SUBCLASSES);
+	printk("... MAX_LOCK_DEPTH:         %6lu\n", MAX_LOCK_DEPTH);
+	printk("... MAX_LOCKDEP_KEYS:       %6lu\n", MAX_LOCKDEP_KEYS);
+	printk("... CLASSHASH_SIZE:         %6lu\n", CLASSHASH_SIZE);
+	printk("... MAX_LOCKDEP_ENTRIES:    %6lu\n", MAX_LOCKDEP_ENTRIES);
+	printk("... MAX_LOCKDEP_CHAINS:     %6lu\n", MAX_LOCKDEP_CHAINS);
+	printk("... CHAINHASH_SIZE:         %6lu\n", CHAINHASH_SIZE);
 
 	printk(" memory used by lock dependency info: %lu kB\n",
 		(sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
@@ -3216,6 +3314,8 @@ void __debug_show_held_locks(struct task
 		printk("INFO: lockdep is turned off.\n");
 		return;
 	}
+	if (task == current)
+		lockdep_print_held_locks(task);
 	lockdep_print_held_locks(task);
 }
 EXPORT_SYMBOL_GPL(__debug_show_held_locks);
Index: linux-2.6.25.4-rt4/kernel/sched_trace.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/sched_trace.h	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,41 @@
+#include <linux/marker.h>
+
+static inline void trace_kernel_sched_wait(struct task_struct *p)
+{
+	trace_mark(kernel_sched_wait_task, "pid %d state %ld",
+			p->pid, p->state);
+}
+
+static inline
+void trace_kernel_sched_wakeup(struct rq *rq, struct task_struct *p)
+{
+	trace_mark(kernel_sched_wakeup,
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			p->pid, p->state, rq, p, rq->curr);
+}
+
+static inline
+void trace_kernel_sched_wakeup_new(struct rq *rq, struct task_struct *p)
+{
+	trace_mark(kernel_sched_wakeup_new,
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			p->pid, p->state, rq, p, rq->curr);
+}
+
+static inline void trace_kernel_sched_switch(struct rq *rq,
+		struct task_struct *prev, struct task_struct *next)
+{
+	trace_mark(kernel_sched_schedule,
+			"prev_pid %d next_pid %d prev_state %ld "
+			"## rq %p prev %p next %p",
+			prev->pid, next->pid, prev->state,
+			rq, prev, next);
+}
+
+static inline void
+trace_kernel_sched_migrate_task(struct task_struct *p, int src, int dst)
+{
+	trace_mark(kernel_sched_migrate_task,
+			"pid %d state %ld dest_cpu %d",
+			p->pid, p->state, dst);
+}
Index: linux-2.6.25.4-rt4/kernel/sysctl.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/sysctl.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/sysctl.c	2008-05-29 09:47:25.000000000 -0400
@@ -45,6 +45,8 @@
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
+#include <linux/ftrace.h>
+#include <linux/profile.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -67,6 +69,7 @@ extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_oom_dump_tasks;
+extern int futex_performance_hack;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -149,6 +152,10 @@ static int parse_table(int __user *, int
 		void __user *, size_t, struct ctl_table *);
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+extern int rt_rwlock_limit;
+#endif
+
 
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
@@ -356,6 +363,74 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_FUTEX
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "futex_performance_hack",
+		.data		= &futex_performance_hack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "prof_pid",
+		.data		= &prof_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_PREEMPT
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "kernel_preemption",
+		.data		= &kernel_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "voluntary_preemption",
+		.data		= &voluntary_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "softirq_preemption",
+		.data		= &softirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hardirq_preemption",
+		.data		= &hardirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_PREEMPT_RT
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "rwlock_reader_limit",
+		.data		= &rt_rwlock_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= KERN_PANIC,
 		.procname	= "panic",
@@ -364,6 +439,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_GENERIC_HARDIRQS
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "debug_direct_keyboard",
+		.data		= &debug_direct_keyboard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= KERN_CORE_USES_PID,
 		.procname	= "core_uses_pid",
@@ -470,6 +555,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_FTRACE
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_enabled",
+		.data		= &ftrace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &ftrace_enable_sysctl,
+	},
+#endif
 #ifdef CONFIG_KMOD
 	{
 		.ctl_name	= KERN_MODPROBE,
Index: linux-2.6.25.4-rt4/kernel/trace/Kconfig
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/Kconfig	2008-05-29 09:46:09.000000000 -0400
@@ -0,0 +1,165 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+	bool
+
+config HAVE_DYNAMIC_FTRACE
+	bool
+
+config TRACER_MAX_TRACE
+	bool
+
+config TRACING
+	bool
+	select DEBUG_FS
+	select STACKTRACE
+
+config FTRACE
+	bool "Kernel Function Tracer"
+	depends on HAVE_FTRACE
+	select FRAME_POINTER
+	select TRACING
+	select CONTEXT_SWITCH_TRACER
+	help
+	  Enable the kernel to trace every kernel function. This is done
+	  by using a compiler feature to insert a small, 5-byte No-Operation
+	  instruction to the beginning of every kernel function, which NOP
+	  sequence is then dynamically patched into a tracer call when
+	  tracing is enabled by the administrator. If it's runtime disabled
+	  (the bootup default), then the overhead of the instructions is very
+	  small and not measurable even in micro-benchmarks.
+
+config IRQSOFF_TRACER
+	bool "Interrupts-off Latency Tracer"
+	default n
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on GENERIC_TIME
+	depends on HAVE_FTRACE
+	select TRACE_IRQFLAGS
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_TRACER
+	bool "Preemption-off Latency Tracer"
+	default n
+	depends on GENERIC_TIME
+	depends on PREEMPT
+	depends on HAVE_FTRACE
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in preemption off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config SCHED_TRACER
+	bool "Scheduling Latency Tracer"
+	depends on HAVE_FTRACE
+	select TRACING
+	select CONTEXT_SWITCH_TRACER
+	select TRACER_MAX_TRACE
+	help
+	  This tracer tracks the latency of the highest priority task
+	  to be scheduled in, starting from the point it has woken up.
+
+config EVENT_TRACER
+	bool "trace kernel events"
+	select CONTEXT_SWITCH_TRACER
+	help
+	  This option activates the event tracer of the latency_tracer.
+	  It activates markers through out the kernel for tracing.
+	  This option has a fairly low overhead when enabled.
+
+config CONTEXT_SWITCH_TRACER
+	bool "Trace process context switches"
+	depends on HAVE_FTRACE
+	select TRACING
+	select MARKERS
+	help
+	  This tracer gets called from the context switch and records
+	  all switching of tasks.
+
+config DYNAMIC_FTRACE
+	bool "enable/disable ftrace tracepoints dynamically"
+	depends on FTRACE
+	depends on HAVE_DYNAMIC_FTRACE
+	default y
+	help
+         This option will modify all the calls to ftrace dynamically
+	 (will patch them out of the binary image and replaces them
+	 with a No-Op instruction) as they are called. A table is
+	 created to dynamically enable them again.
+
+	 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+	 has native performance as long as no tracing is active.
+
+	 The changes to the code are done by a kernel thread that
+	 wakes up once a second and checks to see if any ftrace calls
+	 were made. If so, it runs stop_machine (stops all CPUS)
+	 and modifies the code to jump over the call to ftrace.
+
+config FTRACE_SELFTEST
+	bool
+
+config FTRACE_STARTUP_TEST
+	bool "Perform a startup test on ftrace"
+	depends on TRACING
+	select FTRACE_SELFTEST
+	help
+	  This option performs a series of startup tests on ftrace. On bootup
+	  a series of tests are made to verify that the tracer is
+	  functioning properly. It will do tests on all the configured
+	  tracers of ftrace.
+
+config INTERRUPT_OFF_HIST
+	bool "Interrupts off critical timings histogram"
+	depends on IRQSOFF_TRACER
+	help
+	  This option uses the infrastructure of the critical
+	  irqs off timings to create a histogram of latencies.
+
+config PREEMPT_OFF_HIST
+	bool "Preempt off critical timings histogram"
+	depends on PREEMPT_TRACER
+	help
+	  This option uses the infrastructure of the critical
+	  preemption off timings to create a histogram of latencies.
+
+config WAKEUP_LATENCY_HIST
+	bool "Interrupts off critical timings histogram"
+	select TRACING
+	select MARKERS
+	help
+	  This option uses the infrastructure of the wakeup tracer
+	  to create a histogram of latencies.
+
+config PREEMPT_TRACE
+	bool "Keep a record of preempt disabled spots"
+	depends on DEBUG_KERNEL
+	depends on PREEMPT
+	select TRACING
+	help
+	  Keeps a record of the last 25 preempt disabled locations.
Index: linux-2.6.25.4-rt4/kernel/trace/Makefile
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/Makefile	2008-05-29 09:46:09.000000000 -0400
@@ -0,0 +1,30 @@
+
+# Do not instrument the tracer itself:
+
+ifdef CONFIG_FTRACE
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+
+# selftest needs instrumentation
+CFLAGS_trace_selftest_dynamic.o = -pg
+obj-y += trace_selftest_dynamic.o
+endif
+
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
+obj-$(CONFIG_EVENT_TRACER) += trace_events.o
+
+obj-$(CONFIG_INTERRUPT_OFF_HIST) += trace_hist.o
+obj-$(CONFIG_PREEMPT_OFF_HIST) += trace_hist.o
+obj-$(CONFIG_WAKEUP_LATENCY_HIST) += trace_hist.o
+
+obj-$(CONFIG_PREEMPT_TRACE) += preempt-trace.o
+
+libftrace-y := ftrace.o
Index: linux-2.6.25.4-rt4/kernel/trace/ftrace.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/ftrace.c	2008-05-29 09:46:34.000000000 -0400
@@ -0,0 +1,1553 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/kthread.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+
+#include "trace.h"
+
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
+static int last_ftrace_enabled;
+
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+
+static DEFINE_RAW_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_sysctl_lock);
+
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+	.func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_ops *op = ftrace_list;
+
+	/* in case someone actually ports this to alpha! */
+	read_barrier_depends();
+
+	while (op != &ftrace_list_end) {
+		/* silly alpha */
+		read_barrier_depends();
+		op->func(ip, parent_ip);
+		op = op->next;
+	};
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+	ftrace_trace_function = ftrace_stub;
+}
+
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+	/* Should never be called by interrupts */
+	spin_lock(&ftrace_lock);
+
+	ops->next = ftrace_list;
+	/*
+	 * We are entering ops into the ftrace_list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the ftrace_list.
+	 */
+	smp_wmb();
+	ftrace_list = ops;
+
+	if (ftrace_enabled) {
+		/*
+		 * For one func, simply call it directly.
+		 * For more than one func, call the chain.
+		 */
+		if (ops->next == &ftrace_list_end)
+			ftrace_trace_function = ops->func;
+		else
+			ftrace_trace_function = ftrace_list_func;
+	}
+
+	spin_unlock(&ftrace_lock);
+
+	return 0;
+}
+
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	struct ftrace_ops **p;
+	int ret = 0;
+
+	spin_lock(&ftrace_lock);
+
+	/*
+	 * If we are removing the last function, then simply point
+	 * to the ftrace_stub.
+	 */
+	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+		ftrace_trace_function = ftrace_stub;
+		ftrace_list = &ftrace_list_end;
+		goto out;
+	}
+
+	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+	if (ftrace_enabled) {
+		/* If we only have one func left, then call that directly */
+		if (ftrace_list == &ftrace_list_end ||
+		    ftrace_list->next == &ftrace_list_end)
+			ftrace_trace_function = ftrace_list->func;
+	}
+
+ out:
+	spin_unlock(&ftrace_lock);
+
+	return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct task_struct *ftraced_task;
+
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+};
+
+static int ftrace_filtered;
+
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+
+static DEFINE_RAW_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	unsigned long		index;
+	struct dyn_ftrace	records[];
+};
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+static int ftraced_trigger;
+static int ftraced_suspend;
+static int ftraced_stop;
+
+static int ftrace_record_suspend;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+static inline int
+ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+	struct dyn_ftrace *p;
+	struct hlist_node *t;
+	int found = 0;
+
+	hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
+		if (p->ip == ip) {
+			found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static inline void
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+	hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
+}
+
+static void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+	/* no locking, only called from kstop_machine */
+
+	rec->ip = (unsigned long)ftrace_free_records;
+	ftrace_free_records = rec;
+	rec->flags |= FTRACE_FL_FREE;
+}
+
+static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
+{
+	struct dyn_ftrace *rec;
+
+	/* First check for freed records */
+	if (ftrace_free_records) {
+		rec = ftrace_free_records;
+
+		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+			WARN_ON_ONCE(1);
+			ftrace_free_records = NULL;
+			ftrace_disabled = 1;
+			ftrace_enabled = 0;
+			return NULL;
+		}
+
+		ftrace_free_records = (void *)rec->ip;
+		memset(rec, 0, sizeof(*rec));
+		return rec;
+	}
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	return &ftrace_pages->records[ftrace_pages->index++];
+}
+
+static void
+ftrace_record_ip(unsigned long ip)
+{
+	struct dyn_ftrace *node;
+	unsigned long flags;
+	unsigned long key;
+	int resched;
+	int atomic;
+	int cpu;
+
+	if (!ftrace_enabled || ftrace_disabled)
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	/*
+	 * We simply need to protect against recursion.
+	 * Use the the raw version of smp_processor_id and not
+	 * __get_cpu_var which can call debug hooks that can
+	 * cause a recursive crash here.
+	 */
+	cpu = raw_smp_processor_id();
+	per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
+	if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
+		goto out;
+
+	if (unlikely(ftrace_record_suspend))
+		goto out;
+
+	key = hash_long(ip, FTRACE_HASHBITS);
+
+	WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+
+	if (ftrace_ip_in_hash(ip, key))
+		goto out;
+
+	atomic = irqs_disabled();
+
+	spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+
+	/* This ip may have hit the hash before the lock */
+	if (ftrace_ip_in_hash(ip, key))
+		goto out_unlock;
+
+	/*
+	 * There's a slight race that the ftraced will update the
+	 * hash and reset here. If it is already converted, skip it.
+	 */
+	if (ftrace_ip_converted(ip))
+		goto out_unlock;
+
+	node = ftrace_alloc_dyn_node(ip);
+	if (!node)
+		goto out_unlock;
+
+	node->ip = ip;
+
+	ftrace_add_hash(node, key);
+
+	ftraced_trigger = 1;
+
+ out_unlock:
+	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+	per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
+
+	/* prevent recursion with scheduler */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+#define FTRACE_ADDR ((long)(ftrace_caller))
+#define MCOUNT_ADDR ((long)(mcount))
+
+static void
+__ftrace_replace_code(struct dyn_ftrace *rec,
+		      unsigned char *old, unsigned char *new, int enable)
+{
+	unsigned long ip, fl;
+	int failed;
+
+	ip = rec->ip;
+
+	if (ftrace_filtered && enable) {
+		/*
+		 * If filtering is on:
+		 *
+		 * If this record is set to be filtered and
+		 * is enabled then do nothing.
+		 *
+		 * If this record is set to be filtered and
+		 * it is not enabled, enable it.
+		 *
+		 * If this record is not set to be filtered
+		 * and it is not enabled do nothing.
+		 *
+		 * If this record is set not to trace then
+		 * do nothing.
+		 *
+		 * If this record is not set to be filtered and
+		 * it is enabled, disable it.
+		 */
+		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
+
+		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
+		    (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE))
+			return;
+
+		/*
+		 * If it is enabled disable it,
+		 * otherwise enable it!
+		 */
+		if (fl == FTRACE_FL_ENABLED) {
+			/* swap new and old */
+			new = old;
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		} else {
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags |= FTRACE_FL_ENABLED;
+		}
+	} else {
+
+		if (enable) {
+			/*
+			 * If this record is set not to trace and is
+			 * not enabled, do nothing.
+			 */
+			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
+			if (fl == FTRACE_FL_NOTRACE)
+				return;
+
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+		} else
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+
+		if (enable) {
+			if (rec->flags & FTRACE_FL_ENABLED)
+				return;
+			rec->flags |= FTRACE_FL_ENABLED;
+		} else {
+			if (!(rec->flags & FTRACE_FL_ENABLED))
+				return;
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		}
+	}
+
+	failed = ftrace_modify_code(ip, old, new);
+	if (failed) {
+		unsigned long key;
+		/* It is possible that the function hasn't been converted yet */
+		key = hash_long(ip, FTRACE_HASHBITS);
+		if (!ftrace_ip_in_hash(ip, key)) {
+			rec->flags |= FTRACE_FL_FAILED;
+			ftrace_free_rec(rec);
+		}
+
+	}
+}
+
+static void ftrace_replace_code(int enable)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	int i;
+
+	if (enable)
+		old = ftrace_nop_replace();
+	else
+		new = ftrace_nop_replace();
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+
+			__ftrace_replace_code(rec, old, new, enable);
+		}
+	}
+}
+
+static void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static int
+ftrace_code_disable(struct dyn_ftrace *rec)
+{
+	unsigned long ip;
+	unsigned char *nop, *call;
+	int failed;
+
+	ip = rec->ip;
+
+	nop = ftrace_nop_replace();
+	call = ftrace_call_replace(ip, MCOUNT_ADDR);
+
+	failed = ftrace_modify_code(ip, call, nop);
+	if (failed) {
+		rec->flags |= FTRACE_FL_FAILED;
+		ftrace_free_rec(rec);
+		return 0;
+	}
+	return 1;
+}
+
+static int __ftrace_update_code(void *ignore);
+
+static int __ftrace_modify_code(void *data)
+{
+	unsigned long addr;
+	int *command = data;
+
+	if (*command & FTRACE_ENABLE_CALLS) {
+		/*
+		 * Update any recorded ips now that we have the
+		 * machine stopped
+		 */
+		__ftrace_update_code(NULL);
+		ftrace_replace_code(1);
+	} else if (*command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (*command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (*command & FTRACE_ENABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_record_ip;
+		ftrace_mcount_set(&addr);
+	} else if (*command & FTRACE_DISABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_stub;
+		ftrace_mcount_set(&addr);
+	}
+
+	return 0;
+}
+
+static void ftrace_run_update_code(int command)
+{
+	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+}
+
+void ftrace_disable_daemon(void)
+{
+	/* Stop the daemon from calling kstop_machine */
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 1;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
+void ftrace_enable_daemon(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 0;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
+static ftrace_func_t saved_ftrace_func;
+
+static void ftrace_startup(void)
+{
+	int command = 0;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend++;
+	if (ftraced_suspend == 1)
+		command |= FTRACE_ENABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_shutdown(void)
+{
+	int command = 0;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend--;
+	if (!ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_startup_sysctl(void)
+{
+	int command = FTRACE_ENABLE_MCOUNT;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	/* Force update next time */
+	saved_ftrace_func = NULL;
+	/* ftraced_suspend is true if we want ftrace running */
+	if (ftraced_suspend)
+		command |= FTRACE_ENABLE_CALLS;
+
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+	int command = FTRACE_DISABLE_MCOUNT;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftraced_lock);
+	/* ftraced_suspend is true if ftrace is running */
+	if (ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
+
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+}
+
+static cycle_t		ftrace_update_time;
+static unsigned long	ftrace_update_cnt;
+unsigned long		ftrace_update_tot_cnt;
+
+static int __ftrace_update_code(void *ignore)
+{
+	struct dyn_ftrace *p;
+	struct hlist_head head;
+	struct hlist_node *t;
+	int save_ftrace_enabled;
+	cycle_t start, stop;
+	int i;
+
+	/* Don't be recording funcs now */
+	ftrace_record_suspend++;
+	save_ftrace_enabled = ftrace_enabled;
+	ftrace_enabled = 0;
+
+	start = ftrace_now(raw_smp_processor_id());
+	ftrace_update_cnt = 0;
+
+	/* No locks needed, the machine is stopped! */
+	for (i = 0; i < FTRACE_HASHSIZE; i++) {
+		if (hlist_empty(&ftrace_hash[i]))
+			continue;
+
+		head = ftrace_hash[i];
+		INIT_HLIST_HEAD(&ftrace_hash[i]);
+
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry(p, t, &head, node) {
+			if (ftrace_code_disable(p))
+				ftrace_update_cnt++;
+		}
+
+	}
+
+	stop = ftrace_now(raw_smp_processor_id());
+	ftrace_update_time = stop - start;
+	ftrace_update_tot_cnt += ftrace_update_cnt;
+	ftraced_trigger = 0;
+
+	ftrace_enabled = save_ftrace_enabled;
+	ftrace_record_suspend--;
+
+	return 0;
+}
+
+static int ftrace_update_code(void)
+{
+	if (unlikely(ftrace_disabled) ||
+	    !ftrace_enabled || !ftraced_trigger)
+		return 0;
+
+	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+
+	return 1;
+}
+
+static int ftraced(void *ignore)
+{
+	unsigned long usecs;
+
+	while (!kthread_should_stop()) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/* check once a second */
+		schedule_timeout(HZ);
+
+		if (unlikely(ftrace_disabled))
+			continue;
+
+		mutex_lock(&ftrace_sysctl_lock);
+		mutex_lock(&ftraced_lock);
+		if (!ftraced_suspend && !ftraced_stop &&
+		    ftrace_update_code()) {
+			usecs = nsecs_to_usecs(ftrace_update_time);
+			if (ftrace_update_tot_cnt > 100000) {
+				ftrace_update_tot_cnt = 0;
+				pr_info("hm, dftrace overflow: %lu change%s"
+					" (%lu total) in %lu usec%s\n",
+					ftrace_update_cnt,
+					ftrace_update_cnt != 1 ? "s" : "",
+					ftrace_update_tot_cnt,
+					usecs, usecs != 1 ? "s" : "");
+				ftrace_disabled = 1;
+				WARN_ON_ONCE(1);
+			}
+		}
+		mutex_unlock(&ftraced_lock);
+		mutex_unlock(&ftrace_sysctl_lock);
+
+		ftrace_shutdown_replenish();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __init ftrace_dyn_table_alloc(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
+
+enum {
+	FTRACE_ITER_FILTER	= (1 << 0),
+	FTRACE_ITER_CONT	= (1 << 1),
+	FTRACE_ITER_NOTRACE	= (1 << 2),
+};
+
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+
+struct ftrace_iterator {
+	loff_t			pos;
+	struct ftrace_page	*pg;
+	unsigned		idx;
+	unsigned		flags;
+	unsigned char		buffer[FTRACE_BUFF_MAX+1];
+	unsigned		buffer_idx;
+	unsigned		filtered;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct dyn_ftrace *rec = NULL;
+
+	(*pos)++;
+
+ retry:
+	if (iter->idx >= iter->pg->index) {
+		if (iter->pg->next) {
+			iter->pg = iter->pg->next;
+			iter->idx = 0;
+			goto retry;
+		}
+	} else {
+		rec = &iter->pg->records[iter->idx++];
+		if ((rec->flags & FTRACE_FL_FAILED) ||
+		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		     !(rec->flags & FTRACE_FL_FILTER)) ||
+		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
+		     !(rec->flags & FTRACE_FL_NOTRACE))) {
+			rec = NULL;
+			goto retry;
+		}
+	}
+
+	iter->pos = *pos;
+
+	return rec;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = -1;
+
+	if (*pos != iter->pos) {
+		for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
+			;
+	} else {
+		l = *pos;
+		p = t_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	if (!rec)
+		return 0;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "%s\n", str);
+
+	return 0;
+}
+
+static struct seq_operations show_ftrace_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.stop = t_stop,
+	.show = t_show,
+};
+
+static int
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	iter->pg = ftrace_pages_start;
+	iter->pos = -1;
+
+	ret = seq_open(file, &show_ftrace_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = iter;
+	} else {
+		kfree(iter);
+	}
+
+	return ret;
+}
+
+int ftrace_avail_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter = m->private;
+
+	seq_release(inode, file);
+	kfree(iter);
+
+	return 0;
+}
+
+static void ftrace_filter_reset(int enable)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned i;
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	if (enable)
+		ftrace_filtered = 0;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			rec->flags &= ~type;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static int
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+{
+	struct ftrace_iterator *iter;
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	mutex_lock(&ftrace_regex_lock);
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND))
+		ftrace_filter_reset(enable);
+
+	if (file->f_mode & FMODE_READ) {
+		iter->pg = ftrace_pages_start;
+		iter->pos = -1;
+		iter->flags = enable ? FTRACE_ITER_FILTER :
+			FTRACE_ITER_NOTRACE;
+
+		ret = seq_open(file, &show_ftrace_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = iter;
+		} else
+			kfree(iter);
+	} else
+		file->private_data = iter;
+	mutex_unlock(&ftrace_regex_lock);
+
+	return ret;
+}
+
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 1);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 0);
+}
+
+static ssize_t
+ftrace_regex_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_read(file, ubuf, cnt, ppos);
+	else
+		return -EPERM;
+}
+
+static loff_t
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, origin);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
+enum {
+	MATCH_FULL,
+	MATCH_FRONT_ONLY,
+	MATCH_MIDDLE_ONLY,
+	MATCH_END_ONLY,
+};
+
+static void
+ftrace_match(unsigned char *buff, int len, int enable)
+{
+	char str[KSYM_SYMBOL_LEN];
+	char *search = NULL;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type = MATCH_FULL;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned i, match = 0, search_len = 0;
+
+	for (i = 0; i < len; i++) {
+		if (buff[i] == '*') {
+			if (!i) {
+				search = buff + i + 1;
+				type = MATCH_END_ONLY;
+				search_len = len - (i + 1);
+			} else {
+				if (type == MATCH_END_ONLY) {
+					type = MATCH_MIDDLE_ONLY;
+				} else {
+					match = i;
+					type = MATCH_FRONT_ONLY;
+				}
+				buff[i] = 0;
+				break;
+			}
+		}
+	}
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	if (enable)
+		ftrace_filtered = 1;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			int matched = 0;
+			char *ptr;
+
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+			switch (type) {
+			case MATCH_FULL:
+				if (strcmp(str, buff) == 0)
+					matched = 1;
+				break;
+			case MATCH_FRONT_ONLY:
+				if (memcmp(str, buff, match) == 0)
+					matched = 1;
+				break;
+			case MATCH_MIDDLE_ONLY:
+				if (strstr(str, search))
+					matched = 1;
+				break;
+			case MATCH_END_ONLY:
+				ptr = strstr(str, search);
+				if (ptr && (ptr[search_len] == 0))
+					matched = 1;
+				break;
+			}
+			if (matched)
+				rec->flags |= flag;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static ssize_t
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos, int enable)
+{
+	struct ftrace_iterator *iter;
+	char ch;
+	size_t read = 0;
+	ssize_t ret;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	mutex_lock(&ftrace_regex_lock);
+
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		iter = m->private;
+	} else
+		iter = file->private_data;
+
+	if (!*ppos) {
+		iter->flags &= ~FTRACE_ITER_CONT;
+		iter->buffer_idx = 0;
+	}
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+	read++;
+	cnt--;
+
+	if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+		/* skip white space */
+		while (cnt && isspace(ch)) {
+			ret = get_user(ch, ubuf++);
+			if (ret)
+				goto out;
+			read++;
+			cnt--;
+		}
+
+		if (isspace(ch)) {
+			file->f_pos += read;
+			ret = read;
+			goto out;
+		}
+
+		iter->buffer_idx = 0;
+	}
+
+	while (cnt && !isspace(ch)) {
+		if (iter->buffer_idx < FTRACE_BUFF_MAX)
+			iter->buffer[iter->buffer_idx++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	if (isspace(ch)) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		iter->buffer_idx = 0;
+	} else
+		iter->flags |= FTRACE_ITER_CONT;
+
+
+	file->f_pos += read;
+
+	ret = read;
+ out:
+	mutex_unlock(&ftrace_regex_lock);
+
+	return ret;
+}
+
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftrace_regex_lock);
+	if (reset)
+		ftrace_filter_reset(enable);
+	if (buf)
+		ftrace_match(buf, len, enable);
+	mutex_unlock(&ftrace_regex_lock);
+}
+
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+	ftrace_set_regex(buf, len, reset, 1);
+}
+
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+	ftrace_set_regex(buf, len, reset, 0);
+}
+
+static int
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter;
+
+	mutex_lock(&ftrace_regex_lock);
+	if (file->f_mode & FMODE_READ) {
+		iter = m->private;
+
+		seq_release(inode, file);
+	} else
+		iter = file->private_data;
+
+	if (iter->buffer_idx) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+	}
+
+	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftraced_lock);
+	if (iter->filtered && ftraced_suspend && ftrace_enabled)
+		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+	mutex_unlock(&ftraced_lock);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	kfree(iter);
+	mutex_unlock(&ftrace_regex_lock);
+	return 0;
+}
+
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 1);
+}
+
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 0);
+}
+
+static ssize_t
+ftraced_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	/* don't worry about races */
+	char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
+	int r = strlen(buf);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftraced_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (strncmp(buf, "enable", 6) == 0)
+		val = 1;
+	else if (strncmp(buf, "disable", 7) == 0)
+		val = 0;
+	else {
+		buf[cnt] = 0;
+
+		ret = strict_strtoul(buf, 10, &val);
+		if (ret < 0)
+			return ret;
+
+		val = !!val;
+	}
+
+	if (val)
+		ftrace_enable_daemon();
+	else
+		ftrace_disable_daemon();
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static struct file_operations ftrace_avail_fops = {
+	.open = ftrace_avail_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_filter_fops = {
+	.open = ftrace_filter_open,
+	.read = ftrace_regex_read,
+	.write = ftrace_filter_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_filter_release,
+};
+
+static struct file_operations ftrace_notrace_fops = {
+	.open = ftrace_notrace_open,
+	.read = ftrace_regex_read,
+	.write = ftrace_notrace_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_notrace_release,
+};
+
+static struct file_operations ftraced_fops = {
+	.open = tracing_open_generic,
+	.read = ftraced_read,
+	.write = ftraced_write,
+};
+
+/**
+ * ftrace_force_update - force an update to all recording ftrace functions
+ */
+int ftrace_force_update(void)
+{
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftraced_lock);
+
+	/*
+	 * If ftraced_trigger is not set, then there is nothing
+	 * to update.
+	 */
+	if (ftraced_trigger && !ftrace_update_code())
+		ret = -EBUSY;
+
+	mutex_unlock(&ftraced_lock);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+static void ftrace_force_shutdown(void)
+{
+	struct task_struct *task;
+	int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
+
+	mutex_lock(&ftraced_lock);
+	task = ftraced_task;
+	ftraced_task = NULL;
+	ftraced_suspend = -1;
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+
+	if (task)
+		kthread_stop(task);
+}
+
+static __init int ftrace_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("available_filter_functions", 0444,
+				    d_tracer, NULL, &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_filter_functions' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+				    NULL, &ftrace_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_filter' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+				    NULL, &ftrace_notrace_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_notrace' entry\n");
+
+	entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
+				    NULL, &ftraced_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ftraced_enabled' entry\n");
+	return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
+static int __init ftrace_dynamic_init(void)
+{
+	struct task_struct *p;
+	unsigned long addr;
+	int ret;
+
+	addr = (unsigned long)ftrace_record_ip;
+
+	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+
+	/* ftrace_dyn_arch_init places the return code in addr */
+	if (addr) {
+		ret = (int)addr;
+		goto failed;
+	}
+
+	ret = ftrace_dyn_table_alloc();
+	if (ret)
+		goto failed;
+
+	p = kthread_run(ftraced, NULL, "ftraced");
+	if (IS_ERR(p)) {
+		ret = -1;
+		goto failed;
+	}
+
+	last_ftrace_enabled = ftrace_enabled = 1;
+	ftraced_task = p;
+
+	return 0;
+
+ failed:
+	ftrace_disabled = 1;
+	return ret;
+}
+
+core_initcall(ftrace_dynamic_init);
+#else
+# define ftrace_startup()		do { } while (0)
+# define ftrace_shutdown()		do { } while (0)
+# define ftrace_startup_sysctl()	do { } while (0)
+# define ftrace_shutdown_sysctl()	do { } while (0)
+# define ftrace_force_shutdown()	do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/**
+ * ftrace_kill - totally shutdown ftrace
+ *
+ * This is a safety measure. If something was detected that seems
+ * wrong, calling this function will keep ftrace from doing
+ * any more modifications, and updates.
+ * used when something went wrong.
+ */
+void ftrace_kill(void)
+{
+	mutex_lock(&ftrace_sysctl_lock);
+	ftrace_disabled = 1;
+	ftrace_enabled = 0;
+
+	clear_ftrace_function();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	/* Try to totally disable ftrace */
+	ftrace_force_shutdown();
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -1;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	ret = __register_ftrace_function(ops);
+	ftrace_startup();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	mutex_lock(&ftrace_sysctl_lock);
+	ret = __unregister_ftrace_function(ops);
+	ftrace_shutdown();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+		     struct file *file, void __user *buffer, size_t *lenp,
+		     loff_t *ppos)
+{
+	int ret;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&ftrace_sysctl_lock);
+
+	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+
+	if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+		goto out;
+
+	last_ftrace_enabled = ftrace_enabled;
+
+	if (ftrace_enabled) {
+
+		ftrace_startup_sysctl();
+
+		/* we are starting ftrace again */
+		if (ftrace_list != &ftrace_list_end) {
+			if (ftrace_list->next == &ftrace_list_end)
+				ftrace_trace_function = ftrace_list->func;
+			else
+				ftrace_trace_function = ftrace_list_func;
+		}
+
+	} else {
+		/* stopping ftrace calls (just send to ftrace_stub) */
+		ftrace_trace_function = ftrace_stub;
+
+		ftrace_shutdown_sysctl();
+	}
+
+ out:
+	mutex_unlock(&ftrace_sysctl_lock);
+	return ret;
+}
Index: linux-2.6.25.4-rt4/kernel/trace/trace.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace.c	2008-05-29 09:47:28.000000000 -0400
@@ -0,0 +1,3460 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally taken from the RT patch by:
+ *    Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/utsrelease.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pagemap.h>
+#include <linux/hardirq.h>
+#include <linux/linkage.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+
+#include <linux/stacktrace.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/unistd.h>
+
+#include "trace.h"
+
+unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly	tracing_thresh;
+
+static unsigned long __read_mostly	tracing_nr_buffers;
+static cpumask_t __read_mostly		tracing_buffer_mask;
+
+#define for_each_cpu_mask_nr(cpu, mask) for_each_cpu_mask(cpu, mask)
+#define for_each_tracing_cpu(cpu)	\
+	for_each_cpu_mask_nr(cpu, tracing_buffer_mask)
+
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly = {
+	.name		= "none",
+};
+
+static int trace_alloc_page(void);
+static int trace_free_page(void);
+
+static int tracing_disabled = 1;
+
+static unsigned long tracing_pages_allocated;
+
+long
+ns2usecs(cycle_t nsec)
+{
+	nsec += 500;
+	do_div(nsec, 1000);
+	return nsec;
+}
+
+cycle_t ftrace_now(int cpu)
+{
+//	return cpu_clock(cpu);
+	return sched_clock();
+}
+
+/*
+ * The global_trace is the descriptor that holds the tracing
+ * buffers for the live tracing. For each CPU, it contains
+ * a link list of pages that will store trace entries. The
+ * page descriptor of the pages in the memory is used to hold
+ * the link list by linking the lru item in the page descriptor
+ * to each of the pages in the buffer per CPU.
+ *
+ * For each active CPU there is a data field that holds the
+ * pages for the buffer for that CPU. Each CPU has the same number
+ * of pages allocated for its buffer.
+ */
+static struct trace_array	global_trace;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+
+/*
+ * The max_tr is used to snapshot the global_trace when a maximum
+ * latency is reached. Some tracers will use this to store a maximum
+ * trace while it continues examining live traces.
+ *
+ * The buffers for the max_tr are set up the same as the global_trace.
+ * When a snapshot is taken, the link list of the max_tr is swapped
+ * with the link list of the global_trace and the buffers are reset for
+ * the global_trace so the tracing can continue.
+ */
+static struct trace_array	max_tr;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+
+/* tracer_enabled is used to toggle activation of a tracer */
+static int			tracer_enabled = 1;
+
+/*
+ * trace_nr_entries is the number of entries that is allocated
+ * for a buffer. Note, the number of entries is always rounded
+ * to ENTRIES_PER_PAGE.
+ */
+static unsigned long		trace_nr_entries = 65536UL;
+
+/* trace_types holds a link list of available tracers. */
+static struct tracer		*trace_types __read_mostly;
+
+/* current_trace points to the tracer that is currently active */
+static struct tracer		*current_trace __read_mostly;
+
+/*
+ * max_tracer_type_len is used to simplify the allocating of
+ * buffers to read userspace tracer names. We keep track of
+ * the longest tracer name registered.
+ */
+static int			max_tracer_type_len;
+
+/*
+ * trace_types_lock is used to protect the trace_types list.
+ * This lock is also used to keep user access serialized.
+ * Accesses from userspace will grab this lock while userspace
+ * activities happen inside the kernel.
+ */
+static DEFINE_MUTEX(trace_types_lock);
+
+/* trace_wait is a waitqueue for tasks blocked on trace_poll */
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+
+/* trace_flags holds iter_ctrl options */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+
+/**
+ * trace_wake_up - wake up tasks waiting for trace input
+ *
+ * Simply wakes up any task that is blocked on the trace_wait
+ * queue. These is used with trace_poll for tasks polling the trace.
+ */
+void trace_wake_up(void)
+{
+	/*
+	 * The runqueue_is_locked() can fail, but this is the best we
+	 * have for now:
+	 */
+	if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
+		wake_up(&trace_wait);
+}
+
+#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
+
+static int __init set_nr_entries(char *str)
+{
+	unsigned long nr_entries;
+	int ret;
+
+	if (!str)
+		return 0;
+	ret = strict_strtoul(str, 0, &nr_entries);
+	/* nr_entries can not be zero */
+	if (ret < 0 || nr_entries == 0)
+		return 0;
+	trace_nr_entries = nr_entries;
+	return 1;
+}
+__setup("trace_entries=", set_nr_entries);
+
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+	return nsecs / 1000;
+}
+
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ *  IRQS_OFF	- interrupts were disabled
+ *  NEED_RESCED - reschedule is requested
+ *  HARDIRQ	- inside an interrupt handler
+ *  SOFTIRQ	- inside a softirq handler
+ */
+enum trace_flag_type {
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+};
+
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
+/* These must match the bit postions in trace_iterator_flags */
+static const char *trace_options[] = {
+	"print-parent",
+	"sym-offset",
+	"sym-addr",
+	"verbose",
+	"raw",
+	"hex",
+	"bin",
+	"block",
+	"stacktrace",
+	"sched-tree",
+	NULL
+};
+
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a raw_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ */
+static __raw_spinlock_t ftrace_max_lock =
+	(__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /debugfs/tracing/latency_trace)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data = tr->data[cpu];
+
+	max_tr.cpu = cpu;
+	max_tr.time_start = data->preempt_timestamp;
+
+	data = max_tr.data[cpu];
+	data->saved_latency = tracing_max_latency;
+
+	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+	data->pid = tsk->pid;
+	data->uid = tsk->uid;
+	data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+	data->policy = tsk->policy;
+	data->rt_priority = tsk->rt_priority;
+
+	/* record this tasks comm */
+	tracing_record_cmdline(current);
+}
+
+#define CHECK_COND(cond)			\
+	if (unlikely(cond)) {			\
+		tracing_disabled = 1;		\
+		WARN_ON(1);			\
+		return -1;			\
+	}
+
+/**
+ * check_pages - integrity check of trace buffers
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+int check_pages(struct trace_array_cpu *data)
+{
+	struct page *page, *tmp;
+
+	CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
+	CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
+
+	list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
+		CHECK_COND(page->lru.next->prev != &page->lru);
+		CHECK_COND(page->lru.prev->next != &page->lru);
+	}
+
+	return 0;
+}
+
+/**
+ * head_page - page address of the first page in per_cpu buffer.
+ *
+ * head_page returns the page address of the first page in
+ * a per_cpu buffer. This also preforms various consistency
+ * checks to make sure the buffer has not been corrupted.
+ */
+void *head_page(struct trace_array_cpu *data)
+{
+	struct page *page;
+
+	if (list_empty(&data->trace_pages))
+		return NULL;
+
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	BUG_ON(&page->lru == &data->trace_pages);
+
+	return page_address(page);
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	va_list ap;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	va_end(ap);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+static int
+trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	int len = strlen(str);
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, str, len);
+	s->len += len;
+
+	return len;
+}
+
+static int
+trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+
+	s->buffer[s->len++] = c;
+
+	return 1;
+}
+
+static int
+trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, mem, len);
+	s->len += len;
+
+	return len;
+}
+
+#define HEX_CHARS 17
+static const char hex2asc[] = "0123456789abcdef";
+
+static int
+trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+	unsigned char hex[HEX_CHARS];
+	unsigned char *data = mem;
+	unsigned char byte;
+	int i, j;
+
+	BUG_ON(len >= HEX_CHARS);
+
+#ifdef __BIG_ENDIAN
+	for (i = 0, j = 0; i < len; i++) {
+#else
+	for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+		byte = data[i];
+
+		hex[j++] = hex2asc[byte & 0x0f];
+		hex[j++] = hex2asc[byte >> 4];
+	}
+	hex[j++] = ' ';
+
+	return trace_seq_putmem(s, hex, j);
+}
+
+static void
+trace_seq_reset(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
+{
+	int len;
+	int ret;
+
+	if (s->len <= s->readpos)
+		return -EBUSY;
+
+	len = s->len - s->readpos;
+	if (cnt > len)
+		cnt = len;
+	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+	if (ret)
+		return -EFAULT;
+
+	s->readpos += len;
+	return cnt;
+}
+
+static void
+trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+	s->buffer[len] = 0;
+	seq_puts(m, s->buffer);
+
+	trace_seq_reset(s);
+}
+
+/*
+ * flip the trace buffers between two trace descriptors.
+ * This usually is the buffers between the global_trace and
+ * the max_tr to record a snapshot of a current trace.
+ *
+ * The ftrace_max_lock must be held.
+ */
+static void
+flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
+{
+	struct list_head flip_pages;
+
+	INIT_LIST_HEAD(&flip_pages);
+
+	memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
+		sizeof(struct trace_array_cpu) -
+		offsetof(struct trace_array_cpu, trace_head_idx));
+
+	check_pages(tr1);
+	check_pages(tr2);
+	list_splice_init(&tr1->trace_pages, &flip_pages);
+	list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
+	list_splice_init(&flip_pages, &tr2->trace_pages);
+	BUG_ON(!list_empty(&flip_pages));
+	check_pages(tr1);
+	check_pages(tr2);
+}
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data;
+	int i;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	__raw_spin_lock(&ftrace_max_lock);
+	/* clear out all the previous traces */
+	for_each_tracing_cpu(i) {
+		data = tr->data[i];
+		flip_trace(max_tr.data[i], data);
+		tracing_reset(data);
+	}
+
+	__update_max_tr(tr, tsk, cpu);
+	__raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr - tracer
+ * @tsk - task with the latency
+ * @cpu - the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data = tr->data[cpu];
+	int i;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_tracing_cpu(i)
+		tracing_reset(max_tr.data[i]);
+
+	flip_trace(max_tr.data[cpu], data);
+	tracing_reset(data);
+
+	__update_max_tr(tr, tsk, cpu);
+	__raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * register_tracer - register a tracer with the ftrace system.
+ * @type - the plugin for the tracer
+ *
+ * Register a new plugin tracer.
+ */
+int register_tracer(struct tracer *type)
+{
+	struct tracer *t;
+	int len;
+	int ret = 0;
+
+	if (!type->name) {
+		pr_info("Tracer must have a name\n");
+		return -1;
+	}
+
+	mutex_lock(&trace_types_lock);
+	for (t = trace_types; t; t = t->next) {
+		if (strcmp(type->name, t->name) == 0) {
+			/* already found */
+			pr_info("Trace %s already registered\n",
+				type->name);
+			ret = -1;
+			goto out;
+		}
+	}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	if (type->selftest) {
+		struct tracer *saved_tracer = current_trace;
+		struct trace_array_cpu *data;
+		struct trace_array *tr = &global_trace;
+		int saved_ctrl = tr->ctrl;
+		int i;
+		/*
+		 * Run a selftest on this tracer.
+		 * Here we reset the trace buffer, and set the current
+		 * tracer to be this tracer. The tracer can then run some
+		 * internal tracing to verify that everything is in order.
+		 * If we fail, we do not register this tracer.
+		 */
+		for_each_tracing_cpu(i) {
+			data = tr->data[i];
+			if (!head_page(data))
+				continue;
+			tracing_reset(data);
+		}
+		current_trace = type;
+		tr->ctrl = 0;
+		/* the test is responsible for initializing and enabling */
+		pr_info("Testing tracer %s: ", type->name);
+		ret = type->selftest(type, tr);
+		/* the test is responsible for resetting too */
+		current_trace = saved_tracer;
+		tr->ctrl = saved_ctrl;
+		if (ret) {
+			printk(KERN_CONT "FAILED!\n");
+			goto out;
+		}
+		/* Only reset on passing, to avoid touching corrupted buffers */
+		for_each_tracing_cpu(i) {
+			data = tr->data[i];
+			if (!head_page(data))
+				continue;
+			tracing_reset(data);
+		}
+		printk(KERN_CONT "PASSED\n");
+	}
+#endif
+
+	type->next = trace_types;
+	trace_types = type;
+	len = strlen(type->name);
+	if (len > max_tracer_type_len)
+		max_tracer_type_len = len;
+
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
+void unregister_tracer(struct tracer *type)
+{
+	struct tracer **t;
+	int len;
+
+	mutex_lock(&trace_types_lock);
+	for (t = &trace_types; *t; t = &(*t)->next) {
+		if (*t == type)
+			goto found;
+	}
+	pr_info("Trace %s not registered\n", type->name);
+	goto out;
+
+ found:
+	*t = (*t)->next;
+	if (strlen(type->name) != max_tracer_type_len)
+		goto out;
+
+	max_tracer_type_len = 0;
+	for (t = &trace_types; *t; t = &(*t)->next) {
+		len = strlen((*t)->name);
+		if (len > max_tracer_type_len)
+			max_tracer_type_len = len;
+	}
+ out:
+	mutex_unlock(&trace_types_lock);
+}
+
+void tracing_reset(struct trace_array_cpu *data)
+{
+	data->trace_idx = 0;
+	data->overrun = 0;
+	data->trace_head = data->trace_tail = head_page(data);
+	data->trace_head_idx = 0;
+	data->trace_tail_idx = 0;
+}
+
+#define SAVED_CMDLINES 128
+static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
+static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
+static int cmdline_idx;
+static DEFINE_RAW_SPINLOCK(trace_cmdline_lock);
+
+/* temporary disable recording */
+atomic_t trace_record_cmdline_disabled __read_mostly;
+
+static void trace_init_cmdlines(void)
+{
+	memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
+	memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+	cmdline_idx = 0;
+}
+
+void trace_stop_cmdline_recording(void);
+
+static void trace_save_cmdline(struct task_struct *tsk)
+{
+	unsigned map;
+	unsigned idx;
+
+	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+		return;
+
+	/*
+	 * It's not the end of the world if we don't get
+	 * the lock, but we also don't want to spin
+	 * nor do we want to disable interrupts,
+	 * so if we miss here, then better luck next time.
+	 */
+	if (!spin_trylock(&trace_cmdline_lock))
+		return;
+
+	/* from the pid, find the index of the cmdline array */
+	idx = map_pid_to_cmdline[tsk->pid];
+
+	if (idx >= SAVED_CMDLINES) {
+		/* this is new */
+		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+
+		/* check the reverse map and reset it if needed */
+		map = map_cmdline_to_pid[idx];
+		if (map <= PID_MAX_DEFAULT)
+			map_pid_to_cmdline[map] = (unsigned)-1;
+
+		map_cmdline_to_pid[idx] = tsk->pid;
+		map_pid_to_cmdline[tsk->pid] = idx;
+
+		cmdline_idx = idx;
+	}
+
+	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+
+	spin_unlock(&trace_cmdline_lock);
+}
+
+static char *trace_find_cmdline(int pid)
+{
+	char *cmdline = "<...>";
+	unsigned map;
+
+	if (!pid)
+		return "<idle>";
+
+	if (pid > PID_MAX_DEFAULT)
+		goto out;
+
+	map = map_pid_to_cmdline[pid];
+	if (map >= SAVED_CMDLINES)
+		goto out;
+
+	if (map_cmdline_to_pid[map] != pid)
+		goto out;
+
+	cmdline = saved_cmdlines[map];
+
+ out:
+	return cmdline;
+}
+
+void tracing_record_cmdline(struct task_struct *tsk)
+{
+	if (atomic_read(&trace_record_cmdline_disabled))
+		return;
+
+	trace_save_cmdline(tsk);
+}
+
+static inline struct list_head *
+trace_next_list(struct trace_array_cpu *data, struct list_head *next)
+{
+	/*
+	 * Roundrobin - but skip the head (which is not a real page):
+	 */
+	next = next->next;
+	if (unlikely(next == &data->trace_pages))
+		next = next->next;
+	BUG_ON(next == &data->trace_pages);
+
+	return next;
+}
+
+static inline void *
+trace_next_page(struct trace_array_cpu *data, void *addr)
+{
+	struct list_head *next;
+	struct page *page;
+
+	page = virt_to_page(addr);
+
+	next = trace_next_list(data, &page->lru);
+	page = list_entry(next, struct page, lru);
+
+	return page_address(page);
+}
+
+static inline struct trace_entry *
+tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
+{
+	unsigned long idx, idx_next;
+	struct trace_entry *entry;
+
+	data->trace_idx++;
+	idx = data->trace_head_idx;
+	idx_next = idx + 1;
+
+	BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
+
+	entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
+
+	if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
+		data->trace_head = trace_next_page(data, data->trace_head);
+		idx_next = 0;
+	}
+
+	if (data->trace_head == data->trace_tail &&
+	    idx_next == data->trace_tail_idx) {
+		/* overrun */
+		data->overrun++;
+		data->trace_tail_idx++;
+		if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+			data->trace_tail =
+				trace_next_page(data, data->trace_tail);
+			data->trace_tail_idx = 0;
+		}
+	}
+
+	data->trace_head_idx = idx_next;
+
+	return entry;
+}
+
+static inline void
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
+{
+	struct task_struct *tsk = current;
+	unsigned long pc;
+
+	pc = preempt_count();
+
+	entry->preempt_count	= pc & 0xff;
+	entry->pid		= (tsk) ? tsk->pid : 0;
+	entry->t		= ftrace_now(raw_smp_processor_id());
+	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+}
+
+void
+trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+	       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_FN;
+	entry->fn.ip		= ip;
+	entry->fn.parent_ip	= parent_ip;
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+}
+
+void
+ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+	if (likely(!atomic_read(&data->disabled)))
+		trace_function(tr, data, ip, parent_ip, flags);
+}
+
+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_rw *rw)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_RW;
+	entry->mmiorw		= *rw;
+
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_map *map)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_MAP;
+	entry->mmiomap		= *map;
+
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+#endif
+
+void __trace_stack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags,
+		   int skip)
+{
+	struct trace_entry *entry;
+	struct stack_trace trace;
+
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_STACK;
+
+	memset(&entry->stack, 0, sizeof(entry->stack));
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= FTRACE_STACK_ENTRIES;
+	trace.skip		= skip;
+	trace.entries		= entry->stack.caller;
+
+	save_stack_trace(&trace);
+}
+
+void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array_cpu *data = __data;
+	struct trace_array *tr = __tr;
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_SPECIAL;
+	entry->special.arg1	= arg1;
+	entry->special.arg2	= arg2;
+	entry->special.arg3	= arg3;
+	__trace_stack(tr, data, irq_flags, 4);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   struct task_struct *prev,
+			   struct task_struct *next,
+			   unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_CTX;
+	entry->ctx.prev_pid	= prev->pid;
+	entry->ctx.prev_prio	= prev->prio;
+	entry->ctx.prev_state	= prev->state;
+	entry->ctx.next_pid	= next->pid;
+	entry->ctx.next_prio	= next->prio;
+	entry->ctx.next_state	= next->state;
+	__trace_stack(tr, data, flags, 5);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+}
+
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   struct task_struct *wakee,
+			   struct task_struct *curr,
+			   unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_WAKE;
+	entry->ctx.prev_pid	= curr->pid;
+	entry->ctx.prev_prio	= curr->prio;
+	entry->ctx.prev_state	= curr->state;
+	entry->ctx.next_pid	= wakee->pid;
+	entry->ctx.next_prio	= wakee->prio;
+	entry->ctx.next_state	= wakee->state;
+	__trace_stack(tr, data, flags, 6);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		__trace_special(tr, data, arg1, arg2, arg3);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+void tracing_event_irq(struct trace_array *tr,
+		       struct trace_array_cpu *data,
+		       unsigned long flags,
+		       unsigned long ip,
+		       int irq, int usermode,
+		       unsigned long retip)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_IRQ;
+	entry->irq.ip		= ip;
+	entry->irq.irq		= irq;
+	entry->irq.ret_ip	= retip;
+	entry->irq.usermode	= usermode;
+}
+
+void tracing_event_fault(struct trace_array *tr,
+			 struct trace_array_cpu *data,
+			 unsigned long flags,
+			 unsigned long ip,
+			 unsigned long retip,
+			 unsigned long error_code,
+			 unsigned long address)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_FAULT;
+	entry->fault.ip		= ip;
+	entry->fault.ret_ip	= retip;
+	entry->fault.errorcode	= error_code;
+	entry->fault.address	= address;
+}
+
+void tracing_event_timer_set(struct trace_array *tr,
+			     struct trace_array_cpu *data,
+			     unsigned long flags,
+			     unsigned long ip,
+			     ktime_t *expires, void *timer)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_TIMER_SET;
+	entry->timer.ip		= ip;
+	entry->timer.expire	= *expires;
+	entry->timer.timer	= timer;
+}
+
+void tracing_event_timer_triggered(struct trace_array *tr,
+				   struct trace_array_cpu *data,
+				   unsigned long flags,
+				   unsigned long ip,
+				   ktime_t *expired, void *timer)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_TIMER_TRIG;
+	entry->timer.ip		= ip;
+	entry->timer.expire	= *expired;
+	entry->timer.timer	= timer;
+}
+
+void tracing_event_timestamp(struct trace_array *tr,
+			     struct trace_array_cpu *data,
+			     unsigned long flags,
+			     unsigned long ip,
+			     ktime_t *now)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_TIMESTAMP;
+	entry->timestamp.ip		= ip;
+	entry->timestamp.now		= *now;
+}
+
+void tracing_event_task_activate(struct trace_array *tr,
+				 struct trace_array_cpu *data,
+				 unsigned long flags,
+				 unsigned long ip,
+				 struct task_struct *p,
+				 int task_cpu)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_TASK_ACT;
+	entry->task.ip		= ip;
+	entry->task.pid		= p->pid;
+	entry->task.prio	= p->prio;
+	entry->task.cpu		= task_cpu;
+}
+
+void tracing_event_task_deactivate(struct trace_array *tr,
+				   struct trace_array_cpu *data,
+				   unsigned long flags,
+				   unsigned long ip,
+				   struct task_struct *p,
+				   int task_cpu)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_TASK_DEACT;
+	entry->task.ip		= ip;
+	entry->task.pid		= p->pid;
+	entry->task.prio	= p->prio;
+	entry->task.cpu		= task_cpu;
+}
+
+void tracing_event_syscall(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   unsigned long flags,
+			   unsigned long ip,
+			   unsigned long nr,
+			   unsigned long p1,
+			   unsigned long p2,
+			   unsigned long p3)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type			= TRACE_SYSCALL;
+	entry->syscall.ip		= ip;
+	entry->syscall.nr		= nr;
+	entry->syscall.p1		= p1;
+	entry->syscall.p2		= p2;
+	entry->syscall.p3		= p3;
+}
+
+void tracing_event_sysret(struct trace_array *tr,
+			  struct trace_array_cpu *data,
+			  unsigned long flags,
+			  unsigned long ip,
+			  unsigned long ret)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type			= TRACE_SYSRET;
+	entry->sysret.ip		= ip;
+	entry->sysret.ret		= ret;
+}
+
+#ifdef CONFIG_FTRACE
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu, resched;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		local_save_flags(flags);
+		trace_function(tr, data, ip, parent_ip, flags);
+	}
+
+	atomic_dec(&data->disabled);
+
+	/*
+	 * To prevent recursion with schedule(), we look at the
+	 * resched flag before disabling preemption. If it is already
+	 * set, then we may be just calling schedule, and we don't
+	 * want to call schedule again. But if it was not set, then
+	 * it is fine to call schedule() if we need to.
+	 */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = function_trace_call,
+};
+
+void tracing_start_function_trace(void)
+{
+	register_ftrace_function(&trace_ops);
+	tracing_record_cmdline(current);
+}
+
+void tracing_stop_function_trace(void)
+{
+	tracing_record_cmdline(current);
+	unregister_ftrace_function(&trace_ops);
+}
+#endif
+
+enum trace_file_type {
+	TRACE_FILE_LAT_FMT	= 1,
+};
+
+static struct trace_entry *
+trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
+		struct trace_iterator *iter, int cpu)
+{
+	struct page *page;
+	struct trace_entry *array;
+
+	if (iter->next_idx[cpu] >= tr->entries ||
+	    iter->next_idx[cpu] >= data->trace_idx ||
+	    (data->trace_head == data->trace_tail &&
+	     data->trace_head_idx == data->trace_tail_idx))
+		return NULL;
+
+	if (!iter->next_page[cpu]) {
+		/* Initialize the iterator for this cpu trace buffer */
+		WARN_ON(!data->trace_tail);
+		page = virt_to_page(data->trace_tail);
+		iter->next_page[cpu] = &page->lru;
+		iter->next_page_idx[cpu] = data->trace_tail_idx;
+	}
+
+	page = list_entry(iter->next_page[cpu], struct page, lru);
+	BUG_ON(&data->trace_pages == &page->lru);
+
+	array = page_address(page);
+
+	WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
+	return &array[iter->next_page_idx[cpu]];
+}
+
+static struct trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+{
+	struct trace_array *tr = iter->tr;
+	struct trace_entry *ent, *next = NULL;
+	int next_cpu = -1;
+	int cpu;
+
+	for_each_tracing_cpu(cpu) {
+		if (!head_page(tr->data[cpu]))
+			continue;
+		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+		/*
+		 * Pick the entry with the smallest timestamp:
+		 */
+		if (ent && (!next || ent->t < next->t)) {
+			next = ent;
+			next_cpu = cpu;
+		}
+	}
+
+	if (ent_cpu)
+		*ent_cpu = next_cpu;
+
+	return next;
+}
+
+static void trace_iterator_increment(struct trace_iterator *iter)
+{
+	iter->idx++;
+	iter->next_idx[iter->cpu]++;
+	iter->next_page_idx[iter->cpu]++;
+
+	if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
+		struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+		iter->next_page_idx[iter->cpu] = 0;
+		iter->next_page[iter->cpu] =
+			trace_next_list(data, iter->next_page[iter->cpu]);
+	}
+}
+
+static void trace_consume(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+	data->trace_tail_idx++;
+	if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+		data->trace_tail = trace_next_page(data, data->trace_tail);
+		data->trace_tail_idx = 0;
+	}
+
+	/* Check if we empty it, then reset the index */
+	if (data->trace_head == data->trace_tail &&
+	    data->trace_head_idx == data->trace_tail_idx)
+		data->trace_idx = 0;
+}
+
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+	struct trace_entry *next;
+	int next_cpu = -1;
+
+	next = find_next_entry(iter, &next_cpu);
+
+	iter->prev_ent = iter->ent;
+	iter->prev_cpu = iter->cpu;
+
+	iter->ent = next;
+	iter->cpu = next_cpu;
+
+	if (next)
+		trace_iterator_increment(iter);
+
+	return next ? iter : NULL;
+}
+
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_iterator *iter = m->private;
+	void *last_ent = iter->ent;
+	int i = (int)*pos;
+	void *ent;
+
+	(*pos)++;
+
+	/* can't go backwards */
+	if (iter->idx > i)
+		return NULL;
+
+	if (iter->idx < 0)
+		ent = find_next_entry_inc(iter);
+	else
+		ent = iter;
+
+	while (ent && iter->idx < i)
+		ent = find_next_entry_inc(iter);
+
+	iter->pos = *pos;
+
+	if (last_ent && !ent)
+		seq_puts(m, "\n\nvim:ft=help\n");
+
+	return ent;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	struct trace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = 0;
+	int i;
+
+	mutex_lock(&trace_types_lock);
+
+	if (!current_trace || current_trace != iter->trace) {
+		mutex_unlock(&trace_types_lock);
+		return NULL;
+	}
+
+	atomic_inc(&trace_record_cmdline_disabled);
+
+	/* let the tracer grab locks here if needed */
+	if (current_trace->start)
+		current_trace->start(iter);
+
+	if (*pos != iter->pos) {
+		iter->ent = NULL;
+		iter->cpu = 0;
+		iter->idx = -1;
+		iter->prev_ent = NULL;
+		iter->prev_cpu = -1;
+
+		for_each_tracing_cpu(i) {
+			iter->next_idx[i] = 0;
+			iter->next_page[i] = NULL;
+		}
+
+		for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+			;
+
+	} else {
+		l = *pos - 1;
+		p = s_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	struct trace_iterator *iter = m->private;
+
+	atomic_dec(&trace_record_cmdline_disabled);
+
+	/* let the tracer release locks here if needed */
+	if (current_trace && current_trace == iter->trace && iter->trace->stop)
+		iter->trace->stop(iter);
+
+	mutex_unlock(&trace_types_lock);
+}
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+	return trace_seq_printf(s, fmt, str);
+#endif
+	return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+		     unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(str, address);
+	return trace_seq_printf(s, fmt, str);
+#endif
+	return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+static int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+	int ret;
+
+	if (!ip)
+		return trace_seq_printf(s, "0");
+
+	if (sym_flags & TRACE_ITER_SYM_OFFSET)
+		ret = seq_print_sym_offset(s, "%s", ip);
+	else
+		ret = seq_print_sym_short(s, "%s", ip);
+
+	if (!ret)
+		return 0;
+
+	if (sym_flags & TRACE_ITER_SYM_ADDR)
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+static void print_lat_help_header(struct seq_file *m)
+{
+	seq_puts(m, "#                _------=> CPU#            \n");
+	seq_puts(m, "#               / _-----=> irqs-off        \n");
+	seq_puts(m, "#              | / _----=> need-resched    \n");
+	seq_puts(m, "#              || / _---=> hardirq/softirq \n");
+	seq_puts(m, "#              ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "#              |||| /                      \n");
+	seq_puts(m, "#              |||||     delay             \n");
+	seq_puts(m, "#  cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "#     \\   /    |||||   \\   |   /           \n");
+}
+
+static void print_func_help_header(struct seq_file *m)
+{
+	seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n");
+	seq_puts(m, "#              | |      |          |         |\n");
+}
+
+
+static void
+print_trace_header(struct seq_file *m, struct trace_iterator *iter)
+{
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_array *tr = iter->tr;
+	struct trace_array_cpu *data = tr->data[tr->cpu];
+	struct tracer *type = current_trace;
+	unsigned long total   = 0;
+	unsigned long entries = 0;
+	int cpu;
+	const char *name = "preemption";
+
+	if (type)
+		name = type->name;
+
+	for_each_tracing_cpu(cpu) {
+		if (head_page(tr->data[cpu])) {
+			total += tr->data[cpu]->trace_idx;
+			if (tr->data[cpu]->trace_idx > tr->entries)
+				entries += tr->entries;
+			else
+				entries += tr->data[cpu]->trace_idx;
+		}
+	}
+
+	seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+		   name, UTS_RELEASE);
+	seq_puts(m, "-----------------------------------"
+		 "---------------------------------\n");
+	seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+		   nsecs_to_usecs(data->saved_latency),
+		   entries,
+		   total,
+		   tr->cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+		   "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+		   "desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+		   "preempt",
+#else
+		   "unknown",
+#endif
+		   /* These are reserved for later use */
+		   0, 0, 0, 0);
+#ifdef CONFIG_SMP
+	seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+	seq_puts(m, ")\n");
+#endif
+	seq_puts(m, "    -----------------\n");
+	seq_printf(m, "    | task: %.16s-%d "
+		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+		   data->comm, data->pid, data->uid, data->nice,
+		   data->policy, data->rt_priority);
+	seq_puts(m, "    -----------------\n");
+
+	if (data->critical_start) {
+		seq_puts(m, " => started at: ");
+		seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
+		trace_print_seq(m, &iter->seq);
+		seq_puts(m, "\n => ended at:   ");
+		seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
+		trace_print_seq(m, &iter->seq);
+		seq_puts(m, "\n");
+	}
+
+	seq_puts(m, "\n");
+}
+
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+	int hardirq, softirq;
+	char *comm;
+
+	comm = trace_find_cmdline(entry->pid);
+
+	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+	trace_seq_printf(s, "%d", cpu);
+	trace_seq_printf(s, "%c%c",
+			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq) {
+		trace_seq_putc(s, 'H');
+	} else {
+		if (hardirq) {
+			trace_seq_putc(s, 'h');
+		} else {
+			if (softirq)
+				trace_seq_putc(s, 's');
+			else
+				trace_seq_putc(s, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		trace_seq_printf(s, "%x", entry->preempt_count);
+	else
+		trace_seq_puts(s, ".");
+}
+
+unsigned long preempt_mark_thresh = 100;
+
+static void
+lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
+		    unsigned long rel_usecs)
+{
+	trace_seq_printf(s, " %4lldus", abs_usecs);
+	if (rel_usecs > preempt_mark_thresh)
+		trace_seq_puts(s, "!: ");
+	else if (rel_usecs > 1)
+		trace_seq_puts(s, "+: ");
+	else
+		trace_seq_puts(s, " : ");
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+extern unsigned long sys_call_table[NR_syscalls];
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+extern unsigned long ia32_sys_call_table[], ia32_syscall_end[];
+# define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table)
+#endif
+
+static int
+print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_entry *next_entry = find_next_entry(iter, NULL);
+	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+	struct trace_entry *entry = iter->ent;
+	unsigned long abs_usecs;
+	unsigned long rel_usecs;
+	unsigned long nr;
+	char *comm;
+	int S, T;
+	int i;
+	unsigned state;
+
+	if (!next_entry)
+		next_entry = entry;
+	rel_usecs = ns2usecs(next_entry->t - entry->t);
+	abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
+
+	if (verbose) {
+		comm = trace_find_cmdline(entry->pid);
+		trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
+				 " %ld.%03ldms (+%ld.%03ldms): ",
+				 comm,
+				 entry->pid, cpu, entry->flags,
+				 entry->preempt_count, trace_idx,
+				 ns2usecs(entry->t),
+				 abs_usecs/1000,
+				 abs_usecs % 1000, rel_usecs/1000,
+				 rel_usecs % 1000);
+	} else {
+		lat_print_generic(s, entry, cpu);
+		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	}
+	switch (entry->type) {
+	case TRACE_FN:
+		seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		trace_seq_puts(s, " (");
+		seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+		trace_seq_puts(s, ")\n");
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+
+		state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
+		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+		comm = trace_find_cmdline(entry->ctx.next_pid);
+		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
+				 entry->ctx.prev_pid,
+				 entry->ctx.prev_prio,
+				 S, entry->type == TRACE_CTX ? "==>" : "  +",
+				 entry->ctx.next_pid,
+				 entry->ctx.next_prio,
+				 T, comm);
+		break;
+	case TRACE_SPECIAL:
+		trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i)
+				trace_seq_puts(s, " <= ");
+			seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
+		}
+		trace_seq_puts(s, "\n");
+		break;
+	case TRACE_IRQ:
+		seq_print_ip_sym(s, entry->irq.ip, sym_flags);
+		if (entry->irq.irq >= 0)
+			trace_seq_printf(s, " %d ", entry->irq.irq);
+		if (entry->irq.usermode)
+			trace_seq_puts(s, " (usermode)\n ");
+		else {
+			trace_seq_puts(s, " (");
+			seq_print_ip_sym(s, entry->irq.ret_ip, sym_flags);
+			trace_seq_puts(s, ")\n");
+		}
+		break;
+	case TRACE_FAULT:
+		seq_print_ip_sym(s, entry->fault.ip, sym_flags);
+		trace_seq_printf(s, " %lx ", entry->fault.errorcode);
+		trace_seq_puts(s, " (");
+		seq_print_ip_sym(s, entry->fault.ret_ip, sym_flags);
+		trace_seq_puts(s, ")");
+		trace_seq_printf(s, " [%lx]\n", entry->fault.address);
+		break;
+	case TRACE_TIMER_SET:
+		seq_print_ip_sym(s, entry->timer.ip, sym_flags);
+		trace_seq_printf(s, " (%Ld) (%p)\n",
+			   entry->timer.expire, entry->timer.timer);
+		break;
+	case TRACE_TIMER_TRIG:
+		seq_print_ip_sym(s, entry->timer.ip, sym_flags);
+		trace_seq_printf(s, " (%Ld) (%p)\n",
+			   entry->timer.expire, entry->timer.timer);
+		break;
+	case TRACE_TIMESTAMP:
+		seq_print_ip_sym(s, entry->timestamp.ip, sym_flags);
+		trace_seq_printf(s, " (%Ld)\n",
+			   entry->timestamp.now.tv64);
+		break;
+	case TRACE_TASK_ACT:
+		seq_print_ip_sym(s, entry->task.ip, sym_flags);
+		comm = trace_find_cmdline(entry->task.pid);
+		trace_seq_printf(s, " %s %d %d [%d]\n",
+			   comm, entry->task.pid,
+			   entry->task.prio, entry->task.cpu);
+		break;
+	case TRACE_TASK_DEACT:
+		seq_print_ip_sym(s, entry->task.ip, sym_flags);
+		comm = trace_find_cmdline(entry->task.pid);
+		trace_seq_printf(s, " %s %d %d [%d]\n",
+			   comm, entry->task.pid,
+			   entry->task.prio, entry->task.cpu);
+		break;
+	case TRACE_SYSCALL:
+		seq_print_ip_sym(s, entry->syscall.ip, sym_flags);
+		nr = entry->syscall.nr;
+		trace_seq_putc(s, ' ');
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+		if (nr & 0x80000000) {
+			nr &= ~0x80000000;
+			if (nr < IA32_NR_syscalls)
+				seq_print_ip_sym(s, ia32_sys_call_table[nr], 0);
+			else
+				trace_seq_printf(s, "<badsys(%lu)>", nr);
+		} else
+#endif
+			if (nr < NR_syscalls)
+				seq_print_ip_sym(s, sys_call_table[nr], 0);
+			else
+				trace_seq_printf(s, "<badsys(%lu)>", nr);
+
+		trace_seq_printf(s, " (%lx %lx %lx)\n",
+			   entry->syscall.p1,
+			   entry->syscall.p2,
+			   entry->syscall.p3);
+		break;
+	case TRACE_SYSRET:
+		seq_print_ip_sym(s, entry->sysret.ip, sym_flags);
+		trace_seq_printf(s, " < (%ld)\n",
+			   entry->sysret.ret);
+		break;
+	default:
+		trace_seq_printf(s, "Unknown type %d\n", entry->type);
+	}
+	return 1;
+}
+
+static int print_trace_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_entry *entry;
+	unsigned long usec_rem;
+	unsigned long long t;
+	unsigned long secs;
+	long nr;
+	char *comm;
+	int ret;
+	int S, T;
+	int i;
+
+	entry = iter->ent;
+
+	comm = trace_find_cmdline(iter->ent->pid);
+
+	t = ns2usecs(entry->t);
+	usec_rem = do_div(t, 1000000ULL);
+	secs = (unsigned long)t;
+
+	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+	if (!ret)
+		return 0;
+
+	switch (entry->type) {
+	case TRACE_FN:
+		ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		if (!ret)
+			return 0;
+		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
+						entry->fn.parent_ip) {
+			ret = trace_seq_printf(s, " <-");
+			if (!ret)
+				return 0;
+			ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+					       sym_flags);
+			if (!ret)
+				return 0;
+		}
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
+				       entry->ctx.prev_pid,
+				       entry->ctx.prev_prio,
+				       S,
+				       entry->type == TRACE_CTX ? "==>" : "  +",
+				       entry->ctx.next_pid,
+				       entry->ctx.next_prio,
+				       T);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_SPECIAL:
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i) {
+				ret = trace_seq_puts(s, " <= ");
+				if (!ret)
+					return 0;
+			}
+			ret = seq_print_ip_sym(s, entry->stack.caller[i],
+					       sym_flags);
+			if (!ret)
+				return 0;
+		}
+		ret = trace_seq_puts(s, "\n");
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_IRQ:
+		seq_print_ip_sym(s, entry->irq.ip, sym_flags);
+		if (entry->irq.irq >= 0)
+			trace_seq_printf(s, " %d ", entry->irq.irq);
+		if (entry->irq.usermode)
+			trace_seq_puts(s, " (usermode)\n ");
+		else {
+			trace_seq_puts(s, " (");
+			seq_print_ip_sym(s, entry->irq.ret_ip, sym_flags);
+			trace_seq_puts(s, ")\n");
+		}
+		break;
+	case TRACE_FAULT:
+		seq_print_ip_sym(s, entry->fault.ip, sym_flags);
+		trace_seq_printf(s, " %lx ", entry->fault.errorcode);
+		trace_seq_puts(s, " (");
+		seq_print_ip_sym(s, entry->fault.ret_ip, sym_flags);
+		trace_seq_puts(s, ")");
+		trace_seq_printf(s, " [%lx]\n", entry->fault.address);
+		break;
+	case TRACE_TIMER_SET:
+		seq_print_ip_sym(s, entry->timer.ip, sym_flags);
+		trace_seq_printf(s, " (%Ld) (%p)\n",
+			   entry->timer.expire, entry->timer.timer);
+		break;
+	case TRACE_TIMER_TRIG:
+		seq_print_ip_sym(s, entry->timer.ip, sym_flags);
+		trace_seq_printf(s, " (%Ld) (%p)\n",
+			   entry->timer.expire, entry->timer.timer);
+		break;
+	case TRACE_TIMESTAMP:
+		seq_print_ip_sym(s, entry->timestamp.ip, sym_flags);
+		trace_seq_printf(s, " (%Ld)\n",
+			   entry->timestamp.now.tv64);
+		break;
+	case TRACE_TASK_ACT:
+		seq_print_ip_sym(s, entry->task.ip, sym_flags);
+		comm = trace_find_cmdline(entry->task.pid);
+		trace_seq_printf(s, " %s %d %d [%d]\n",
+			   comm, entry->task.pid,
+			   entry->task.prio, entry->task.cpu);
+		break;
+	case TRACE_TASK_DEACT:
+		seq_print_ip_sym(s, entry->task.ip, sym_flags);
+		comm = trace_find_cmdline(entry->task.pid);
+		trace_seq_printf(s, " %s %d %d [%d]\n",
+			   comm, entry->task.pid,
+			   entry->task.prio, entry->task.cpu);
+		break;
+	case TRACE_SYSCALL:
+		seq_print_ip_sym(s, entry->syscall.ip, sym_flags);
+		nr = entry->syscall.nr;
+		trace_seq_putc(s, ' ');
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+		if (nr & 0x80000000) {
+			nr &= ~0x80000000;
+			if (nr < IA32_NR_syscalls)
+				seq_print_ip_sym(s, ia32_sys_call_table[nr], 0);
+			else
+				trace_seq_printf(s, "<badsys(%lu)>", nr);
+		} else
+#endif
+			if (nr < NR_syscalls)
+				seq_print_ip_sym(s, sys_call_table[nr], 0);
+			else
+				trace_seq_printf(s, "<badsys(%lu)>", nr);
+
+		trace_seq_printf(s, " (%lx %lx %lx)\n",
+			   entry->syscall.p1,
+			   entry->syscall.p2,
+			   entry->syscall.p3);
+		break;
+	case TRACE_SYSRET:
+		seq_print_ip_sym(s, entry->sysret.ip, sym_flags);
+		trace_seq_printf(s, "< (%ld)\n",
+			   entry->sysret.ret);
+		break;
+	default:
+		trace_seq_printf(s, "Unknown type %d\n", entry->type);
+	}
+	return 1;
+}
+
+static int print_raw_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry;
+	int ret;
+	int S, T;
+
+	entry = iter->ent;
+
+	ret = trace_seq_printf(s, "%d %d %llu ",
+		entry->pid, iter->cpu, entry->t);
+	if (!ret)
+		return 0;
+
+	switch (entry->type) {
+	case TRACE_FN:
+		ret = trace_seq_printf(s, "%x %x\n",
+					entry->fn.ip, entry->fn.parent_ip);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		if (entry->type == TRACE_WAKE)
+			S = '+';
+		ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
+				       entry->ctx.prev_pid,
+				       entry->ctx.prev_prio,
+				       S,
+				       entry->ctx.next_pid,
+				       entry->ctx.next_prio,
+				       T);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		if (!ret)
+			return 0;
+		break;
+	}
+	return 1;
+}
+
+#define SEQ_PUT_FIELD_RET(s, x)				\
+do {							\
+	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
+do {							\
+	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+static int print_hex_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned char newline = '\n';
+	struct trace_entry *entry;
+	int S, T;
+
+	entry = iter->ent;
+
+	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+	SEQ_PUT_HEX_FIELD_RET(s, entry->t);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_CTX:
+	case TRACE_WAKE:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		if (entry->type == TRACE_WAKE)
+			S = '+';
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
+		SEQ_PUT_HEX_FIELD_RET(s, S);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		SEQ_PUT_HEX_FIELD_RET(s, T);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
+		break;
+	}
+	SEQ_PUT_FIELD_RET(s, newline);
+
+	return 1;
+}
+
+static int print_bin_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry;
+
+	entry = iter->ent;
+
+	SEQ_PUT_FIELD_RET(s, entry->pid);
+	SEQ_PUT_FIELD_RET(s, entry->cpu);
+	SEQ_PUT_FIELD_RET(s, entry->t);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		SEQ_PUT_FIELD_RET(s, entry->fn.ip);
+		SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_CTX:
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_STACK:
+		SEQ_PUT_FIELD_RET(s, entry->special.arg1);
+		SEQ_PUT_FIELD_RET(s, entry->special.arg2);
+		SEQ_PUT_FIELD_RET(s, entry->special.arg3);
+		break;
+	}
+	return 1;
+}
+
+static int trace_empty(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	for_each_tracing_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (head_page(data) && data->trace_idx &&
+		    (data->trace_tail != data->trace_head ||
+		     data->trace_tail_idx != data->trace_head_idx))
+			return 0;
+	}
+	return 1;
+}
+
+static int print_trace_line(struct trace_iterator *iter)
+{
+	if (iter->trace && iter->trace->print_line)
+		return iter->trace->print_line(iter);
+
+	if (trace_flags & TRACE_ITER_BIN)
+		return print_bin_fmt(iter);
+
+	if (trace_flags & TRACE_ITER_HEX)
+		return print_hex_fmt(iter);
+
+	if (trace_flags & TRACE_ITER_RAW)
+		return print_raw_fmt(iter);
+
+	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+		return print_lat_fmt(iter, iter->idx, iter->cpu);
+
+	return print_trace_fmt(iter);
+}
+
+static int s_show(struct seq_file *m, void *v)
+{
+	struct trace_iterator *iter = v;
+
+	if (iter->ent == NULL) {
+		if (iter->tr) {
+			seq_printf(m, "# tracer: %s\n", iter->trace->name);
+			seq_puts(m, "#\n");
+		}
+		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+			/* print nothing if the buffers are empty */
+			if (trace_empty(iter))
+				return 0;
+			print_trace_header(m, iter);
+			if (!(trace_flags & TRACE_ITER_VERBOSE))
+				print_lat_help_header(m);
+		} else {
+			if (!(trace_flags & TRACE_ITER_VERBOSE))
+				print_func_help_header(m);
+		}
+	} else {
+		print_trace_line(iter);
+		trace_print_seq(m, &iter->seq);
+	}
+
+	return 0;
+}
+
+static struct seq_operations tracer_seq_ops = {
+	.start		= s_start,
+	.next		= s_next,
+	.stop		= s_stop,
+	.show		= s_show,
+};
+
+static struct trace_iterator *
+__tracing_open(struct inode *inode, struct file *file, int *ret)
+{
+	struct trace_iterator *iter;
+
+	if (tracing_disabled) {
+		*ret = -ENODEV;
+		return NULL;
+	}
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter) {
+		*ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&trace_types_lock);
+	if (current_trace && current_trace->print_max)
+		iter->tr = &max_tr;
+	else
+		iter->tr = inode->i_private;
+	iter->trace = current_trace;
+	iter->pos = -1;
+
+	/* TODO stop tracer */
+	*ret = seq_open(file, &tracer_seq_ops);
+	if (!*ret) {
+		struct seq_file *m = file->private_data;
+		m->private = iter;
+
+		/* stop the trace while dumping */
+		if (iter->tr->ctrl)
+			tracer_enabled = 0;
+
+		if (iter->trace && iter->trace->open)
+			iter->trace->open(iter);
+	} else {
+		kfree(iter);
+		iter = NULL;
+	}
+	mutex_unlock(&trace_types_lock);
+
+ out:
+	return iter;
+}
+
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+	if (tracing_disabled)
+		return -ENODEV;
+
+	filp->private_data = inode->i_private;
+	return 0;
+}
+
+int tracing_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct trace_iterator *iter = m->private;
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace && iter->trace->close)
+		iter->trace->close(iter);
+
+	/* reenable tracing if it was previously enabled */
+	if (iter->tr->ctrl)
+		tracer_enabled = 1;
+	mutex_unlock(&trace_types_lock);
+
+	seq_release(inode, file);
+	kfree(iter);
+	return 0;
+}
+
+static int tracing_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	__tracing_open(inode, file, &ret);
+
+	return ret;
+}
+
+static int tracing_lt_open(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter;
+	int ret;
+
+	iter = __tracing_open(inode, file, &ret);
+
+	if (!ret)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
+	return ret;
+}
+
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct tracer *t = m->private;
+
+	(*pos)++;
+
+	if (t)
+		t = t->next;
+
+	m->private = t;
+
+	return t;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct tracer *t = m->private;
+	loff_t l = 0;
+
+	mutex_lock(&trace_types_lock);
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&trace_types_lock);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct tracer *t = v;
+
+	if (!t)
+		return 0;
+
+	seq_printf(m, "%s", t->name);
+	if (t->next)
+		seq_putc(m, ' ');
+	else
+		seq_putc(m, '\n');
+
+	return 0;
+}
+
+static struct seq_operations show_traces_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int show_traces_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	ret = seq_open(file, &show_traces_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = trace_types;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_fops = {
+	.open		= tracing_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_release,
+};
+
+static struct file_operations tracing_lt_fops = {
+	.open		= tracing_lt_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_release,
+};
+
+static struct file_operations show_traces_fops = {
+	.open		= show_traces_open,
+	.read		= seq_read,
+	.release	= seq_release,
+};
+
+/*
+ * Only trace on a CPU if the bitmask is set:
+ */
+static cpumask_t tracing_cpumask = CPU_MASK_ALL;
+
+/*
+ * When tracing/tracing_cpu_mask is modified then this holds
+ * the new bitmask we are about to install:
+ */
+static cpumask_t tracing_cpumask_new;
+
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask (and one more byte for the newline):
+ */
+static char mask_str[NR_CPUS + 1];
+
+static ssize_t
+tracing_cpumask_read(struct file *filp, char __user *ubuf,
+		     size_t count, loff_t *ppos)
+{
+	int len;
+
+	mutex_lock(&tracing_cpumask_update_lock);
+
+	len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+	if (count - len < 2) {
+		count = -EINVAL;
+		goto out_err;
+	}
+	len += sprintf(mask_str + len, "\n");
+	count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+
+out_err:
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return count;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+		      size_t count, loff_t *ppos)
+{
+	int err, cpu;
+
+	mutex_lock(&tracing_cpumask_update_lock);
+	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+	if (err)
+		goto err_unlock;
+
+	raw_local_irq_disable();
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_tracing_cpu(cpu) {
+		/*
+		 * Increase/decrease the disabled counter if we are
+		 * about to flip a bit in the cpumask:
+		 */
+		if (cpu_isset(cpu, tracing_cpumask) &&
+				!cpu_isset(cpu, tracing_cpumask_new)) {
+			atomic_inc(&global_trace.data[cpu]->disabled);
+		}
+		if (!cpu_isset(cpu, tracing_cpumask) &&
+				cpu_isset(cpu, tracing_cpumask_new)) {
+			atomic_dec(&global_trace.data[cpu]->disabled);
+		}
+	}
+	__raw_spin_unlock(&ftrace_max_lock);
+	raw_local_irq_enable();
+
+	tracing_cpumask = tracing_cpumask_new;
+
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return count;
+
+err_unlock:
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return err;
+}
+
+static struct file_operations tracing_cpumask_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_cpumask_read,
+	.write		= tracing_cpumask_write,
+};
+
+static ssize_t
+tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char *buf;
+	int r = 0;
+	int len = 0;
+	int i;
+
+	/* calulate max size */
+	for (i = 0; trace_options[i]; i++) {
+		len += strlen(trace_options[i]);
+		len += 3; /* "no" and space */
+	}
+
+	/* +2 for \n and \0 */
+	buf = kmalloc(len + 2, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	for (i = 0; trace_options[i]; i++) {
+		if (trace_flags & (1 << i))
+			r += sprintf(buf + r, "%s ", trace_options[i]);
+		else
+			r += sprintf(buf + r, "no%s ", trace_options[i]);
+	}
+
+	r += sprintf(buf + r, "\n");
+	WARN_ON(r >= len + 2);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+	kfree(buf);
+
+	return r;
+}
+
+static ssize_t
+tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	char *cmp = buf;
+	int neg = 0;
+	int i;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (strncmp(buf, "no", 2) == 0) {
+		neg = 1;
+		cmp += 2;
+	}
+
+	for (i = 0; trace_options[i]; i++) {
+		int len = strlen(trace_options[i]);
+
+		if (strncmp(cmp, trace_options[i], len) == 0) {
+			if (neg)
+				trace_flags &= ~(1 << i);
+			else
+				trace_flags |= (1 << i);
+			break;
+		}
+	}
+	/*
+	 * If no option could be set, return an error:
+	 */
+	if (!trace_options[i])
+		return -EINVAL;
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static struct file_operations tracing_iter_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_iter_ctrl_read,
+	.write		= tracing_iter_ctrl_write,
+};
+
+static const char readme_msg[] =
+	"tracing mini-HOWTO:\n\n"
+	"# mkdir /debug\n"
+	"# mount -t debugfs nodev /debug\n\n"
+	"# cat /debug/tracing/available_tracers\n"
+	"wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+	"# cat /debug/tracing/current_tracer\n"
+	"none\n"
+	"# echo sched_switch > /debug/tracing/current_tracer\n"
+	"# cat /debug/tracing/current_tracer\n"
+	"sched_switch\n"
+	"# cat /debug/tracing/iter_ctrl\n"
+	"noprint-parent nosym-offset nosym-addr noverbose\n"
+	"# echo print-parent > /debug/tracing/iter_ctrl\n"
+	"# echo 1 > /debug/tracing/tracing_enabled\n"
+	"# cat /debug/tracing/trace > /tmp/trace.txt\n"
+	"echo 0 > /debug/tracing/tracing_enabled\n"
+;
+
+static ssize_t
+tracing_readme_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+					readme_msg, strlen(readme_msg));
+}
+
+static struct file_operations tracing_readme_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_readme_read,
+};
+
+static ssize_t
+tracing_ctrl_read(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%ld\n", tr->ctrl);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_ctrl_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	val = !!val;
+
+	mutex_lock(&trace_types_lock);
+	if (tr->ctrl ^ val) {
+		if (val)
+			tracer_enabled = 1;
+		else
+			tracer_enabled = 0;
+
+		tr->ctrl = val;
+
+		if (current_trace && current_trace->ctrl_update)
+			current_trace->ctrl_update(tr);
+	}
+	mutex_unlock(&trace_types_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+tracing_set_trace_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char buf[max_tracer_type_len+2];
+	int r;
+
+	mutex_lock(&trace_types_lock);
+	if (current_trace)
+		r = sprintf(buf, "%s\n", current_trace->name);
+	else
+		r = sprintf(buf, "\n");
+	mutex_unlock(&trace_types_lock);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = &global_trace;
+	struct tracer *t;
+	char buf[max_tracer_type_len+1];
+	int i;
+
+	if (cnt > max_tracer_type_len)
+		cnt = max_tracer_type_len;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	/* strip ending whitespace. */
+	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+		buf[i] = 0;
+
+	mutex_lock(&trace_types_lock);
+	for (t = trace_types; t; t = t->next) {
+		if (strcmp(t->name, buf) == 0)
+			break;
+	}
+	if (!t || t == current_trace)
+		goto out;
+
+	if (current_trace && current_trace->reset)
+		current_trace->reset(tr);
+
+	current_trace = t;
+	if (t->init)
+		t->init(tr);
+
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long *ptr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = snprintf(buf, sizeof(buf), "%ld\n",
+		     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
+	if (r > sizeof(buf))
+		r = sizeof(buf);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	long *ptr = filp->private_data;
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	*ptr = val * 1000;
+
+	return cnt;
+}
+
+static atomic_t tracing_reader;
+
+static int tracing_open_pipe(struct inode *inode, struct file *filp)
+{
+	struct trace_iterator *iter;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	/* We only allow for reader of the pipe */
+	if (atomic_inc_return(&tracing_reader) != 1) {
+		atomic_dec(&tracing_reader);
+		return -EBUSY;
+	}
+
+	/* create a buffer to store the information to pass to userspace */
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	mutex_lock(&trace_types_lock);
+	iter->tr = &global_trace;
+	iter->trace = current_trace;
+	filp->private_data = iter;
+
+	if (iter->trace->pipe_open)
+		iter->trace->pipe_open(iter);
+	mutex_unlock(&trace_types_lock);
+
+	return 0;
+}
+
+static int tracing_release_pipe(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter = file->private_data;
+
+	kfree(iter);
+	atomic_dec(&tracing_reader);
+
+	return 0;
+}
+
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+	struct trace_iterator *iter = filp->private_data;
+
+	if (trace_flags & TRACE_ITER_BLOCK) {
+		/*
+		 * Always select as readable when in blocking mode
+		 */
+		return POLLIN | POLLRDNORM;
+	} else {
+		if (!trace_empty(iter))
+			return POLLIN | POLLRDNORM;
+		poll_wait(filp, &trace_wait, poll_table);
+		if (!trace_empty(iter))
+			return POLLIN | POLLRDNORM;
+
+		return 0;
+	}
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_iterator *iter = filp->private_data;
+	struct trace_array_cpu *data;
+	static cpumask_t mask;
+	unsigned long flags;
+#ifdef CONFIG_FTRACE
+	int ftrace_save;
+#endif
+	int cpu;
+	ssize_t sret;
+
+	/* return any leftover data */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		return sret;
+	sret = 0;
+
+	trace_seq_reset(&iter->seq);
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace->read) {
+		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+		if (sret)
+			goto out;
+	}
+
+	while (trace_empty(iter)) {
+
+		if ((filp->f_flags & O_NONBLOCK)) {
+			sret = -EAGAIN;
+			goto out;
+		}
+
+		/*
+		 * This is a make-shift waitqueue. The reason we don't use
+		 * an actual wait queue is because:
+		 *  1) we only ever have one waiter
+		 *  2) the tracing, traces all functions, we don't want
+		 *     the overhead of calling wake_up and friends
+		 *     (and tracing them too)
+		 *     Anyway, this is really very primitive wakeup.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		iter->tr->waiter = current;
+
+		mutex_unlock(&trace_types_lock);
+
+		/* sleep for 100 msecs, and try again. */
+		schedule_timeout(HZ/10);
+
+		mutex_lock(&trace_types_lock);
+
+		iter->tr->waiter = NULL;
+
+		if (signal_pending(current)) {
+			sret = -EINTR;
+			goto out;
+		}
+
+		if (iter->trace != current_trace)
+			goto out;
+
+		/*
+		 * We block until we read something and tracing is disabled.
+		 * We still block if tracing is disabled, but we have never
+		 * read anything. This allows a user to cat this file, and
+		 * then enable tracing. But after we have read something,
+		 * we give an EOF when tracing is again disabled.
+		 *
+		 * iter->pos will be 0 if we haven't read anything.
+		 */
+		if (!tracer_enabled && iter->pos)
+			break;
+
+		continue;
+	}
+
+	/* stop when tracing is finished */
+	if (trace_empty(iter))
+		goto out;
+
+	if (cnt >= PAGE_SIZE)
+		cnt = PAGE_SIZE - 1;
+
+	/* reset all but tr, trace, and overruns */
+	memset(&iter->seq, 0,
+	       sizeof(struct trace_iterator) -
+	       offsetof(struct trace_iterator, seq));
+	iter->pos = -1;
+
+	/*
+	 * We need to stop all tracing on all CPUS to read the
+	 * the next buffer. This is a bit expensive, but is
+	 * not done often. We fill all what we can read,
+	 * and then release the locks again.
+	 */
+
+	cpus_clear(mask);
+	local_irq_save(flags);
+#ifdef CONFIG_FTRACE
+	ftrace_save = ftrace_enabled;
+	ftrace_enabled = 0;
+#endif
+	smp_wmb();
+	for_each_tracing_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (!head_page(data) || !data->trace_idx)
+			continue;
+
+		atomic_inc(&data->disabled);
+		cpu_set(cpu, mask);
+	}
+
+	for_each_cpu_mask_nr(cpu, mask) {
+		data = iter->tr->data[cpu];
+		__raw_spin_lock(&data->lock);
+
+		if (data->overrun > iter->last_overrun[cpu])
+			iter->overrun[cpu] +=
+				data->overrun - iter->last_overrun[cpu];
+		iter->last_overrun[cpu] = data->overrun;
+	}
+
+	while (find_next_entry_inc(iter) != NULL) {
+		int ret;
+		int len = iter->seq.len;
+
+		ret = print_trace_line(iter);
+		if (!ret) {
+			/* don't print partial lines */
+			iter->seq.len = len;
+			break;
+		}
+
+		trace_consume(iter);
+
+		if (iter->seq.len >= cnt)
+			break;
+	}
+
+	for_each_cpu_mask_nr(cpu, mask) {
+		data = iter->tr->data[cpu];
+		__raw_spin_unlock(&data->lock);
+	}
+
+	for_each_cpu_mask_nr(cpu, mask) {
+		data = iter->tr->data[cpu];
+		atomic_dec(&data->disabled);
+	}
+#ifdef CONFIG_FTRACE
+	ftrace_enabled = ftrace_save;
+#endif
+	local_irq_restore(flags);
+
+	/* Now copy what we have to the user */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (iter->seq.readpos >= iter->seq.len)
+		trace_seq_reset(&iter->seq);
+	if (sret == -EBUSY)
+		sret = 0;
+
+out:
+	mutex_unlock(&trace_types_lock);
+
+	return sret;
+}
+
+static ssize_t
+tracing_entries_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%lu\n", tr->entries);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_entries_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];
+	int i, ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	/* must have at least 1 entry */
+	if (!val)
+		return -EINVAL;
+
+	mutex_lock(&trace_types_lock);
+
+	if (current_trace != &no_tracer) {
+		cnt = -EBUSY;
+		pr_info("ftrace: set current_tracer to none"
+			" before modifying buffer size\n");
+		goto out;
+	}
+
+	if (val > global_trace.entries) {
+		long pages_requested;
+		unsigned long freeable_pages;
+
+		/* make sure we have enough memory before mapping */
+		pages_requested =
+			(val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
+
+		/* account for each buffer (and max_tr) */
+		pages_requested *= tracing_nr_buffers * 2;
+
+		/* Check for overflow */
+		if (pages_requested < 0) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		freeable_pages = determine_dirtyable_memory();
+
+		/* we only allow to request 1/4 of useable memory */
+		if (pages_requested >
+		    ((freeable_pages + tracing_pages_allocated) / 4)) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		while (global_trace.entries < val) {
+			if (trace_alloc_page()) {
+				cnt = -ENOMEM;
+				goto out;
+			}
+			/* double check that we don't go over the known pages */
+			if (tracing_pages_allocated > pages_requested)
+				break;
+		}
+
+	} else {
+		/* include the number of entries in val (inc of page entries) */
+		while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
+			trace_free_page();
+	}
+
+	/* check integrity */
+	for_each_tracing_cpu(i)
+		check_pages(global_trace.data[i]);
+
+	filp->f_pos += cnt;
+
+	/* If check pages failed, return ENOMEM */
+	if (tracing_disabled)
+		cnt = -ENOMEM;
+ out:
+	max_tr.entries = global_trace.entries;
+	mutex_unlock(&trace_types_lock);
+
+	return cnt;
+}
+
+static struct file_operations tracing_max_lat_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_max_lat_read,
+	.write		= tracing_max_lat_write,
+};
+
+static struct file_operations tracing_ctrl_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_ctrl_read,
+	.write		= tracing_ctrl_write,
+};
+
+static struct file_operations set_tracer_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_set_trace_read,
+	.write		= tracing_set_trace_write,
+};
+
+static struct file_operations tracing_pipe_fops = {
+	.open		= tracing_open_pipe,
+	.poll		= tracing_poll_pipe,
+	.read		= tracing_read_pipe,
+	.release	= tracing_release_pipe,
+};
+
+static struct file_operations tracing_entries_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_entries_read,
+	.write		= tracing_entries_write,
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static ssize_t
+tracing_read_long(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	unsigned long *p = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%ld\n", *p);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static struct file_operations tracing_read_long_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_read_long,
+};
+#endif
+
+static struct dentry *d_tracer;
+
+struct dentry *tracing_init_dentry(void)
+{
+	static int once;
+
+	if (d_tracer)
+		return d_tracer;
+
+	d_tracer = debugfs_create_dir("tracing", NULL);
+
+	if (!d_tracer && !once) {
+		once = 1;
+		pr_warning("Could not create debugfs directory 'tracing'\n");
+		return NULL;
+	}
+
+	return d_tracer;
+}
+
+#ifdef CONFIG_FTRACE_SELFTEST
+/* Let selftest have access to static functions in this file */
+#include "trace_selftest.c"
+#endif
+
+static __init void tracer_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
+				    &global_trace, &tracing_ctrl_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+
+	entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+				    NULL, &tracing_iter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+
+	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
+				    NULL, &tracing_cpumask_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
+
+	entry = debugfs_create_file("latency_trace", 0444, d_tracer,
+				    &global_trace, &tracing_lt_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'latency_trace' entry\n");
+
+	entry = debugfs_create_file("trace", 0444, d_tracer,
+				    &global_trace, &tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("available_tracers", 0444, d_tracer,
+				    &global_trace, &show_traces_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("current_tracer", 0444, d_tracer,
+				    &global_trace, &set_tracer_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
+				    &tracing_max_latency,
+				    &tracing_max_lat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_max_latency' entry\n");
+
+	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
+				    &tracing_thresh, &tracing_max_lat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+	entry = debugfs_create_file("README", 0644, d_tracer,
+				    NULL, &tracing_readme_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'README' entry\n");
+
+	entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
+				    NULL, &tracing_pipe_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+
+	entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+				    &global_trace, &tracing_entries_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+				    &ftrace_update_tot_cnt,
+				    &tracing_read_long_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'dyn_ftrace_total_info' entry\n");
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+	init_tracer_sysprof_debugfs(d_tracer);
+#endif
+}
+
+static int trace_alloc_page(void)
+{
+	struct trace_array_cpu *data;
+	struct page *page, *tmp;
+	LIST_HEAD(pages);
+	void *array;
+	unsigned pages_allocated = 0;
+	int i;
+
+	/* first allocate a page for each CPU */
+	for_each_tracing_cpu(i) {
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_pages;
+		}
+
+		pages_allocated++;
+		page = virt_to_page(array);
+		list_add(&page->lru, &pages);
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_pages;
+		}
+		pages_allocated++;
+		page = virt_to_page(array);
+		list_add(&page->lru, &pages);
+#endif
+	}
+
+	/* Now that we successfully allocate a page per CPU, add them */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i];
+		page = list_entry(pages.next, struct page, lru);
+		list_del_init(&page->lru);
+		list_add_tail(&page->lru, &data->trace_pages);
+		ClearPageLRU(page);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		page = list_entry(pages.next, struct page, lru);
+		list_del_init(&page->lru);
+		list_add_tail(&page->lru, &data->trace_pages);
+		SetPageLRU(page);
+#endif
+	}
+	tracing_pages_allocated += pages_allocated;
+	global_trace.entries += ENTRIES_PER_PAGE;
+
+	return 0;
+
+ free_pages:
+	list_for_each_entry_safe(page, tmp, &pages, lru) {
+		list_del_init(&page->lru);
+		__free_page(page);
+	}
+	return -ENOMEM;
+}
+
+static int trace_free_page(void)
+{
+	struct trace_array_cpu *data;
+	struct page *page;
+	struct list_head *p;
+	int i;
+	int ret = 0;
+
+	/* free one page from each buffer */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i];
+		p = data->trace_pages.next;
+		if (p == &data->trace_pages) {
+			/* should never happen */
+			WARN_ON(1);
+			tracing_disabled = 1;
+			ret = -1;
+			break;
+		}
+		page = list_entry(p, struct page, lru);
+		ClearPageLRU(page);
+		list_del(&page->lru);
+		tracing_pages_allocated--;
+		tracing_pages_allocated--;
+		__free_page(page);
+
+		tracing_reset(data);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		p = data->trace_pages.next;
+		if (p == &data->trace_pages) {
+			/* should never happen */
+			WARN_ON(1);
+			tracing_disabled = 1;
+			ret = -1;
+			break;
+		}
+		page = list_entry(p, struct page, lru);
+		ClearPageLRU(page);
+		list_del(&page->lru);
+		__free_page(page);
+
+		tracing_reset(data);
+#endif
+	}
+	global_trace.entries -= ENTRIES_PER_PAGE;
+
+	return ret;
+}
+
+__init static int tracer_alloc_buffers(void)
+{
+	struct trace_array_cpu *data;
+	void *array;
+	struct page *page;
+	int pages = 0;
+	int ret = -ENOMEM;
+	int i;
+
+	/* TODO: make the number of buffers hot pluggable with CPUS */
+	tracing_nr_buffers = num_possible_cpus();
+	tracing_buffer_mask = cpu_possible_map;
+
+	/* Allocate the first page for all buffers */
+	for_each_tracing_cpu(i) {
+		data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+		max_tr.data[i] = &per_cpu(max_data, i);
+
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_buffers;
+		}
+
+		/* set the array to the list */
+		INIT_LIST_HEAD(&data->trace_pages);
+		page = virt_to_page(array);
+		list_add(&page->lru, &data->trace_pages);
+		/* use the LRU flag to differentiate the two buffers */
+		ClearPageLRU(page);
+
+		data->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+		max_tr.data[i]->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_buffers;
+		}
+
+		INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
+		page = virt_to_page(array);
+		list_add(&page->lru, &max_tr.data[i]->trace_pages);
+		SetPageLRU(page);
+#endif
+	}
+
+	/*
+	 * Since we allocate by orders of pages, we may be able to
+	 * round up a bit.
+	 */
+	global_trace.entries = ENTRIES_PER_PAGE;
+	pages++;
+
+	while (global_trace.entries < trace_nr_entries) {
+		if (trace_alloc_page())
+			break;
+		pages++;
+	}
+	max_tr.entries = global_trace.entries;
+
+	pr_info("tracer: %d pages allocated for %ld",
+		pages, trace_nr_entries);
+	pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE);
+	pr_info("   actual entries %ld\n", global_trace.entries);
+
+	tracer_init_debugfs();
+
+	trace_init_cmdlines();
+
+	register_tracer(&no_tracer);
+	current_trace = &no_tracer;
+
+	/* All seems OK, enable tracing */
+	global_trace.ctrl = tracer_enabled;
+	tracing_disabled = 0;
+
+	return 0;
+
+ free_buffers:
+	for (i-- ; i >= 0; i--) {
+		struct page *page, *tmp;
+		struct trace_array_cpu *data = global_trace.data[i];
+
+		if (data) {
+			list_for_each_entry_safe(page, tmp,
+						 &data->trace_pages, lru) {
+				list_del_init(&page->lru);
+				__free_page(page);
+			}
+		}
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		if (data) {
+			list_for_each_entry_safe(page, tmp,
+						 &data->trace_pages, lru) {
+				list_del_init(&page->lru);
+				__free_page(page);
+			}
+		}
+#endif
+	}
+	return ret;
+}
+fs_initcall(tracer_alloc_buffers);
Index: linux-2.6.25.4-rt4/kernel/trace/trace.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace.h	2008-05-29 09:46:26.000000000 -0400
@@ -0,0 +1,467 @@
+#ifndef _LINUX_KERNEL_TRACE_H
+#define _LINUX_KERNEL_TRACE_H
+
+#include <linux/fs.h>
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
+
+enum trace_type {
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_CTX,
+	TRACE_WAKE,
+	TRACE_STACK,
+	TRACE_SPECIAL,
+	TRACE_MMIO_RW,
+	TRACE_MMIO_MAP,
+	TRACE_IRQ,
+	TRACE_FAULT,
+	TRACE_TIMER_SET,
+	TRACE_TIMER_TRIG,
+	TRACE_TIMESTAMP,
+	TRACE_TASK_ACT,
+	TRACE_TASK_DEACT,
+	TRACE_SYSCALL,
+	TRACE_SYSRET,
+
+	__TRACE_LAST_TYPE
+};
+
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+struct ftrace_entry {
+	unsigned long		ip;
+	unsigned long		parent_ip;
+};
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ */
+struct ctx_switch_entry {
+	unsigned int		prev_pid;
+	unsigned char		prev_prio;
+	unsigned char		prev_state;
+	unsigned int		next_pid;
+	unsigned char		next_prio;
+	unsigned char		next_state;
+};
+
+/*
+ * Special (free-form) trace entry:
+ */
+struct special_entry {
+	unsigned long		arg1;
+	unsigned long		arg2;
+	unsigned long		arg3;
+};
+
+struct irq_entry {
+	unsigned long		ip;
+	unsigned long		ret_ip;
+	unsigned		irq;
+	unsigned		usermode;
+};
+
+struct fault_entry {
+	unsigned long		ip;
+	unsigned long		ret_ip;
+	unsigned long		errorcode;
+	unsigned long		address;
+};
+
+struct timer_entry {
+	unsigned long		ip;
+	ktime_t			expire;
+	void			*timer;
+};
+
+struct timestamp_entry {
+	unsigned long		ip;
+	ktime_t			now;
+};
+
+struct task_entry {
+	unsigned long		ip;
+	pid_t			pid;
+	unsigned		prio;
+	int			cpu;
+};
+
+struct wakeup_entry {
+	unsigned long		ip;
+	pid_t			pid;
+	unsigned		prio;
+	unsigned		curr_prio;
+};
+
+struct syscall_entry {
+	unsigned long		ip;
+	unsigned long		nr;
+	unsigned long		p1;
+	unsigned long		p2;
+	unsigned long		p3;
+};
+
+struct sysret_entry {
+	unsigned long		ip;
+	unsigned long		ret;
+};
+
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES	8
+
+struct stack_entry {
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	char			type;
+	char			cpu;
+	char			flags;
+	char			preempt_count;
+	int			pid;
+	cycle_t			t;
+	union {
+		struct ftrace_entry		fn;
+		struct ctx_switch_entry		ctx;
+		struct special_entry		special;
+		struct stack_entry		stack;
+		struct mmiotrace_rw		mmiorw;
+		struct mmiotrace_map		mmiomap;
+		struct irq_entry		irq;
+		struct fault_entry		fault;
+		struct timer_entry		timer;
+		struct timestamp_entry		timestamp;
+		struct task_entry		task;
+		struct wakeup_entry		wakeup;
+		struct syscall_entry		syscall;
+		struct sysret_entry		sysret;
+	};
+};
+
+#define TRACE_ENTRY_SIZE	sizeof(struct trace_entry)
+
+/*
+ * The CPU trace array - it consists of thousands of trace entries
+ * plus some other descriptor data: (for example which task started
+ * the trace, etc.)
+ */
+struct trace_array_cpu {
+	struct list_head	trace_pages;
+	atomic_t		disabled;
+	__raw_spinlock_t		lock;
+	struct lock_class_key	lock_key;
+
+	/* these fields get copied into max-trace: */
+	unsigned		trace_head_idx;
+	unsigned		trace_tail_idx;
+	void			*trace_head; /* producer */
+	void			*trace_tail; /* consumer */
+	unsigned long		trace_idx;
+	unsigned long		overrun;
+	unsigned long		saved_latency;
+	unsigned long		critical_start;
+	unsigned long		critical_end;
+	unsigned long		critical_sequence;
+	unsigned long		nice;
+	unsigned long		policy;
+	unsigned long		rt_priority;
+	cycle_t			preempt_timestamp;
+	pid_t			pid;
+	uid_t			uid;
+	char			comm[TASK_COMM_LEN];
+};
+
+struct trace_iterator;
+
+/*
+ * The trace array - an array of per-CPU trace arrays. This is the
+ * highest level data structure that individual tracers deal with.
+ * They have on/off state as well:
+ */
+struct trace_array {
+	unsigned long		entries;
+	long			ctrl;
+	int			cpu;
+	cycle_t			time_start;
+	struct task_struct	*waiter;
+	struct trace_array_cpu	*data[NR_CPUS];
+};
+
+/*
+ * A specific tracer, represented by methods that operate on a trace array:
+ */
+struct tracer {
+	const char		*name;
+	void			(*init)(struct trace_array *tr);
+	void			(*reset)(struct trace_array *tr);
+	void			(*open)(struct trace_iterator *iter);
+	void			(*pipe_open)(struct trace_iterator *iter);
+	void			(*close)(struct trace_iterator *iter);
+	void			(*start)(struct trace_iterator *iter);
+	void			(*stop)(struct trace_iterator *iter);
+	ssize_t			(*read)(struct trace_iterator *iter,
+					struct file *filp, char __user *ubuf,
+					size_t cnt, loff_t *ppos);
+	void			(*ctrl_update)(struct trace_array *tr);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	int			(*selftest)(struct tracer *trace,
+					    struct trace_array *tr);
+#endif
+	int			(*print_line)(struct trace_iterator *iter);
+	struct tracer		*next;
+	int			print_max;
+};
+
+struct trace_seq {
+	unsigned char		buffer[PAGE_SIZE];
+	unsigned int		len;
+	unsigned int		readpos;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	void			*private;
+	long			last_overrun[NR_CPUS];
+	long			overrun[NR_CPUS];
+
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
+	struct trace_entry	*ent;
+	int			cpu;
+
+	struct trace_entry	*prev_ent;
+	int			prev_cpu;
+
+	unsigned long		iter_flags;
+	loff_t			pos;
+	unsigned long		next_idx[NR_CPUS];
+	struct list_head	*next_page[NR_CPUS];
+	unsigned		next_page_idx[NR_CPUS];
+	long			idx;
+};
+
+void tracing_reset(struct trace_array_cpu *data);
+int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *tracing_init_dentry(void);
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
+
+void ftrace(struct trace_array *tr,
+			    struct trace_array_cpu *data,
+			    unsigned long ip,
+			    unsigned long parent_ip,
+			    unsigned long flags);
+void tracing_sched_switch_trace(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct task_struct *prev,
+				struct task_struct *next,
+				unsigned long flags);
+void tracing_record_cmdline(struct task_struct *tsk);
+
+void tracing_sched_wakeup_trace(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct task_struct *wakee,
+				struct task_struct *cur,
+				unsigned long flags);
+void trace_special(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long arg1,
+		   unsigned long arg2,
+		   unsigned long arg3);
+void trace_function(struct trace_array *tr,
+		    struct trace_array_cpu *data,
+		    unsigned long ip,
+		    unsigned long parent_ip,
+		    unsigned long flags);
+void tracing_event_irq(struct trace_array *tr,
+		       struct trace_array_cpu *data,
+		       unsigned long flags,
+		       unsigned long ip,
+		       int irq, int usermode,
+		       unsigned long retip);
+void tracing_event_fault(struct trace_array *tr,
+			 struct trace_array_cpu *data,
+			 unsigned long flags,
+			 unsigned long ip,
+			 unsigned long retip,
+			 unsigned long error_code,
+			 unsigned long address);
+void tracing_event_timer_set(struct trace_array *tr,
+			     struct trace_array_cpu *data,
+			     unsigned long flags,
+			     unsigned long ip,
+			     ktime_t *expires, void *timer);
+void tracing_event_timer_triggered(struct trace_array *tr,
+				   struct trace_array_cpu *data,
+				   unsigned long flags,
+				   unsigned long ip,
+				   ktime_t *expired, void *timer);
+void tracing_event_timestamp(struct trace_array *tr,
+			     struct trace_array_cpu *data,
+			     unsigned long flags,
+			     unsigned long ip,
+			     ktime_t *now);
+void tracing_event_task_activate(struct trace_array *tr,
+				 struct trace_array_cpu *data,
+				 unsigned long flags,
+				 unsigned long ip,
+				 struct task_struct *p,
+				 int cpu);
+void tracing_event_task_deactivate(struct trace_array *tr,
+				   struct trace_array_cpu *data,
+				   unsigned long flags,
+				   unsigned long ip,
+				   struct task_struct *p,
+				   int cpu);
+void tracing_event_wakeup(struct trace_array *tr,
+			  struct trace_array_cpu *data,
+			  unsigned long flags,
+			  unsigned long ip,
+			  pid_t pid, int prio,
+			  int curr_prio);
+void tracing_event_syscall(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   unsigned long flags,
+			   unsigned long ip,
+			   unsigned long nr,
+			   unsigned long p1,
+			   unsigned long p2,
+			   unsigned long p3);
+void tracing_event_sysret(struct trace_array *tr,
+			  struct trace_array_cpu *data,
+			  unsigned long flags,
+			  unsigned long ip,
+			  unsigned long ret);
+
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
+int register_tracer(struct tracer *type);
+void unregister_tracer(struct tracer *type);
+
+extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+
+extern unsigned long tracing_max_latency;
+extern unsigned long tracing_thresh;
+
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr_single(struct trace_array *tr,
+			  struct task_struct *tsk, int cpu);
+
+extern cycle_t ftrace_now(int cpu);
+
+#ifdef CONFIG_FTRACE
+void tracing_start_function_trace(void);
+void tracing_stop_function_trace(void);
+#else
+# define tracing_start_function_trace()		do { } while (0)
+# define tracing_stop_function_trace()		do { } while (0)
+#endif
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+typedef void
+(*tracer_switch_func_t)(void *private,
+			void *__rq,
+			struct task_struct *prev,
+			struct task_struct *next);
+
+struct tracer_switch_ops {
+	tracer_switch_func_t		func;
+	void				*private;
+	struct tracer_switch_ops	*next;
+};
+
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_update_tot_cnt;
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+extern int DYN_FTRACE_TEST_NAME(void);
+#endif
+
+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_map *map);
+#endif
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+#ifdef CONFIG_FTRACE
+extern int trace_selftest_startup_function(struct tracer *trace,
+					   struct trace_array *tr);
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+extern int trace_selftest_startup_irqsoff(struct tracer *trace,
+					  struct trace_array *tr);
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+extern int trace_selftest_startup_preemptoff(struct tracer *trace,
+					     struct trace_array *tr);
+#endif
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
+						 struct trace_array *tr);
+#endif
+#ifdef CONFIG_SCHED_TRACER
+extern int trace_selftest_startup_wakeup(struct tracer *trace,
+					 struct trace_array *tr);
+#endif
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern int trace_selftest_startup_sched_switch(struct tracer *trace,
+					       struct trace_array *tr);
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+extern int trace_selftest_startup_sysprof(struct tracer *trace,
+					       struct trace_array *tr);
+#endif
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
+extern void *head_page(struct trace_array_cpu *data);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+extern long ns2usecs(cycle_t nsec);
+
+extern unsigned long trace_flags;
+
+/*
+ * trace_iterator_flags is an enumeration that defines bit
+ * positions into trace_flags that controls the output.
+ *
+ * NOTE: These bits must match the trace_options array in
+ *       trace.c.
+ */
+enum trace_iterator_flags {
+	TRACE_ITER_PRINT_PARENT		= 0x01,
+	TRACE_ITER_SYM_OFFSET		= 0x02,
+	TRACE_ITER_SYM_ADDR		= 0x04,
+	TRACE_ITER_VERBOSE		= 0x08,
+	TRACE_ITER_RAW			= 0x10,
+	TRACE_ITER_HEX			= 0x20,
+	TRACE_ITER_BIN			= 0x40,
+	TRACE_ITER_BLOCK		= 0x80,
+	TRACE_ITER_STACKTRACE		= 0x100,
+	TRACE_ITER_SCHED_TREE		= 0x200,
+};
+
+#endif /* _LINUX_KERNEL_TRACE_H */
Index: linux-2.6.25.4-rt4/kernel/trace/trace_functions.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_functions.c	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,78 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static void function_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static void start_function_trace(struct trace_array *tr)
+{
+	function_reset(tr);
+	tracing_start_cmdline_record();
+	tracing_start_function_trace();
+}
+
+static void stop_function_trace(struct trace_array *tr)
+{
+	tracing_stop_function_trace();
+	tracing_stop_cmdline_record();
+}
+
+static void function_trace_init(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_function_trace(tr);
+}
+
+static void function_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_function_trace(tr);
+}
+
+static void function_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_function_trace(tr);
+	else
+		stop_function_trace(tr);
+}
+
+static struct tracer function_trace __read_mostly =
+{
+	.name	     = "ftrace",
+	.init	     = function_trace_init,
+	.reset	     = function_trace_reset,
+	.ctrl_update = function_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_function,
+#endif
+};
+
+static __init int init_function_trace(void)
+{
+	return register_tracer(&function_trace);
+}
+
+device_initcall(init_function_trace);
Index: linux-2.6.25.4-rt4/kernel/trace/trace_irqsoff.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_irqsoff.c	2008-05-29 09:46:34.000000000 -0400
@@ -0,0 +1,505 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+#include "trace_hist.h"
+
+static struct trace_array		*irqsoff_trace __read_mostly;
+static int				tracer_enabled __read_mostly;
+
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+static DEFINE_RAW_SPINLOCK(max_trace_lock);
+
+enum {
+	TRACER_IRQS_OFF		= (1 << 1),
+	TRACER_PREEMPT_OFF	= (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int
+preempt_trace(void)
+{
+	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int
+irq_trace(void)
+{
+	return ((trace_type & TRACER_IRQS_OFF) &&
+		irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp	unsigned long max_sequence;
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	/*
+	 * Does not matter if we preempt. We test the flags
+	 * afterward, to see if irqs are disabled or not.
+	 * If we preempt and get a false positive, the flags
+	 * test will fail.
+	 */
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		trace_function(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void
+check_critical_timing(struct trace_array *tr,
+		      struct trace_array_cpu *data,
+		      unsigned long parent_ip,
+		      int cpu)
+{
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, delta;
+	unsigned long flags;
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = ftrace_now(cpu);
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	spin_lock_irqsave(&max_trace_lock, flags);
+
+	/* check if we are still the max latency */
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+
+	latency = nsecs_to_usecs(delta);
+
+	if (data->critical_sequence != max_sequence)
+		goto out_unlock;
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	data->critical_end = parent_ip;
+
+	update_max_tr_single(tr, current, cpu);
+
+	max_sequence++;
+
+out_unlock:
+	spin_unlock_irqrestore(&max_trace_lock, flags);
+
+out:
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = ftrace_now(cpu);
+	tracing_reset(data);
+	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+
+	if (per_cpu(tracing_cpu, cpu))
+		return;
+
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = ftrace_now(cpu);
+	data->critical_start = parent_ip ? : ip;
+	tracing_reset(data);
+
+	local_save_flags(flags);
+
+	trace_function(tr, data, ip, parent_ip, flags);
+
+	per_cpu(tracing_cpu, cpu) = 1;
+
+	atomic_dec(&data->disabled);
+}
+
+static inline void
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	cpu = raw_smp_processor_id();
+	/* Always clear the tracing cpu on stopping the trace */
+	if (unlikely(per_cpu(tracing_cpu, cpu)))
+		per_cpu(tracing_cpu, cpu) = 0;
+	else
+		return;
+
+	if (!tracer_enabled)
+		return;
+
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!head_page(data)) ||
+	    !data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	local_save_flags(flags);
+	trace_function(tr, data, ip, parent_ip, flags);
+	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+	data->critical_start = 0;
+	atomic_dec(&data->disabled);
+}
+
+/* start and stop critical timings used to for stoppage (in idle) */
+void start_critical_timings(void)
+{
+	if (preempt_trace() || irq_trace())
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+
+	tracing_hist_preempt_start();
+}
+
+void stop_critical_timings(void)
+{
+	tracing_hist_preempt_stop(TRACE_STOP);
+
+	if (preempt_trace() || irq_trace())
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+#ifdef CONFIG_PROVE_LOCKING
+void time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	tracing_hist_preempt_stop(1);
+
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(a0, a1);
+}
+
+void time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(a0, a1);
+
+	tracing_hist_preempt_start();
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void trace_hardirqs_on(void)
+{
+	tracing_hist_preempt_stop(1);
+
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+
+	tracing_hist_preempt_start();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	tracing_hist_preempt_stop(1);
+
+	if (!preempt_trace() && irq_trace())
+		stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	if (!preempt_trace() && irq_trace())
+		start_critical_timing(CALLER_ADDR0, caller_addr);
+
+	tracing_hist_preempt_start();
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	tracing_hist_preempt_stop(0);
+	stop_critical_timing(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	start_critical_timing(a0, a1);
+	tracing_hist_preempt_start();
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+	register_ftrace_function(&trace_ops);
+	tracer_enabled = 1;
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+}
+
+static void __irqsoff_tracer_init(struct trace_array *tr)
+{
+	irqsoff_trace = tr;
+	/* make sure that the tracer is visible */
+	smp_wmb();
+
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+	else
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_irqsoff_tracer(iter->tr);
+}
+
+static void irqsoff_tracer_close(struct trace_iterator *iter)
+{
+	if (iter->tr->ctrl)
+		start_irqsoff_tracer(iter->tr);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+static struct tracer irqsoff_tracer __read_mostly =
+{
+	.name		= "irqsoff",
+	.init		= irqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_irqsoff,
+#endif
+};
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+	.name		= "preemptoff",
+	.init		= preemptoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_preemptoff,
+#endif
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+	defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+	.name		= "preemptirqsoff",
+	.init		= preemptirqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_preemptirqsoff,
+#endif
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
+
+__init static int init_irqsoff_tracer(void)
+{
+	register_irqsoff(irqsoff_tracer);
+	register_preemptoff(preemptoff_tracer);
+	register_preemptirqsoff(preemptirqsoff_tracer);
+
+	return 0;
+}
+device_initcall(init_irqsoff_tracer);
Index: linux-2.6.25.4-rt4/kernel/trace/trace_mmiotrace.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_mmiotrace.c	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,295 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+#include <linux/pci.h>
+
+#include "trace.h"
+
+struct header_iter {
+	struct pci_dev *dev;
+};
+
+static struct trace_array *mmio_trace_array;
+static bool overrun_detected;
+
+static void mmio_reset_data(struct trace_array *tr)
+{
+	int cpu;
+
+	overrun_detected = false;
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	mmio_trace_array = tr;
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
+		enable_mmiotrace();
+	}
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl)
+		disable_mmiotrace();
+	mmio_reset_data(tr);
+	mmio_trace_array = NULL;
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
+		enable_mmiotrace();
+	} else {
+		disable_mmiotrace();
+	}
+}
+
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+	int ret = 0;
+	int i;
+	resource_size_t start, end;
+	const struct pci_driver *drv = pci_dev_driver(dev);
+
+	/* XXX: incomplete checks for trace_seq_printf() return value */
+	ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+				dev->bus->number, dev->devfn,
+				dev->vendor, dev->device, dev->irq);
+	/*
+	 * XXX: is pci_resource_to_user() appropriate, since we are
+	 * supposed to interpret the __ioremap() phys_addr argument based on
+	 * these printed values?
+	 */
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			(unsigned long long)(start |
+			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+	}
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			dev->resource[i].start < dev->resource[i].end ?
+			(unsigned long long)(end - start) + 1 : 0);
+	}
+	if (drv)
+		ret += trace_seq_printf(s, " %s\n", drv->name);
+	else
+		ret += trace_seq_printf(s, " \n");
+	return ret;
+}
+
+static void destroy_header_iter(struct header_iter *hiter)
+{
+	if (!hiter)
+		return;
+	pci_dev_put(hiter->dev);
+	kfree(hiter);
+}
+
+static void mmio_pipe_open(struct trace_iterator *iter)
+{
+	struct header_iter *hiter;
+	struct trace_seq *s = &iter->seq;
+
+	trace_seq_printf(s, "VERSION 20070824\n");
+
+	hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+	if (!hiter)
+		return;
+
+	hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
+	iter->private = hiter;
+}
+
+/* XXX: This is not called when the pipe is closed! */
+static void mmio_close(struct trace_iterator *iter)
+{
+	struct header_iter *hiter = iter->private;
+	destroy_header_iter(hiter);
+	iter->private = NULL;
+}
+
+static unsigned long count_overruns(struct trace_iterator *iter)
+{
+	int cpu;
+	unsigned long cnt = 0;
+	for_each_online_cpu(cpu) {
+		cnt += iter->overrun[cpu];
+		iter->overrun[cpu] = 0;
+	}
+	return cnt;
+}
+
+static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
+				char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	ssize_t ret;
+	struct header_iter *hiter = iter->private;
+	struct trace_seq *s = &iter->seq;
+	unsigned long n;
+
+	n = count_overruns(iter);
+	if (n) {
+		/* XXX: This is later than where events were lost. */
+		trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
+		if (!overrun_detected)
+			pr_warning("mmiotrace has lost events.\n");
+		overrun_detected = true;
+		goto print_out;
+	}
+
+	if (!hiter)
+		return 0;
+
+	mmio_print_pcidev(s, hiter->dev);
+	hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
+
+	if (!hiter->dev) {
+		destroy_header_iter(hiter);
+		iter->private = NULL;
+	}
+
+print_out:
+	ret = trace_seq_to_user(s, ubuf, cnt);
+	return (ret == -EBUSY) ? 0 : ret;
+}
+
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_rw *rw	= &entry->mmiorw;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_READ:
+		ret = trace_seq_printf(s,
+			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
+			rw->value, rw->pc, 0);
+		break;
+	case MMIO_WRITE:
+		ret = trace_seq_printf(s,
+			"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
+			rw->value, rw->pc, 0);
+		break;
+	case MMIO_UNKNOWN_OP:
+		ret = trace_seq_printf(s,
+			"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+			secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
+			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+			(rw->value >> 0) & 0xff, rw->pc, 0);
+		break;
+	default:
+		ret = trace_seq_printf(s, "rw what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+static int mmio_print_map(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_map *m	= &entry->mmiomap;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_PROBE:
+		ret = trace_seq_printf(s,
+			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+			secs, usec_rem, m->map_id,
+			(unsigned long long)m->phys, m->virt, m->len,
+			0UL, 0);
+		break;
+	case MMIO_UNPROBE:
+		ret = trace_seq_printf(s,
+			"UNMAP %lu.%06lu %d 0x%lx %d\n",
+			secs, usec_rem, m->map_id, 0UL, 0);
+		break;
+	default:
+		ret = trace_seq_printf(s, "map what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+	switch (iter->ent->type) {
+	case TRACE_MMIO_RW:
+		return mmio_print_rw(iter);
+	case TRACE_MMIO_MAP:
+		return mmio_print_map(iter);
+	default:
+		return 1; /* ignore unknown entries */
+	}
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+	.name		= "mmiotrace",
+	.init		= mmio_trace_init,
+	.reset		= mmio_trace_reset,
+	.pipe_open	= mmio_pipe_open,
+	.close		= mmio_close,
+	.read		= mmio_read,
+	.ctrl_update	= mmio_trace_ctrl_update,
+	.print_line	= mmio_print_line,
+};
+
+__init static int init_mmio_trace(void)
+{
+	return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+void mmio_trace_rw(struct mmiotrace_rw *rw)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_rw(tr, data, rw);
+}
+
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data;
+
+	preempt_disable();
+	data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_map(tr, data, map);
+	preempt_enable();
+}
Index: linux-2.6.25.4-rt4/kernel/trace/trace_sched_switch.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_sched_switch.c	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,196 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/marker.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*ctx_trace;
+static int __read_mostly	tracer_enabled;
+static atomic_t			sched_ref;
+
+static void
+sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+			struct task_struct *next)
+{
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	tracing_record_cmdline(prev);
+	tracing_record_cmdline(next);
+
+	if (!tracer_enabled)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		tracing_sched_switch_trace(tr, data, prev, next, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	if (!atomic_read(&sched_ref))
+		return;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	sched_switch_func(probe_data, __rq, prev, next);
+}
+
+static void sched_switch_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static int tracing_sched_register(void)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&ctx_trace);
+	if (ret)
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+
+	return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&ctx_trace);
+}
+
+void tracing_start_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_inc_return(&sched_ref);
+	if (ref == 1)
+		tracing_sched_register();
+}
+
+void tracing_stop_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_dec_and_test(&sched_ref);
+	if (ref)
+		tracing_sched_unregister();
+}
+
+void tracing_start_cmdline_record(void)
+{
+	tracing_start_sched_switch();
+}
+
+void tracing_stop_cmdline_record(void)
+{
+	tracing_stop_sched_switch();
+}
+
+static void start_sched_trace(struct trace_array *tr)
+{
+	sched_switch_reset(tr);
+	tracer_enabled = 1;
+	tracing_start_cmdline_record();
+}
+
+static void stop_sched_trace(struct trace_array *tr)
+{
+	tracing_stop_cmdline_record();
+	tracer_enabled = 0;
+}
+
+static void sched_switch_trace_init(struct trace_array *tr)
+{
+	ctx_trace = tr;
+
+	if (tr->ctrl)
+		start_sched_trace(tr);
+}
+
+static void sched_switch_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_sched_trace(tr);
+}
+
+static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+{
+	/* When starting a new trace, reset the buffers */
+	if (tr->ctrl)
+		start_sched_trace(tr);
+	else
+		stop_sched_trace(tr);
+}
+
+static struct tracer sched_switch_trace __read_mostly =
+{
+	.name		= "sched_switch",
+	.init		= sched_switch_trace_init,
+	.reset		= sched_switch_trace_reset,
+	.ctrl_update	= sched_switch_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_sched_switch,
+#endif
+};
+
+__init static int init_sched_switch_trace(void)
+{
+	int ret = 0;
+
+	if (atomic_read(&sched_ref))
+		ret = tracing_sched_register();
+	if (ret) {
+		pr_info("error registering scheduler trace\n");
+		return ret;
+	}
+	return register_tracer(&sched_switch_trace);
+}
+device_initcall(init_sched_switch_trace);
Index: linux-2.6.25.4-rt4/kernel/trace/trace_sched_wakeup.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_sched_wakeup.c	2008-05-29 09:46:34.000000000 -0400
@@ -0,0 +1,447 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/marker.h>
+
+#include "trace.h"
+
+static struct trace_array	*wakeup_trace;
+static int __read_mostly	tracer_enabled;
+
+static struct task_struct	*wakeup_task;
+static int			wakeup_cpu;
+static unsigned			wakeup_prio = -1;
+
+static DEFINE_RAW_SPINLOCK(wakeup_lock);
+
+static void __wakeup_reset(struct trace_array *tr);
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int cpu;
+
+	if (likely(!wakeup_task))
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	if (unlikely(!wakeup_task))
+		goto unlock;
+
+	/*
+	 * The task can't disappear because it needs to
+	 * wake up first, and we have the wakeup_lock.
+	 */
+	if (task_cpu(wakeup_task) != cpu)
+		goto unlock;
+
+	trace_function(tr, data, ip, parent_ip, flags);
+
+ unlock:
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+
+ out:
+	atomic_dec(&data->disabled);
+
+	/*
+	 * To prevent recursion from the scheduler, if the
+	 * resched flag was set before we entered, then
+	 * don't reschedule.
+	 */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = wakeup_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void notrace
+wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+	struct task_struct *next)
+{
+	unsigned long latency = 0, t0 = 0, t1 = 0;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
+	struct trace_array_cpu *data;
+	cycle_t T0, T1, delta;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	/*
+	 * When we start a new trace, we set wakeup_task to NULL
+	 * and then set tracer_enabled = 1. We want to make sure
+	 * that another CPU does not see the tracer_enabled = 1
+	 * and the wakeup_task with an older task, that might
+	 * actually be the same as next.
+	 */
+	smp_rmb();
+
+	if (next != wakeup_task)
+		return;
+
+	/* The task we are waiting for is waking up */
+	data = tr->data[wakeup_cpu];
+
+	/* disable local data, not wakeup_cpu data */
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (likely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	/* We could race with grabbing wakeup_lock */
+	if (unlikely(!tracer_enabled || next != wakeup_task))
+		goto out_unlock;
+
+	trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = ftrace_now(cpu);
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	latency = nsecs_to_usecs(delta);
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	update_max_tr(tr, wakeup_task, wakeup_cpu);
+
+out_unlock:
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	wakeup_sched_switch(probe_data, __rq, prev, next);
+}
+
+static void __wakeup_reset(struct trace_array *tr)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	assert_spin_locked(&wakeup_lock);
+
+	for_each_possible_cpu(cpu) {
+		data = tr->data[cpu];
+		tracing_reset(data);
+	}
+
+	wakeup_cpu = -1;
+	wakeup_prio = -1;
+
+	if (wakeup_task)
+		put_task_struct(wakeup_task);
+
+	wakeup_task = NULL;
+}
+
+static void wakeup_reset(struct trace_array *tr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static void
+wakeup_check_start(struct trace_array *tr, struct task_struct *p,
+		   struct task_struct *curr)
+{
+	int cpu = smp_processor_id();
+	unsigned long flags;
+	long disabled;
+
+	if (likely(!rt_task(p)) ||
+			p->prio >= wakeup_prio ||
+			p->prio >= curr->prio)
+		return;
+
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	/* interrupts should be off from try_to_wake_up */
+	spin_lock(&wakeup_lock);
+
+	/* check for races. */
+	if (!tracer_enabled || p->prio >= wakeup_prio)
+		goto out_locked;
+
+	/* reset the trace */
+	__wakeup_reset(tr);
+
+	wakeup_cpu = task_cpu(p);
+	wakeup_prio = p->prio;
+
+	wakeup_task = p;
+	get_task_struct(wakeup_task);
+
+	local_save_flags(flags);
+
+	tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
+	trace_function(tr, tr->data[wakeup_cpu],
+		       CALLER_ADDR1, CALLER_ADDR2, flags);
+
+out_locked:
+	spin_unlock(&wakeup_lock);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
+{
+	struct trace_array **ptr = probe_data;
+	struct trace_array *tr = *ptr;
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_record_cmdline(task);
+	tracing_record_cmdline(curr);
+
+	wakeup_check_start(tr, task, curr);
+}
+
+static void start_wakeup_tracer(struct trace_array *tr)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&wakeup_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	wakeup_reset(tr);
+
+	/*
+	 * Don't let the tracer_enabled = 1 show up before
+	 * the wakeup_task is reset. This may be overkill since
+	 * wakeup_reset does a spin_unlock after setting the
+	 * wakeup_task to NULL, but I want to be safe.
+	 * This is a slow path anyway.
+	 */
+	smp_wmb();
+
+	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
+
+	return;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
+}
+
+static void stop_wakeup_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
+}
+
+static void wakeup_tracer_init(struct trace_array *tr)
+{
+	wakeup_trace = tr;
+
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl) {
+		stop_wakeup_tracer(tr);
+		/* make sure we put back any tasks we are tracing */
+		wakeup_reset(tr);
+	}
+}
+
+static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+	else
+		stop_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_wakeup_tracer(iter->tr);
+}
+
+static void wakeup_tracer_close(struct trace_iterator *iter)
+{
+	/* forget about any processes we were recording */
+	if (iter->tr->ctrl)
+		start_wakeup_tracer(iter->tr);
+}
+
+static struct tracer wakeup_tracer __read_mostly =
+{
+	.name		= "wakeup",
+	.init		= wakeup_tracer_init,
+	.reset		= wakeup_tracer_reset,
+	.open		= wakeup_tracer_open,
+	.close		= wakeup_tracer_close,
+	.ctrl_update	= wakeup_tracer_ctrl_update,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_wakeup,
+#endif
+};
+
+__init static int init_wakeup_tracer(void)
+{
+	int ret;
+
+	ret = register_tracer(&wakeup_tracer);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+device_initcall(init_wakeup_tracer);
Index: linux-2.6.25.4-rt4/kernel/trace/trace_selftest.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_selftest.c	2008-05-29 09:46:09.000000000 -0400
@@ -0,0 +1,572 @@
+/* Include in trace.c */
+
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+static inline int trace_valid_entry(struct trace_entry *entry)
+{
+	switch (entry->type) {
+	case TRACE_FN:
+	case TRACE_CTX:
+	case TRACE_WAKE:
+	case TRACE_STACK:
+	case TRACE_SPECIAL:
+	case TRACE_IRQ:
+	case TRACE_FAULT:
+	case TRACE_TIMER_SET:
+	case TRACE_TIMER_TRIG:
+	case TRACE_TIMESTAMP:
+	case TRACE_TASK_ACT:
+	case TRACE_TASK_DEACT:
+	case TRACE_SYSCALL:
+	case TRACE_SYSRET:
+		return 1;
+	}
+	return 0;
+}
+
+static int
+trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
+{
+	struct trace_entry *entries;
+	struct page *page;
+	int idx = 0;
+	int i;
+
+	BUG_ON(list_empty(&data->trace_pages));
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	entries = page_address(page);
+
+	check_pages(data);
+	if (head_page(data) != entries)
+		goto failed;
+
+	/*
+	 * The starting trace buffer always has valid elements,
+	 * if any element exists.
+	 */
+	entries = head_page(data);
+
+	for (i = 0; i < tr->entries; i++) {
+
+		if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
+			printk(KERN_CONT ".. invalid entry %d ",
+				entries[idx].type);
+			goto failed;
+		}
+
+		idx++;
+		if (idx >= ENTRIES_PER_PAGE) {
+			page = virt_to_page(entries);
+			if (page->lru.next == &data->trace_pages) {
+				if (i != tr->entries - 1) {
+					printk(KERN_CONT ".. entries buffer mismatch");
+					goto failed;
+				}
+			} else {
+				page = list_entry(page->lru.next, struct page, lru);
+				entries = page_address(page);
+			}
+			idx = 0;
+		}
+	}
+
+	page = virt_to_page(entries);
+	if (page->lru.next != &data->trace_pages) {
+		printk(KERN_CONT ".. too many entries");
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	/* disable tracing */
+	tracing_disabled = 1;
+	printk(KERN_CONT ".. corrupted trace buffer .. ");
+	return -1;
+}
+
+/*
+ * Test the trace buffer to see if all the elements
+ * are still sane.
+ */
+static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+{
+	unsigned long flags, cnt = 0;
+	int cpu, ret = 0;
+
+	/* Don't allow flipping of max traces now */
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&ftrace_max_lock);
+	for_each_possible_cpu(cpu) {
+		if (!head_page(tr->data[cpu]))
+			continue;
+
+		cnt += tr->data[cpu]->trace_idx;
+
+		ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
+		if (ret)
+			break;
+	}
+	__raw_spin_unlock(&ftrace_max_lock);
+	raw_local_irq_restore(flags);
+
+	if (count)
+		*count = cnt;
+
+	return ret;
+}
+
+#ifdef CONFIG_FTRACE
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
+/* Test dynamic code modification and ftrace filters */
+int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+					   struct trace_array *tr,
+					   int (*func)(void))
+{
+	unsigned long count;
+	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
+	char *func_name;
+
+	/* The ftrace test PASSED */
+	printk(KERN_CONT "PASSED\n");
+	pr_info("Testing dynamic ftrace: ");
+
+	/* enable tracing, and record the filter function */
+	ftrace_enabled = 1;
+	tracer_enabled = 1;
+
+	/* passed in by parameter to fool gcc from optimizing */
+	func();
+
+	/* update the records */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/*
+	 * Some archs *cough*PowerPC*cough* add charachters to the
+	 * start of the function names. We simply put a '*' to
+	 * accomodate them.
+	 */
+	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+
+	/* filter only on our function */
+	ftrace_set_filter(func_name, strlen(func_name), 1);
+
+	/* enable tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+
+	/* we should have nothing in the buffer */
+	ret = trace_test_buffer(tr, &count);
+	if (ret)
+		goto out;
+
+	if (count) {
+		ret = -1;
+		printk(KERN_CONT ".. filter did not filter .. ");
+		goto out;
+	}
+
+	/* call our function again */
+	func();
+
+	/* sleep again */
+	msleep(100);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	/* we should only have one item */
+	if (!ret && count != 1) {
+		printk(KERN_CONT ".. filter failed count=%ld ..", count);
+		ret = -1;
+		goto out;
+	}
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
+	/* Enable tracing on all functions again */
+	ftrace_set_filter(NULL, 0, 1);
+
+	return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * Simple verification test of ftrace function tracer.
+ * Enable ftrace, sleep 1/10 second, and then read the trace
+ * buffer to see if all is in order.
+ */
+int
+trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
+
+	/* make sure msleep has been recorded */
+	msleep(1);
+
+	/* force the recorded functions to be traced */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/* start the tracing */
+	ftrace_enabled = 1;
+	tracer_enabled = 1;
+
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+						     DYN_FTRACE_TEST_NAME);
+
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
+	/* kill ftrace totally if we failed */
+	if (ret)
+		ftrace_kill();
+
+	return ret;
+}
+#endif /* CONFIG_FTRACE */
+
+#ifdef CONFIG_IRQSOFF_TRACER
+int
+trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+	/* disable interrupts for a bit */
+	local_irq_disable();
+	udelay(100);
+	local_irq_enable();
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+int
+trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+	/* disable preemption for a bit */
+	preempt_disable();
+	udelay(100);
+	preempt_enable();
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+int
+trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+
+	/* reset the max latency */
+	tracing_max_latency = 0;
+
+	/* disable preemption and interrupts for a bit */
+	preempt_disable();
+	local_irq_disable();
+	udelay(100);
+	preempt_enable();
+	/* reverse the order of preempt vs irqs */
+	local_irq_enable();
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (ret)
+		goto out;
+
+	ret = trace_test_buffer(&max_tr, &count);
+	if (ret)
+		goto out;
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	/* do the test by disabling interrupts first this time */
+	tracing_max_latency = 0;
+	tr->ctrl = 1;
+	trace->ctrl_update(tr);
+	preempt_disable();
+	local_irq_disable();
+	udelay(100);
+	preempt_enable();
+	/* reverse the order of preempt vs irqs */
+	local_irq_enable();
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (ret)
+		goto out;
+
+	ret = trace_test_buffer(&max_tr, &count);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+ out:
+	trace->reset(tr);
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+
+#ifdef CONFIG_SCHED_TRACER
+static int trace_wakeup_test_thread(void *data)
+{
+	/* Make this a RT thread, doesn't need to be too high */
+	struct sched_param param = { .sched_priority = 5 };
+	struct completion *x = data;
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+
+	/* Make it know we have a new prio */
+	complete(x);
+
+	/* now go to sleep and let the test wake us up */
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule();
+
+	/* we are awake, now wait to disappear */
+	while (!kthread_should_stop()) {
+		/*
+		 * This is an RT task, do short sleeps to let
+		 * others run.
+		 */
+		msleep(100);
+	}
+
+	return 0;
+}
+
+int
+trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	struct task_struct *p;
+	struct completion isrt;
+	unsigned long count;
+	int ret;
+
+	init_completion(&isrt);
+
+	/* create a high prio thread */
+	p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+	if (IS_ERR(p)) {
+		printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
+		return -1;
+	}
+
+	/* make sure the thread is running at an RT prio */
+	wait_for_completion(&isrt);
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+
+	/* sleep to let the RT thread sleep too */
+	msleep(100);
+
+	/*
+	 * Yes this is slightly racy. It is possible that for some
+	 * strange reason that the RT thread we created, did not
+	 * call schedule for 100ms after doing the completion,
+	 * and we do a wakeup on a task that already is awake.
+	 * But that is extremely unlikely, and the worst thing that
+	 * happens in such a case, is that we disable tracing.
+	 * Honestly, if this race does happen something is horrible
+	 * wrong with the system.
+	 */
+
+	wake_up_process(p);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+
+
+	trace->reset(tr);
+
+	tracing_max_latency = save_max;
+
+	/* kill the thread */
+	kthread_stop(p);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SCHED_TRACER */
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+int
+trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_SYSPROF_TRACER
+int
+trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	return ret;
+}
+#endif /* CONFIG_SYSPROF_TRACER */
Index: linux-2.6.25.4-rt4/kernel/trace/trace_selftest_dynamic.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_selftest_dynamic.c	2008-05-29 09:46:04.000000000 -0400
@@ -0,0 +1,7 @@
+#include "trace.h"
+
+int DYN_FTRACE_TEST_NAME(void)
+{
+	/* used to call mcount */
+	return 0;
+}
Index: linux-2.6.25.4-rt4/lib/Kconfig.debug
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/Kconfig.debug	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/Kconfig.debug	2008-05-29 09:46:43.000000000 -0400
@@ -237,6 +237,8 @@ config DEBUG_RT_MUTEXES
 	help
 	 This allows rt mutex semantics violations and rt mutex related
 	 deadlocks (lockups) to be detected and reported automatically.
+	 When realtime preemption is enabled this includes spinlocks,
+	 rwlocks, mutexes and (rw)semaphores
 
 config DEBUG_PI_LIST
 	bool
@@ -260,7 +262,7 @@ config DEBUG_SPINLOCK
 
 config DEBUG_MUTEXES
 	bool "Mutex debugging: basic checks"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !PREEMPT_RT
 	help
 	 This feature allows mutex semantics violations to be detected and
 	 reported.
@@ -621,4 +623,6 @@ config PROVIDE_OHCI1394_DMA_INIT
 
 	  See Documentation/debugging-via-ohci1394.txt for more information.
 
+source kernel/trace/Kconfig
+
 source "samples/Kconfig"
Index: linux-2.6.25.4-rt4/lib/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/Makefile	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/Makefile	2008-05-29 09:46:53.000000000 -0400
@@ -3,11 +3,20 @@
 #
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
-	 rbtree.o radix-tree.o dump_stack.o \
+	 rbtree.o radix-tree.o dump_stack.o lock_list.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o
 
+ifdef CONFIG_FTRACE
+# Do not profile string.o, since it may be used in early boot or vdso
+CFLAGS_REMOVE_string.o = -pg
+# Also do not profile any debug utilities
+CFLAGS_REMOVE_spinlock_debug.o = -pg
+CFLAGS_REMOVE_list_debug.o = -pg
+CFLAGS_REMOVE_debugobjects.o = -pg
+endif
+
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 
@@ -27,7 +36,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o 
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_PREEMPT_RT) += plist.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
Index: linux-2.6.25.4-rt4/mm/page-writeback.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/page-writeback.c	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/page-writeback.c	2008-05-29 09:46:56.000000000 -0400
@@ -126,8 +126,6 @@ static void background_writeout(unsigned
 static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_dirties;
 
-static unsigned long determine_dirtyable_memory(void);
-
 /*
  * couple the period to the dirty_ratio:
  *
@@ -286,7 +284,13 @@ static unsigned long highmem_dirtyable_m
 #endif
 }
 
-static unsigned long determine_dirtyable_memory(void)
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 
@@ -1016,9 +1020,11 @@ int __set_page_dirty_nobuffers(struct pa
 		if (!mapping)
 			return 1;
 
-		write_lock_irq(&mapping->tree_lock);
+		lock_page_ref_irq(page);
 		mapping2 = page_mapping(page);
 		if (mapping2) { /* Race with truncate? */
+			DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree);
+
 			BUG_ON(mapping2 != mapping);
 			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
 			if (mapping_cap_account_dirty(mapping)) {
@@ -1027,10 +1033,12 @@ int __set_page_dirty_nobuffers(struct pa
 						BDI_RECLAIMABLE);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
-			radix_tree_tag_set(&mapping->page_tree,
+			radix_tree_lock(&ctx);
+			radix_tree_tag_set(ctx.tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
+			radix_tree_unlock(&ctx);
 		}
-		write_unlock_irq(&mapping->tree_lock);
+		unlock_page_ref_irq(page);
 		if (mapping->host) {
 			/* !PageAnon && !swapper_space */
 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1186,18 +1194,21 @@ int test_clear_page_writeback(struct pag
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
-		write_lock_irqsave(&mapping->tree_lock, flags);
+		lock_page_ref_irqsave(page, flags);
 		ret = TestClearPageWriteback(page);
 		if (ret) {
-			radix_tree_tag_clear(&mapping->page_tree,
-						page_index(page),
+			DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree);
+
+			radix_tree_lock(&ctx);
+			radix_tree_tag_clear(ctx.tree, page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			radix_tree_unlock(&ctx);
 			if (bdi_cap_writeback_dirty(bdi)) {
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
 				__bdi_writeout_inc(bdi);
 			}
 		}
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
+		unlock_page_ref_irqrestore(page, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
@@ -1214,21 +1225,25 @@ int test_set_page_writeback(struct page 
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
+		DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree);
 
-		write_lock_irqsave(&mapping->tree_lock, flags);
+		lock_page_ref_irqsave(page, flags);
 		ret = TestSetPageWriteback(page);
 		if (!ret) {
-			radix_tree_tag_set(&mapping->page_tree,
-						page_index(page),
+			radix_tree_lock(&ctx);
+			radix_tree_tag_set(ctx.tree, page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			radix_tree_unlock(&ctx);
 			if (bdi_cap_writeback_dirty(bdi))
 				__inc_bdi_stat(bdi, BDI_WRITEBACK);
 		}
-		if (!PageDirty(page))
-			radix_tree_tag_clear(&mapping->page_tree,
-						page_index(page),
+		if (!PageDirty(page)) {
+			radix_tree_lock(&ctx);
+			radix_tree_tag_clear(ctx.tree, page_index(page),
 						PAGECACHE_TAG_DIRTY);
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
+			radix_tree_unlock(&ctx);
+		}
+		unlock_page_ref_irqrestore(page, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
 	}
Index: linux-2.6.25.4-rt4/scripts/Makefile.lib
===================================================================
--- linux-2.6.25.4-rt4.orig/scripts/Makefile.lib	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/scripts/Makefile.lib	2008-05-29 09:46:04.000000000 -0400
@@ -96,7 +96,8 @@ basename_flags = -D"KBUILD_BASENAME=KBUI
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
-_c_flags       = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o)
+orig_c_flags   = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o)
+_c_flags       = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags))
 _a_flags       = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o)
 _cpp_flags     = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F))
 
Index: linux-2.6.25.4-rt4/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/Makefile	2008-05-29 09:45:59.000000000 -0400
+++ linux-2.6.25.4-rt4/Makefile	2008-05-29 09:47:34.000000000 -0400
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 25
-EXTRAVERSION = .4
+EXTRAVERSION = .4-rt4
 NAME = Funky Weasel is Jiggy wit it
 
 # *DOCUMENTATION*
@@ -507,6 +507,10 @@ else
 KBUILD_CFLAGS	+= -O2
 endif
 
+ifdef CONFIG_FTRACE
+KBUILD_CFLAGS	+= -pg
+endif
+
 # Force gcc to behave correct even for buggy distributions
 # Arch Makefiles may override this setting
 KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector)
Index: linux-2.6.25.4-rt4/arch/x86/kernel/i386_ksyms_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/i386_ksyms_32.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/i386_ksyms_32.c	2008-05-29 09:46:27.000000000 -0400
@@ -1,13 +1,21 @@
+#include <linux/ftrace.h>
 #include <linux/module.h>
 #include <asm/semaphore.h>
 #include <asm/checksum.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
+
+#ifdef CONFIG_ASM_SEMAPHORES
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/x8664_ksyms_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/x8664_ksyms_64.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/x8664_ksyms_64.c	2008-05-29 09:46:27.000000000 -0400
@@ -1,6 +1,7 @@
 /* Exports for assembly files.
    All C exports should go in the respective C files. */
 
+#include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/smp.h>
 
@@ -10,12 +11,19 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
+
 EXPORT_SYMBOL(kernel_thread);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 
 EXPORT_SYMBOL(__get_user_1);
 EXPORT_SYMBOL(__get_user_2);
Index: linux-2.6.25.4-rt4/arch/x86/kernel/nmi_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/nmi_32.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/nmi_32.c	2008-05-29 09:47:32.000000000 -0400
@@ -25,6 +25,7 @@
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
+#include <asm/mach-default/mach_ipi.h>
 
 #include "mach_traps.h"
 
@@ -42,7 +43,7 @@ static cpumask_t backtrace_mask = CPU_MA
 atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
 
 unsigned int nmi_watchdog = NMI_DEFAULT;
-static unsigned int nmi_hz = HZ;
+static unsigned int nmi_hz = 1000;
 
 static DEFINE_PER_CPU(short, wd_enabled);
 
@@ -55,7 +56,12 @@ static int endflag __initdata = 0;
  */
 static __init void nmi_cpu_busy(void *data)
 {
+	/*
+	 * avoid a warning, on PREEMPT_RT this wont run in hardirq context:
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_enable_in_hardirq();
+#endif
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
@@ -92,7 +98,7 @@ static int __init check_nmi_watchdog(voi
 	for_each_possible_cpu(cpu)
 		prev_nmi_count[cpu] = nmi_count(cpu);
 	local_irq_enable();
-	mdelay((20*1000)/nmi_hz); // wait 20 ticks
+	mdelay((100*1000)/nmi_hz); // wait 100 ticks
 
 	for_each_possible_cpu(cpu) {
 #ifdef CONFIG_SMP
@@ -317,7 +323,60 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
 
 extern void die_nmi(struct pt_regs *, const char *msg);
 
-__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+int nmi_show_regs[NR_CPUS];
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+
+notrace int irq_show_regs_callback(int cpu, struct pt_regs *regs)
+{
+	if (!nmi_show_regs[cpu])
+		return 0;
+
+	spin_lock(&nmi_print_lock);
+	printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu);
+	printk(KERN_WARNING "apic_timer_irqs: %d\n",
+		per_cpu(irq_stat, cpu).apic_timer_irqs);
+	show_regs(regs);
+	spin_unlock(&nmi_print_lock);
+	nmi_show_regs[cpu] = 0;
+	return 1;
+}
+
+void nmi_show_all_regs(void)
+{
+	struct pt_regs *regs;
+	int i, cpu;
+
+	if (system_state == SYSTEM_BOOTING)
+		return;
+
+	preempt_disable();
+
+	regs = get_irq_regs();
+	cpu = smp_processor_id();
+
+	printk(KERN_WARNING "nmi_show_all_regs(): start on CPU#%d.\n", cpu);
+	dump_stack();
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+
+	if (regs)
+		irq_show_regs_callback(cpu, regs);
+	else
+		nmi_show_regs[cpu] = 0;
+
+	smp_send_nmi_allbutself();
+	preempt_enable();
+
+	for_each_online_cpu(i) {
+		while (nmi_show_regs[i] == 1)
+			barrier();
+	}
+}
+
+notrace __kprobes int
+nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 {
 
 	/*
@@ -328,7 +387,10 @@ __kprobes int nmi_watchdog_tick(struct p
 	unsigned int sum;
 	int touched = 0;
 	int cpu = smp_processor_id();
-	int rc = 0;
+	int rc;
+
+	rc = irq_show_regs_callback(cpu, regs);
+	__profile_tick(CPU_PROFILING, regs);
 
 	/* check for other users first */
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
@@ -342,7 +404,7 @@ __kprobes int nmi_watchdog_tick(struct p
 
 		spin_lock(&lock);
 		printk("NMI backtrace for cpu %d\n", cpu);
-		dump_stack();
+		show_regs(regs);
 		spin_unlock(&lock);
 		cpu_clear(cpu, backtrace_mask);
 	}
@@ -354,6 +416,7 @@ __kprobes int nmi_watchdog_tick(struct p
 	sum = per_cpu(irq_stat, cpu).apic_timer_irqs +
 		per_cpu(irq_stat, cpu).irq0_irqs;
 
+	/* if the apic timer isn't firing, this cpu isn't doing much */
 	/* if the none of the timers isn't firing, this cpu isn't doing much */
 	if (!touched && last_irq_sums[cpu] == sum) {
 		/*
@@ -361,11 +424,36 @@ __kprobes int nmi_watchdog_tick(struct p
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
 		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz)
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
-			die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
+		if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) {
+			int i;
+
+			spin_lock(&nmi_print_lock);
+			printk(KERN_WARNING "NMI watchdog detected lockup on "
+				"CPU#%d (%d/%d)\n", cpu, alert_counter[cpu],
+						    5*nmi_hz);
+			show_regs(regs);
+			spin_unlock(&nmi_print_lock);
+
+			for_each_online_cpu(i) {
+				if (i == cpu)
+					continue;
+				nmi_show_regs[i] = 1;
+			}
+
+			smp_send_nmi_allbutself();
+
+			for_each_online_cpu(i) {
+				if (i == cpu)
+					continue;
+				while (nmi_show_regs[i] == 1)
+					cpu_relax();
+			}
+			printk(KERN_WARNING "NMI watchdog running again ...\n");
+			for_each_online_cpu(i)
+				alert_counter[i] = 0;
+
+		}
+
 	} else {
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
@@ -463,5 +551,17 @@ void __trigger_all_cpu_backtrace(void)
 	}
 }
 
+void smp_send_nmi_allbutself(void)
+{
+#ifdef CONFIG_SMP
+	cpumask_t mask = cpu_online_map;
+	preempt_disable();
+	cpu_clear(safe_smp_processor_id(), mask);
+	if (!cpus_empty(mask))
+		send_IPI_mask(mask, NMI_VECTOR);
+	preempt_enable();
+#endif
+}
+
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
Index: linux-2.6.25.4-rt4/arch/x86/kernel/nmi_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/nmi_64.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/nmi_64.c	2008-05-29 09:47:32.000000000 -0400
@@ -20,11 +20,13 @@
 #include <linux/kprobes.h>
 #include <linux/cpumask.h>
 #include <linux/kdebug.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
 #include <asm/proto.h>
 #include <asm/mce.h>
+#include <asm/mach_apic.h>
 
 int unknown_nmi_panic;
 int nmi_watchdog_enabled;
@@ -42,12 +44,12 @@ atomic_t nmi_active = ATOMIC_INIT(0);		/
 static int panic_on_timeout;
 
 unsigned int nmi_watchdog = NMI_DEFAULT;
-static unsigned int nmi_hz = HZ;
+static unsigned int nmi_hz = 1000;
 
 static DEFINE_PER_CPU(short, wd_enabled);
 
 /* Run after command line and cpu_init init, but before all other checks */
-void nmi_watchdog_default(void)
+static void nmi_watchdog_default(void)
 {
 	if (nmi_watchdog != NMI_DEFAULT)
 		return;
@@ -63,7 +65,9 @@ static int endflag __initdata = 0;
  */
 static __init void nmi_cpu_busy(void *data)
 {
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_enable_in_hardirq();
+#endif
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
@@ -297,7 +301,7 @@ void touch_nmi_watchdog(void)
 		unsigned cpu;
 
 		/*
- 		 * Tell other CPUs to reset their alert counters. We cannot
+		 * Tell other CPUs to reset their alert counters. We cannot
 		 * do it ourselves because the alert count increase is not
 		 * atomic.
 		 */
@@ -311,12 +315,66 @@ void touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
-int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+int nmi_show_regs[NR_CPUS];
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+
+notrace int irq_show_regs_callback(int cpu, struct pt_regs *regs)
+{
+	if (!nmi_show_regs[cpu])
+		return 0;
+
+	spin_lock(&nmi_print_lock);
+	printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu);
+	printk(KERN_WARNING "apic_timer_irqs: %d\n", read_pda(apic_timer_irqs));
+	show_regs(regs);
+	spin_unlock(&nmi_print_lock);
+	nmi_show_regs[cpu] = 0;
+	return 1;
+}
+
+void nmi_show_all_regs(void)
+{
+	struct pt_regs *regs;
+	int i, cpu;
+
+	if (system_state == SYSTEM_BOOTING)
+		return;
+
+	preempt_disable();
+
+	regs = get_irq_regs();
+	cpu = smp_processor_id();
+
+	printk(KERN_WARNING "nmi_show_all_regs(): start on CPU#%d.\n", cpu);
+	dump_stack();
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+
+	if (regs)
+		irq_show_regs_callback(cpu, regs);
+	else
+		nmi_show_regs[cpu] = 0;
+
+	smp_send_nmi_allbutself();
+	preempt_enable();
+
+	for_each_online_cpu(i) {
+		while (nmi_show_regs[i] == 1)
+			barrier();
+	}
+}
+
+notrace int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 {
 	int sum;
 	int touched = 0;
 	int cpu = smp_processor_id();
-	int rc = 0;
+	int rc;
+
+	rc = irq_show_regs_callback(cpu, regs);
+	__profile_tick(CPU_PROFILING, regs);
 
 	/* check for other users first */
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
@@ -325,7 +383,6 @@ int __kprobes nmi_watchdog_tick(struct p
 		touched = 1;
 	}
 
-	sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
 	if (__get_cpu_var(nmi_touch)) {
 		__get_cpu_var(nmi_touch) = 0;
 		touched = 1;
@@ -341,6 +398,12 @@ int __kprobes nmi_watchdog_tick(struct p
 		cpu_clear(cpu, backtrace_mask);
 	}
 
+	/*
+	 * Take the local apic timer and PIT/HPET into account. We don't
+	 * know which one is active, when we have highres/dyntick on
+	 */
+	sum = read_pda(apic_timer_irqs) + kstat_cpu(cpu).irqs[0];
+
 #ifdef CONFIG_X86_MCE
 	/* Could check oops_in_progress here too, but it's safer
 	   not too */
@@ -354,9 +417,26 @@ int __kprobes nmi_watchdog_tick(struct p
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
 		local_inc(&__get_cpu_var(alert_counter));
-		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
+		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
+			int i;
+
+			for_each_online_cpu(i) {
+				if (i == cpu)
+					continue;
+				nmi_show_regs[i] = 1;
+			}
+
+			smp_send_nmi_allbutself();
+
+			for_each_online_cpu(i) {
+				if (i == cpu)
+					continue;
+				while (nmi_show_regs[i] == 1)
+					cpu_relax();
+			}
 			die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
 				panic_on_timeout);
+		}
 	} else {
 		__get_cpu_var(last_irq_sum) = sum;
 		local_set(&__get_cpu_var(alert_counter), 0);
@@ -474,5 +554,14 @@ void __trigger_all_cpu_backtrace(void)
 	}
 }
 
+void smp_send_nmi_allbutself(void)
+{
+#ifdef CONFIG_SMP
+	preempt_disable();
+	send_IPI_allbutself(NMI_VECTOR);
+	preempt_enable();
+#endif
+}
+
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
Index: linux-2.6.25.4-rt4/arch/x86/lib/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/lib/Makefile	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/lib/Makefile	2008-05-29 09:46:04.000000000 -0400
@@ -5,6 +5,7 @@
 obj-$(CONFIG_SMP) := msr-on-cpu.o
 
 lib-y := delay_$(BITS).o
+lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
 lib-y += memcpy_$(BITS).o
 
@@ -20,7 +21,7 @@ else
         CFLAGS_csum-partial_64.o := -funroll-loops
 
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
-        lib-y += thunk_64.o clear_page_64.o copy_page_64.o
+        lib-y += clear_page_64.o copy_page_64.o
         lib-y += bitops_64.o
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
Index: linux-2.6.25.4-rt4/include/asm-x86/alternative.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/alternative.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/alternative.h	2008-05-29 09:46:04.000000000 -0400
@@ -72,6 +72,8 @@ static inline void alternatives_smp_modu
 static inline void alternatives_smp_switch(int smp) {}
 #endif	/* CONFIG_SMP */
 
+const unsigned char *const *find_nop_table(void);
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
Index: linux-2.6.25.4-rt4/include/asm-x86/irqflags.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/irqflags.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/irqflags.h	2008-05-29 09:46:04.000000000 -0400
@@ -212,8 +212,6 @@ static inline void trace_hardirqs_fixup(
  * have a reliable stack. x86_64 only.
  */
 #define SWAPGS_UNSAFE_STACK	swapgs
-#define ARCH_TRACE_IRQS_ON		call trace_hardirqs_on_thunk
-#define ARCH_TRACE_IRQS_OFF		call trace_hardirqs_off_thunk
 #define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
@@ -225,24 +223,6 @@ static inline void trace_hardirqs_fixup(
 	TRACE_IRQS_OFF;
 
 #else
-#define ARCH_TRACE_IRQS_ON			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
 #define ARCH_LOCKDEP_SYS_EXIT			\
 	pushl %eax;				\
 	pushl %ecx;				\
@@ -256,8 +236,8 @@ static inline void trace_hardirqs_fixup(
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		ARCH_TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF	ARCH_TRACE_IRQS_OFF
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
 #else
 #  define TRACE_IRQS_ON
 #  define TRACE_IRQS_OFF
Index: linux-2.6.25.4-rt4/include/asm-x86/timer.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/timer.h	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/timer.h	2008-05-29 09:46:07.000000000 -0400
@@ -43,19 +43,16 @@ DECLARE_PER_CPU(unsigned long, cyc2ns);
 
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
-static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+static inline notrace unsigned long long __cycles_2_ns(unsigned long long cyc)
 {
 	return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
 }
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static inline notrace unsigned long long cycles_2_ns(unsigned long long cyc)
 {
 	unsigned long long ns;
-	unsigned long flags;
 
-	local_irq_save(flags);
 	ns = __cycles_2_ns(cyc);
-	local_irq_restore(flags);
 
 	return ns;
 }
Index: linux-2.6.25.4-rt4/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/Makefile	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/Makefile	2008-05-29 09:46:04.000000000 -0400
@@ -7,6 +7,13 @@ extra-$(CONFIG_X86_64) += head64.o
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
+ifdef CONFIG_FTRACE
+# Do not profile debug utilities
+CFLAGS_REMOVE_tsc_64.o = -pg
+CFLAGS_REMOVE_tsc_32.o = -pg
+CFLAGS_REMOVE_rtc.o = -pg
+endif
+
 #
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
@@ -55,6 +62,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse_$(B
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi_$(BITS).o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
Index: linux-2.6.25.4-rt4/init/main.c
===================================================================
--- linux-2.6.25.4-rt4.orig/init/main.c	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/init/main.c	2008-05-29 09:47:31.000000000 -0400
@@ -34,6 +34,7 @@
 #include <linux/workqueue.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
+#include <linux/posix-timers.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
 #include <linux/writeback.h>
@@ -47,6 +48,7 @@
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
+#include <linux/irq.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 #include <linux/unwind.h>
@@ -58,6 +60,7 @@
 #include <linux/kthread.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
+#include <linux/ftrace.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -97,6 +100,12 @@ static inline void acpi_early_init(void)
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
 #endif
+#ifdef CONFIG_ALLOC_RTSJ_MEM
+extern void alloc_rtsj_mem_early_setup(void);
+#else
+static inline void alloc_rtsj_mem_early_setup(void) { }
+#endif
+
 
 #ifdef CONFIG_TC
 extern void tc_init(void);
@@ -434,6 +443,8 @@ static void noinline __init_refok rest_i
 {
 	int pid;
 
+	system_state = SYSTEM_BOOTING_SCHEDULER_OK;
+
 	kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
@@ -445,7 +456,7 @@ static void noinline __init_refok rest_i
 	 * at least once to get things moving:
 	 */
 	init_idle_bootup_task(current);
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 
@@ -550,8 +561,10 @@ asmlinkage void __init start_kernel(void
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
+
 	build_all_zonelists();
 	page_alloc_init();
+	early_init_hardirqs();
 	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
 	parse_early_param();
 	parse_args("Booting kernel", static_command_line, __start___param,
@@ -572,6 +585,7 @@ asmlinkage void __init start_kernel(void
 	softirq_init();
 	timekeeping_init();
 	time_init();
+	sched_clock_init();
 	profile_init();
 	if (!irqs_disabled())
 		printk("start_kernel(): bug: interrupts were enabled early\n");
@@ -606,6 +620,7 @@ asmlinkage void __init start_kernel(void
 #endif
 	vfs_caches_init_early();
 	cpuset_init_early();
+	alloc_rtsj_mem_early_setup();
 	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
@@ -646,6 +661,9 @@ asmlinkage void __init start_kernel(void
 
 	acpi_early_init(); /* before LAPIC and SMP init */
 
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
@@ -750,11 +768,14 @@ __setup("nosoftlockup", nosoftlockup_set
 static void __init do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
+	extern int spawn_desched_task(void);
 
 	migration_init();
+	posix_cpu_thread_init();
 	spawn_ksoftirqd();
 	if (!nosoftlockup)
 		spawn_softlockup_task();
+	spawn_desched_task();
 }
 
 static void run_init_process(char *init_filename)
@@ -780,11 +801,17 @@ static int noinline init_post(void)
 	(void) sys_dup(0);
 	(void) sys_dup(0);
 
+#ifdef CONFIG_PREEMPT_RT
+	ftrace_disable_daemon();
+#endif
 	if (ramdisk_execute_command) {
 		run_init_process(ramdisk_execute_command);
 		printk(KERN_WARNING "Failed to execute %s\n",
 				ramdisk_execute_command);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 
 	/*
 	 * We try each of these until one succeeds.
@@ -826,6 +853,8 @@ static int __init kernel_init(void * unu
 
 	smp_prepare_cpus(setup_max_cpus);
 
+	init_hardirqs();
+
 	do_pre_smp_initcalls();
 
 	smp_init();
@@ -847,12 +876,61 @@ static int __init kernel_init(void * unu
 		ramdisk_execute_command = NULL;
 		prepare_namespace();
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 
+#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_IRQSOFF_TRACER) + defined(CONFIG_PREEMPT_TRACER) + defined(CONFIG_FTRACE) + defined(CONFIG_WAKEUP_LATENCY_HIST) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC) + defined(CONFIG_LOCKDEP))
+
+#if DEBUG_COUNT > 0
+	printk(KERN_ERR "*****************************************************************************\n");
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  REMINDER, the following debugging option is turned on in your .config:   *\n");
+#else
+	printk(KERN_ERR "*  REMINDER, the following debugging options are turned on in your .config: *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	printk(KERN_ERR "*        CONFIG_DEBUG_RT_MUTEXES                                            *\n");
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+	printk(KERN_ERR "*        CONFIG_IRQSOFF_TRACER                                              *\n");
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+	printk(KERN_ERR "*        CONFIG_PREEMPT_TRACER                                              *\n");
+#endif
+#ifdef CONFIG_FTRACE
+	printk(KERN_ERR "*        CONFIG_FTRACE                                                      *\n");
+#endif
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	printk(KERN_ERR "*        CONFIG_WAKEUP_LATENCY_HIST                                         *\n");
+#endif
+#ifdef CONFIG_DEBUG_SLAB
+	printk(KERN_ERR "*        CONFIG_DEBUG_SLAB                                                  *\n");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_ERR "*        CONFIG_DEBUG_PAGEALLOC                                             *\n");
+#endif
+#ifdef CONFIG_LOCKDEP
+	printk(KERN_ERR "*        CONFIG_LOCKDEP                                                     *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  it may increase runtime overhead and latencies.                          *\n");
+#else
+	printk(KERN_ERR "*  they may increase runtime overhead and latencies.                        *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+	printk(KERN_ERR "*****************************************************************************\n");
+#endif
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the
 	 * initmem segments and start the user-mode stuff..
 	 */
 	init_post();
+	WARN_ON(debug_direct_keyboard);
+
 	return 0;
 }
Index: linux-2.6.25.4-rt4/kernel/sched_clock.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/sched_clock.c	2008-05-29 09:47:21.000000000 -0400
@@ -0,0 +1,272 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Based on code by:
+ *   Ingo Molnar <mingo@redhat.com>
+ *   Guillaume Chazarain <guichaz@gmail.com>
+ *
+ * Create a semi stable clock from a mixture of other events, including:
+ *  - gtod
+ *  - jiffies
+ *  - sched_clock()
+ *  - explicit idle events
+ *
+ * We use gtod as base and the unstable clock deltas. The deltas are filtered,
+ * making it monotonic and keeping it within an expected window.  This window
+ * is set up using jiffies.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
+ * consistent between cpus (never more than 1 jiffies difference).
+ */
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/hardirq.h>
+
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+
+struct sched_clock_data {
+	/*
+	 * Raw spinlock - this is a special case: this might be called
+	 * from within instrumentation code so we dont want to do any
+	 * instrumentation ourselves.
+	 */
+	__raw_spinlock_t	lock;
+
+	unsigned long		prev_jiffies;
+	u64			prev_raw;
+	u64			tick_raw;
+	u64			tick_gtod;
+	u64			clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+	return &__get_cpu_var(sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+	return &per_cpu(sched_clock_data, cpu);
+}
+
+static __read_mostly int sched_clock_running;
+
+void sched_clock_init(void)
+{
+	int cpu;
+	unsigned long now_jiffies = jiffies;
+	u64 ktime = ktime_to_ns(ktime_get());
+
+	for_each_possible_cpu(cpu) {
+		struct sched_clock_data *scd = cpu_sdc(cpu);
+
+		scd->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+		scd->prev_jiffies = now_jiffies;
+		scd->prev_raw = 0;
+		scd->tick_raw = 0;
+		scd->tick_gtod = 0;
+		scd->clock = ktime;
+	}
+
+	sched_clock_running = 1;
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ *  - filter out backward motion
+ *  - use jiffies to generate a min,max window to clip the raw values
+ */
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+{
+	unsigned long now_jiffies = jiffies;
+	long delta_jiffies = now_jiffies - scd->prev_jiffies;
+	u64 clock = scd->clock;
+	u64 min_clock, max_clock;
+	s64 delta = now - scd->prev_raw;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+
+	if (unlikely(delta < 0)) {
+		clock++;
+		goto out;
+	}
+
+	max_clock = min_clock + TICK_NSEC;
+
+	if (unlikely(clock + delta > max_clock)) {
+		if (clock < max_clock)
+			clock = max_clock;
+		else
+			clock++;
+	} else {
+		clock += delta;
+	}
+
+ out:
+	if (unlikely(clock < min_clock))
+		clock = min_clock;
+
+	scd->prev_raw = now;
+	scd->prev_jiffies = now_jiffies;
+	scd->clock = clock;
+}
+
+static void lock_double_clock(struct sched_clock_data *data1,
+				struct sched_clock_data *data2)
+{
+	if (data1 < data2) {
+		__raw_spin_lock(&data1->lock);
+		__raw_spin_lock(&data2->lock);
+	} else {
+		__raw_spin_lock(&data2->lock);
+		__raw_spin_lock(&data1->lock);
+	}
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+	struct sched_clock_data *scd = cpu_sdc(cpu);
+	u64 now, clock;
+
+	if (unlikely(!sched_clock_running))
+		return 0ULL;
+
+	if (unlikely(in_nmi()))
+		return scd->clock;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	now = sched_clock();
+
+#if 0
+	if (cpu != smp_processor_id()) {
+		/*
+		 * in order to update a remote cpu's clock based on our
+		 * unstable raw time rebase it against:
+		 *   tick_raw		(offset between raw counters)
+		 *   tick_gotd          (tick offset between cpus)
+		 */
+		struct sched_clock_data *my_scd = this_scd();
+
+		lock_double_clock(scd, my_scd);
+
+		now -= my_scd->tick_raw;
+		now += scd->tick_raw;
+
+		now -= my_scd->tick_gtod;
+		now += scd->tick_gtod;
+
+		__raw_spin_unlock(&my_scd->lock);
+	} else {
+		__raw_spin_lock(&scd->lock);
+	}
+
+	__update_sched_clock(scd, now);
+	clock = scd->clock;
+
+	__raw_spin_unlock(&scd->lock);
+#else
+	if (cpu == smp_processor_id() && __raw_spin_trylock(&scd->lock)) {
+		__update_sched_clock(scd, now);
+		clock = scd->clock;
+		__raw_spin_unlock(&scd->lock);
+	} else
+		clock = scd->clock;
+#endif
+
+	return clock;
+}
+
+void sched_clock_tick(void)
+{
+	struct sched_clock_data *scd = this_scd();
+	u64 now, now_gtod;
+
+	if (unlikely(!sched_clock_running))
+		return;
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	now = sched_clock();
+	now_gtod = ktime_to_ns(ktime_get());
+
+	__raw_spin_lock(&scd->lock);
+	__update_sched_clock(scd, now);
+	/*
+	 * update tick_gtod after __update_sched_clock() because that will
+	 * already observe 1 new jiffy; adding a new tick_gtod to that would
+	 * increase the clock 2 jiffies.
+	 */
+	scd->tick_raw = now;
+	scd->tick_gtod = now_gtod;
+	__raw_spin_unlock(&scd->lock);
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+	sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+	struct sched_clock_data *scd = this_scd();
+	u64 now = sched_clock();
+
+	/*
+	 * Override the previous timestamp and ignore all
+	 * sched_clock() deltas that occured while we idled,
+	 * and use the PM-provided delta_ns to advance the
+	 * rq clock:
+	 */
+	__raw_spin_lock(&scd->lock);
+	scd->prev_raw = now;
+	scd->clock += delta_ns;
+	__raw_spin_unlock(&scd->lock);
+
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+#endif
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
+
+unsigned long long cpu_clock(int cpu)
+{
+	unsigned long flags;
+	unsigned long long clock;
+
+	raw_local_irq_save(flags);
+	clock = sched_clock_cpu(cpu);
+	raw_local_irq_restore(flags);
+
+	return clock;
+}
+EXPORT_SYMBOL_GPL(cpu_clock);
Index: linux-2.6.25.4-rt4/kernel/sched_debug.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/sched_debug.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/sched_debug.c	2008-05-29 09:47:05.000000000 -0400
@@ -175,18 +175,24 @@ static void print_cpu(struct seq_file *m
 	PN(next_balance);
 	P(curr->pid);
 	PN(clock);
-	PN(idle_clock);
-	PN(prev_clock_raw);
-	P(clock_warps);
-	P(clock_overflows);
-	P(clock_underflows);
-	P(clock_deep_idle_events);
-	PN(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
+#ifdef CONFIG_PREEMPT_RT
+	/* Print rt related rq stats */
+	P(rt.rt_nr_running);
+	P(rt.rt_nr_uninterruptible);
+# ifdef CONFIG_SCHEDSTATS
+	P(rto_schedule);
+	P(rto_schedule_tail);
+	P(rto_wakeup);
+	P(rto_pulled);
+	P(rto_pushed);
+# endif
+#endif
+
 #undef P
 #undef PN
 
Index: linux-2.6.25.4-rt4/kernel/sched_fair.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/sched_fair.c	2008-05-29 09:45:58.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/sched_fair.c	2008-05-29 09:47:32.000000000 -0400
@@ -901,7 +901,7 @@ static void yield_task_fair(struct rq *r
 		return;
 
 	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-		__update_rq_clock(rq);
+		update_rq_clock(rq);
 		/*
 		 * Update run-time statistics of the 'current'.
 		 */
@@ -951,7 +951,7 @@ static int wake_idle(int cpu, struct tas
 	 * sibling runqueue info. This will avoid the checks and cache miss
 	 * penalities associated with that.
 	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+	if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
 		return cpu;
 
 	for_each_domain(cpu, sd) {
Index: linux-2.6.25.4-rt4/arch/x86/kernel/apic_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/apic_32.c	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/apic_32.c	2008-05-29 09:46:15.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/acpi_pmtmr.h>
 #include <linux/module.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -598,6 +599,7 @@ void smp_apic_timer_interrupt(struct pt_
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	ftrace_event_irq(-1, user_mode(regs), regs->eip);
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
@@ -1285,6 +1287,7 @@ void smp_error_interrupt(struct pt_regs 
 	*/
 	printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
 		smp_processor_id(), v , v1);
+	dump_stack();
 	irq_exit();
 }
 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/irq_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/irq_32.c	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/irq_32.c	2008-05-29 09:46:51.000000000 -0400
@@ -16,6 +16,8 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 
+#include <linux/ftrace.h>
+
 #include <asm/apic.h>
 #include <asm/uaccess.h>
 
@@ -77,6 +79,10 @@ unsigned int do_IRQ(struct pt_regs *regs
 	u32 *isp;
 #endif
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	irq_show_regs_callback(smp_processor_id(), regs);
+#endif
+
 	if (unlikely((unsigned)irq >= NR_IRQS)) {
 		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
 					__FUNCTION__, irq);
@@ -85,6 +91,7 @@ unsigned int do_IRQ(struct pt_regs *regs
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
+	ftrace_event_irq(irq, user_mode(regs), regs->eip);
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
@@ -93,7 +100,7 @@ unsigned int do_IRQ(struct pt_regs *regs
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (sp) : "0" (THREAD_SIZE - 1));
 		if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
-			printk("do_IRQ: stack overflow: %ld\n",
+			printk("BUG: do_IRQ: stack overflow: %ld\n",
 				sp - sizeof(struct thread_info));
 			dump_stack();
 		}
Index: linux-2.6.25.4-rt4/arch/x86/kernel/irq_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/irq_64.c	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/irq_64.c	2008-05-29 09:46:16.000000000 -0400
@@ -13,6 +13,7 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/delay.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/io_apic.h>
 #include <asm/idle.h>
@@ -165,10 +166,14 @@ asmlinkage unsigned int do_IRQ(struct pt
 	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 
+	irq_show_regs_callback(smp_processor_id(), regs);
+
 	exit_idle();
 	irq_enter();
 	irq = __get_cpu_var(vector_irq)[vector];
 
+	ftrace_event_irq(irq, user_mode(regs), regs->ip);
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	stack_overflow_check(regs);
 #endif
Index: linux-2.6.25.4-rt4/arch/x86/kernel/traps_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/traps_32.c	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/traps_32.c	2008-05-29 09:47:00.000000000 -0400
@@ -30,6 +30,8 @@
 #include <linux/nmi.h>
 #include <linux/bug.h>
 
+#include <linux/ftrace.h>
+
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
 #include <linux/eisa.h>
@@ -255,6 +257,7 @@ show_trace_log_lvl(struct task_struct *t
 {
 	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	printk("%s =======================\n", log_lvl);
+	print_preempt_trace(task);
 }
 
 void show_trace(struct task_struct *task, struct pt_regs *regs,
@@ -284,8 +287,15 @@ static void show_stack_log_lvl(struct ta
 			printk("\n%s       ", log_lvl);
 		printk("%08lx ", *stack++);
 	}
+
+	pause_on_oops_head();
+
 	printk("\n%sCall Trace:\n", log_lvl);
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+	debug_show_held_locks(task);
+
+	pause_on_oops_tail();
+
 }
 
 void show_stack(struct task_struct *task, unsigned long *sp)
@@ -317,6 +327,12 @@ void dump_stack(void)
 
 EXPORT_SYMBOL(dump_stack);
 
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE)
+extern unsigned long worst_stack_left;
+#else
+# define worst_stack_left -1L
+#endif
+
 void show_registers(struct pt_regs *regs)
 {
 	int i;
@@ -425,7 +441,7 @@ void die(const char * str, struct pt_reg
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			__RAW_SPIN_LOCK_UNLOCKED,
+		.lock =			RAW_SPIN_LOCK_UNLOCKED(die.lock),
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -436,7 +452,7 @@ void die(const char * str, struct pt_reg
 	if (die.lock_owner != raw_smp_processor_id()) {
 		console_verbose();
 		raw_local_irq_save(flags);
-		__raw_spin_lock(&die.lock);
+		spin_lock(&die.lock);
 		die.lock_owner = smp_processor_id();
 		die.lock_owner_depth = 0;
 		bust_spinlocks(1);
@@ -455,7 +471,7 @@ void die(const char * str, struct pt_reg
 	bust_spinlocks(0);
 	die.lock_owner = -1;
 	add_taint(TAINT_DIE);
-	__raw_spin_unlock(&die.lock);
+	spin_unlock(&die.lock);
 	raw_local_irq_restore(flags);
 
 	if (!regs)
@@ -495,6 +511,11 @@ static void __kprobes do_trap(int trapnr
 	if (!user_mode(regs))
 		goto kernel_trap;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	trap_signal: {
 		/*
 		 * We want error_code and trap_no set for userspace faults and
@@ -755,6 +776,7 @@ void __kprobes die_nmi(struct pt_regs *r
 		crash_kexec(regs);
 	}
 
+	nmi_exit();
 	do_exit(SIGSEGV);
 }
 
@@ -804,6 +826,8 @@ __kprobes void do_nmi(struct pt_regs * r
 
 	nmi_enter();
 
+	ftrace_event_irq(-1, user_mode(regs), regs->eip);
+
 	cpu = smp_processor_id();
 
 	++nmi_count(cpu);
Index: linux-2.6.25.4-rt4/arch/x86/kernel/traps_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/traps_64.c	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/traps_64.c	2008-05-29 09:47:12.000000000 -0400
@@ -33,6 +33,8 @@
 #include <linux/kdebug.h>
 #include <linux/utsname.h>
 
+#include <linux/ftrace.h>
+
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
 #endif
@@ -82,20 +84,22 @@ static inline void conditional_sti(struc
 		local_irq_enable();
 }
 
-static inline void preempt_conditional_sti(struct pt_regs *regs)
+static inline void preempt_conditional_sti(struct pt_regs *regs, int stack)
 {
-	inc_preempt_count();
+	if (stack)
+		inc_preempt_count();
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
 
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void preempt_conditional_cli(struct pt_regs *regs, int stack)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
 	/* Make sure to not schedule here because we could be running
 	   on an exception stack. */
-	dec_preempt_count();
+	if (stack)
+		dec_preempt_count();
 }
 
 int kstack_depth_to_print = 12;
@@ -132,10 +136,14 @@ static unsigned long *in_exception_stack
 					unsigned *usedp, char **idp)
 {
 	static char ids[][8] = {
+#if DEBUG_STACK > 0
 		[DEBUG_STACK - 1] = "#DB",
+#endif
 		[NMI_STACK - 1] = "NMI",
 		[DOUBLEFAULT_STACK - 1] = "#DF",
+#if STACKFAULT_STACK > 0
 		[STACKFAULT_STACK - 1] = "#SS",
+#endif
 		[MCE_STACK - 1] = "#MC",
 #if DEBUG_STKSZ > EXCEPTION_STKSZ
 		[N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
@@ -260,7 +268,7 @@ void dump_trace(struct task_struct *tsk,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 {
-	const unsigned cpu = get_cpu();
+	const unsigned cpu = raw_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
 	struct thread_info *tinfo;
@@ -344,7 +352,6 @@ void dump_trace(struct task_struct *tsk,
 	 * This handles the process stack:
 	 */
 	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
-	put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
 
@@ -383,9 +390,13 @@ void
 show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
 		unsigned long bp)
 {
+	pause_on_oops_head();
 	printk("\nCall Trace:\n");
 	dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
 	printk("\n");
+	debug_show_held_locks(tsk);
+	pause_on_oops_tail();
+	print_preempt_trace(tsk);
 }
 
 static void
@@ -394,7 +405,7 @@ _show_stack(struct task_struct *tsk, str
 {
 	unsigned long *stack;
 	int i;
-	const int cpu = smp_processor_id();
+	const int cpu = raw_smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
 	unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
 
@@ -513,7 +524,7 @@ int is_valid_bugaddr(unsigned long ip)
 	return ud2 == 0x0b0f;
 }
 
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static raw_spinlock_t die_lock = RAW_SPIN_LOCK_UNLOCKED(die_lock);
 static int die_owner = -1;
 static unsigned int die_nest_count;
 
@@ -527,11 +538,11 @@ unsigned __kprobes long oops_begin(void)
 	/* racy, but better than risking deadlock. */
 	raw_local_irq_save(flags);
 	cpu = smp_processor_id();
-	if (!__raw_spin_trylock(&die_lock)) {
+	if (!spin_trylock(&die_lock)) {
 		if (cpu == die_owner) 
 			/* nested oops. should stop eventually */;
 		else
-			__raw_spin_lock(&die_lock);
+			spin_lock(&die_lock);
 	}
 	die_nest_count++;
 	die_owner = cpu;
@@ -547,7 +558,7 @@ void __kprobes oops_end(unsigned long fl
 	die_nest_count--;
 	if (!die_nest_count)
 		/* Nest count reaches zero, release the lock. */
-		__raw_spin_unlock(&die_lock);
+		spin_unlock(&die_lock);
 	raw_local_irq_restore(flags);
 	if (!regs) {
 		oops_exit();
@@ -707,9 +718,9 @@ asmlinkage void do_stack_segment(struct 
 	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
 			12, SIGBUS) == NOTIFY_STOP)
 		return;
-	preempt_conditional_sti(regs);
+	preempt_conditional_sti(regs, STACKFAULT_STACK);
 	do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	preempt_conditional_cli(regs, STACKFAULT_STACK);
 }
 
 asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
@@ -825,6 +836,8 @@ asmlinkage __kprobes void default_do_nmi
 
 	cpu = smp_processor_id();
 
+	ftrace_event_irq(-1, user_mode(regs), regs->ip);
+
 	/* Only the BSP gets external NMIs from the system.  */
 	if (!cpu)
 		reason = get_nmi_reason();
@@ -863,9 +876,9 @@ asmlinkage void __kprobes do_int3(struct
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
 		return;
 	}
-	preempt_conditional_sti(regs);
+	preempt_conditional_sti(regs, DEBUG_STACK);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	preempt_conditional_cli(regs, DEBUG_STACK);
 }
 
 /* Help handler running on IST stack to switch back to user stack
@@ -911,7 +924,7 @@ asmlinkage void __kprobes do_debug(struc
 						SIGTRAP) == NOTIFY_STOP)
 		return;
 
-	preempt_conditional_sti(regs);
+	preempt_conditional_sti(regs, DEBUG_STACK);
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -943,13 +956,13 @@ asmlinkage void __kprobes do_debug(struc
 
 clear_dr7:
 	set_debugreg(0UL, 7);
-	preempt_conditional_cli(regs);
+	preempt_conditional_cli(regs, DEBUG_STACK);
 	return;
 
 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 	regs->flags &= ~X86_EFLAGS_TF;
-	preempt_conditional_cli(regs);
+	preempt_conditional_cli(regs, DEBUG_STACK);
 }
 
 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
Index: linux-2.6.25.4-rt4/arch/x86/mm/fault.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/mm/fault.c	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/mm/fault.c	2008-05-29 09:46:58.000000000 -0400
@@ -25,6 +25,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/ftrace.h>
 
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -363,6 +364,7 @@ static int is_f00f_bug(struct pt_regs *r
 		nr = (address - idt_descr.address) >> 3;
 
 		if (nr == 6) {
+			zap_rt_locks();
 			do_invalid_op(regs, 0);
 			return 1;
 		}
@@ -597,6 +599,8 @@ void __kprobes do_page_fault(struct pt_r
 	/* get the address */
 	address = read_cr2();
 
+	ftrace_event_fault(regs->ip, error_code, address);
+
 	si_code = SEGV_MAPERR;
 
 	if (notify_page_fault(regs))
@@ -646,7 +650,7 @@ void __kprobes do_page_fault(struct pt_r
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault.
 	 */
-	if (in_atomic() || !mm)
+	if (unlikely(in_atomic() || !mm || current->pagefault_disabled))
 		goto bad_area_nosemaphore;
 #else /* CONFIG_X86_64 */
 	if (likely(regs->flags & X86_EFLAGS_IF))
Index: linux-2.6.25.4-rt4/kernel/hrtimer.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/hrtimer.c	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/hrtimer.c	2008-05-29 09:47:02.000000000 -0400
@@ -44,6 +44,8 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 
+#include <linux/ftrace.h>
+
 #include <asm/uaccess.h>
 
 /**
@@ -393,9 +395,9 @@ static inline int hrtimer_is_hres_enable
 /*
  * Is the high resolution mode active ?
  */
-static inline int hrtimer_hres_active(void)
+static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
 {
-	return __get_cpu_var(hrtimer_bases).hres_active;
+	return cpu_base->hres_active;
 }
 
 /*
@@ -483,11 +485,12 @@ static int hrtimer_reprogram(struct hrti
  */
 static void retrigger_next_event(void *arg)
 {
-	struct hrtimer_cpu_base *base;
+	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+
 	struct timespec realtime_offset;
 	unsigned long seq;
 
-	if (!hrtimer_hres_active())
+	if (!hrtimer_hres_active(base))
 		return;
 
 	do {
@@ -497,8 +500,6 @@ static void retrigger_next_event(void *a
 					-wall_to_monotonic.tv_nsec);
 	} while (read_seqretry(&xtime_lock, seq));
 
-	base = &__get_cpu_var(hrtimer_bases);
-
 	/* Adjust CLOCK_REALTIME offset */
 	spin_lock(&base->lock);
 	base->clock_base[CLOCK_REALTIME].offset =
@@ -601,10 +602,8 @@ static inline int hrtimer_enqueue_reprog
 /*
  * Switch to high resolution mode
  */
-static int hrtimer_switch_to_hres(void)
+static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base)
 {
-	int cpu = smp_processor_id();
-	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
 	unsigned long flags;
 
 	if (base->hres_active)
@@ -615,7 +614,7 @@ static int hrtimer_switch_to_hres(void)
 	if (tick_init_highres()) {
 		local_irq_restore(flags);
 		printk(KERN_WARNING "Could not switch to high resolution "
-				    "mode on CPU %d\n", cpu);
+		       "mode on CPU %d\n", raw_smp_processor_id());
 		return 0;
 	}
 	base->hres_active = 1;
@@ -627,8 +626,6 @@ static int hrtimer_switch_to_hres(void)
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
 	local_irq_restore(flags);
-	printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
-	       smp_processor_id());
 	return 1;
 }
 
@@ -639,9 +636,15 @@ static inline void hrtimer_raise_softirq
 
 #else
 
-static inline int hrtimer_hres_active(void) { return 0; }
+static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base)
+{
+	return 0;
+}
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base)
+{
+	return 0;
+}
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 					    struct hrtimer_clock_base *base)
@@ -721,6 +724,28 @@ u64 hrtimer_forward(struct hrtimer *time
 }
 EXPORT_SYMBOL_GPL(hrtimer_forward);
 
+unsigned long
+hrtimer_overrun(struct hrtimer *timer, ktime_t now, ktime_t interval)
+{
+	unsigned long orun = 1;
+	ktime_t delta;
+
+	delta = ktime_sub(now, timer->expires);
+
+	if (delta.tv64 < 0)
+		return 0;
+
+	if (interval.tv64 < timer->base->resolution.tv64)
+		interval.tv64 = timer->base->resolution.tv64;
+
+	if (unlikely(delta.tv64 >= interval.tv64))
+		orun = ktime_divns(delta, ktime_to_ns(interval)) + 1;
+
+	return orun;
+}
+EXPORT_SYMBOL_GPL(hrtimer_overrun);
+
+
 /*
  * enqueue_hrtimer - internal function to (re)start a timer
  *
@@ -735,6 +760,7 @@ static void enqueue_hrtimer(struct hrtim
 	struct hrtimer *entry;
 	int leftmost = 1;
 
+	ftrace_event_timer_set(&timer->expires, timer);
 	/*
 	 * Find the right place in the rbtree:
 	 */
@@ -806,7 +832,7 @@ static void __remove_hrtimer(struct hrti
 		if (base->first == &timer->node) {
 			base->first = rb_next(&timer->node);
 			/* Reprogram the clock event device. if enabled */
-			if (reprogram && hrtimer_hres_active())
+			if (reprogram && hrtimer_hres_active(base->cpu_base))
 				hrtimer_force_reprogram(base->cpu_base);
 		}
 		rb_erase(&timer->node, &base->active);
@@ -948,7 +974,7 @@ int hrtimer_cancel(struct hrtimer *timer
 
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		hrtimer_wait_for_timer(timer);
 	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -988,7 +1014,7 @@ ktime_t hrtimer_get_next_event(void)
 
 	spin_lock_irqsave(&cpu_base->lock, flags);
 
-	if (!hrtimer_hres_active()) {
+	if (!hrtimer_hres_active(cpu_base)) {
 		for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
 			struct hrtimer *timer;
 
@@ -1060,6 +1086,32 @@ int hrtimer_get_res(const clockid_t whic
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+# define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+
+	if (base && base->cpu_base)
+		wait_event(base->cpu_base->wait,
+				!(timer->state & HRTIMER_STATE_CALLBACK));
+}
+
+#else
+# define wake_up_timer_waiters(b)	do { } while (0)
+#endif
+
 static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
 {
 	spin_lock_irq(&cpu_base->lock);
@@ -1111,6 +1163,8 @@ static void run_hrtimer_pending(struct h
 		}
 	}
 	spin_unlock_irq(&cpu_base->lock);
+
+	wake_up_timer_waiters(cpu_base);
 }
 
 static void __run_hrtimer(struct hrtimer *timer)
@@ -1169,6 +1223,7 @@ void hrtimer_interrupt(struct clock_even
 
  retry:
 	now = ktime_get();
+	ftrace_event_timestamp(&now);
 
 	expires_next.tv64 = KTIME_MAX;
 
@@ -1197,6 +1252,8 @@ void hrtimer_interrupt(struct clock_even
 				break;
 			}
 
+			ftrace_event_timer_triggered(&timer->expires, timer);
+
 			/* Move softirq callbacks to the pending list */
 			if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
 				__remove_hrtimer(timer, base,
@@ -1228,7 +1285,10 @@ void hrtimer_interrupt(struct clock_even
 
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
-	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+	struct hrtimer_cpu_base *cpu_base;
+
+	cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id());
+	run_hrtimer_pending(cpu_base);
 }
 
 #endif	/* CONFIG_HIGH_RES_TIMERS */
@@ -1244,7 +1304,7 @@ void hrtimer_run_pending(void)
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
-	if (hrtimer_hres_active())
+	if (hrtimer_hres_active(cpu_base))
 		return;
 
 	/*
@@ -1256,7 +1316,7 @@ void hrtimer_run_pending(void)
 	 * deadlock vs. xtime_lock.
 	 */
 	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		hrtimer_switch_to_hres();
+		hrtimer_switch_to_hres(cpu_base);
 
 	run_hrtimer_pending(cpu_base);
 }
@@ -1299,10 +1359,11 @@ static inline void run_hrtimer_queue(str
 
 void hrtimer_run_queues(void)
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+	struct hrtimer_cpu_base *cpu_base;
 	int i;
 
-	if (hrtimer_hres_active())
+	cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id());
+	if (hrtimer_hres_active(cpu_base))
 		return;
 
 	hrtimer_get_softirq_time(cpu_base);
@@ -1458,6 +1519,9 @@ static void __cpuinit init_hrtimers_cpu(
 
 	INIT_LIST_HEAD(&cpu_base->cb_pending);
 	hrtimer_init_hres(cpu_base);
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	init_waitqueue_head(&cpu_base->wait);
+#endif
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1492,7 +1556,7 @@ static void migrate_hrtimers(int cpu)
 	tick_cancel_sched_timer(cpu);
 
 	local_irq_disable();
-	double_spin_lock(&new_base->lock, &old_base->lock,
+	raw_double_spin_lock(&new_base->lock, &old_base->lock,
 			 smp_processor_id() < cpu);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
@@ -1500,7 +1564,7 @@ static void migrate_hrtimers(int cpu)
 				     &new_base->clock_base[i]);
 	}
 
-	double_spin_unlock(&new_base->lock, &old_base->lock,
+	raw_double_spin_unlock(&new_base->lock, &old_base->lock,
 			   smp_processor_id() < cpu);
 	local_irq_enable();
 	put_cpu_var(hrtimer_bases);
Index: linux-2.6.25.4-rt4/kernel/trace/trace_events.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_events.c	2008-05-29 09:46:09.000000000 -0400
@@ -0,0 +1,658 @@
+/*
+ * trace task events
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array __read_mostly	*events_trace;
+static int __read_mostly	tracer_enabled;
+static atomic_t			event_ref;
+
+static void event_reset(struct trace_array *tr)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		data = tr->data[cpu];
+		tracing_reset(data);
+	}
+
+	tr->time_start = ftrace_now(raw_smp_processor_id());
+}
+
+/* HACK */
+void notrace
+sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+	struct trace_array *tr;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	unsigned long ip;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	tr = events_trace;
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	atomic_inc(&data->disabled);
+	if (atomic_read(&data->disabled) != 1)
+		goto out;
+
+	ip = CALLER_ADDR0;
+
+	tracing_event_syscall(tr, data, flags, ip, nr, p1, p2, p3);
+
+ out:
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+void notrace
+sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2,
+	      unsigned long p3)
+{
+	struct trace_array *tr;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	unsigned long ip;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	tr = events_trace;
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	atomic_inc(&data->disabled);
+	if (atomic_read(&data->disabled) != 1)
+		goto out;
+
+	ip = CALLER_ADDR0;
+	tracing_event_syscall(tr, data, flags, ip, nr | 0x80000000, p1, p2, p3);
+
+ out:
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+#endif
+
+void notrace
+sys_ret(unsigned long ret)
+{
+	struct trace_array *tr;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	unsigned long ip;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	tr = events_trace;
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	atomic_inc(&data->disabled);
+	if (atomic_read(&data->disabled) != 1)
+		goto out;
+
+	ip = CALLER_ADDR0;
+	tracing_event_sysret(tr, data, flags, ip, ret);
+
+ out:
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+#define getarg(arg, ap) arg = va_arg(ap, typeof(arg));
+
+static void
+event_irq_callback(void *probe_data, void *call_data,
+		   const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long ip, flags;
+	int irq, user, cpu;
+	long disable;
+
+	if (!tracer_enabled)
+		return;
+
+	getarg(irq, *args);
+	getarg(user, *args);
+	getarg(ip, *args);
+
+	/* interrupts should be off, we are in an interrupt */
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+
+	disable = atomic_inc_return(&data->disabled);
+	if (disable != 1)
+		goto out;
+
+	local_save_flags(flags);
+	tracing_event_irq(tr, data, flags, CALLER_ADDR1, irq, user, ip);
+
+ out:
+	atomic_dec(&data->disabled);
+}
+
+static void
+event_fault_callback(void *probe_data, void *call_data,
+		     const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long ip, flags, error, addr;
+	long disable;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	getarg(ip, *args);
+	getarg(error, *args);
+	getarg(addr, *args);
+
+	preempt_disable_notrace();
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+
+	disable = atomic_inc_return(&data->disabled);
+	if (disable != 1)
+		goto out;
+
+	local_save_flags(flags);
+	tracing_event_fault(tr, data, flags, CALLER_ADDR1, ip, error, addr);
+
+ out:
+	atomic_dec(&data->disabled);
+	preempt_enable_notrace();
+}
+
+static void
+event_timer_set_callback(void *probe_data, void *call_data,
+			 const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	ktime_t *expires;
+	void *timer;
+	long disable;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	getarg(expires, *args);
+	getarg(timer, *args);
+
+	/* interrupts should be off, we are in an interrupt */
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+
+	disable = atomic_inc_return(&data->disabled);
+	if (disable != 1)
+		goto out;
+
+	local_save_flags(flags);
+	tracing_event_timer_set(tr, data, flags, CALLER_ADDR1, expires, timer);
+
+ out:
+	atomic_dec(&data->disabled);
+}
+
+static void
+event_timer_triggered_callback(void *probe_data, void *call_data,
+			       const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	ktime_t *expired;
+	void *timer;
+	long disable;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	getarg(expired, *args);
+	getarg(timer, *args);
+
+	/* interrupts should be off, we are in an interrupt */
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+
+	disable = atomic_inc_return(&data->disabled);
+	if (disable != 1)
+		goto out;
+
+	local_save_flags(flags);
+	tracing_event_timer_triggered(tr, data, flags, CALLER_ADDR1, expired, timer);
+
+ out:
+	atomic_dec(&data->disabled);
+}
+
+static void
+event_hrtimer_callback(void *probe_data, void *call_data,
+		       const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	ktime_t *now;
+	long disable;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	getarg(now, *args);
+
+	/* interrupts should be off, we are in an interrupt */
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+
+	disable = atomic_inc_return(&data->disabled);
+	if (disable != 1)
+		goto out;
+
+	local_save_flags(flags);
+	tracing_event_timestamp(tr, data, flags, CALLER_ADDR1, now);
+
+ out:
+	atomic_dec(&data->disabled);
+}
+
+static void
+event_task_activate_callback(void *probe_data, void *call_data,
+			     const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	struct task_struct *p;
+	long disable;
+	int cpu, rqcpu;
+
+	if (!tracer_enabled)
+		return;
+
+	getarg(p, *args);
+	getarg(rqcpu, *args);
+
+	/* interrupts should be off, we are in an interrupt */
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+
+	disable = atomic_inc_return(&data->disabled);
+	if (disable != 1)
+		goto out;
+
+	local_save_flags(flags);
+	tracing_event_task_activate(tr, data, flags, CALLER_ADDR1, p, rqcpu);
+
+ out:
+	atomic_dec(&data->disabled);
+}
+
+static void
+event_task_deactivate_callback(void *probe_data, void *call_data,
+			       const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	struct task_struct *p;
+	long disable;
+	int cpu, rqcpu;
+
+	if (!tracer_enabled)
+		return;
+
+	getarg(p, *args);
+	getarg(rqcpu, *args);
+
+	/* interrupts should be off, we are in an interrupt */
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+
+	disable = atomic_inc_return(&data->disabled);
+	if (disable != 1)
+		goto out;
+
+	local_save_flags(flags);
+	tracing_event_task_deactivate(tr, data, flags, CALLER_ADDR1, p, rqcpu);
+
+ out:
+	atomic_dec(&data->disabled);
+}
+
+static void
+event_wakeup_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	struct task_struct *wakee, *curr;
+	long disable, ignore2;
+	void *ignore3;
+	int ignore1;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	getarg(ignore1, *args);
+	getarg(ignore2, *args);
+	getarg(ignore3, *args);
+
+	getarg(wakee, *args);
+	getarg(curr, *args);
+
+	/* interrupts should be disabled */
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
+		goto out;
+
+	local_save_flags(flags);
+	/* record process's command line */
+	tracing_record_cmdline(wakee);
+	tracing_record_cmdline(curr);
+
+	tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+
+ out:
+	atomic_dec(&data->disabled);
+}
+static void
+event_ctx_callback(void *probe_data, void *call_data,
+		   const char *format, va_list *args)
+{
+	struct trace_array *tr = probe_data;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	struct task_struct *prev;
+	struct task_struct *next;
+	long disable, ignore2;
+	void *ignore3;
+	int ignore1;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	getarg(ignore1, *args);
+	getarg(ignore1, *args);
+	getarg(ignore2, *args);
+	getarg(ignore3, *args);
+
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+	tracing_record_cmdline(next);
+
+	/* interrupts should be disabled */
+	cpu = smp_processor_id();
+	data = tr->data[cpu];
+	disable = atomic_inc_return(&data->disabled);
+
+	if (likely(disable != 1))
+		goto out;
+
+	local_save_flags(flags);
+	tracing_sched_switch_trace(tr, data, prev, next, flags);
+ out:
+	atomic_dec(&data->disabled);
+}
+
+static int event_register_marker(const char *name, const char *format,
+				 marker_probe_func *probe, void *data)
+{
+	int ret;
+
+	ret = marker_probe_register(name, format, probe, data);
+	if (ret) {
+		pr_info("event trace: Couldn't add marker"
+			" probe to %s\n", name);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void event_tracer_register(struct trace_array *tr)
+{
+	int ret;
+
+	ret = event_register_marker("ftrace_event_irq", "%d %d %ld",
+				    event_irq_callback, tr);
+	if (ret)
+		return;
+
+	ret = event_register_marker("ftrace_event_fault", "%ld %ld %ld",
+				    event_fault_callback, tr);
+	if (ret)
+		goto out1;
+
+	ret = event_register_marker("ftrace_event_timer_set", "%p %p",
+				    event_timer_set_callback, tr);
+	if (ret)
+		goto out2;
+
+	ret = event_register_marker("ftrace_event_timer_triggered", "%p %p",
+				    event_timer_triggered_callback, tr);
+	if (ret)
+		goto out3;
+
+	ret = event_register_marker("ftrace_event_hrtimer", "%p",
+				    event_hrtimer_callback, tr);
+	if (ret)
+		goto out4;
+
+	ret = event_register_marker("ftrace_event_task_activate", "%p %d",
+				    event_task_activate_callback, tr);
+	if (ret)
+		goto out5;
+
+	ret = event_register_marker("ftrace_event_task_deactivate", "%p %d",
+				    event_task_deactivate_callback, tr);
+	if (ret)
+		goto out6;
+
+	ret = event_register_marker("kernel_sched_wakeup",
+				    "pid %d state %ld ## rq %p task %p rq->curr %p",
+				    event_wakeup_callback, tr);
+	if (ret)
+		goto out7;
+
+	ret = event_register_marker("kernel_sched_wakeup_new",
+				    "pid %d state %ld ## rq %p task %p rq->curr %p",
+				    event_wakeup_callback, tr);
+	if (ret)
+		goto out8;
+
+	ret = event_register_marker("kernel_sched_schedule",
+				    "prev_pid %d next_pid %d prev_state %ld "
+				    "## rq %p prev %p next %p",
+				    event_ctx_callback, tr);
+	if (ret)
+		goto out9;
+
+	return;
+
+ out9:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				event_wakeup_callback, tr);
+ out8:
+	marker_probe_unregister("kernel_sched_wakeup",
+				event_wakeup_callback, tr);
+ out7:
+	marker_probe_unregister("ftrace_event_task_deactivate",
+				event_task_deactivate_callback, tr);
+ out6:
+	marker_probe_unregister("ftrace_event_task_activate",
+				event_task_activate_callback, tr);
+ out5:
+	marker_probe_unregister("ftrace_event_hrtimer",
+				event_hrtimer_callback, tr);
+ out4:
+	marker_probe_unregister("ftrace_event_timer_triggered",
+				event_timer_triggered_callback, tr);
+ out3:
+	marker_probe_unregister("ftrace_event_timer_set",
+				event_timer_set_callback, tr);
+ out2:
+	marker_probe_unregister("ftrace_event_fault",
+				event_fault_callback, tr);
+ out1:
+	marker_probe_unregister("ftrace_event_irq",
+				event_irq_callback, tr);
+}
+
+static void event_tracer_unregister(struct trace_array *tr)
+{
+	marker_probe_unregister("kernel_sched_schedule",
+				event_ctx_callback, tr);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				event_wakeup_callback, tr);
+	marker_probe_unregister("kernel_sched_wakeup",
+				event_wakeup_callback, tr);
+	marker_probe_unregister("ftrace_event_task_deactivate",
+			      event_task_deactivate_callback, tr);
+	marker_probe_unregister("ftrace_event_task_activate",
+				event_task_activate_callback, tr);
+	marker_probe_unregister("ftrace_event_hrtimer",
+				event_hrtimer_callback, tr);
+	marker_probe_unregister("ftrace_event_timer_triggered",
+				event_timer_triggered_callback, tr);
+	marker_probe_unregister("ftrace_event_timer_set",
+				event_timer_set_callback, tr);
+	marker_probe_unregister("ftrace_event_fault",
+				event_fault_callback, tr);
+	marker_probe_unregister("ftrace_event_irq",
+				event_irq_callback, tr);
+}
+
+void trace_event_register(struct trace_array *tr)
+{
+	long ref;
+
+	ref = atomic_inc_return(&event_ref);
+	if (ref == 1)
+		event_tracer_register(tr);
+}
+
+void trace_event_unregister(struct trace_array *tr)
+{
+	long ref;
+
+	ref = atomic_dec_and_test(&event_ref);
+	if (ref)
+		event_tracer_unregister(tr);
+}
+
+static void start_event_trace(struct trace_array *tr)
+{
+	event_reset(tr);
+	trace_event_register(tr);
+	tracing_start_function_trace();
+	tracer_enabled = 1;
+}
+
+static void stop_event_trace(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	tracing_stop_function_trace();
+	trace_event_unregister(tr);
+}
+
+static void event_trace_init(struct trace_array *tr)
+{
+	events_trace = tr;
+
+	if (tr->ctrl)
+		start_event_trace(tr);
+}
+
+static void event_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_event_trace(tr);
+}
+
+static void event_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_event_trace(tr);
+	else
+		stop_event_trace(tr);
+}
+
+static void event_trace_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		tracer_enabled = 0;
+}
+
+static void event_trace_close(struct trace_iterator *iter)
+{
+	if (iter->tr->ctrl)
+		tracer_enabled = 1;
+}
+
+static struct tracer event_trace __read_mostly =
+{
+	.name = "events",
+	.init = event_trace_init,
+	.reset = event_trace_reset,
+	.open = event_trace_open,
+	.close = event_trace_close,
+	.ctrl_update = event_trace_ctrl_update,
+};
+
+__init static int init_event_trace(void)
+{
+	int ret;
+
+	ret = register_tracer(&event_trace);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+device_initcall(init_event_trace);
Index: linux-2.6.25.4-rt4/kernel/trace/trace_hist.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_hist.c	2008-05-29 09:47:33.000000000 -0400
@@ -0,0 +1,637 @@
+/*
+ * kernel/trace/trace_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ *  Converted to work with the new latency tracer.
+ *  Copyright (C) 2008 Red Hat, Inc.
+ *    Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/marker.h>
+#include <asm/atomic.h>
+#include <asm/div64.h>
+#include <asm/uaccess.h>
+
+#include "trace.h"
+#include "trace_hist.h"
+
+enum {
+	INTERRUPT_LATENCY = 0,
+	PREEMPT_LATENCY,
+	PREEMPT_INTERRUPT_LATENCY,
+	WAKEUP_LATENCY,
+};
+
+#define MAX_ENTRY_NUM 10240
+
+struct hist_data {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	unsigned long min_lat;
+	unsigned long max_lat;
+	unsigned long long beyond_hist_bound_samples;
+	unsigned long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+};
+
+static char *latency_hist_dir_root = "latency_hist";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, interrupt_off_hist);
+static char *interrupt_off_hist_dir = "interrupt_off_latency";
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(struct hist_data, preempt_off_hist);
+static char *preempt_off_hist_dir = "preempt_off_latency";
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+static DEFINE_PER_CPU(struct hist_data, preempt_irqs_off_hist);
+static char *preempt_irqs_off_hist_dir = "preempt_interrupts_off_latency";
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
+static char *wakeup_latency_hist_dir = "wakeup_latency";
+#endif
+
+void notrace latency_hist(int latency_type, int cpu, unsigned long latency)
+{
+	struct hist_data *my_hist;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY)
+			|| (latency_type > WAKEUP_LATENCY) || (latency < 0))
+		return;
+
+	switch (latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case INTERRUPT_LATENCY:
+		my_hist = &per_cpu(interrupt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPT_LATENCY:
+		my_hist = &per_cpu(preempt_off_hist, cpu);
+		break;
+#endif
+
+#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
+	case PREEMPT_INTERRUPT_LATENCY:
+		my_hist = &per_cpu(preempt_irqs_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = &per_cpu(wakeup_latency_hist, cpu);
+		break;
+#endif
+	default:
+		return;
+	}
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency >= MAX_ENTRY_NUM)
+		my_hist->beyond_hist_bound_samples++;
+	else
+		my_hist->hist_array[latency]++;
+
+	if (latency < my_hist->min_lat)
+		my_hist->min_lat = latency;
+	else if (latency > my_hist->max_lat)
+		my_hist->max_lat = latency;
+
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+	return;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+	loff_t index = *pos;
+	struct hist_data *my_hist = m->private;
+
+	if (!index_ptr)
+		return NULL;
+
+	if (index == 0) {
+		char avgstr[32];
+
+		atomic_dec(&my_hist->hist_mode);
+		if (likely(my_hist->total_samples)) {
+			unsigned long avg = (unsigned long)
+			    div64_64(my_hist->accumulate_lat,
+			    my_hist->total_samples);
+			sprintf(avgstr, "%lu", avg);
+		} else
+			strcpy(avgstr, "<undef>");
+
+		seq_printf(m, "#Minimum latency: %lu microseconds.\n"
+			   "#Average latency: %s microseconds.\n"
+			   "#Maximum latency: %lu microseconds.\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples greater or equal"
+			   " than %d microseconds\n"
+			   "#usecs\t%16s\n"
+			   , my_hist->min_lat
+			   , avgstr
+			   , my_hist->max_lat
+			   , my_hist->total_samples
+			   , my_hist->beyond_hist_bound_samples
+			   , MAX_ENTRY_NUM, "samples");
+	}
+	if (index >= MAX_ENTRY_NUM)
+		return NULL;
+
+	*index_ptr = index;
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	loff_t *index_ptr = p;
+	struct hist_data *my_hist = m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	struct hist_data *my_hist = m->private;
+
+	seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]);
+	return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (!ret) {
+		struct seq_file *seq = file->private_data;
+		seq->private = inode->i_private;
+	}
+	return ret;
+}
+
+static struct file_operations latency_hist_fops = {
+	.open = latency_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static void hist_reset(struct hist_data *hist)
+{
+	atomic_dec(&hist->hist_mode);
+
+	memset(hist->hist_array, 0, sizeof(hist->hist_array));
+	hist->beyond_hist_bound_samples = 0ULL;
+	hist->min_lat = 0xFFFFFFFFUL;
+	hist->max_lat = 0UL;
+	hist->total_samples = 0ULL;
+	hist->accumulate_lat = 0ULL;
+
+	atomic_inc(&hist->hist_mode);
+}
+
+ssize_t latency_hist_reset(struct file *file, const char __user *a,
+			   size_t size, loff_t *off)
+{
+	int cpu;
+	struct hist_data *hist;
+	int latency_type = (long)file->private_data;
+
+	switch (latency_type) {
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		for_each_online_cpu(cpu) {
+			hist = &per_cpu(wakeup_latency_hist, cpu);
+			hist_reset(hist);
+		}
+		break;
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPT_LATENCY:
+		for_each_online_cpu(cpu) {
+			hist = &per_cpu(preempt_off_hist, cpu);
+			hist_reset(hist);
+		}
+		break;
+#endif
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case INTERRUPT_LATENCY:
+		for_each_online_cpu(cpu) {
+			hist = &per_cpu(interrupt_off_hist, cpu);
+			hist_reset(hist);
+		}
+		break;
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+	case PREEMPT_INTERRUPT_LATENCY:
+		for_each_online_cpu(cpu) {
+			hist = &per_cpu(preempt_irqs_off_hist, cpu);
+			hist_reset(hist);
+		}
+		break;
+#endif
+	}
+
+	return size;
+}
+
+static struct file_operations latency_hist_reset_fops = {
+	.open = tracing_open_generic,
+	.write = latency_hist_reset,
+};
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
+static DEFINE_PER_CPU(int, hist_irqsoff_tracing);
+#endif
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
+static DEFINE_PER_CPU(int, hist_preemptoff_tracing);
+#endif
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
+static DEFINE_PER_CPU(int, hist_preemptirqsoff_tracing);
+#endif
+
+notrace void tracing_hist_preempt_start(void)
+{
+	cycle_t uninitialized_var(start);
+	int start_set = 0;
+	int cpu;
+
+	if (!preempt_count() && !irqs_disabled())
+		return;
+
+	/* cpu is only used if we are in atomic */
+	cpu = raw_smp_processor_id();
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	if (irqs_disabled() &&
+	    !per_cpu(hist_irqsoff_tracing, cpu)) {
+		per_cpu(hist_irqsoff_tracing, cpu) = 1;
+		start_set++;
+		start = ftrace_now(cpu);
+		per_cpu(hist_irqsoff_start, cpu) = start;
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	if (preempt_count() &&
+	    !per_cpu(hist_preemptoff_tracing, cpu)) {
+		per_cpu(hist_preemptoff_tracing, cpu) = 1;
+		if (1 || !(start_set++))
+			start = ftrace_now(cpu);
+		per_cpu(hist_preemptoff_start, cpu) = start;
+
+	}
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+	if (!per_cpu(hist_preemptirqsoff_tracing, cpu)) {
+		per_cpu(hist_preemptirqsoff_tracing, cpu) = 1;
+		if (1 || !(start_set))
+			start = ftrace_now(cpu);
+		per_cpu(hist_preemptirqsoff_start, cpu) = start;
+	}
+#endif
+}
+
+notrace void tracing_hist_preempt_stop(int irqs_on)
+{
+	long latency;
+	cycle_t start;
+	cycle_t uninitialized_var(stop);
+	int stop_set = 0;
+	int cpu;
+
+	/* irqs_on == TRACE_STOP if we must stop tracing. */
+
+	/* cpu is only used if we are in atomic */
+	cpu = raw_smp_processor_id();
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	if (irqs_on  &&
+	    per_cpu(hist_irqsoff_tracing, cpu)) {
+		stop = ftrace_now(cpu);
+		stop_set++;
+		start = per_cpu(hist_irqsoff_start, cpu);
+		latency = (long)nsecs_to_usecs(stop - start);
+		if (latency > 1000000) {
+			printk("%d: latency = %ld (%lu)\n", __LINE__, latency, latency);
+			printk("%d: start=%Ld  stop=%Ld\n", __LINE__, start, stop);
+		}
+		barrier();
+		per_cpu(hist_irqsoff_tracing, cpu) = 0;
+		latency_hist(INTERRUPT_LATENCY, cpu, latency);
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	if ((!irqs_on || irqs_on == TRACE_STOP) &&
+	    per_cpu(hist_preemptoff_tracing, cpu)) {
+		WARN_ON(!preempt_count());
+		if (1 || !(stop_set++))
+			stop = ftrace_now(cpu);
+		start = per_cpu(hist_preemptoff_start, cpu);
+		latency = (long)nsecs_to_usecs(stop - start);
+		if (latency > 1000000) {
+			printk("%d: latency = %ld (%lu)\n", __LINE__, latency, latency);
+			printk("%d: start=%Ld  stop=%Ld\n", __LINE__, start, stop);
+		}
+		barrier();
+		per_cpu(hist_preemptoff_tracing, cpu) = 0;
+		latency_hist(PREEMPT_LATENCY, cpu, latency);
+	}
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+	if (((!irqs_on && !irqs_disabled()) ||
+	     (irqs_on && !preempt_count()) ||
+	     (irqs_on == TRACE_STOP)) &&
+	    per_cpu(hist_preemptirqsoff_tracing, cpu)) {
+		WARN_ON(!preempt_count() && !irqs_disabled());
+		if (1 || !stop_set)
+			stop = ftrace_now(cpu);
+		start = per_cpu(hist_preemptirqsoff_start, cpu);
+		latency = (long)nsecs_to_usecs(stop - start);
+		if (latency > 1000000) {
+			printk("%d: latency = %ld (%lu)\n", __LINE__, latency, latency);
+			printk("%d: start=%Ld  stop=%Ld\n", __LINE__, start, stop);
+		}
+		barrier();
+		per_cpu(hist_preemptirqsoff_tracing, cpu) = 0;
+		latency_hist(PREEMPT_INTERRUPT_LATENCY, cpu, latency);
+	}
+#endif
+}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+int tracing_wakeup_hist __read_mostly = 1;
+
+static unsigned wakeup_prio = (unsigned)-1 ;
+static struct task_struct *wakeup_task;
+static cycle_t wakeup_start;
+static DEFINE_RAW_SPINLOCK(wakeup_lock);
+
+notrace void tracing_hist_wakeup_start(struct task_struct *p,
+				       struct task_struct *curr)
+{
+	unsigned long flags;
+
+	if (likely(!rt_task(p)) ||
+	    p->prio >= wakeup_prio ||
+	    p->prio >= curr->prio)
+		return;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+	if (wakeup_task)
+		put_task_struct(wakeup_task);
+
+	get_task_struct(p);
+	wakeup_task = p;
+	wakeup_prio = p->prio;
+	wakeup_start = ftrace_now(raw_smp_processor_id());
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+notrace void tracing_hist_wakeup_stop(struct task_struct *next)
+{
+	unsigned long flags;
+	long latency;
+	cycle_t stop;
+
+	if (next != wakeup_task)
+		return;
+
+	stop = ftrace_now(raw_smp_processor_id());
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+	if (wakeup_task != next)
+		goto out;
+
+	latency = (long)nsecs_to_usecs(stop - wakeup_start);
+
+	latency_hist(WAKEUP_LATENCY, smp_processor_id(), latency);
+
+	put_task_struct(wakeup_task);
+	wakeup_task = NULL;
+	wakeup_prio = (unsigned)-1;
+ out:
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+
+}
+
+static void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_hist_wakeup_stop(next);
+}
+
+static void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
+{
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_hist_wakeup_start(task, curr);
+}
+
+#endif
+
+static __init int latency_hist_init(void)
+{
+	struct dentry *latency_hist_root = NULL;
+	struct dentry *dentry;
+	struct dentry *entry;
+	int i = 0, len = 0;
+	struct hist_data *my_hist;
+	char name[64];
+
+	dentry = tracing_init_dentry();
+
+	latency_hist_root =
+		debugfs_create_dir(latency_hist_dir_root, dentry);
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	dentry = debugfs_create_dir(interrupt_off_hist_dir,
+				    latency_hist_root);
+	for_each_possible_cpu(i) {
+		len = sprintf(name, "CPU%d", i);
+		name[len] = '\0';
+		entry = debugfs_create_file(name, 0444, dentry,
+					    &per_cpu(interrupt_off_hist, i),
+					    &latency_hist_fops);
+		my_hist = &per_cpu(interrupt_off_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+	entry = debugfs_create_file("reset", 0444, dentry,
+				    (void *)INTERRUPT_LATENCY,
+				    &latency_hist_reset_fops);
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	dentry = debugfs_create_dir(preempt_off_hist_dir,
+				    latency_hist_root);
+	for_each_possible_cpu(i) {
+		len = sprintf(name, "CPU%d", i);
+		name[len] = '\0';
+		entry = debugfs_create_file(name, 0444, dentry,
+					    &per_cpu(preempt_off_hist, i),
+					    &latency_hist_fops);
+		my_hist = &per_cpu(preempt_off_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+	entry = debugfs_create_file("reset", 0444, dentry,
+				    (void *)PREEMPT_LATENCY,
+				    &latency_hist_reset_fops);
+#endif
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
+	dentry = debugfs_create_dir(preempt_irqs_off_hist_dir,
+				    latency_hist_root);
+	for_each_possible_cpu(i) {
+		len = sprintf(name, "CPU%d", i);
+		name[len] = '\0';
+		entry = debugfs_create_file(name, 0444, dentry,
+					    &per_cpu(preempt_off_hist, i),
+					    &latency_hist_fops);
+		my_hist = &per_cpu(preempt_irqs_off_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+	entry = debugfs_create_file("reset", 0444, dentry,
+				    (void *)PREEMPT_INTERRUPT_LATENCY,
+				    &latency_hist_reset_fops);
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+
+	i = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			 wake_up_callback, NULL);
+	if (i) {
+		pr_info("wakeup hist: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		goto out_wake;
+	}
+
+	i = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			  wake_up_callback, NULL);
+	if (i) {
+		pr_info("wakeup hist: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	i = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		  sched_switch_callback, NULL);
+	if (i) {
+		pr_info("wakeup hist: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	dentry = debugfs_create_dir(wakeup_latency_hist_dir,
+				    latency_hist_root);
+	for_each_possible_cpu(i) {
+		len = sprintf(name, "CPU%d", i);
+		name[len] = '\0';
+		entry = debugfs_create_file(name, 0444, dentry,
+					    &per_cpu(wakeup_latency_hist, i),
+					    &latency_hist_fops);
+		my_hist = &per_cpu(wakeup_latency_hist, i);
+		atomic_set(&my_hist->hist_mode, 1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+	entry = debugfs_create_file("reset", 0444, dentry,
+				    (void *)WAKEUP_LATENCY,
+				    &latency_hist_reset_fops);
+
+	goto out_wake;
+
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback, NULL);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback, NULL);
+ out_wake:
+#endif
+	return 0;
+
+}
+
+__initcall(latency_hist_init);
Index: linux-2.6.25.4-rt4/kernel/trace/trace_hist.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/trace_hist.h	2008-05-29 09:46:08.000000000 -0400
@@ -0,0 +1,39 @@
+/*
+ * kernel/trace/trace_hist.h
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ *  Converted to work with the new latency tracer.
+ *  Copyright (C) 2008 Red Hat, Inc.
+ *    Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#ifndef _LIB_TRACING_TRACER_HIST_H_
+#define _LIB_TRACING_TRACER_HIST_H_
+
+#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
+# define TRACE_STOP 2
+void tracing_hist_preempt_start(void);
+void tracing_hist_preempt_stop(int irqs_on);
+#else
+# define tracing_hist_preempt_start() do { } while (0)
+# define tracing_hist_preempt_stop(irqs_off) do { } while (0)
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+void tracing_hist_wakeup_start(struct task_struct *p,
+			       struct task_struct *curr);
+void tracing_hist_wakeup_stop(struct task_struct *next);
+extern int tracing_wakeup_hist;
+#else
+# define tracing_hist_wakeup_start(p, curr) do { } while (0)
+# define tracing_hist_wakeup_stop(next) do { } while (0)
+# define tracing_wakeup_hist 0
+#endif
+
+#endif /* ifndef _LIB_TRACING_TRACER_HIST_H_ */
Index: linux-2.6.25.4-rt4/arch/x86/ia32/ia32entry.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/ia32/ia32entry.S	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/ia32/ia32entry.S	2008-05-29 09:46:08.000000000 -0400
@@ -131,7 +131,9 @@ sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls-1),%eax
 	ja	ia32_badsys
 	IA32_ARG_FIXUP 1
+	TRACE_SYS_IA32_CALL
 	call	*ia32_sys_call_table(,%rax,8)
+	TRACE_SYS_RET
 	movq	%rax,RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%r10)
 	cli
@@ -236,7 +238,9 @@ cstar_do_call:	
 	cmpl $IA32_NR_syscalls-1,%eax
 	ja  ia32_badsys
 	IA32_ARG_FIXUP 1
+	TRACE_SYS_IA32_CALL
 	call *ia32_sys_call_table(,%rax,8)
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%r10)
 	cli
@@ -327,8 +331,10 @@ ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls-1),%eax
 	ja  ia32_badsys
 	IA32_ARG_FIXUP
+	TRACE_SYS_IA32_CALL
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 	jmp int_ret_from_sys_call 
 
@@ -399,7 +405,7 @@ END(ia32_ptregs_common)
 
 	.section .rodata,"a"
 	.align 8
-ia32_sys_call_table:
+ENTRY(ia32_sys_call_table)
 	.quad sys_restart_syscall
 	.quad sys_exit
 	.quad stub32_fork
@@ -727,4 +733,5 @@ ia32_sys_call_table:
 	.quad sys32_fallocate
 	.quad compat_sys_timerfd_settime	/* 325 */
 	.quad compat_sys_timerfd_gettime
+.globl ia32_syscall_end
 ia32_syscall_end:
Index: linux-2.6.25.4-rt4/include/asm-x86/calling.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/calling.h	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/calling.h	2008-05-29 09:46:08.000000000 -0400
@@ -166,3 +166,53 @@
 	.byte 0xf1
 	.endm
 
+
+/*
+ * latency-tracing helpers:
+ */
+
+	.macro TRACE_SYS_CALL
+
+#ifdef CONFIG_EVENT_TRACER
+	SAVE_ARGS
+
+	mov     %rdx, %rcx
+	mov     %rsi, %rdx
+	mov     %rdi, %rsi
+	mov     %rax, %rdi
+
+	call sys_call
+
+	RESTORE_ARGS
+#endif
+	.endm
+
+
+	.macro TRACE_SYS_IA32_CALL
+
+#ifdef CONFIG_EVENT_TRACER
+	SAVE_ARGS
+
+	mov     %rdx, %rcx
+	mov     %rsi, %rdx
+	mov     %rdi, %rsi
+	mov     %rax, %rdi
+
+	call sys_ia32_call
+
+	RESTORE_ARGS
+#endif
+	.endm
+
+	.macro TRACE_SYS_RET
+
+#ifdef CONFIG_EVENT_TRACER
+	SAVE_ARGS
+
+	mov     %rax, %rdi
+
+	call sys_ret
+
+	RESTORE_ARGS
+#endif
+	.endm
Index: linux-2.6.25.4-rt4/include/asm-x86/unistd_64.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/unistd_64.h	2008-05-29 09:45:57.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/unistd_64.h	2008-05-29 09:46:08.000000000 -0400
@@ -11,6 +11,8 @@
  * Note: holes are not allowed.
  */
 
+#define NR_syscalls (__NR_syscall_max+1)
+
 /* at least 8 syscall per cacheline */
 #define __NR_read				0
 __SYSCALL(__NR_read, sys_read)
Index: linux-2.6.25.4-rt4/arch/arm/kernel/traps.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/traps.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/traps.c	2008-05-29 09:46:35.000000000 -0400
@@ -225,7 +225,7 @@ static void __die(const char *str, int e
 	}
 }
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 /*
  * This function is protected against re-entrancy.
@@ -268,7 +268,7 @@ void arm_notify_die(const char *str, str
 }
 
 static LIST_HEAD(undef_hook);
-static DEFINE_SPINLOCK(undef_lock);
+static DEFINE_RAW_SPINLOCK(undef_lock);
 
 void register_undef_hook(struct undef_hook *hook)
 {
@@ -357,6 +357,7 @@ asmlinkage void do_unexp_fiq (struct pt_
 {
 	printk("Hmm.  Unexpected FIQ received, but trying to continue\n");
 	printk("You may have a hardware problem...\n");
+	print_preempt_trace(current);
 }
 
 /*
Index: linux-2.6.25.4-rt4/kernel/trace/preempt-trace.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/trace/preempt-trace.c	2008-05-29 09:46:09.000000000 -0400
@@ -0,0 +1,30 @@
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/kallsyms.h>
+
+void print_preempt_trace(struct task_struct *task)
+{
+	unsigned int count;
+	unsigned int i, lim;
+
+	if (!task)
+		task = current;
+
+	count = task_thread_info(task)->preempt_count;
+	lim = count & PREEMPT_MASK;
+
+	if (lim >= MAX_PREEMPT_TRACE)
+		lim = MAX_PREEMPT_TRACE-1;
+	printk("---------------------------\n");
+	printk("| preempt count: %08x ]\n", count);
+	printk("| %d-level deep critical section nesting:\n", lim);
+	printk("----------------------------------------\n");
+	for (i = 1; i <= lim; i++) {
+		printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]);
+		print_symbol("%s\n", task->preempt_trace_eip[i]);
+		printk(".....[<%08lx>] ..   ( <= ",
+				task->preempt_trace_parent_eip[i]);
+		print_symbol("%s)\n", task->preempt_trace_parent_eip[i]);
+	}
+	printk("\n");
+}
Index: linux-2.6.25.4-rt4/arch/arm/kernel/irq.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/irq.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/irq.c	2008-05-29 09:46:35.000000000 -0400
@@ -37,6 +37,8 @@
 #include <linux/kallsyms.h>
 #include <linux/proc_fs.h>
 
+#include <linux/ftrace.h>
+
 #include <asm/system.h>
 #include <asm/mach/time.h>
 
@@ -100,7 +102,7 @@ unlock:
 /* Handle bad interrupts */
 static struct irq_desc bad_irq_desc = {
 	.handle_irq = handle_bad_irq,
-	.lock = SPIN_LOCK_UNLOCKED
+	.lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock)
 };
 
 /*
@@ -108,11 +110,13 @@ static struct irq_desc bad_irq_desc = {
  * come via this function.  Instead, they should provide their
  * own 'handler'
  */
-asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
+asmlinkage void __exception notrace asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct irq_desc *desc = irq_desc + irq;
 
+	ftrace_event_irq(irq, user_mode(regs), instruction_pointer(regs));
+
 	/*
 	 * Some hardware gives randomly wrong interrupts.  Rather
 	 * than crashing, do something sensible.
Index: linux-2.6.25.4-rt4/arch/powerpc/xmon/xmon.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/xmon/xmon.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/xmon/xmon.c	2008-05-29 09:46:40.000000000 -0400
@@ -343,6 +343,7 @@ static int xmon_core(struct pt_regs *reg
 	unsigned long timeout;
 #endif
 
+	preempt_disable();
 	local_irq_save(flags);
 
 	bp = in_breakpoint_table(regs->nip, &offset);
@@ -519,6 +520,7 @@ static int xmon_core(struct pt_regs *reg
 	insert_cpu_bpts();
 
 	local_irq_restore(flags);
+	preempt_enable();
 
 	return cmd != 'X' && cmd != EOF;
 }
@@ -2137,7 +2139,7 @@ print_address(unsigned long addr)
 static unsigned long mdest;		/* destination address */
 static unsigned long msrc;		/* source address */
 static unsigned long mval;		/* byte value to set memory to */
-static unsigned long mcount;		/* # bytes to affect */
+static unsigned long xmon_mcount;	/* # bytes to affect */
 static unsigned long mdiffs;		/* max # differences to print */
 
 void
@@ -2149,19 +2151,20 @@ memops(int cmd)
 	scanhex((void *)(cmd == 's'? &mval: &msrc));
 	if( termch != '\n' )
 		termch = 0;
-	scanhex((void *)&mcount);
+	scanhex((void *)&xmon_mcount);
 	switch( cmd ){
 	case 'm':
-		memmove((void *)mdest, (void *)msrc, mcount);
+		memmove((void *)mdest, (void *)msrc, xmon_mcount);
 		break;
 	case 's':
-		memset((void *)mdest, mval, mcount);
+		memset((void *)mdest, mval, xmon_mcount);
 		break;
 	case 'd':
 		if( termch != '\n' )
 			termch = 0;
 		scanhex((void *)&mdiffs);
-		memdiffs((unsigned char *)mdest, (unsigned char *)msrc, mcount, mdiffs);
+		memdiffs((unsigned char *)mdest, (unsigned char *)msrc,
+			 xmon_mcount, mdiffs);
 		break;
 	}
 }
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/entry_64.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/entry_64.S	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/entry_64.S	2008-05-29 09:46:38.000000000 -0400
@@ -470,7 +470,8 @@ _GLOBAL(ret_from_except_lite)
 
 #ifdef CONFIG_PREEMPT
 	clrrdi	r9,r1,THREAD_SHIFT	/* current_thread_info() */
-	li	r0,_TIF_NEED_RESCHED	/* bits to check */
+	li	r0,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
+					/* bits to check */
 	ld	r3,_MSR(r1)
 	ld	r4,TI_FLAGS(r9)
 	/* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */
@@ -592,17 +593,22 @@ do_work:
 	rotldi	r10,r10,16
 	mtmsrd	r10,1
 	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_NEED_RESCHED
+	andi.	r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	1b
 	b	restore
 
 user_work:
 #endif
+	/* here we are preempting the current task */
+	li	r0,1
+	stb	r0,PACASOFTIRQEN(r13)
+	stb	r0,PACAHARDIRQEN(r13)
+
 	/* Enable interrupts */
 	ori	r10,r10,MSR_EE
 	mtmsrd	r10,1
 
-	andi.	r0,r4,_TIF_NEED_RESCHED
+	andi.	r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	1f
 	bl	.schedule
 	b	.ret_from_except_lite
@@ -846,3 +852,65 @@ _GLOBAL(enter_prom)
 	ld	r0,16(r1)
 	mtlr    r0
         blr
+
+#ifdef CONFIG_MCOUNT
+/*
+ * code almost taken from entry_32.S
+ */
+#define MCOUNT_FRAME_SIZE 32
+_GLOBAL(mcount)
+	stdu	r1,-MCOUNT_FRAME_SIZE(r1)
+	mflr	r3
+
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	ld	r4,MCOUNT_FRAME_SIZE(r1)
+	ld	r4,16(r4)
+	LOAD_REG_ADDR(r5,ftrace_trace_function)
+	ld	r5,0(r5)
+	ld	r5,0(r5)
+	mtctr	r5
+	bctrl
+	nop
+1:
+	ld	r0,MCOUNT_FRAME_SIZE+16(r1)
+	mtlr	r0
+	addi	r1,r1,MCOUNT_FRAME_SIZE
+	blr
+
+/*
+ * Based on glibc-2.4/sysdeps/powerpc/powerpc64/ppc-mcount.S
+ *
+ * We don't need to save the parameter-passing registers as gcc takes
+ * care of that for us.  Thus this function looks fairly normal.
+ * In fact, the generic code would work for us.
+ */
+_GLOBAL(_mcount)
+	/* return if we're in real mode. */
+	mfmsr	r3
+	andi.	r0,r3,MSR_IR|MSR_DR		/* see if relocation is on? */
+	beqlr					/* if not, do nothing. */
+	/* we're in translation mode. keep going. */
+	mflr	r3
+	ld	r11,0(r1)			/* load back chain ptr */
+	stdu	r1,-STACK_FRAME_OVERHEAD(r1)
+	std	r3,STACK_FRAME_OVERHEAD+16(r1)
+	ld	r4,16(r11)			/* LR in back chain */
+	LOAD_REG_ADDR(r5,mcount_enabled)
+	lwz	r5,0(r5)
+	cmpwi	r5,0				/* see if mcount_enabled? */
+	beq	1f				/* if disabled, then skip */
+
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	LOAD_REG_ADDR(r5,mcount_trace_function)
+	ld      r5,0(r5)
+	ld      r5,0(r5)
+        mtctr   r5
+        bctrl
+	nop
+1:
+	ld	r0,STACK_FRAME_OVERHEAD+16(r1)	/* restore saved LR */
+	mtlr	r0
+	addi	r1,r1,STACK_FRAME_OVERHEAD
+	blr
+
+#endif /* CONFIG_MCOUNT */
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/setup_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/setup_64.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/setup_64.c	2008-05-29 09:46:10.000000000 -0400
@@ -607,3 +607,21 @@ struct ppc_pci_io ppc_pci_io;
 EXPORT_SYMBOL(ppc_pci_io);
 #endif /* CONFIG_PPC_INDIRECT_IO */
 
+#ifdef CONFIG_STACKTRACE
+#include <linux/stacktrace.h>
+void notrace save_stack_trace(struct stack_trace *trace)
+{
+}
+#endif /* CONFIG_STACKTRACE */
+
+#ifdef CONFIG_EARLY_PRINTK
+void notrace early_printk(const char *fmt, ...)
+{
+	BUG();
+}
+#endif /* CONFIG_EARLY_PRINTK */
+
+#ifdef CONFIG_MCOUNT
+extern void _mcount(void);
+EXPORT_SYMBOL(_mcount);
+#endif /* CONFIG_MCOUNT */
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/irq.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/irq.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/irq.c	2008-05-29 09:46:40.000000000 -0400
@@ -94,8 +94,6 @@ extern atomic_t ipi_sent;
 #endif
 
 #ifdef CONFIG_PPC64
-EXPORT_SYMBOL(irq_desc);
-
 int distribute_irqs = 1;
 
 static inline unsigned long get_hard_enabled(void)
@@ -114,7 +112,7 @@ static inline void set_soft_enabled(unsi
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
-void local_irq_restore(unsigned long en)
+void notrace raw_local_irq_restore(unsigned long en)
 {
 	/*
 	 * get_paca()->soft_enabled = en;
@@ -404,7 +402,7 @@ void do_softirq(void)
 #ifdef CONFIG_PPC_MERGE
 
 static LIST_HEAD(irq_hosts);
-static DEFINE_SPINLOCK(irq_big_lock);
+static DEFINE_RAW_SPINLOCK(irq_big_lock);
 static DEFINE_PER_CPU(unsigned int, irq_radix_reader);
 static unsigned int irq_radix_writer;
 struct irq_map_entry irq_map[NR_IRQS];
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/entry_32.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/entry_32.S	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/entry_32.S	2008-05-29 09:46:22.000000000 -0400
@@ -661,7 +661,7 @@ user_exc_return:		/* r10 contains MSR_KE
 	/* Check current_thread_info()->flags */
 	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	do_work
 
 restore_user:
@@ -896,7 +896,7 @@ global_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -910,7 +910,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	do_resched
 	andi.	r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK
 	beq	restore_user
@@ -1022,3 +1022,86 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_MCOUNT
+/*
+ * mcount() is not the same as _mcount().  The callers of mcount() have a
+ * normal context.  The callers of _mcount() do not have a stack frame and
+ * have not saved the "caller saves" registers.
+ */
+_GLOBAL(mcount)
+	stwu	r1,-16(r1)
+	mflr	r3
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,16(r1)
+	lwz	r4,4(r4)
+	LOAD_REG_ADDR(r5,ftrace_trace_function)
+	lwz	r5,0(r5)
+	mtctr	r5
+	bctrl
+1:
+	lwz	r0,20(r1)
+	mtlr	r0
+	addi	r1,r1,16
+	blr
+
+/*
+ * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the
+ * C compiler to add a call to _mcount() at the start of each function
+ * preamble, before the stack frame is created.  An example of this preamble
+ * code is:
+ *
+ *	mflr	r0
+ *	lis	r12,-16354
+ *	stw	r0,4(r1)
+ *	addi	r0,r12,-19652
+ *	bl	0xc00034c8 <_mcount>
+ *	mflr	r0
+ *	stwu	r1,-16(r1)
+ */
+_GLOBAL(_mcount)
+#define M_STK_SIZE 48
+	/* Would not expect to need to save cr, but glibc version of */
+	/* _mcount() does, so cautiously saving it here too. */
+	stwu	r1,-M_STK_SIZE(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3	/* will use as first arg to __trace() */
+	mfcr	r4
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	cmpwi	r5,0
+	stw	r3, 44(r1)	/* lr */
+	stw	r4,  8(r1)	/* cr */
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,M_STK_SIZE+4(r1)
+	LOAD_REG_ADDR(r5,mcount_trace_function)
+	lwz	r5,0(r5)
+	mtctr	r5
+	bctrl
+1:
+	lwz	r8,  8(r1)	/* cr */
+	lwz	r9, 44(r1)	/* lr */
+	lwz	r3, 12(r1)
+	lwz	r4, 16(r1)
+	lwz	r5, 20(r1)
+	mtcrf	0xff,r8
+	mtctr	r9
+	lwz	r0, 52(r1)
+	lwz	r6, 24(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1,r1,M_STK_SIZE
+	mtlr	r0
+	bctr
+
+#endif /* CONFIG_MCOUNT */
Index: linux-2.6.25.4-rt4/arch/powerpc/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/Kconfig	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/Kconfig	2008-05-29 09:46:28.000000000 -0400
@@ -49,13 +49,6 @@ config IRQ_PER_CPU
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y
-
 config GENERIC_LOCKBREAK
 	bool
 	default y
@@ -90,6 +83,7 @@ config PPC
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_MCOUNT
 	select HAVE_KRETPROBES
 
 config EARLY_PRINTK
@@ -208,6 +202,18 @@ config HIGHMEM
 source kernel/time/Kconfig
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "fs/Kconfig.binfmt"
 
 # We optimistically allocate largepages from the VM, so make the limit
Index: linux-2.6.25.4-rt4/arch/arm/mach-ep93xx/core.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-ep93xx/core.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-ep93xx/core.c	2008-05-29 09:46:11.000000000 -0400
@@ -32,6 +32,8 @@
 #include <linux/termios.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/serial.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 
 #include <asm/types.h>
 #include <asm/setup.h>
@@ -50,7 +52,6 @@
 
 #include <asm/hardware/vic.h>
 
-
 /*************************************************************************
  * Static I/O mappings that are needed for all EP93xx platforms
  *************************************************************************/
@@ -93,55 +94,123 @@ void __init ep93xx_map_io(void)
  * to use this timer for something else.  We also use timer 4 for keeping
  * track of lost jiffies.
  */
-static unsigned int last_jiffy_time;
-
-#define TIMER4_TICKS_PER_JIFFY		((CLOCK_TICK_RATE + (HZ/2)) / HZ)
+static struct clock_event_device clockevent_ep93xx;
 
 static int ep93xx_timer_interrupt(int irq, void *dev_id)
 {
-	__raw_writel(1, EP93XX_TIMER1_CLEAR);
-	while ((signed long)
-		(__raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time)
-						>= TIMER4_TICKS_PER_JIFFY) {
-		last_jiffy_time += TIMER4_TICKS_PER_JIFFY;
-		timer_tick();
-	}
+	__raw_writel(EP93XX_TC_CLEAR, EP93XX_TIMER1_CLEAR);
+
+	clockevent_ep93xx.event_handler(&clockevent_ep93xx);
 
 	return IRQ_HANDLED;
 }
 
+static int ep93xx_set_next_event(unsigned long evt,
+				  struct clock_event_device *unused)
+{
+	u32 tmode = __raw_readl(EP93XX_TIMER1_CONTROL);
+
+	/* stop timer */
+	__raw_writel(tmode & ~EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL);
+	/* program timer */
+	__raw_writel(evt, EP93XX_TIMER1_LOAD);
+	/* start timer */
+	__raw_writel(tmode | EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL);
+
+	return 0;
+}
+
+static void ep93xx_set_mode(enum clock_event_mode mode,
+			    struct clock_event_device *evt)
+{
+	u32 tmode = EP93XX_TC123_SEL_508KHZ;
+
+	/* Disable timer */
+	__raw_writel(tmode, EP93XX_TIMER1_CONTROL);
+
+	switch(mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* Set timer period  */
+		__raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD);
+		tmode |= EP93XX_TC123_PERIODIC;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		tmode |= EP93XX_TC123_ENABLE;
+		__raw_writel(tmode, EP93XX_TIMER1_CONTROL);
+		break;
+
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_RESUME:
+		return;
+	}
+}
+
+static struct clock_event_device clockevent_ep93xx = {
+	.name		= "ep93xx-timer1",
+	.features	= CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC,
+	.shift		= 32,
+	.set_mode	= ep93xx_set_mode,
+	.set_next_event	= ep93xx_set_next_event,
+};
 static struct irqaction ep93xx_timer_irq = {
 	.name		= "ep93xx timer",
 	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
 	.handler	= ep93xx_timer_interrupt,
 };
 
-static void __init ep93xx_timer_init(void)
+static void __init ep93xx_clockevent_init(void)
 {
-	/* Enable periodic HZ timer.  */
-	__raw_writel(0x48, EP93XX_TIMER1_CONTROL);
-	__raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD);
-	__raw_writel(0xc8, EP93XX_TIMER1_CONTROL);
+	setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq);
 
-	/* Enable lost jiffy timer.  */
-	__raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH);
+	clockevent_ep93xx.mult = div_sc(508469, NSEC_PER_SEC,
+					clockevent_ep93xx.shift);
+	clockevent_ep93xx.max_delta_ns =
+		clockevent_delta2ns(0xfffffffe, &clockevent_ep93xx);
+	clockevent_ep93xx.min_delta_ns =
+		clockevent_delta2ns(0xf, &clockevent_ep93xx);
+	clockevent_ep93xx.cpumask = cpumask_of_cpu(0);
+	clockevents_register_device(&clockevent_ep93xx);
+}
 
-	setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq);
+/*
+ * timer4 is a 40 Bit timer, separated in a 32bit and a 8 bit
+ * register, EP93XX_TIMER4_VALUE_LOW stores 32 bit word. The
+ * controlregister is in EP93XX_TIMER4_VALUE_HIGH
+ */
+
+cycle_t ep93xx_get_cycles(void)
+{
+	return __raw_readl(EP93XX_TIMER4_VALUE_LOW);
 }
 
-static unsigned long ep93xx_gettimeoffset(void)
+static struct clocksource clocksource_ep93xx = {
+	.name		= "ep93xx_timer4",
+	.rating		= 200,
+	.read		= ep93xx_get_cycles,
+	.mask		= 0xFFFFFFFF,
+	.shift		= 20,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static void __init ep93xx_clocksource_init(void)
 {
-	int offset;
+	/* Reset time-stamp counter */
+	__raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH);
 
-	offset = __raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time;
+	clocksource_ep93xx.mult =
+		clocksource_hz2mult(983040, clocksource_ep93xx.shift);
+	clocksource_register(&clocksource_ep93xx);
+}
 
-	/* Calculate (1000000 / 983040) * offset.  */
-	return offset + (53 * offset / 3072);
+static void __init ep93xx_timer_init(void)
+{
+	ep93xx_clocksource_init();
+	ep93xx_clockevent_init();
 }
 
 struct sys_timer ep93xx_timer = {
-	.init		= ep93xx_timer_init,
-	.offset		= ep93xx_gettimeoffset,
+ 	.init			= ep93xx_timer_init,
 };
 
 
@@ -549,7 +618,6 @@ static struct platform_device ep93xx_ohc
 	.resource	= ep93xx_ohci_resources,
 };
 
-
 void __init ep93xx_init_devices(void)
 {
 	unsigned int v;
Index: linux-2.6.25.4-rt4/include/asm-arm/arch-ep93xx/ep93xx-regs.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/arch-ep93xx/ep93xx-regs.h	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/arch-ep93xx/ep93xx-regs.h	2008-05-29 09:46:11.000000000 -0400
@@ -67,6 +67,12 @@
 #define EP93XX_TIMER3_CONTROL		EP93XX_TIMER_REG(0x88)
 #define EP93XX_TIMER3_CLEAR		EP93XX_TIMER_REG(0x8c)
 
+#define EP93XX_TC_CLEAR			0x00000001
+#define EP93XX_TC123_ENABLE		0x00000080
+#define EP93XX_TC123_PERIODIC		0x00000040
+#define EP93XX_TC123_SEL_508KHZ		0x00000008
+#define EP93XX_TC4_ENABLE		0x00000100
+
 #define EP93XX_I2S_BASE			(EP93XX_APB_VIRT_BASE + 0x00020000)
 
 #define EP93XX_SECURITY_BASE		(EP93XX_APB_VIRT_BASE + 0x00030000)
Index: linux-2.6.25.4-rt4/arch/arm/kernel/time.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/time.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/time.c	2008-05-29 09:46:12.000000000 -0400
@@ -225,6 +225,13 @@ static inline void do_leds(void)
 #define	do_leds()
 #endif
 
+void arch_tick_leds(void)
+{
+#ifdef CONFIG_LEDS_TIMER
+	do_leds();
+#endif
+}
+
 #ifndef CONFIG_GENERIC_TIME
 void do_gettimeofday(struct timeval *tv)
 {
Index: linux-2.6.25.4-rt4/drivers/net/sungem.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/sungem.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/sungem.c	2008-05-29 09:46:12.000000000 -0400
@@ -1033,10 +1033,8 @@ static int gem_start_xmit(struct sk_buff
 			(csum_stuff_off << 21));
 	}
 
-	local_irq_save(flags);
-	if (!spin_trylock(&gp->tx_lock)) {
+	if (!spin_trylock_irqsave(&gp->tx_lock, flags)) {
 		/* Tell upper layer to requeue */
-		local_irq_restore(flags);
 		return NETDEV_TX_LOCKED;
 	}
 	/* We raced with gem_do_stop() */
Index: linux-2.6.25.4-rt4/arch/x86/kernel/tsc_sync.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/tsc_sync.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/tsc_sync.c	2008-05-29 09:46:27.000000000 -0400
@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count
  * we want to have the fastest, inlined, non-debug version
  * of a critical section, to be able to prove TSC time-warps:
  */
-static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static __cpuinitdata __raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
 static __cpuinitdata cycles_t last_tsc;
 static __cpuinitdata cycles_t max_warp;
 static __cpuinitdata int nr_warps;
@@ -101,6 +101,7 @@ static __cpuinit void check_tsc_warp(voi
  */
 void __cpuinit check_tsc_sync_source(int cpu)
 {
+	unsigned long flags;
 	int cpus = 2;
 
 	/*
@@ -121,8 +122,11 @@ void __cpuinit check_tsc_sync_source(int
 	/*
 	 * Wait for the target to arrive:
 	 */
+	local_save_flags(flags);
+	local_irq_enable();
 	while (atomic_read(&start_count) != cpus-1)
 		cpu_relax();
+	local_irq_restore(flags);
 	/*
 	 * Trigger the target to continue into the measurement too:
 	 */
Index: linux-2.6.25.4-rt4/drivers/input/keyboard/atkbd.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/input/keyboard/atkbd.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/input/keyboard/atkbd.c	2008-05-29 09:46:12.000000000 -0400
@@ -1453,8 +1453,23 @@ static struct dmi_system_id atkbd_dmi_qu
 	{ }
 };
 
+static int __read_mostly noatkbd;
+
+static int __init noatkbd_setup(char *str)
+{
+        noatkbd = 1;
+        printk(KERN_INFO "debug: not setting up AT keyboard.\n");
+
+        return 1;
+}
+
+__setup("noatkbd", noatkbd_setup);
+
 static int __init atkbd_init(void)
 {
+	if (noatkbd)
+		return 0;
+
 	dmi_check_system(atkbd_dmi_quirk_table);
 
 	return serio_register_driver(&atkbd_drv);
Index: linux-2.6.25.4-rt4/drivers/input/mouse/psmouse-base.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/input/mouse/psmouse-base.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/input/mouse/psmouse-base.c	2008-05-29 09:46:12.000000000 -0400
@@ -1597,10 +1597,25 @@ static int psmouse_get_maxproto(char *bu
 	return sprintf(buffer, "%s\n", psmouse_protocol_by_type(type)->name);
 }
 
+static int __read_mostly nopsmouse;
+
+static int __init nopsmouse_setup(char *str)
+{
+        nopsmouse = 1;
+        printk(KERN_INFO "debug: not setting up psmouse.\n");
+
+        return 1;
+}
+
+__setup("nopsmouse", nopsmouse_setup);
+
 static int __init psmouse_init(void)
 {
 	int err;
 
+	if (nopsmouse)
+		return 0;
+
 	kpsmoused_wq = create_singlethread_workqueue("kpsmoused");
 	if (!kpsmoused_wq) {
 		printk(KERN_ERR "psmouse: failed to create kpsmoused workqueue\n");
Index: linux-2.6.25.4-rt4/kernel/rtmutex-debug.h
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rtmutex-debug.h	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rtmutex-debug.h	2008-05-29 09:46:12.000000000 -0400
@@ -17,17 +17,17 @@ extern void debug_rt_mutex_free_waiter(s
 extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
 extern void debug_rt_mutex_lock(struct rt_mutex *lock);
 extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
-				      struct task_struct *powner);
+extern void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner);
 extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
 extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
 				    struct rt_mutex *lock);
 extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
-# define debug_rt_mutex_reset_waiter(w)			\
+# define debug_rt_mutex_reset_waiter(w) \
 	do { (w)->deadlock_lock = NULL; } while (0)
 
-static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
-						 int detect)
+static inline int
+debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect)
 {
-	return (waiter != NULL);
+	return waiter != NULL;
 }
Index: linux-2.6.25.4-rt4/drivers/net/8139too.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/8139too.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/8139too.c	2008-05-29 09:46:13.000000000 -0400
@@ -2199,7 +2199,11 @@ static irqreturn_t rtl8139_interrupt (in
  */
 static void rtl8139_poll_controller(struct net_device *dev)
 {
-	disable_irq(dev->irq);
+	/*
+	 * use _nosync() variant - might be used by netconsole
+	 * from atomic contexts:
+	 */
+	disable_irq_nosync(dev->irq);
 	rtl8139_interrupt(dev->irq, dev);
 	enable_irq(dev->irq);
 }
Index: linux-2.6.25.4-rt4/arch/x86/kernel/kprobes.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/kprobes.c	2008-05-29 09:45:56.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/kprobes.c	2008-05-29 09:46:13.000000000 -0400
@@ -451,7 +451,7 @@ static void __kprobes setup_singlestep(s
 		/* Boost up -- we can execute copied instructions directly */
 		reset_current_kprobe();
 		regs->ip = (unsigned long)p->ainsn.insn;
-		preempt_enable_no_resched();
+		preempt_enable();
 		return;
 	}
 #endif
@@ -477,7 +477,7 @@ static int __kprobes reenter_kprobe(stru
 		arch_disarm_kprobe(p);
 		regs->ip = (unsigned long)p->addr;
 		reset_current_kprobe();
-		preempt_enable_no_resched();
+		preempt_enable();
 		break;
 #endif
 	case KPROBE_HIT_ACTIVE:
@@ -573,7 +573,7 @@ static int __kprobes kprobe_handler(stru
 		}
 	} /* else: not a kprobe fault; let the kernel handle it */
 
-	preempt_enable_no_resched();
+	preempt_enable();
 	return 0;
 }
 
@@ -874,7 +874,7 @@ static int __kprobes post_kprobe_handler
 	}
 	reset_current_kprobe();
 out:
-	preempt_enable_no_resched();
+	preempt_enable();
 
 	/*
 	 * if somebody else is singlestepping across a probe point, flags
@@ -908,7 +908,7 @@ int __kprobes kprobe_fault_handler(struc
 			restore_previous_kprobe(kcb);
 		else
 			reset_current_kprobe();
-		preempt_enable_no_resched();
+		preempt_enable();
 		break;
 	case KPROBE_HIT_ACTIVE:
 	case KPROBE_HIT_SSDONE:
@@ -1049,7 +1049,7 @@ int __kprobes longjmp_break_handler(stru
 		memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
 		       kcb->jprobes_stack,
 		       MIN_STACK_SIZE(kcb->jprobe_saved_sp));
-		preempt_enable_no_resched();
+		preempt_enable();
 		return 1;
 	}
 	return 0;
Index: linux-2.6.25.4-rt4/arch/x86/mm/highmem_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/mm/highmem_32.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/mm/highmem_32.c	2008-05-29 09:46:58.000000000 -0400
@@ -3,9 +3,9 @@
 
 void *kmap(struct page *page)
 {
-	might_sleep();
 	if (!PageHighMem(page))
 		return page_address(page);
+	might_sleep();
 	return kmap_high(page);
 }
 
@@ -18,6 +18,27 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
+void kunmap_virt(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return;
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	kunmap(page);
+}
+
+struct page *kmap_to_page(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return virt_to_page(ptr);
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	return page;
+}
+EXPORT_SYMBOL_GPL(kmap_to_page); /* PREEMPT_RT converts some modules to use this */
+
 static void debug_kmap_atomic_prot(enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
@@ -69,12 +90,12 @@ static void debug_kmap_atomic_prot(enum 
  * However when holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 
 	if (!PageHighMem(page))
@@ -84,19 +105,19 @@ void *kmap_atomic_prot(struct page *page
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	BUG_ON(!pte_none(*(kmap_pte-idx)));
+	WARN_ON_ONCE(!pte_none(*(kmap_pte-idx)));
 	set_pte(kmap_pte-idx, mk_pte(page, prot));
 	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
 }
 
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page, enum km_type type)
 {
 	return kmap_atomic_prot(page, type, kmap_prot);
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr, enum km_type type)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -118,16 +139,18 @@ void kunmap_atomic(void *kvaddr, enum km
 
 	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
+	preempt_enable();
 }
 
 /* This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
+	preempt_disable();
 	pagefault_disable();
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
@@ -138,7 +161,7 @@ void *kmap_atomic_pfn(unsigned long pfn,
 	return (void*) vaddr;
 }
 
-struct page *kmap_atomic_to_page(void *ptr)
+struct page *__kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
 	pte_t *pte;
@@ -153,6 +176,7 @@ struct page *kmap_atomic_to_page(void *p
 
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_to_page);
+EXPORT_SYMBOL(kunmap_virt);
+EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic_to_page);
Index: linux-2.6.25.4-rt4/include/asm-x86/atomic_32.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/atomic_32.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/atomic_32.h	2008-05-29 09:46:13.000000000 -0400
@@ -195,10 +195,10 @@ static __inline__ int atomic_add_return(
 
 #ifdef CONFIG_M386
 no_xadd: /* Legacy 386 processor */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	__i = atomic_read(v);
 	atomic_set(v, i + __i);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return i + __i;
 #endif
 }
Index: linux-2.6.25.4-rt4/drivers/pci/msi.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/pci/msi.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/pci/msi.c	2008-05-29 09:47:22.000000000 -0400
@@ -75,7 +75,7 @@ static void msi_set_enable(struct pci_de
 	int pos;
 	u16 control;
 
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSI);
 	if (pos) {
 		pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 		control &= ~PCI_MSI_FLAGS_ENABLE;
@@ -90,7 +90,7 @@ static void msix_set_enable(struct pci_d
 	int pos;
 	u16 control;
 
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX);
 	if (pos) {
 		pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
 		control &= ~PCI_MSIX_FLAGS_ENABLE;
@@ -285,6 +285,10 @@ static void __pci_restore_msi_state(stru
 		return;
 
 	entry = get_irq_msi(dev->irq);
+	if (!entry) {
+		WARN_ON(1);
+		return;
+	}
 	pos = entry->msi_attrib.pos;
 
 	pci_intx_for_msi(dev, 0);
@@ -351,7 +355,7 @@ static int msi_capability_init(struct pc
 
 	msi_set_enable(dev, 0);	/* Ensure msi is disabled as I set it up */
 
-   	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+   	pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSI);
 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	/* MSI Entry Initialization */
 	entry = alloc_msi_entry();
@@ -424,7 +428,7 @@ static int msix_capability_init(struct p
 
 	msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
 
-   	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+   	pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX);
 	/* Request & Map MSI-X table region */
  	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	nr_entries = multi_msix_capable(control);
@@ -531,7 +535,7 @@ static int pci_msi_check_device(struct p
 	if (ret)
 		return ret;
 
-	if (!pci_find_capability(dev, type))
+	if (!pci_find_capability_cached(dev, type))
 		return -EINVAL;
 
 	return 0;
@@ -650,7 +654,7 @@ int pci_enable_msix(struct pci_dev* dev,
 	if (status)
 		return status;
 
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX);
 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	nr_entries = multi_msix_capable(control);
 	if (nvec > nr_entries)
Index: linux-2.6.25.4-rt4/drivers/block/floppy.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/block/floppy.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/block/floppy.c	2008-05-29 09:46:14.000000000 -0400
@@ -4145,6 +4145,28 @@ static void floppy_device_release(struct
 {
 }
 
+static int floppy_suspend(struct platform_device *dev, pm_message_t state)
+{
+	floppy_release_irq_and_dma();
+
+	return 0;
+}
+
+static int floppy_resume(struct platform_device *dev)
+{
+	floppy_grab_irq_and_dma();
+
+	return 0;
+}
+
+static struct platform_driver floppy_driver = {
+	.suspend	= floppy_suspend,
+	.resume		= floppy_resume,
+	.driver		= {
+		.name	= "floppy",
+	},
+};
+
 static struct platform_device floppy_device[N_DRIVE];
 
 static struct kobject *floppy_find(dev_t dev, int *part, void *data)
@@ -4193,10 +4215,14 @@ static int __init floppy_init(void)
 	if (err)
 		goto out_put_disk;
 
+	err = platform_driver_register(&floppy_driver);
+	if (err)
+		goto out_unreg_blkdev;
+
 	floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
 	if (!floppy_queue) {
 		err = -ENOMEM;
-		goto out_unreg_blkdev;
+		goto out_unreg_driver;
 	}
 	blk_queue_max_sectors(floppy_queue, 64);
 
@@ -4345,6 +4371,8 @@ out_flush_work:
 out_unreg_region:
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	blk_cleanup_queue(floppy_queue);
+out_unreg_driver:
+	platform_driver_unregister(&floppy_driver);
 out_unreg_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 out_put_disk:
@@ -4540,6 +4568,7 @@ void cleanup_module(void)
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 
+	platform_driver_unregister(&floppy_driver);
 	for (drive = 0; drive < N_DRIVE; drive++) {
 		del_timer_sync(&motor_off_timer[drive]);
 
Index: linux-2.6.25.4-rt4/include/linux/hrtimer.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/hrtimer.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/hrtimer.h	2008-05-29 09:46:48.000000000 -0400
@@ -188,7 +188,7 @@ struct hrtimer_clock_base {
  * @nr_events:		Total number of timer interrupt events
  */
 struct hrtimer_cpu_base {
-	spinlock_t			lock;
+	raw_spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 	struct list_head		cb_pending;
@@ -197,6 +197,9 @@ struct hrtimer_cpu_base {
 	int				hres_active;
 	unsigned long			nr_events;
 #endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t		wait;
+#endif
 };
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -279,6 +282,13 @@ static inline int hrtimer_restart(struct
 	return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
 }
 
+/* Softirq preemption could deadlock timer removal */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
+#else
+# define hrtimer_wait_for_timer(timer)	do { cpu_relax(); } while (0)
+#endif
+
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
@@ -314,6 +324,10 @@ static inline u64 hrtimer_forward_now(st
 	return hrtimer_forward(timer, timer->base->get_time(), interval);
 }
 
+/* Overrun count: */
+extern unsigned long
+hrtimer_overrun(struct hrtimer *timer, ktime_t now, ktime_t interval);
+
 /* Precise sleep: */
 extern long hrtimer_nanosleep(struct timespec *rqtp,
 			      struct timespec __user *rmtp,
Index: linux-2.6.25.4-rt4/mm/memory.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/memory.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/memory.c	2008-05-29 09:46:58.000000000 -0400
@@ -271,18 +271,52 @@ void free_pgd_range(struct mmu_gather **
 	} while (pgd++, addr = next, addr != end);
 }
 
+#ifdef CONFIG_IA64
+#define tlb_start_addr(tlb)	(tlb)->start_addr
+#define tlb_end_addr(tlb)	(tlb)->end_addr
+#else
+#define tlb_start_addr(tlb)	0UL	/* only ia64 really uses it */
+#define tlb_end_addr(tlb)	0UL	/* only ia64 really uses it */
+#endif
+
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
+#ifdef CONFIG_PREEMPT
+	struct vm_area_struct *unlink = vma;
+	int fullmm = (*tlb)->fullmm;
+
+	if (!vma)	/* Sometimes when exiting after an oops */
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	if (vma->vm_next)
+#endif
+		tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb));
+	/*
+	 * Hide vma from rmap and vmtruncate before freeeing pgtables,
+	 * with preemption enabled, except when unmapping just one area.
+	 */
+	while (unlink) {
+		anon_vma_unlink(unlink);
+		unlink_file_vma(unlink);
+		unlink = unlink->vm_next;
+	}
+#ifndef CONFIG_PREEMPT_RT
+	if (vma->vm_next)
+#endif
+		*tlb = tlb_gather_mmu(vma->vm_mm, fullmm);
+#endif
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 
+#ifndef CONFIG_PREEMPT
 		/*
 		 * Hide vma from rmap and vmtruncate before freeing pgtables
 		 */
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
+#endif
 
 		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
@@ -295,8 +329,10 @@ void free_pgtables(struct mmu_gather **t
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = vma->vm_next;
+#ifndef CONFIG_PREEMPT
 				anon_vma_unlink(vma);
 				unlink_file_vma(vma);
+#endif
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
@@ -781,10 +817,13 @@ static unsigned long unmap_page_range(st
 	return addr;
 }
 
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT)
 # define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
 #else
-/* No preempt: go for improved straight-line efficiency */
+/*
+ * No preempt: go for improved straight-line efficiency
+ * on PREEMPT_RT this is not a critical latency-path.
+ */
 # define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
 #endif
 
@@ -2503,6 +2542,28 @@ unlock:
 	return 0;
 }
 
+void pagefault_disable(void)
+{
+	current->pagefault_disabled++;
+	/*
+	 * make sure to have issued the store before a pagefault
+	 * can hit.
+	 */
+	barrier();
+}
+EXPORT_SYMBOL(pagefault_disable);
+
+void pagefault_enable(void)
+{
+	/*
+	 * make sure to issue those last loads/stores before enabling
+	 * the pagefault handler again.
+	 */
+	barrier();
+	current->pagefault_disabled--;
+}
+EXPORT_SYMBOL(pagefault_enable);
+
 /*
  * By the time we get here, we already hold the mm semaphore
  */
Index: linux-2.6.25.4-rt4/arch/x86/kernel/io_apic_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/io_apic_32.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/io_apic_32.c	2008-05-29 09:46:41.000000000 -0400
@@ -55,8 +55,8 @@ atomic_t irq_mis_count;
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(vector_lock);
 
 int timer_over_8254 __initdata = 1;
 
@@ -260,14 +260,14 @@ static void __unmask_IO_APIC_irq (unsign
 	__modify_IO_APIC_irq(irq, 0, 0x00010000);
 }
 
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+/* trigger = 0 (edge mode) */
+static void __pcix_mask_IO_APIC_irq (unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+	__modify_IO_APIC_irq(irq, 0, 0x00008000);
 }
 
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+/* mask = 0, trigger = 1 (level mode) */
+static void __pcix_unmask_IO_APIC_irq (unsigned int irq)
 {
 	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
 }
@@ -290,6 +290,24 @@ static void unmask_IO_APIC_irq (unsigned
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+static void pcix_mask_IO_APIC_irq (unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__pcix_mask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void pcix_unmask_IO_APIC_irq (unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__pcix_unmask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
@@ -1235,23 +1253,28 @@ static int assign_irq_vector(int irq)
 
 	return vector;
 }
+
 static struct irq_chip ioapic_chip;
+static struct irq_chip pcix_ioapic_chip;
 
 #define IOAPIC_AUTO	-1
 #define IOAPIC_EDGE	0
 #define IOAPIC_LEVEL	1
 
-static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger,
+				 int pcix)
 {
+	struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip;
+
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL) {
 		irq_desc[irq].status |= IRQ_LEVEL;
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					 handle_fasteoi_irq, "fasteoi");
+		set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq,
+					      pcix ? "pcix-fasteoi" : "fasteoi");
 	} else {
 		irq_desc[irq].status &= ~IRQ_LEVEL;
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					 handle_edge_irq, "edge");
+		set_irq_chip_and_handler_name(irq, chip, handle_edge_irq,
+					      pcix ? "pcix-edge" : "edge");
 	}
 	set_intr_gate(vector, interrupt[irq]);
 }
@@ -1321,7 +1344,8 @@ static void __init setup_IO_APIC_irqs(vo
 		if (IO_APIC_IRQ(irq)) {
 			vector = assign_irq_vector(irq);
 			entry.vector = vector;
-			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+			ioapic_register_intr(irq, vector, IOAPIC_AUTO,
+					     apic > 0);
 		
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
@@ -1492,7 +1516,7 @@ void __init print_IO_APIC(void)
 	return;
 }
 
-#if 0
+#if 1
 
 static void print_APIC_bitfield (int base)
 {
@@ -1899,7 +1923,8 @@ static int __init timer_irq_works(void)
 	 * might have cached one ExtINT interrupt.  Finally, at
 	 * least one tick may be lost due to delays.
 	 */
-	if (time_after(jiffies, t1 + 4))
+	if (time_after(jiffies, t1 + 4) &&
+	    time_before(jiffies, t1 + 16))
 		return 1;
 
 	return 0;
@@ -1988,8 +2013,10 @@ static void ack_ioapic_quirk_irq(unsigne
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		/* mask = 1, trigger = 0 */
+		__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+		/* mask = 0, trigger = 1 */
+		__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
 		spin_unlock(&ioapic_lock);
 	}
 }
@@ -2014,6 +2041,18 @@ static struct irq_chip ioapic_chip __rea
 	.retrigger	= ioapic_retrigger_irq,
 };
 
+static struct irq_chip pcix_ioapic_chip __read_mostly = {
+	.name 		= "IO-APIC",
+	.startup 	= startup_ioapic_irq,
+	.mask	 	= pcix_mask_IO_APIC_irq,
+	.unmask	 	= pcix_unmask_IO_APIC_irq,
+	.ack 		= ack_ioapic_irq,
+	.eoi 		= ack_ioapic_irq,
+#ifdef CONFIG_SMP
+	.set_affinity 	= set_ioapic_affinity_irq,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
 
 static inline void init_IO_APIC_traps(void)
 {
@@ -2826,7 +2865,7 @@ int io_apic_set_pci_routing (int ioapic,
 		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
 		edge_level, active_high_low);
 
-	ioapic_register_intr(irq, entry.vector, edge_level);
+	ioapic_register_intr(irq, entry.vector, edge_level, ioapic > 0);
 
 	if (!ioapic && (irq < 16))
 		disable_8259A_irq(irq);
Index: linux-2.6.25.4-rt4/arch/x86/pci/Makefile_32
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/pci/Makefile_32	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/pci/Makefile_32	2008-05-29 09:46:14.000000000 -0400
@@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS)		+= pcbios.o
 obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig_32.o direct.o mmconfig-shared.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 
+obj-$(CONFIG_ACPI)		+= acpi.o
+
 pci-y				:= fixup.o
-pci-$(CONFIG_ACPI)		+= acpi.o
 pci-y				+= legacy.o irq.o
 
 pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
Index: linux-2.6.25.4-rt4/include/linux/spinlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/spinlock.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/spinlock.h	2008-05-29 09:47:25.000000000 -0400
@@ -44,6 +44,42 @@
  *                        builds the _spin_*() APIs.
  *
  *  linux/spinlock.h:     builds the final spin_*() APIs.
+ *
+ *
+ * Public types and naming conventions:
+ * ------------------------------------
+ * spinlock_t:				type:  sleep-lock
+ * raw_spinlock_t:			type:  spin-lock (debug)
+ *
+ * spin_lock([raw_]spinlock_t):		API:   acquire lock, both types
+ *
+ *
+ * Internal types and naming conventions:
+ * -------------------------------------
+ * __raw_spinlock_t:			type: lowlevel spin-lock
+ *
+ * _spin_lock(struct rt_mutex):		API:  acquire sleep-lock
+ * __spin_lock(raw_spinlock_t):		API:  acquire spin-lock (highlevel)
+ * _raw_spin_lock(raw_spinlock_t):	API:  acquire spin-lock (debug)
+ * __raw_spin_lock(__raw_spinlock_t):	API:  acquire spin-lock (lowlevel)
+ *
+ *
+ * spin_lock(raw_spinlock_t) translates into the following chain of
+ * calls/inlines/macros, if spin-lock debugging is enabled:
+ *
+ *       spin_lock()			[include/linux/spinlock.h]
+ * ->    __spin_lock()			[kernel/spinlock.c]
+ *  ->   _raw_spin_lock()		[lib/spinlock_debug.c]
+ *   ->  __raw_spin_lock()		[include/asm/spinlock.h]
+ *
+ * spin_lock(spinlock_t) translates into the following chain of
+ * calls/inlines/macros:
+ *
+ *       spin_lock()			[include/linux/spinlock.h]
+ * ->    _spin_lock()			[include/linux/spinlock.h]
+ *  ->   rt_spin_lock()			[kernel/rtmutex.c]
+ *   ->  rt_spin_lock_fastlock()	[kernel/rtmutex.c]
+ *    -> rt_spin_lock_slowlock()	[kernel/rtmutex.c]
  */
 
 #include <linux/preempt.h>
@@ -51,29 +87,15 @@
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <linux/kernel.h>
+#include <linux/cache.h>
 #include <linux/stringify.h>
 #include <linux/bottom_half.h>
+#include <linux/irqflags.h>
+#include <linux/pickop.h>
 
 #include <asm/system.h>
 
 /*
- * Must define these before including other files, inline functions need them
- */
-#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME
-
-#define LOCK_SECTION_START(extra)               \
-        ".subsection 1\n\t"                     \
-        extra                                   \
-        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
-        LOCK_SECTION_NAME ":\n\t"               \
-        ".endif\n"
-
-#define LOCK_SECTION_END                        \
-        ".previous\n\t"
-
-#define __lockfunc __attribute__((section(".spinlock.text")))
-
-/*
  * Pull the raw_spinlock_t and raw_rwlock_t definitions:
  */
 #include <linux/spinlock_types.h>
@@ -89,36 +111,10 @@ extern int __lockfunc generic__raw_read_
 # include <linux/spinlock_up.h>
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK
-  extern void __spin_lock_init(spinlock_t *lock, const char *name,
-			       struct lock_class_key *key);
-# define spin_lock_init(lock)					\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__spin_lock_init((lock), #lock, &__key);		\
-} while (0)
-
-#else
-# define spin_lock_init(lock)					\
-	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
-#endif
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-  extern void __rwlock_init(rwlock_t *lock, const char *name,
-			    struct lock_class_key *key);
-# define rwlock_init(lock)					\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__rwlock_init((lock), #lock, &__key);			\
-} while (0)
-#else
-# define rwlock_init(lock)					\
-	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
-#endif
-
-#define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
+/*
+ * Pull the RT types:
+ */
+#include <linux/rt_lock.h>
 
 #ifdef CONFIG_GENERIC_LOCKBREAK
 #define spin_is_contended(lock) ((lock)->break_lock)
@@ -126,12 +122,6 @@ do {								\
 #define spin_is_contended(lock)	__raw_spin_is_contended(&(lock)->raw_lock)
 #endif
 
-/**
- * spin_unlock_wait - wait until the spinlock gets unlocked
- * @lock: the spinlock in question.
- */
-#define spin_unlock_wait(lock)	__raw_spin_unlock_wait(&(lock)->raw_lock)
-
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
  */
@@ -142,16 +132,16 @@ do {								\
 #endif
 
 #ifdef CONFIG_DEBUG_SPINLOCK
- extern void _raw_spin_lock(spinlock_t *lock);
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
- extern int _raw_spin_trylock(spinlock_t *lock);
- extern void _raw_spin_unlock(spinlock_t *lock);
- extern void _raw_read_lock(rwlock_t *lock);
- extern int _raw_read_trylock(rwlock_t *lock);
- extern void _raw_read_unlock(rwlock_t *lock);
- extern void _raw_write_lock(rwlock_t *lock);
- extern int _raw_write_trylock(rwlock_t *lock);
- extern void _raw_write_unlock(rwlock_t *lock);
+ extern __lockfunc void _raw_spin_lock(raw_spinlock_t *lock);
+# define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+ extern __lockfunc int _raw_spin_trylock(raw_spinlock_t *lock);
+ extern __lockfunc void _raw_spin_unlock(raw_spinlock_t *lock);
+ extern __lockfunc void _raw_read_lock(raw_rwlock_t *lock);
+ extern __lockfunc int _raw_read_trylock(raw_rwlock_t *lock);
+ extern __lockfunc void _raw_read_unlock(raw_rwlock_t *lock);
+ extern __lockfunc void _raw_write_lock(raw_rwlock_t *lock);
+ extern __lockfunc int _raw_write_trylock(raw_rwlock_t *lock);
+ extern __lockfunc void _raw_write_unlock(raw_rwlock_t *lock);
 #else
 # define _raw_spin_lock(lock)		__raw_spin_lock(&(lock)->raw_lock)
 # define _raw_spin_lock_flags(lock, flags) \
@@ -166,141 +156,460 @@ do {								\
 # define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 #endif
 
-#define read_can_lock(rwlock)		__raw_read_can_lock(&(rwlock)->raw_lock)
-#define write_can_lock(rwlock)		__raw_write_can_lock(&(rwlock)->raw_lock)
+extern int __bad_spinlock_type(void);
+extern int __bad_rwlock_type(void);
+
+extern void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
+
+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
+extern int __lockfunc
+rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
+extern int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic);
 
 /*
- * Define the various spin_lock and rw_lock methods.  Note we define these
- * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
- * methods are defined as nops in the case they are not required.
+ * lockdep-less calls, for derived types like rwlock:
+ * (for trylock they can use rt_mutex_trylock() directly.
  */
-#define spin_trylock(lock)		__cond_lock(lock, _spin_trylock(lock))
-#define read_trylock(lock)		__cond_lock(lock, _read_trylock(lock))
-#define write_trylock(lock)		__cond_lock(lock, _write_trylock(lock))
+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
 
-#define spin_lock(lock)			_spin_lock(lock)
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass)
+#ifdef CONFIG_PREEMPT_RT
+# define _spin_lock(l)			rt_spin_lock(l)
+# define _spin_lock_nested(l, s)	rt_spin_lock_nested(l, s)
+# define _spin_lock_bh(l)		rt_spin_lock(l)
+# define _spin_lock_irq(l)		rt_spin_lock(l)
+# define _spin_unlock(l)		rt_spin_unlock(l)
+# define _spin_unlock_no_resched(l)	rt_spin_unlock(l)
+# define _spin_unlock_bh(l)		rt_spin_unlock(l)
+# define _spin_unlock_irq(l)		rt_spin_unlock(l)
+# define _spin_unlock_irqrestore(l, f)	rt_spin_unlock(l)
+static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	rt_spin_lock(lock);
+	return 0;
+}
+static inline unsigned long __lockfunc
+_spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+{
+	rt_spin_lock_nested(lock, subclass);
+	return 0;
+}
 #else
-# define spin_lock_nested(lock, subclass) _spin_lock(lock)
+static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	return 0;
+}
+static inline unsigned long __lockfunc
+_spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+{
+	return 0;
+}
+# define _spin_lock(l)			do { } while (0)
+# define _spin_lock_nested(l, s)	do { } while (0)
+# define _spin_lock_bh(l)		do { } while (0)
+# define _spin_lock_irq(l)		do { } while (0)
+# define _spin_unlock(l)		do { } while (0)
+# define _spin_unlock_no_resched(l)	do { } while (0)
+# define _spin_unlock_bh(l)		do { } while (0)
+# define _spin_unlock_irq(l)		do { } while (0)
+# define _spin_unlock_irqrestore(l, f)	do { } while (0)
 #endif
 
-#define write_lock(lock)		_write_lock(lock)
-#define read_lock(lock)			_read_lock(lock)
+#define _spin_lock_init(sl, n, f, l) \
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_spin_lock_init(sl, n, &__key);		\
+} while (0)
 
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+# ifdef CONFIG_PREEMPT_RT
+#  define _spin_can_lock(l)		(!rt_mutex_is_locked(&(l)->lock))
+#  define _spin_is_locked(l)		rt_mutex_is_locked(&(l)->lock)
+#  define _spin_unlock_wait(l)		rt_spin_unlock_wait(l)
+
+#  define _spin_trylock(l)		rt_spin_trylock(l)
+#  define _spin_trylock_bh(l)		rt_spin_trylock(l)
+#  define _spin_trylock_irq(l)		rt_spin_trylock(l)
+#  define _spin_trylock_irqsave(l,f)	rt_spin_trylock_irqsave(l, f)
+# else
+
+   extern int this_should_never_be_called_on_non_rt(spinlock_t *lock);
+#  define TSNBCONRT(l) this_should_never_be_called_on_non_rt(l)
+#  define _spin_can_lock(l)		TSNBCONRT(l)
+#  define _spin_is_locked(l)		TSNBCONRT(l)
+#  define _spin_unlock_wait(l)		TSNBCONRT(l)
+
+#  define _spin_trylock(l)		TSNBCONRT(l)
+#  define _spin_trylock_bh(l)		TSNBCONRT(l)
+#  define _spin_trylock_irq(l)		TSNBCONRT(l)
+#  define _spin_trylock_irqsave(l,f)	TSNBCONRT(l)
+#endif
 
-#define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
-#define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
-#define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock,
+					       unsigned long *flags);
+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
+extern void
+__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
+
+#define _rwlock_init(rwl, n, f, l)			\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_rwlock_init(rwl, n, &__key);		\
+} while (0)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define spin_lock_irqsave_nested(lock, flags, subclass) \
-	flags = _spin_lock_irqsave_nested(lock, subclass)
+#ifdef CONFIG_PREEMPT_RT
+# define rt_read_can_lock(rwl)	(!rt_mutex_is_locked(&(rwl)->lock))
+# define rt_write_can_lock(rwl)	((rwl)->owners.owner == NULL)
 #else
-#define spin_lock_irqsave_nested(lock, flags, subclass) \
-	flags = _spin_lock_irqsave(lock)
+ extern int rt_rwlock_can_lock_never_call_on_non_rt(rwlock_t *rwlock);
+# define rt_read_can_lock(rwl)	rt_rwlock_can_lock_never_call_on_non_rt(rwl)
+# define rt_write_can_lock(rwl)	rt_rwlock_can_lock_never_call_on_non_rt(rwl)
 #endif
 
+# define _read_can_lock(rwl)	rt_read_can_lock(rwl)
+# define _write_can_lock(rwl)	rt_write_can_lock(rwl)
+
+# define _read_trylock(rwl)	rt_read_trylock(rwl)
+# define _write_trylock(rwl)	rt_write_trylock(rwl)
+# define _write_trylock_irqsave(rwl, flags) \
+	rt_write_trylock_irqsave(rwl, flags)
+
+# define _read_lock(rwl)	rt_read_lock(rwl)
+# define _write_lock(rwl)	rt_write_lock(rwl)
+# define _read_unlock(rwl)	rt_read_unlock(rwl)
+# define _write_unlock(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_bh(rwl)	rt_read_lock(rwl)
+# define _write_lock_bh(rwl)	rt_write_lock(rwl)
+# define _read_unlock_bh(rwl)	rt_read_unlock(rwl)
+# define _write_unlock_bh(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_irq(rwl)	rt_read_lock(rwl)
+# define _write_lock_irq(rwl)	rt_write_lock(rwl)
+# define _read_unlock_irq(rwl)	rt_read_unlock(rwl)
+# define _write_unlock_irq(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_irqsave(rwl) 	rt_read_lock_irqsave(rwl)
+# define _write_lock_irqsave(rwl)	rt_write_lock_irqsave(rwl)
+
+# define _read_unlock_irqrestore(rwl, f)	rt_read_unlock(rwl)
+# define _write_unlock_irqrestore(rwl, f)	rt_write_unlock(rwl)
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+  extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
+				   struct lock_class_key *key);
+# define _raw_spin_lock_init(lock, name, file, line)		\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__raw_spin_lock_init((lock), #lock, &__key);		\
+} while (0)
+
 #else
+#define __raw_spin_lock_init(lock) \
+	do { *(lock) = RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
+# define _raw_spin_lock_init(lock, name, file, line) __raw_spin_lock_init(lock)
+#endif
 
-#define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
-#define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
-#define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
-#define spin_lock_irqsave_nested(lock, flags, subclass)	\
-	spin_lock_irqsave(lock, flags)
+/*
+ * PICK_SPIN_OP()/PICK_RW_OP() are simple redirectors for PICK_FUNCTION
+ */
+#define PICK_SPIN_OP(...)	\
+	PICK_FUNCTION(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__)
+#define PICK_SPIN_OP_RET(...)	\
+	PICK_FUNCTION_RET(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__)
+#define PICK_RW_OP(...)	PICK_FUNCTION(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__)
+#define PICK_RW_OP_RET(...)	\
+	PICK_FUNCTION_RET(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__)
+
+#define spin_lock_init(lock) \
+	PICK_SPIN_OP(_raw_spin_lock_init, _spin_lock_init, lock, #lock,	\
+		__FILE__, __LINE__)
 
+#ifdef CONFIG_DEBUG_SPINLOCK
+  extern void __raw_rwlock_init(raw_rwlock_t *lock, const char *name,
+				struct lock_class_key *key);
+# define _raw_rwlock_init(lock, name, file, line)		\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__raw_rwlock_init((lock), #lock, &__key);		\
+} while (0)
+#else
+#define __raw_rwlock_init(lock) \
+	do { *(lock) = RAW_RW_LOCK_UNLOCKED(lock); } while (0)
+# define _raw_rwlock_init(lock, name, file, line) __raw_rwlock_init(lock)
 #endif
 
-#define spin_lock_irq(lock)		_spin_lock_irq(lock)
-#define spin_lock_bh(lock)		_spin_lock_bh(lock)
+#define rwlock_init(lock) \
+	PICK_RW_OP(_raw_rwlock_init, _rwlock_init, lock, #lock,	\
+		__FILE__, __LINE__)
+
+#define __spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
 
-#define read_lock_irq(lock)		_read_lock_irq(lock)
-#define read_lock_bh(lock)		_read_lock_bh(lock)
+#define spin_is_locked(lock)	\
+	PICK_SPIN_OP_RET(__spin_is_locked, _spin_is_locked, lock)
 
-#define write_lock_irq(lock)		_write_lock_irq(lock)
-#define write_lock_bh(lock)		_write_lock_bh(lock)
+#define __spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock)
+
+#define spin_unlock_wait(lock) \
+	PICK_SPIN_OP(__spin_unlock_wait, _spin_unlock_wait, lock)
 
 /*
- * We inline the unlock functions in the nondebug case:
+ * Define the various spin_lock and rw_lock methods.  Note we define these
+ * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
+ * methods are defined as nops in the case they are not required.
  */
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \
-	!defined(CONFIG_SMP)
-# define spin_unlock(lock)		_spin_unlock(lock)
-# define read_unlock(lock)		_read_unlock(lock)
-# define write_unlock(lock)		_write_unlock(lock)
-# define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
-# define read_unlock_irq(lock)		_read_unlock_irq(lock)
-# define write_unlock_irq(lock)		_write_unlock_irq(lock)
+#define spin_trylock(lock)	\
+	__cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock, _spin_trylock, lock))
+
+#define read_trylock(lock)	\
+	__cond_lock(lock, PICK_RW_OP_RET(__read_trylock, _read_trylock, lock))
+
+#define write_trylock(lock)	\
+	__cond_lock(lock, PICK_RW_OP_RET(__write_trylock, _write_trylock, lock))
+
+#define write_trylock_irqsave(lock, flags) \
+	__cond_lock(lock, PICK_RW_OP_RET(__write_trylock_irqsave, 	\
+		_write_trylock_irqsave, lock, &flags))
+
+#define __spin_can_lock(lock)	__raw_spin_can_lock(&(lock)->raw_lock)
+#define __read_can_lock(lock)	__raw_read_can_lock(&(lock)->raw_lock)
+#define __write_can_lock(lock)	__raw_write_can_lock(&(lock)->raw_lock)
+
+#define spin_can_lock(lock) \
+	__cond_lock(lock, PICK_SPIN_OP_RET(__spin_can_lock, _spin_can_lock,\
+		lock))
+
+#define read_can_lock(lock) \
+	__cond_lock(lock, PICK_RW_OP_RET(__read_can_lock, _read_can_lock, lock))
+
+#define write_can_lock(lock) \
+	__cond_lock(lock, PICK_RW_OP_RET(__write_can_lock, _write_can_lock,\
+		lock))
+
+#define spin_lock(lock) PICK_SPIN_OP(__spin_lock, _spin_lock, lock)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define spin_lock_nested(lock, subclass)	\
+	PICK_SPIN_OP(__spin_lock_nested, _spin_lock_nested, lock, subclass)
 #else
-# define spin_unlock(lock) \
-    do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0)
-# define read_unlock(lock) \
-    do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0)
-# define write_unlock(lock) \
-    do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0)
-# define spin_unlock_irq(lock)			\
-do {						\
-	__raw_spin_unlock(&(lock)->raw_lock);	\
-	__release(lock);			\
-	local_irq_enable();			\
-} while (0)
-# define read_unlock_irq(lock)			\
-do {						\
-	__raw_read_unlock(&(lock)->raw_lock);	\
-	__release(lock);			\
-	local_irq_enable();			\
+# define spin_lock_nested(lock, subclass) spin_lock(lock)
+#endif
+
+#define write_lock(lock) PICK_RW_OP(__write_lock, _write_lock, lock)
+
+#define read_lock(lock)	PICK_RW_OP(__read_lock, _read_lock, lock)
+
+# define spin_lock_irqsave(lock, flags)				\
+do {								\
+	BUILD_CHECK_IRQ_FLAGS(flags);				\
+	flags = PICK_SPIN_OP_RET(__spin_lock_irqsave, _spin_lock_irqsave, \
+			lock);						\
 } while (0)
-# define write_unlock_irq(lock)			\
-do {						\
-	__raw_write_unlock(&(lock)->raw_lock);	\
-	__release(lock);			\
-	local_irq_enable();			\
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define spin_lock_irqsave_nested(lock, flags, subclass)		\
+do {									\
+	BUILD_CHECK_IRQ_FLAGS(flags);					\
+	flags = PICK_SPIN_OP_RET(__spin_lock_irqsave_nested, 		\
+		_spin_lock_irqsave_nested, lock, subclass);		\
 } while (0)
+#else
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+				spin_lock_irqsave(lock, flags)
 #endif
 
-#define spin_unlock_irqrestore(lock, flags) \
-					_spin_unlock_irqrestore(lock, flags)
-#define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
+# define read_lock_irqsave(lock, flags)				\
+do {								\
+	BUILD_CHECK_IRQ_FLAGS(flags);				\
+	flags = PICK_RW_OP_RET(__read_lock_irqsave, _read_lock_irqsave, lock);\
+} while (0)
+
+# define write_lock_irqsave(lock, flags)			\
+do {								\
+	BUILD_CHECK_IRQ_FLAGS(flags);				\
+	flags = PICK_RW_OP_RET(__write_lock_irqsave, _write_lock_irqsave,lock);\
+} while (0)
+
+#define spin_lock_irq(lock) PICK_SPIN_OP(__spin_lock_irq, _spin_lock_irq, lock)
+
+#define spin_lock_bh(lock) PICK_SPIN_OP(__spin_lock_bh, _spin_lock_bh, lock)
+
+#define read_lock_irq(lock) PICK_RW_OP(__read_lock_irq, _read_lock_irq, lock)
 
-#define read_unlock_irqrestore(lock, flags) \
-					_read_unlock_irqrestore(lock, flags)
-#define read_unlock_bh(lock)		_read_unlock_bh(lock)
+#define read_lock_bh(lock) PICK_RW_OP(__read_lock_bh, _read_lock_bh, lock)
 
-#define write_unlock_irqrestore(lock, flags) \
-					_write_unlock_irqrestore(lock, flags)
-#define write_unlock_bh(lock)		_write_unlock_bh(lock)
+#define write_lock_irq(lock) PICK_RW_OP(__write_lock_irq, _write_lock_irq, lock)
 
-#define spin_trylock_bh(lock)	__cond_lock(lock, _spin_trylock_bh(lock))
+#define write_lock_bh(lock) PICK_RW_OP(__write_lock_bh, _write_lock_bh, lock)
 
-#define spin_trylock_irq(lock) \
-({ \
-	local_irq_disable(); \
-	spin_trylock(lock) ? \
-	1 : ({ local_irq_enable(); 0;  }); \
-})
+#define spin_unlock(lock) PICK_SPIN_OP(__spin_unlock, _spin_unlock, lock)
+
+#define read_unlock(lock) PICK_RW_OP(__read_unlock, _read_unlock, lock)
+
+#define write_unlock(lock) PICK_RW_OP(__write_unlock, _write_unlock, lock)
+
+#define spin_unlock_no_resched(lock) \
+	PICK_SPIN_OP(__spin_unlock_no_resched, _spin_unlock_no_resched, lock)
+
+#define spin_unlock_irqrestore(lock, flags)				\
+do {									\
+	BUILD_CHECK_IRQ_FLAGS(flags);					\
+	PICK_SPIN_OP(__spin_unlock_irqrestore, _spin_unlock_irqrestore,	\
+		     lock, flags);					\
+} while (0)
+
+#define spin_unlock_irq(lock)	\
+	PICK_SPIN_OP(__spin_unlock_irq, _spin_unlock_irq, lock)
+#define spin_unlock_bh(lock)	\
+	PICK_SPIN_OP(__spin_unlock_bh, _spin_unlock_bh, lock)
+
+#define read_unlock_irqrestore(lock, flags)				\
+do {									\
+	BUILD_CHECK_IRQ_FLAGS(flags);					\
+	PICK_RW_OP(__read_unlock_irqrestore, _read_unlock_irqrestore,	\
+		lock, flags);						\
+} while (0)
+
+#define read_unlock_irq(lock)	\
+	PICK_RW_OP(__read_unlock_irq, _read_unlock_irq, lock)
+#define read_unlock_bh(lock) PICK_RW_OP(__read_unlock_bh, _read_unlock_bh, lock)
+
+#define write_unlock_irqrestore(lock, flags)				\
+do {									\
+	BUILD_CHECK_IRQ_FLAGS(flags);					\
+	PICK_RW_OP(__write_unlock_irqrestore, _write_unlock_irqrestore, \
+		lock, flags);						\
+} while (0)
+#define write_unlock_irq(lock)	\
+	PICK_RW_OP(__write_unlock_irq, _write_unlock_irq, lock)
+
+#define write_unlock_bh(lock)	\
+	PICK_RW_OP(__write_unlock_bh, _write_unlock_bh, lock)
+
+#define spin_trylock_bh(lock)	\
+	__cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_bh, _spin_trylock_bh,\
+		lock))
+
+#define spin_trylock_irq(lock)	\
+	__cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irq,		\
+		_spin_trylock_irq, lock))
 
 #define spin_trylock_irqsave(lock, flags) \
-({ \
-	local_irq_save(flags); \
-	spin_trylock(lock) ? \
-	1 : ({ local_irq_restore(flags); 0; }); \
-})
+	__cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irqsave, 	\
+		_spin_trylock_irqsave, lock, &flags))
 
-#define write_trylock_irqsave(lock, flags) \
-({ \
-	local_irq_save(flags); \
-	write_trylock(lock) ? \
-	1 : ({ local_irq_restore(flags); 0; }); \
-})
+/* "lock on reference count zero" */
+#ifndef ATOMIC_DEC_AND_LOCK
+# include <asm/atomic.h>
+  extern int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic);
+#endif
+
+#define atomic_dec_and_lock(atomic, lock)				\
+	__cond_lock(lock, PICK_SPIN_OP_RET(__atomic_dec_and_spin_lock,	\
+		_atomic_dec_and_spin_lock, lock, atomic))
+
+/*
+ *  bit-based spin_lock()
+ *
+ * Don't use this unless you really need to: spin_lock() and spin_unlock()
+ * are significantly faster.
+ */
+static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+{
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	while (test_and_set_bit(bitnum, addr))
+		while (test_bit(bitnum, addr))
+			cpu_relax();
+#endif
+	__acquire(bitlock);
+}
+
+/*
+ * Return true if it was acquired
+ */
+static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	if (test_and_set_bit(bitnum, addr))
+		return 0;
+#endif
+	__acquire(bitlock);
+	return 1;
+}
+
+/*
+ *  bit-based spin_unlock()
+ */
+static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	BUG_ON(!test_bit(bitnum, addr));
+	smp_mb__before_clear_bit();
+	clear_bit(bitnum, addr);
+#endif
+	__release(bitlock);
+}
+
+/*
+ * Return true if the lock is held.
+ */
+static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	return test_bit(bitnum, addr);
+#else
+	return 1;
+#endif
+}
+
+/**
+ * __raw_spin_can_lock - would __raw_spin_trylock() succeed?
+ * @lock: the spinlock in question.
+ */
+#define __raw_spin_can_lock(lock)            (!__raw_spin_is_locked(lock))
 
 /*
  * Locks two spinlocks l1 and l2.
  * l1_first indicates if spinlock l1 should be taken first.
  */
-static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2,
-				    bool l1_first)
+static inline void
+raw_double_spin_lock(raw_spinlock_t *l1, raw_spinlock_t *l2, bool l1_first)
+	__acquires(l1)
+	__acquires(l2)
+{
+	if (l1_first) {
+		spin_lock(l1);
+		spin_lock(l2);
+	} else {
+		spin_lock(l2);
+		spin_lock(l1);
+	}
+}
+
+static inline void
+double_spin_lock(spinlock_t *l1, spinlock_t *l2, bool l1_first)
 	__acquires(l1)
 	__acquires(l2)
 {
@@ -313,13 +622,15 @@ static inline void double_spin_lock(spin
 	}
 }
 
+
 /*
  * Unlocks two spinlocks l1 and l2.
  * l1_taken_first indicates if spinlock l1 was taken first and therefore
  * should be released after spinlock l2.
  */
-static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2,
-				      bool l1_taken_first)
+static inline void
+raw_double_spin_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2,
+		       bool l1_taken_first)
 	__releases(l1)
 	__releases(l2)
 {
@@ -332,27 +643,18 @@ static inline void double_spin_unlock(sp
 	}
 }
 
-/*
- * Pull the atomic_t declaration:
- * (asm-mips/atomic.h needs above definitions)
- */
-#include <asm/atomic.h>
-/**
- * atomic_dec_and_lock - lock on reaching reference count zero
- * @atomic: the atomic counter
- * @lock: the spinlock in question
- *
- * Decrements @atomic by 1.  If the result is 0, returns true and locks
- * @lock.  Returns false for all other cases.
- */
-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
-#define atomic_dec_and_lock(atomic, lock) \
-		__cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
-
-/**
- * spin_can_lock - would spin_trylock() succeed?
- * @lock: the spinlock in question.
- */
-#define spin_can_lock(lock)	(!spin_is_locked(lock))
+static inline void
+double_spin_unlock(spinlock_t *l1, spinlock_t *l2, bool l1_taken_first)
+	__releases(l1)
+	__releases(l2)
+{
+	if (l1_taken_first) {
+		spin_unlock(l2);
+		spin_unlock(l1);
+	} else {
+		spin_unlock(l1);
+		spin_unlock(l2);
+	}
+}
 
 #endif /* __LINUX_SPINLOCK_H */
Index: linux-2.6.25.4-rt4/kernel/irq/migration.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/irq/migration.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/irq/migration.c	2008-05-29 09:46:15.000000000 -0400
@@ -61,6 +61,7 @@ void move_masked_irq(int irq)
 void move_native_irq(int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
+	int mask = 1;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -68,8 +69,17 @@ void move_native_irq(int irq)
 	if (unlikely(desc->status & IRQ_DISABLED))
 		return;
 
-	desc->chip->mask(irq);
+	/*
+	 * If the irq is already in progress, it should be masked.
+	 * If we unmask it, we might cause an interrupt storm on RT.
+	 */
+	if (unlikely(desc->status & IRQ_INPROGRESS))
+		mask = 0;
+
+	if (mask)
+		desc->chip->mask(irq);
 	move_masked_irq(irq);
-	desc->chip->unmask(irq);
+	if (mask)
+		desc->chip->unmask(irq);
 }
 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/io_apic_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/io_apic_64.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/io_apic_64.c	2008-05-29 09:47:18.000000000 -0400
@@ -93,8 +93,8 @@ int timer_over_8254 __initdata = 1;
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
-static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+DEFINE_RAW_SPINLOCK(vector_lock);
 
 /*
  * # of IRQ routing registers
@@ -207,6 +207,9 @@ static inline void io_apic_sync(unsigned
 		reg ACTION;						\
 		io_apic_modify(entry->apic, reg);			\
 		FINAL;							\
+		 /* Force POST flush by reading: */			\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+									\
 		if (!entry->next)					\
 			break;						\
 		entry = irq_2_pin + entry->next;			\
@@ -351,10 +354,11 @@ static void add_pin_to_irq(unsigned int 
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+DO_ACTION( __mask,             0, |= 0x00010000, ) /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, ) /* mask = 0 */
+
+DO_ACTION( __pcix_mask,   0, &= 0xffff7fff, ) /* edge */
+DO_ACTION( __pcix_unmask, 0, = (reg & 0xfffeffff) | 0x00008000, ) /* level */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -373,6 +377,23 @@ static void unmask_IO_APIC_irq (unsigned
 	__unmask_IO_APIC_irq(irq);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
+static void pcix_mask_IO_APIC_irq (unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__pcix_mask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void pcix_unmask_IO_APIC_irq (unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__pcix_unmask_IO_APIC_irq(irq);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
@@ -798,17 +819,20 @@ void __setup_vector_irq(int cpu)
 
 
 static struct irq_chip ioapic_chip;
+static struct irq_chip pcix_ioapic_chip;
 
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, unsigned long trigger, int pcix)
 {
+	struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip;
+
 	if (trigger) {
 		irq_desc[irq].status |= IRQ_LEVEL;
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					      handle_fasteoi_irq, "fasteoi");
+		set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq,
+					      pcix ? "pcix-fasteoi" : "fasteoi");
 	} else {
 		irq_desc[irq].status &= ~IRQ_LEVEL;
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					      handle_edge_irq, "edge");
+		set_irq_chip_and_handler_name(irq, chip, handle_edge_irq,
+					      pcix ? "pcix-edge" : "edge");
 	}
 }
 
@@ -853,7 +877,7 @@ static void setup_IO_APIC_irq(int apic, 
 	if (trigger)
 		entry.mask = 1;
 
-	ioapic_register_intr(irq, trigger);
+	ioapic_register_intr(irq, trigger, apic > 0);
 	if (irq < 16)
 		disable_8259A_irq(irq);
 
@@ -1442,7 +1466,8 @@ static void ack_apic_level(unsigned int 
 	irq_complete_move(irq);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+	if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING) &&
+	    !(irq_desc[irq].status & IRQ_INPROGRESS)) {
 		do_unmask_irq = 1;
 		mask_IO_APIC_irq(irq);
 	}
@@ -1486,17 +1511,39 @@ static void ack_apic_level(unsigned int 
 			move_masked_irq(irq);
 		unmask_IO_APIC_irq(irq);
 	}
+#if (defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)) && \
+	defined(CONFIG_PREEMPT_HARDIRQS)
+	/*
+	 * With threaded interrupts, we always have IRQ_INPROGRESS
+	 * when acking.
+	 */
+	else if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING))
+		move_masked_irq(irq);
+#endif
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
-	.name 		= "IO-APIC",
-	.startup 	= startup_ioapic_irq,
-	.mask	 	= mask_IO_APIC_irq,
-	.unmask	 	= unmask_IO_APIC_irq,
-	.ack 		= ack_apic_edge,
-	.eoi 		= ack_apic_level,
+	.name		= "IO-APIC",
+	.startup	= startup_ioapic_irq,
+	.mask		= mask_IO_APIC_irq,
+	.unmask		= unmask_IO_APIC_irq,
+	.ack		= ack_apic_edge,
+	.eoi		= ack_apic_level,
+#ifdef CONFIG_SMP
+	.set_affinity	= set_ioapic_affinity_irq,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+static struct irq_chip pcix_ioapic_chip __read_mostly = {
+	.name		= "IO-APIC",
+	.startup	= startup_ioapic_irq,
+	.mask		= pcix_mask_IO_APIC_irq,
+	.unmask		= pcix_unmask_IO_APIC_irq,
+	.ack		= ack_apic_edge,
+	.eoi		= ack_apic_level,
 #ifdef CONFIG_SMP
-	.set_affinity 	= set_ioapic_affinity_irq,
+	.set_affinity	= set_ioapic_affinity_irq,
 #endif
 	.retrigger	= ioapic_retrigger_irq,
 };
@@ -1696,7 +1743,6 @@ static inline void __init check_timer(vo
 		 */
 		unmask_IO_APIC_irq(0);
 		if (!no_timer_check && timer_irq_works()) {
-			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
 				disable_8259A_irq(0);
 				setup_nmi();
@@ -1722,7 +1768,6 @@ static inline void __init check_timer(vo
 		setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
-			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
 			}
Index: linux-2.6.25.4-rt4/net/core/flow.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/core/flow.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/net/core/flow.c	2008-05-29 09:46:29.000000000 -0400
@@ -40,9 +40,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT(
 
 static u32 flow_hash_shift;
 #define flow_hash_size	(1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
+static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables);
+
+#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu))
 
 static struct kmem_cache *flow_cachep __read_mostly;
 
@@ -169,24 +170,24 @@ static int flow_key_compare(struct flowi
 void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 			flow_resolve_t resolver)
 {
-	struct flow_cache_entry *fle, **head;
+	struct flow_cache_entry **table, *fle, **head = NULL /* shut up GCC */;
 	unsigned int hash;
 	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
+	table = get_cpu_var_locked(flow_tables, &cpu);
 
 	fle = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
-	if (!flow_table(cpu))
+	if (!table)
 		goto nocache;
 
 	if (flow_hash_rnd_recalc(cpu))
 		flow_new_hash_rnd(cpu);
 	hash = flow_hash_code(key, cpu);
 
-	head = &flow_table(cpu)[hash];
+	head = &table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
@@ -196,6 +197,7 @@ void *flow_cache_lookup(struct flowi *ke
 
 				if (ret)
 					atomic_inc(fle->object_ref);
+				put_cpu_var_locked(flow_tables, cpu);
 				local_bh_enable();
 
 				return ret;
@@ -221,6 +223,8 @@ void *flow_cache_lookup(struct flowi *ke
 	}
 
 nocache:
+	put_cpu_var_locked(flow_tables, cpu);
+
 	{
 		int err;
 		void *obj;
@@ -250,14 +254,15 @@ nocache:
 static void flow_cache_flush_tasklet(unsigned long data)
 {
 	struct flow_flush_info *info = (void *)data;
+	struct flow_cache_entry **table;
 	int i;
 	int cpu;
 
-	cpu = smp_processor_id();
+	table = get_cpu_var_locked(flow_tables, &cpu);
 	for (i = 0; i < flow_hash_size; i++) {
 		struct flow_cache_entry *fle;
 
-		fle = flow_table(cpu)[i];
+		fle = table[i];
 		for (; fle; fle = fle->next) {
 			unsigned genid = atomic_read(&flow_cache_genid);
 
@@ -268,6 +273,7 @@ static void flow_cache_flush_tasklet(uns
 			atomic_dec(fle->object_ref);
 		}
 	}
+	put_cpu_var_locked(flow_tables, cpu);
 
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
Index: linux-2.6.25.4-rt4/net/sunrpc/svc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/sunrpc/svc.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/net/sunrpc/svc.c	2008-05-29 09:46:15.000000000 -0400
@@ -584,7 +584,7 @@ __svc_create_thread(svc_thread_fn func, 
 	struct svc_rqst	*rqstp;
 	int		error = -ENOMEM;
 	int		have_oldmask = 0;
-	cpumask_t	oldmask;
+	cpumask_t	uninitialized_var(oldmask);
 
 	rqstp = svc_prepare_thread(serv, pool);
 	if (IS_ERR(rqstp)) {
Index: linux-2.6.25.4-rt4/include/net/netfilter/nf_conntrack.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/net/netfilter/nf_conntrack.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/net/netfilter/nf_conntrack.h	2008-05-29 09:47:03.000000000 -0400
@@ -63,11 +63,14 @@ union nf_conntrack_help {
 #ifdef CONFIG_NETFILTER_DEBUG
 #define NF_CT_ASSERT(x)							\
 do {									\
-	if (!(x))							\
+	if (!(x)) {							\
 		/* Wooah!  I'm tripping my conntrack in a frenzy of	\
 		   netplay... */					\
 		printk("NF_CT_ASSERT: %s:%i(%s)\n",			\
 		       __FILE__, __LINE__, __FUNCTION__);		\
+		if (printk_ratelimit())					\
+			WARN_ON(1);					\
+	}								\
 } while(0)
 #else
 #define NF_CT_ASSERT(x)
@@ -255,11 +258,11 @@ extern atomic_t nf_conntrack_count;
 extern int nf_conntrack_max;
 
 DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
-#define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++)
+#define NF_CT_STAT_INC(count) (__raw_get_cpu_var(nf_conntrack_stat).count++)
 #define NF_CT_STAT_INC_ATOMIC(count)			\
 do {							\
 	local_bh_disable();				\
-	__get_cpu_var(nf_conntrack_stat).count++;	\
+	__raw_get_cpu_var(nf_conntrack_stat).count++;	\
 	local_bh_enable();				\
 } while (0)
 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/crash.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/crash.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/crash.c	2008-05-29 09:46:16.000000000 -0400
@@ -78,14 +78,6 @@ static int crash_nmi_callback(struct not
 	return 1;
 }
 
-static void smp_send_nmi_allbutself(void)
-{
-	cpumask_t mask = cpu_online_map;
-	cpu_clear(safe_smp_processor_id(), mask);
-	if (!cpus_empty(mask))
-		send_IPI_mask(mask, NMI_VECTOR);
-}
-
 static struct notifier_block crash_nmi_nb = {
 	.notifier_call = crash_nmi_callback,
 };
Index: linux-2.6.25.4-rt4/include/asm-x86/apic.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/apic.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/apic.h	2008-05-29 09:46:16.000000000 -0400
@@ -137,4 +137,6 @@ static inline void lapic_shutdown(void) 
 
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
+extern void smp_send_nmi_allbutself(void);
+
 #endif /* __ASM_APIC_H */
Index: linux-2.6.25.4-rt4/include/linux/profile.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/profile.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/profile.h	2008-05-29 09:46:43.000000000 -0400
@@ -6,16 +6,18 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
 #include <linux/cache.h>
 
 #include <asm/errno.h>
 
 extern int prof_on __read_mostly;
 
-#define CPU_PROFILING	1
-#define SCHED_PROFILING	2
-#define SLEEP_PROFILING	3
-#define KVM_PROFILING	4
+#define CPU_PROFILING		1
+#define SCHED_PROFILING		2
+#define SLEEP_PROFILING		3
+#define KVM_PROFILING		4
+#define PREEMPT_PROFILING	5
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -23,6 +25,7 @@ struct notifier_block;
 
 /* init basic kernel profiler */
 void __init profile_init(void);
+void __profile_tick(int type, struct pt_regs *regs);
 void profile_tick(int);
 
 /*
@@ -53,6 +56,8 @@ enum profile_type {
 	PROFILE_MUNMAP
 };
 
+extern int prof_pid;
+
 #ifdef CONFIG_PROFILING
 
 struct task_struct;
Index: linux-2.6.25.4-rt4/kernel/profile.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/profile.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/profile.c	2008-05-29 09:46:47.000000000 -0400
@@ -22,6 +22,7 @@
 #include <linux/cpu.h>
 #include <linux/highmem.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include <asm/sections.h>
 #include <asm/semaphore.h>
 #include <asm/irq_regs.h>
@@ -45,6 +46,7 @@ int prof_on __read_mostly;
 EXPORT_SYMBOL_GPL(prof_on);
 
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+int prof_pid = -1;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -408,16 +410,20 @@ void profile_hits(int type, void *__pc, 
 #endif /* !CONFIG_SMP */
 EXPORT_SYMBOL_GPL(profile_hits);
 
-void profile_tick(int type)
+void __profile_tick(int type, struct pt_regs *regs)
 {
-	struct pt_regs *regs = get_irq_regs();
-
 	if (type == CPU_PROFILING && timer_hook)
 		timer_hook(regs);
-	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) &&
+	    (prof_pid == -1 || prof_pid == current->pid))
 		profile_hit(type, (void *)profile_pc(regs));
 }
 
+void profile_tick(int type)
+{
+	return __profile_tick(type, get_irq_regs());
+}
+
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 #include <asm/uaccess.h>
Index: linux-2.6.25.4-rt4/kernel/time/tick-common.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/time/tick-common.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/time/tick-common.c	2008-05-29 09:46:48.000000000 -0400
@@ -32,7 +32,7 @@ DEFINE_PER_CPU(struct tick_device, tick_
 ktime_t tick_next_period;
 ktime_t tick_period;
 int tick_do_timer_cpu __read_mostly = -1;
-DEFINE_SPINLOCK(tick_device_lock);
+DEFINE_RAW_SPINLOCK(tick_device_lock);
 
 /*
  * Debugging: see timer_list.c
@@ -68,7 +68,6 @@ static void tick_periodic(int cpu)
 	}
 
 	update_process_times(user_mode(get_irq_regs()));
-	profile_tick(CPU_PROFILING);
 }
 
 /*
Index: linux-2.6.25.4-rt4/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/time/tick-sched.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/time/tick-sched.c	2008-05-29 09:46:49.000000000 -0400
@@ -219,10 +219,12 @@ void tick_nohz_stop_sched_tick(void)
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		goto end;
 
-	if (need_resched())
+	if (need_resched() || need_resched_delayed())
 		goto end;
 
 	cpu = smp_processor_id();
+
+#ifndef CONFIG_PREEMPT_RT
 	if (unlikely(local_softirq_pending())) {
 		static int ratelimit;
 
@@ -232,6 +234,7 @@ void tick_nohz_stop_sched_tick(void)
 			ratelimit++;
 		}
 	}
+#endif
 
 	ts->idle_calls++;
 	/* Read jiffies and the time when jiffies were updated last */
@@ -476,7 +479,6 @@ static void tick_nohz_handler(struct clo
 	}
 
 	update_process_times(user_mode(regs));
-	profile_tick(CPU_PROFILING);
 
 	/* Do not restart, when we are in the idle loop */
 	if (ts->tick_stopped)
@@ -583,7 +585,6 @@ static enum hrtimer_restart tick_sched_t
 			ts->idle_jiffies++;
 		}
 		update_process_times(user_mode(regs));
-		profile_tick(CPU_PROFILING);
 	}
 
 	/* Do not restart, when we are in the idle loop */
Index: linux-2.6.25.4-rt4/arch/ppc/boot/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/boot/Makefile	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/boot/Makefile	2008-05-29 09:46:16.000000000 -0400
@@ -15,6 +15,15 @@
 
 # KBUILD_CFLAGS used when building rest of boot (takes effect recursively)
 KBUILD_CFLAGS 	+= -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include
+
+ifdef CONFIG_MCOUNT
+# do not trace the boot loader
+nullstring :=
+space      := $(nullstring) # end of the line
+pg_flag     = $(nullstring) -pg # end of the line
+KBUILD_CFLAGS     := $(subst ${pg_flag},${space},${KBUILD_CFLAGS})
+endif
+
 HOSTCFLAGS	+= -Iarch/$(ARCH)/boot/include
 
 BOOT_TARGETS	= zImage zImage.initrd znetboot znetboot.initrd
Index: linux-2.6.25.4-rt4/arch/arm/boot/compressed/head.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/boot/compressed/head.S	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/boot/compressed/head.S	2008-05-29 09:46:16.000000000 -0400
@@ -941,6 +941,19 @@ memdump:	mov	r12, r0
 #endif
 
 		.ltorg
+#ifdef CONFIG_MCOUNT
+/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this
+ * trampoline
+ */
+                .text
+                .align 0
+                .type mcount %function
+                .global mcount
+mcount:
+		mov pc, lr	@ just return
+#endif
+
+
 reloc_end:
 
 		.align
Index: linux-2.6.25.4-rt4/arch/arm/kernel/entry-common.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/entry-common.S	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/entry-common.S	2008-05-29 09:46:28.000000000 -0400
@@ -3,6 +3,8 @@
  *
  *  Copyright (C) 2000 Russell King
  *
+ * FUNCTION_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -44,7 +46,7 @@ ret_fast_syscall:
 fast_work_pending:
 	str	r0, [sp, #S_R0+S_OFF]!		@ returned r0
 work_pending:
-	tst	r1, #_TIF_NEED_RESCHED
+	tst	r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	bne	work_resched
 	tst	r1, #_TIF_SIGPENDING
 	beq	no_work_pending
@@ -54,7 +56,8 @@ work_pending:
 	b	ret_slow_syscall		@ Check work again
 
 work_resched:
-	bl	schedule
+	bl	__schedule
+
 /*
  * "slow" syscall return path.  "why" tells us if this was a real syscall.
  */
@@ -394,6 +397,114 @@ ENTRY(sys_oabi_call_table)
 #include "calls.S"
 #undef ABI
 #undef OBSOLETE
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+
+#ifdef CONFIG_MCOUNT
+/*
+ * At the point where we are in mcount() we maintain the
+ * frame of the prologue code and keep the call to mcount()
+ * out of the stack frame list:
+
+        saved pc          <---\     caller of instrumented routine
+        saved lr              |
+        ip/prev_sp            |
+        fp        -----^      |
+         :                    |
+                              |
+     -> saved pc              |     instrumented routine
+    |   saved lr              |
+    |   ip/prev_sp            |
+    |   fp           ---------/
+    |     :
+    |
+    |                             mcount
+    |	saved pc
+    |	saved lr
+    |	ip/prev sp
+     --	fp
+        r3
+        r2
+        r1
+   sp-> r0
+         :
+ */
+
+	.text
+	.align 0
+	.type mcount %function
+	.global mcount
+
+/* gcc -pg generated FUNCTION_PROLOGUE references mcount()
+ * and has already created the stack frame invocation for
+ * the routine we have been called to instrument. We create
+ * a complete frame nevertheless, as we want to use the same
+ * call to mcount() from c code.
+ */
+mcount:
+
+	ldr	ip, =mcount_enabled	@ leave early, if disabled
+	ldr	ip, [ip]
+	cmp	ip, #0
+	moveq	pc, lr
+
+	mov	ip,  sp
+	stmdb   sp!, {r0 - r3, fp, ip, lr, pc}	@ create stack frame
+
+	mov	r2, =mcount_trace_function
+
+	ldr	r1, [fp, #-4]		@ get lr (the return address
+					@ of the caller of the
+					@ instrumented function)
+	mov	r0, lr			@ get lr - (the return address
+					@ of the instrumented function)
+
+	sub	fp, ip, #4		@ point fp at this frame
+
+	bl	r2
+1:
+	ldmdb   fp, {r0 - r3, fp, sp, pc}	@ pop entry frame and return
+
+#endif
+
+/* ARM replacement for unsupported gcc __builtin_return_address(n)
+ * where 0 < n.  n == 0 is supported here as well.
+ *
+ * Walk up the stack frame until the desired frame is found or a NULL
+ * fp is encountered, return NULL in the latter case.
+ *
+ * Note: it is possible under code optimization for the stack invocation
+ * of an ancestor function (level N) to be removed before calling a
+ * descendant function (level N+1).  No easy means is available to deduce
+ * this scenario with the result being [for example] caller_addr(0) when
+ * called from level N+1 returning level N-1 rather than the expected
+ * level N.  This optimization issue appears isolated to the case of
+ * a call to a level N+1 routine made at the tail end of a level N
+ * routine -- the level N frame is deleted and a simple branch is made
+ * to the level N+1 routine.
+ */
+
+	.text
+	.align 0
+	.type arm_return_addr %function
+	.global arm_return_addr
+
+arm_return_addr:
+	mov	ip, r0
+	mov	r0, fp
+3:
+	cmp	r0, #0
+	beq	1f		@ frame list hit end, bail
+	cmp	ip, #0
+	beq	2f		@ reached desired frame
+	ldr	r0, [r0, #-12]  @ else continue, get next fp
+	sub	ip, ip, #1
+	b	 3b
+2:
+	ldr	r0, [r0, #-4]   @ get target return address
+1:
+	mov	pc, lr
 
 #endif
 
Index: linux-2.6.25.4-rt4/arch/arm/kernel/fiq.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/fiq.c	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/fiq.c	2008-05-29 09:46:16.000000000 -0400
@@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign
  * disable irqs for the duration.  Note - these functions are almost
  * entirely coded in assembly.
  */
-void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
@@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs
 	: "r" (&regs->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE));
 }
 
-void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
Index: linux-2.6.25.4-rt4/arch/arm/mm/copypage-v4mc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mm/copypage-v4mc.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mm/copypage-v4mc.c	2008-05-29 09:46:35.000000000 -0400
@@ -30,7 +30,7 @@
 #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \
 				  L_PTE_CACHEABLE)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * ARMv4 mini-dcache optimised copy_user_page
@@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_page that does the right thing.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	asm volatile(
@@ -88,7 +88,7 @@ void v4_mc_copy_user_page(void *kto, con
 /*
  * ARMv4 optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 v4_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
Index: linux-2.6.25.4-rt4/arch/arm/mm/copypage-xscale.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mm/copypage-xscale.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mm/copypage-xscale.c	2008-05-29 09:46:35.000000000 -0400
@@ -32,7 +32,7 @@
 #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \
 				  L_PTE_CACHEABLE)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * XScale mini-dcache optimised copy_user_page
@@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
  * and merged as appropriate.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	/*
@@ -110,7 +110,7 @@ void xscale_mc_copy_user_page(void *kto,
 /*
  * XScale optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
Index: linux-2.6.25.4-rt4/arch/arm/mm/fault.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mm/fault.c	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mm/fault.c	2008-05-29 09:46:58.000000000 -0400
@@ -239,7 +239,7 @@ out:
 	return fault;
 }
 
-static int __kprobes
+static int notrace __kprobes
 do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk;
@@ -256,7 +256,7 @@ do_page_fault(unsigned long addr, unsign
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto no_context;
 
 	/*
@@ -338,7 +338,7 @@ no_context:
  * interrupt or a critical region, and should only copy the information
  * from the master page table, nothing more.
  */
-static int __kprobes
+static int notrace __kprobes
 do_translation_fault(unsigned long addr, unsigned int fsr,
 		     struct pt_regs *regs)
 {
@@ -381,7 +381,7 @@ bad_area:
  * Some section permission faults need to be handled gracefully.
  * They can happen due to a __{get,put}_user during an oops.
  */
-static int
+static notrace int
 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	do_bad_area(addr, fsr, regs);
@@ -391,7 +391,7 @@ do_sect_fault(unsigned long addr, unsign
 /*
  * This abort handler always returns "fault".
  */
-static int
+static notrace int
 do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	return 1;
@@ -446,7 +446,7 @@ static struct fsr_info {
 	{ do_bad,		SIGBUS,  0,		"unknown 31"			   }
 };
 
-void __init
+void __init notrace
 hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *),
 		int sig, const char *name)
 {
@@ -460,7 +460,7 @@ hook_fault_code(int nr, int (*fn)(unsign
 /*
  * Dispatch a data abort to the relevant handler.
  */
-asmlinkage void __exception
+asmlinkage void __exception notrace
 do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6);
@@ -479,7 +479,7 @@ do_DataAbort(unsigned long addr, unsigne
 	arm_notify_die("", regs, &info, fsr, 0);
 }
 
-asmlinkage void __exception
+asmlinkage void __exception notrace
 do_PrefetchAbort(unsigned long addr, struct pt_regs *regs)
 {
 	do_translation_fault(addr, 0, regs);
Index: linux-2.6.25.4-rt4/include/asm-arm/pgalloc.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/pgalloc.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/pgalloc.h	2008-05-29 09:46:16.000000000 -0400
@@ -111,7 +111,7 @@ static inline void __pmd_populate(pmd_t 
  *
  * Ensure that we always set both PMD entries.
  */
-static inline void
+static inline void notrace
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
 {
 	unsigned long pte_ptr = (unsigned long)ptep;
@@ -124,7 +124,7 @@ pmd_populate_kernel(struct mm_struct *mm
 	__pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE);
 }
 
-static inline void
+static inline void notrace
 pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
 {
 	__pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE);
Index: linux-2.6.25.4-rt4/include/asm-arm/timex.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/timex.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/timex.h	2008-05-29 09:47:14.000000000 -0400
@@ -16,9 +16,17 @@
 
 typedef unsigned long cycles_t;
 
+#ifndef mach_read_cycles
+ #define mach_read_cycles() (0)
+#ifdef CONFIG_EVENT_TRACE
+ #define mach_cycles_to_usecs(d) (d)
+ #define mach_usecs_to_cycles(d) (d)
+#endif
+#endif
+
 static inline cycles_t get_cycles (void)
 {
-	return 0;
+	return mach_read_cycles();
 }
 
 #endif
Index: linux-2.6.25.4-rt4/include/asm-arm/unistd.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/unistd.h	2008-05-29 09:45:55.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/unistd.h	2008-05-29 09:46:16.000000000 -0400
@@ -380,6 +380,10 @@
 #define __NR_eventfd			(__NR_SYSCALL_BASE+351)
 #define __NR_fallocate			(__NR_SYSCALL_BASE+352)
 
+#ifndef __ASSEMBLY__
+#define NR_syscalls			(__NR_fallocate + 1 - __NR_SYSCALL_BASE)
+#endif
+
 /*
  * The following SWIs are ARM private.
  */
Index: linux-2.6.25.4-rt4/arch/arm/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/Kconfig	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/Kconfig	2008-05-29 09:46:19.000000000 -0400
@@ -41,6 +41,10 @@ config GENERIC_CLOCKEVENTS_BROADCAST
 	depends on GENERIC_CLOCKEVENTS
 	default y if SMP && !LOCAL_TIMERS
 
+config STACKTRACE_SUPPORT
+	bool
+	default y
+
 config MMU
 	bool
 	default y
@@ -691,18 +695,7 @@ config LOCAL_TIMERS
 	  accounting to be spread across the timer interval, preventing a
 	  "thundering herd" at every timer tick.
 
-config PREEMPT
-	bool "Preemptible Kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+source kernel/Kconfig.preempt
 
 config NO_IDLE_HZ
 	bool "Dynamic tick timer"
Index: linux-2.6.25.4-rt4/arch/arm/lib/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/lib/Makefile	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/lib/Makefile	2008-05-29 09:46:17.000000000 -0400
@@ -41,6 +41,7 @@ lib-$(CONFIG_ARCH_RPC)		+= ecard.o io-ac
 lib-$(CONFIG_ARCH_CLPS7500)	+= io-acorn.o
 lib-$(CONFIG_ARCH_L7200)	+= io-acorn.o
 lib-$(CONFIG_ARCH_SHARK)	+= io-shark.o
+lib-$(CONFIG_STACKTRACE)	+= stacktrace.o
 
 $(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S
Index: linux-2.6.25.4-rt4/arch/arm/lib/stacktrace.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/arch/arm/lib/stacktrace.c	2008-05-29 09:46:17.000000000 -0400
@@ -0,0 +1,7 @@
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+
+void save_stack_trace(struct stack_trace *trace)
+{
+}
+
Index: linux-2.6.25.4-rt4/include/asm-arm/arch-ep93xx/timex.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/arch-ep93xx/timex.h	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/arch-ep93xx/timex.h	2008-05-29 09:46:17.000000000 -0400
@@ -1,5 +1,11 @@
 /*
  * linux/include/asm-arm/arch-ep93xx/timex.h
  */
+#include <asm-arm/arch-ep93xx/ep93xx-regs.h>
+#include <asm-arm/io.h>
 
 #define CLOCK_TICK_RATE		983040
+
+#define mach_read_cycles() __raw_readl(EP93XX_TIMER4_VALUE_LOW)
+#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32)
+#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32)
Index: linux-2.6.25.4-rt4/drivers/char/random.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/char/random.c	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/char/random.c	2008-05-29 09:46:17.000000000 -0400
@@ -580,8 +580,11 @@ static void add_timer_randomness(struct 
 	preempt_disable();
 	/* if over the trickle threshold, use only 1 in 4096 samples */
 	if (input_pool.entropy_count > trickle_thresh &&
-	    (__get_cpu_var(trickle_count)++ & 0xfff))
-		goto out;
+	    (__get_cpu_var(trickle_count)++ & 0xfff)) {
+		preempt_enable();
+		return;
+	}
+	preempt_enable();
 
 	sample.jiffies = jiffies;
 	sample.cycles = get_cycles();
@@ -626,9 +629,6 @@ static void add_timer_randomness(struct 
 
 	if(input_pool.entropy_count >= random_read_wakeup_thresh)
 		wake_up_interruptible(&random_read_wait);
-
-out:
-	preempt_enable();
 }
 
 void add_input_randomness(unsigned int type, unsigned int code,
Index: linux-2.6.25.4-rt4/drivers/char/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/char/Kconfig	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/char/Kconfig	2008-05-29 09:47:00.000000000 -0400
@@ -752,6 +752,46 @@ config JS_RTC
 	  To compile this driver as a module, choose M here: the
 	  module will be called js-rtc.
 
+config RTC_HISTOGRAM
+	bool "Real Time Clock Histogram Support"
+	default n
+	depends on RTC
+	---help---
+	  If you say Y here then the kernel will track the delivery and
+	  wakeup latency of /dev/rtc using tasks and will report a
+	  histogram to the kernel log when the application closes /dev/rtc.
+
+config BLOCKER
+	tristate "Priority Inheritance Debugging (Blocker) Device Support"
+	depends on X86
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  pi_test suite uses to test and measure kernel locking primitives.
+
+config LPPTEST
+	tristate "Parallel Port Based Latency Measurement Device"
+	depends on !PARPORT && X86
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  testlpp utility uses to measure IRQ latencies of a target system
+	  from an independent measurement system.
+
+	  NOTE: this code assumes x86 PCs and that the parallel port is
+	  bidirectional and is on IRQ 7.
+
+	  to use the device, both the target and the source system needs to
+	  run a kernel with CONFIG_LPPTEST enabled. To measure latencies,
+	  use the scripts/testlpp utility in your kernel source directory,
+	  and run it (as root) on the source system - it will start printing
+	  out the latencies it took to get a response from the target system:
+
+	    Latency of response: 12.2 usecs (121265 cycles)
+
+	  then generate various workloads on the target system to see how
+	  (worst-case-) latencies are impacted.
+
 config SGI_DS1286
 	tristate "SGI DS1286 RTC support"
 	depends on SGI_HAS_DS1286
@@ -1041,6 +1081,24 @@ config TELCLOCK
 	  /sys/devices/platform/telco_clock, with a number of files for
 	  controlling the behavior of this hardware.
 
+config RMEM
+	tristate "Access to physical memory via /dev/rmem"
+	default m
+	help
+	  The /dev/mem device only allows mmap() memory available to
+	  I/O mapped memory; it does not allow access to "real"
+	  physical memory.  The /dev/rmem device is a hack which does
+	  allow access to physical memory.  We use this instead of
+	  patching /dev/mem because we don't expect this functionality
+	  to ever be accepted into mainline.
+
+config ALLOC_RTSJ_MEM
+	tristate "RTSJ-specific hack to reserve memory"
+	default m
+	help
+	  The RTSJ TCK conformance test requires reserving some physical
+	  memory for testing /dev/rmem.
+
 config DEVPORT
 	bool
 	depends on !M68K
Index: linux-2.6.25.4-rt4/drivers/char/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/char/Makefile	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/char/Makefile	2008-05-29 09:47:00.000000000 -0400
@@ -86,6 +86,8 @@ obj-$(CONFIG_TOSHIBA)		+= toshiba.o
 obj-$(CONFIG_I8K)		+= i8k.o
 obj-$(CONFIG_DS1620)		+= ds1620.o
 obj-$(CONFIG_HW_RANDOM)		+= hw_random/
+obj-$(CONFIG_BLOCKER)		+= blocker.o
+obj-$(CONFIG_LPPTEST)		+= lpptest.o
 obj-$(CONFIG_COBALT_LCD)	+= lcd.o
 obj-$(CONFIG_PPDEV)		+= ppdev.o
 obj-$(CONFIG_NWBUTTON)		+= nwbutton.o
@@ -97,6 +99,7 @@ obj-$(CONFIG_CS5535_GPIO)	+= cs5535_gpio
 obj-$(CONFIG_GPIO_VR41XX)	+= vr41xx_giu.o
 obj-$(CONFIG_GPIO_TB0219)	+= tb0219.o
 obj-$(CONFIG_TELCLOCK)		+= tlclk.o
+obj-$(CONFIG_RMEM)		+= rmem.o
 
 obj-$(CONFIG_MWAVE)		+= mwave/
 obj-$(CONFIG_AGP)		+= agp/
@@ -112,6 +115,8 @@ obj-$(CONFIG_PS3_FLASH)		+= ps3flash.o
 obj-$(CONFIG_JS_RTC)		+= js-rtc.o
 js-rtc-y = rtc.o
 
+obj-$(CONFIG_ALLOC_RTSJ_MEM) += alloc_rtsj_mem.o
+
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
 
Index: linux-2.6.25.4-rt4/drivers/char/blocker.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/drivers/char/blocker.c	2008-05-29 09:46:17.000000000 -0400
@@ -0,0 +1,109 @@
+/*
+ * priority inheritance testing device
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/timex.h>
+#include <linux/sched.h>
+
+#define BLOCKER_MINOR		221
+
+#define BLOCK_IOCTL		4245
+#define BLOCK_SET_DEPTH		4246
+
+#define BLOCKER_MAX_LOCK_DEPTH		10
+
+void loop(int loops)
+{
+	int i;
+
+	for (i = 0; i < loops; i++)
+		get_cycles();
+}
+
+static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH];
+
+static unsigned int lock_depth = 1;
+
+void do_the_lock_and_loop(unsigned int args)
+{
+	int i, max;
+
+	if (rt_task(current))
+		max = lock_depth;
+	else if (lock_depth > 1)
+		max = (current->pid % lock_depth) + 1;
+	else
+		max = 1;
+
+	/* Always lock from the top down */
+	for (i = max-1; i >= 0; i--)
+		 spin_lock(&blocker_lock[i]);
+	loop(args);
+	for (i = 0; i < max; i++)
+		spin_unlock(&blocker_lock[i]);
+}
+
+static int blocker_open(struct inode *in, struct file *file)
+{
+	printk(KERN_INFO "blocker_open called\n");
+
+	return 0;
+}
+
+static long blocker_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long args)
+{
+	switch(cmd) {
+	case BLOCK_IOCTL:
+		do_the_lock_and_loop(args);
+		return 0;
+	case BLOCK_SET_DEPTH:
+		if (args >= BLOCKER_MAX_LOCK_DEPTH)
+			return -EINVAL;
+		lock_depth = args;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct file_operations blocker_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.unlocked_ioctl = blocker_ioctl,
+	.open		= blocker_open,
+};
+
+static struct miscdevice blocker_dev =
+{
+	BLOCKER_MINOR,
+	"blocker",
+	&blocker_fops
+};
+
+static int __init blocker_init(void)
+{
+	int i;
+
+	if (misc_register(&blocker_dev))
+		return -ENODEV;
+
+	for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++)
+		spin_lock_init(blocker_lock + i);
+
+	return 0;
+}
+
+void __exit blocker_exit(void)
+{
+	printk(KERN_INFO "blocker device uninstalled\n");
+	misc_deregister(&blocker_dev);
+}
+
+module_init(blocker_init);
+module_exit(blocker_exit);
+
+MODULE_LICENSE("GPL");
+
Index: linux-2.6.25.4-rt4/drivers/char/lpptest.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/drivers/char/lpptest.c	2008-05-29 09:46:17.000000000 -0400
@@ -0,0 +1,178 @@
+/*
+ * /dev/lpptest device: test IRQ handling latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
+ *
+ * licensed under the GPL
+ *
+ * You need to have CONFIG_PARPORT disabled for this device, it is a
+ * completely self-contained device that assumes sole ownership of the
+ * parallel port.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtc.h>
+
+/*
+ * API wrappers so that the code can be shared with the -rt tree:
+ */
+#ifndef local_irq_disable
+# define local_irq_disable	local_irq_disable
+# define local_irq_enable	local_irq_enable
+#endif
+
+#ifndef IRQ_NODELAY
+# define IRQ_NODELAY		0
+# define IRQF_NODELAY		0
+#endif
+
+/*
+ * Driver:
+ */
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_IRQ 7
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+static char dev_id[] = "lpptest";
+
+#define INIT_PORT()	outb(0x04, 0x37a)
+#define ENABLE_IRQ()	outb(0x10, 0x37a)
+#define DISABLE_IRQ()	outb(0, 0x37a)
+
+static unsigned char out = 0x5a;
+
+/**
+ * Interrupt handler. Flip a bit in the reply.
+ */
+static int lpptest_irq (int irq, void *dev_id)
+{
+	out ^= 0xff;
+	outb(out, 0x378);
+
+	return IRQ_HANDLED;
+}
+
+static cycles_t test_response(void)
+{
+	cycles_t now, end;
+	unsigned char in;
+	int timeout = 0;
+
+	local_irq_disable();
+	in = inb(0x379);
+	inb(0x378);
+	outb(0x08, 0x378);
+	now = get_cycles();
+	while(1) {
+		if (inb(0x379) != in)
+			break;
+		if (timeout++ > 1000000) {
+			outb(0x00, 0x378);
+			local_irq_enable();
+
+			return 0;
+		}
+	}
+	end = get_cycles();
+	outb(0x00, 0x378);
+	local_irq_enable();
+
+	return end - now;
+}
+
+static int lpptest_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int lpptest_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param)
+{
+	int retval = 0;
+
+	switch (ioctl_num) {
+
+	case LPPTEST_DISABLE:
+		DISABLE_IRQ();
+		break;
+
+	case LPPTEST_ENABLE:
+		ENABLE_IRQ();
+		break;
+
+	case LPPTEST_TEST: {
+
+		cycles_t diff = test_response();
+		if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff)))
+			goto errcpy;
+		break;
+	}
+	default: retval = -EINVAL;
+	}
+
+	return retval;
+
+ errcpy:
+	return -EFAULT;
+}
+
+static struct file_operations lpptest_dev_fops = {
+	.ioctl = lpptest_ioctl,
+	.open = lpptest_open,
+	.release = lpptest_close,
+};
+
+static int __init lpptest_init (void)
+{
+	if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops))
+	{
+		printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n",
+		       LPPTEST_CHAR_MAJOR);
+		return -EAGAIN;
+	}
+
+	if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) {
+		printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ);
+		unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+		return -EAGAIN;
+	}
+	irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY;
+	irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED;
+
+	INIT_PORT();
+	ENABLE_IRQ();
+
+	return 0;
+}
+module_init (lpptest_init);
+
+static void __exit lpptest_exit (void)
+{
+	DISABLE_IRQ();
+
+	free_irq(LPPTEST_IRQ, dev_id);
+	unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+}
+module_exit (lpptest_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("lpp test module");
+
Index: linux-2.6.25.4-rt4/drivers/char/rtc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/char/rtc.c	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/char/rtc.c	2008-05-29 09:46:50.000000000 -0400
@@ -94,6 +94,32 @@ static unsigned long rtc_port;
 static int rtc_irq = PCI_IRQ_NONE;
 #endif
 
+#ifdef CONFIG_MIPS
+# include <asm/time.h>
+#endif
+
+#ifdef CONFIG_RTC_HISTOGRAM
+
+static cycles_t last_interrupt_time;
+
+#include <asm/timex.h>
+
+#define CPU_MHZ		(cpu_khz / 1000)
+
+#define HISTSIZE	10000
+static int histogram[HISTSIZE];
+
+static int rtc_state;
+
+enum rtc_states {
+	S_STARTUP,		/* First round - let the application start */
+	S_IDLE,			/* Waiting for an interrupt */
+	S_WAITING_FOR_READ,	/* Signal delivered. waiting for rtc_read() */
+	S_READ_MISSED,		/* Signal delivered, read() deadline missed */
+};
+
+#endif
+
 #ifdef	CONFIG_HPET_RTC_IRQ
 #undef	RTC_IRQ
 #endif
@@ -224,7 +250,146 @@ static inline unsigned char rtc_is_updat
 	return uip;
 }
 
+#ifndef RTC_IRQ
+# undef CONFIG_RTC_HISTOGRAM
+#endif
+
+static inline void rtc_open_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i;
+
+	last_interrupt_time = 0;
+	rtc_state = S_STARTUP;
+	rtc_irq_data = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		histogram[i] = 0;
+#endif
+}
+
+static inline void rtc_wake_event(void)
+{
+#ifndef CONFIG_RTC_HISTOGRAM
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+#else
+	if (!(rtc_status & RTC_IS_OPEN))
+		return;
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		break;
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		last_interrupt_time = get_cycles();
+		rtc_state = S_WAITING_FOR_READ;
+		break;
+
+	/* Signal has been delivered. waiting for rtc_read() */
+	case S_WAITING_FOR_READ:
+		/*
+		 * Well foo.  The usermode application didn't
+		 * schedule and read in time.
+		 */
+		last_interrupt_time = get_cycles();
+		rtc_state = S_READ_MISSED;
+		printk("Read missed before next interrupt\n");
+		break;
+	/* Signal has been delivered, read() deadline was missed */
+	case S_READ_MISSED:
+		/*
+		 * Not much we can do here.  We're waiting for the usermode
+		 * application to read the rtc
+		 */
+		last_interrupt_time = get_cycles();
+		break;
+	}
+#endif
+}
+
+static inline void rtc_read_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	cycles_t now = get_cycles();
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		rtc_state = S_IDLE;
+		break;
+
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		printk("bug in rtc_read(): called in state S_IDLE!\n");
+		break;
+	case S_WAITING_FOR_READ:	/*
+					 * Signal has been delivered.
+					 * waiting for rtc_read()
+					 */
+		/*
+		 * Well done
+		 */
+	case S_READ_MISSED:		/*
+					 * Signal has been delivered, read()
+					 * deadline was missed
+					 */
+		/*
+		 * So, you finally got here.
+		 */
+		if (!last_interrupt_time)
+			printk("bug in rtc_read(): last_interrupt_time = 0\n");
+		rtc_state = S_IDLE;
+		{
+			cycles_t latency = now - last_interrupt_time;
+			unsigned long delta;	/* Microseconds */
+
+			delta = latency;
+			delta /= CPU_MHZ;
+
+			if (delta > 1000 * 1000) {
+				printk("rtc: eek\n");
+			} else {
+				unsigned long slot = delta;
+				if (slot >= HISTSIZE)
+					slot = HISTSIZE - 1;
+				histogram[slot]++;
+				if (delta > 2000)
+					printk("wow!  That was a "
+							"%ld millisec bump\n",
+						delta / 1000);
+			}
+		}
+		rtc_state = S_IDLE;
+		break;
+	}
+#endif
+}
+
+static inline void rtc_close_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i = 0;
+	unsigned long total = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		total += histogram[i];
+	if (!total)
+		return;
+
+	printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n",
+		current->comm, current->pid, total);
+	for (i = 0; i < HISTSIZE; i++) {
+		if (histogram[i])
+			printk("%d %d\n", i, histogram[i]);
+	}
+#endif
+}
+
 #ifdef RTC_IRQ
+
 /*
  *	A very tiny interrupt handler. It runs with IRQF_DISABLED set,
  *	but there is possibility of conflicting with the set_rtc_mmss()
@@ -268,9 +433,9 @@ irqreturn_t rtc_interrupt(int irq, void 
 	if (rtc_callback)
 		rtc_callback->func(rtc_callback->private_data);
 	spin_unlock(&rtc_task_lock);
-	wake_up_interruptible(&rtc_wait);
 
-	kill_fasync(&rtc_async_queue, SIGIO, POLL_IN);
+	rtc_wake_event();
+	wake_up_interruptible(&rtc_wait);
 
 	return IRQ_HANDLED;
 }
@@ -380,6 +545,8 @@ static ssize_t rtc_read(struct file *fil
 		schedule();
 	} while (1);
 
+	rtc_read_event();
+
 	if (count == sizeof(unsigned int)) {
 		retval = put_user(data,
 				  (unsigned int __user *)buf) ?: sizeof(int);
@@ -627,6 +794,11 @@ static int rtc_do_ioctl(unsigned int cmd
 		save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
 		CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
 
+		/*
+		 * Make CMOS date writes nonpreemptible even on PREEMPT_RT.
+		 * There's a limit to everything! =B-)
+		 */
+		preempt_disable();
 #ifdef CONFIG_MACH_DECSTATION
 		CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
 #endif
@@ -636,6 +808,7 @@ static int rtc_do_ioctl(unsigned int cmd
 		CMOS_WRITE(hrs, RTC_HOURS);
 		CMOS_WRITE(min, RTC_MINUTES);
 		CMOS_WRITE(sec, RTC_SECONDS);
+		preempt_enable();
 
 		CMOS_WRITE(save_control, RTC_CONTROL);
 		CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
@@ -737,6 +910,7 @@ static int rtc_open(struct inode *inode,
 	if (rtc_status & RTC_IS_OPEN)
 		goto out_busy;
 
+	rtc_open_event();
 	rtc_status |= RTC_IS_OPEN;
 
 	rtc_irq_data = 0;
@@ -790,6 +964,7 @@ no_irq:
 	rtc_irq_data = 0;
 	rtc_status &= ~RTC_IS_OPEN;
 	spin_unlock_irq(&rtc_lock);
+	rtc_close_event();
 
 	return 0;
 }
@@ -1199,10 +1374,12 @@ static void rtc_dropped_irq(unsigned lon
 
 	spin_unlock_irq(&rtc_lock);
 
+#ifndef CONFIG_PREEMPT_RT
 	if (printk_ratelimit()) {
 		printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
 			freq);
 	}
+#endif
 
 	/* Now we have new data */
 	wake_up_interruptible(&rtc_wait);
Index: linux-2.6.25.4-rt4/scripts/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/scripts/Makefile	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/scripts/Makefile	2008-05-29 09:47:05.000000000 -0400
@@ -12,6 +12,12 @@ hostprogs-$(CONFIG_LOGO)         += pnmt
 hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash
 hostprogs-$(CONFIG_IKCONFIG)     += bin2c
+HOST_OS := $(shell uname)
+ifeq ($(HOST_OS),Linux)
+ifdef CONFIG_LPPTEST
+hostprogs-y      += testlpp
+endif
+endif
 
 always		:= $(hostprogs-y) $(hostprogs-m)
 
Index: linux-2.6.25.4-rt4/scripts/testlpp.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/scripts/testlpp.c	2008-05-29 09:46:17.000000000 -0400
@@ -0,0 +1,159 @@
+/*
+ * testlpp.c: use the /dev/lpptest device to test IRQ handling
+ *            latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner
+ *
+ * licensed under the GPL
+ */
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+#define HIST_SIZE 10000
+
+static int hist_total;
+static unsigned long hist[HIST_SIZE];
+
+static void hist_hit(unsigned long usecs)
+{
+	hist_total++;
+	if (usecs >= HIST_SIZE-1)
+		hist[HIST_SIZE-1]++;
+	else
+		hist[usecs]++;
+}
+
+static void print_hist(void)
+{
+	int i;
+
+	printf("LPP latency histogram:\n");
+
+	for (i = 0; i < HIST_SIZE; i++) {
+		if (hist[i])
+			printf("%3d usecs: %9ld\n", i, hist[i]);
+	}
+}
+
+static inline unsigned long long int rdtsc(void)
+{
+	unsigned long long int x, y;
+	for (;;) {
+		__asm__ volatile ("rdtsc" : "=A" (x));
+		__asm__ volatile ("rdtsc" : "=A" (y));
+		if (y - x < 1000)
+			return y;
+	}
+}
+
+static unsigned long long calibrate_loop(void)
+{
+	unsigned long long mytime1, mytime2;
+
+	mytime1 = rdtsc();
+	usleep(500000);
+	mytime2 = rdtsc();
+
+	return (mytime2 - mytime1) * 2;
+}
+
+#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec)
+
+#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec)
+
+int fd, total;
+unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec;
+
+void cleanup(int sig)
+{
+	ioctl (fd, LPPTEST_ENABLE, &tim);
+	if (sig)
+		printf("[ interrupted - exiting ]\n");
+	printf("\ntotal number of responses: %d\n", total);
+	printf("average reponse latency:   %.2lf usecs\n",
+		time_to_usecs(sum_tim/total));
+	printf("minimum latency:           %.2lf usecs\n",
+			time_to_usecs(min_tim));
+	printf("maximum latency:           %.2lf usecs\n",
+			time_to_usecs(max_tim));
+	print_hist();
+	exit(0);
+}
+
+#define HZ 3000
+
+int main (int argc, char **argv)
+{
+	unsigned int nr_requests = 0;
+
+	if (argc > 2) {
+		fprintf(stderr, "usage: testlpp [<nr_of_requests>]\n");
+		exit(-1);
+	}
+	if (argc == 2)
+		nr_requests = atol(argv[1]);
+
+	if (getuid() != 0) {
+		fprintf(stderr, "need to run as root!\n");
+		exit(-1);
+	}
+	mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1));
+
+	fd = open("/dev/lpptest", O_RDWR);
+	if (fd == -1) {
+		fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n");
+		exit(-1);
+	}
+
+	signal(SIGINT,&cleanup);
+
+	ioctl (fd, LPPTEST_DISABLE, &tim);
+
+	fprintf(stderr, "calibrating cycles to usecs: ");
+	cycles_per_sec = calibrate_loop();
+	fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000);
+	if (nr_requests)
+		fprintf(stderr, "[max # of requests: %u]\n", nr_requests);
+	fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ);
+
+	while(1) {
+		ioctl (fd, LPPTEST_TEST, &tim);
+		if (tim == 0)
+			printf ("No response from target.\n");
+		else {
+			hist_hit(time_to_usecs_l(tim));
+			if (tim > max_tim) {
+				printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim);
+				max_tim = tim;
+			}
+			if (tim < min_tim)
+				min_tim = tim;
+			total++;
+			if (total == nr_requests)
+				break;
+			sum_tim += tim;
+		}
+		usleep(1000000/HZ);
+	}
+	cleanup(0);
+
+	return 0;
+}
+
+
Index: linux-2.6.25.4-rt4/include/linux/lockdep.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/lockdep.h	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/lockdep.h	2008-05-29 09:47:25.000000000 -0400
@@ -304,6 +304,9 @@ extern void lock_acquire(struct lockdep_
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 
+extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass,
+			      unsigned long ip);
+
 # define INIT_LOCKDEP				.lockdep_recursion = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
@@ -320,6 +323,7 @@ static inline void lockdep_on(void)
 
 # define lock_acquire(l, s, t, r, c, i)		do { } while (0)
 # define lock_release(l, n, i)			do { } while (0)
+# define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub)	do { (void)(key); } while (0)
@@ -357,6 +361,38 @@ do {								\
 	lock_acquired(&(_lock)->dep_map);			\
 } while (0)
 
+#define LOCK_CONTENDED_RT(_lock, f_try, f_lock)			\
+do {								\
+	if (!f_try(&(_lock)->lock)) {				\
+		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
+		f_lock(&(_lock)->lock);				\
+	}							\
+	lock_acquired(&(_lock)->dep_map);			\
+} while (0)
+
+
+#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock)		\
+({								\
+ 	int ret = 0;						\
+	if (!f_try(&(_lock)->lock)) {				\
+		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
+		ret = f_lock(&(_lock)->lock);			\
+	}							\
+	if (!ret)						\
+		lock_acquired(&(_lock)->dep_map);		\
+ 	ret;							\
+})
+
+#define LOCK_CONTENDED_RT_RW(_lock, f_try, f_lock)		\
+do {								\
+	if (!f_try(&(_lock)->owners)) {				\
+		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
+		f_lock(&(_lock)->owners);			\
+	}							\
+	lock_acquired(&(_lock)->dep_map);			\
+} while (0)
+
+
 #else /* CONFIG_LOCK_STAT */
 
 #define lock_contended(lockdep_map, ip) do {} while (0)
@@ -365,6 +401,15 @@ do {								\
 #define LOCK_CONTENDED(_lock, try, lock) \
 	lock(_lock)
 
+#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \
+	f_lock(&(_lock)->lock)
+
+#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \
+	f_lock(&(_lock)->lock)
+
+#define LOCK_CONTENDED_RT_RW(_lock, f_try, f_lock) \
+	f_lock(&(_lock)->owners)
+
 #endif /* CONFIG_LOCK_STAT */
 
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS)
Index: linux-2.6.25.4-rt4/kernel/lockdep_internals.h
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/lockdep_internals.h	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/lockdep_internals.h	2008-05-29 09:46:18.000000000 -0400
@@ -15,12 +15,12 @@
  * table (if it's not there yet), and we check it for lock order
  * conflicts and deadlocks.
  */
-#define MAX_LOCKDEP_ENTRIES	8192UL
+#define MAX_LOCKDEP_ENTRIES	16384UL
 
 #define MAX_LOCKDEP_KEYS_BITS	11
 #define MAX_LOCKDEP_KEYS	(1UL << MAX_LOCKDEP_KEYS_BITS)
 
-#define MAX_LOCKDEP_CHAINS_BITS	14
+#define MAX_LOCKDEP_CHAINS_BITS	15
 #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
 
 /*
Index: linux-2.6.25.4-rt4/drivers/net/loopback.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/loopback.c	2008-05-29 09:45:54.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/loopback.c	2008-05-29 09:46:50.000000000 -0400
@@ -152,13 +152,13 @@ static int loopback_xmit(struct sk_buff 
 #endif
 	dev->last_rx = jiffies;
 
-	/* it's OK to use per_cpu_ptr() because BHs are off */
 	pcpu_lstats = netdev_priv(dev);
-	lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
+	lb_stats = per_cpu_ptr(pcpu_lstats, get_cpu());
 	lb_stats->bytes += skb->len;
 	lb_stats->packets++;
+	put_cpu();
 
-	netif_rx(skb);
+	netif_rx_ni(skb);
 
 	return 0;
 }
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/time.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/time.c	2008-05-29 09:46:18.000000000 -0400
@@ -796,7 +796,7 @@ static cycle_t rtc_read(void)
 	return (cycle_t)get_rtc();
 }
 
-static cycle_t timebase_read(void)
+static cycle_t notrace timebase_read(void)
 {
 	return (cycle_t)get_tb();
 }
Index: linux-2.6.25.4-rt4/include/asm-arm/atomic.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/atomic.h	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/atomic.h	2008-05-29 09:47:33.000000000 -0400
@@ -114,6 +114,46 @@ static inline void atomic_clear_mask(uns
 	: "cc");
 }
 
+/*
+ * Atomic compare and exchange.
+ */
+#define __HAVE_ARCH_CMPXCHG     1
+
+extern unsigned long wrong_size_cmpxchg(volatile void *ptr);
+
+static inline unsigned long __cmpxchg(volatile void *ptr,
+				unsigned long old,
+				unsigned long new, int size)
+{
+	volatile unsigned long *p = ptr;
+
+	if (size == 4) {
+		unsigned long oldval, res;
+
+		do {
+			__asm__ __volatile__("@ atomic_cmpxchg\n"
+			"ldrex  %1, [%2]\n"
+			"mov    %0, #0\n"
+			"teq    %1, %3\n"
+			"strexeq %0, %4, [%2]\n"
+			: "=&r" (res), "=&r" (oldval)
+			: "r" (p), "Ir" (old), "r" (new)
+			: "cc");
+		} while (res);
+
+		return oldval;
+	} else
+		return wrong_size_cmpxchg(ptr);
+}
+
+#define cmpxchg__disabled(ptr,o,n)						\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,	\
+		 (unsigned long)_n_, sizeof(*(ptr)));			\
+})
+
 #else /* ARM_ARCH_6 */
 
 #include <asm/system.h>
@@ -173,6 +213,41 @@ static inline void atomic_clear_mask(uns
 	raw_local_irq_restore(flags);
 }
 
+#ifndef CONFIG_SMP
+/*
+ * Atomic compare and exchange.
+ */
+#define __HAVE_ARCH_CMPXCHG	1
+
+extern unsigned long wrong_size_cmpxchg(volatile void *ptr);
+
+static inline unsigned long __cmpxchg(volatile void *ptr,
+				    unsigned long old,
+				    unsigned long new, int size)
+{
+	unsigned long flags, prev;
+	volatile unsigned long *p = ptr;
+
+	if (size == 4) {
+		raw_local_irq_save(flags);
+		if ((prev = *p) == old)
+			*p = new;
+		raw_local_irq_restore(flags);
+		return(prev);
+	} else
+		return wrong_size_cmpxchg(ptr);
+}
+
+#define cmpxchg(ptr,o,n)					  	\
+({									\
+     __typeof__(*(ptr)) _o_ = (o);					\
+     __typeof__(*(ptr)) _n_ = (n);					\
+     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		\
+				(unsigned long)_n_, sizeof(*(ptr)));	\
+})
+
+#endif
+
 #endif /* __LINUX_ARM_ARCH__ */
 
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
Index: linux-2.6.25.4-rt4/include/asm-arm/futex.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/futex.h	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/futex.h	2008-05-29 09:46:35.000000000 -0400
@@ -1,6 +1,125 @@
-#ifndef _ASM_FUTEX_H
-#define _ASM_FUTEX_H
+#ifndef _ASM_ARM_FUTEX_H
+#define _ASM_ARM_FUTEX_H
 
-#include <asm-generic/futex.h>
+#ifdef __KERNEL__
 
+#include <linux/futex.h>
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+
+extern raw_spinlock_t  futex_atomic_lock;
+
+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
+	__asm__ __volatile__ (						\
+	"1:	ldrt	%1, [%2]				\n"	\
+		insn							\
+	"2:	strt	%0, [%2]				\n"	\
+	"	mov	%0, #0					\n"	\
+	"3:							\n"	\
+	"	.section __ex_table, \"a\"			\n"	\
+	"	.align	3					\n"	\
+	"	.long	1b, 4f, 2b, 4f				\n"	\
+	"	.previous					\n"	\
+	"	.section .fixup,\"ax\"				\n"	\
+	"4:	mov	%0, %4					\n"	\
+	"	b	3b					\n"	\
+	"	.previous"						\
+	: "=&r" (ret), "=&r" (oldval)					\
+	: "r" (uaddr), "r" (oparg), "Ir" (-EFAULT)			\
+	: "cc", "memory")
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (!access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	pagefault_disable();
+
+	spin_lock(&futex_atomic_lock);
+
+	switch (op) {
+	case FUTEX_OP_SET:
+		__futex_atomic_op("	mov	%0, %3\n",
+				  ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ADD:
+		__futex_atomic_op("	add	%0, %1, %3\n",
+				  ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_OR:
+		__futex_atomic_op("	orr	%0, %1, %3\n",
+				  ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ANDN:
+		__futex_atomic_op("	and	%0, %1, %3\n",
+				  ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_XOR:
+		__futex_atomic_op("	eor	%0, %1, %3\n",
+				  ret, oldval, uaddr, oparg);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	spin_unlock(&futex_atomic_lock);
+
+	pagefault_enable();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+static inline int
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+{
+	int val;
+
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	spin_lock(&futex_atomic_lock);
+
+	__asm__ __volatile__( "@futex_atomic_cmpxchg_inatomic	\n"
+	"1:     ldrt    %0, [%3]				\n"
+	"       teq     %0, %1					\n"
+	"2:     streqt  %2, [%3]				\n"
+	"3:							\n"
+	"       .section __ex_table, \"a\"			\n"
+	"       .align 3					\n"
+	"       .long   1b, 4f, 2b, 4f				\n"
+	"       .previous					\n"
+	"       .section .fixup,\"ax\"				\n"
+	"4:     mov     %0, %4					\n"
+	"       b       3b					\n"
+	"       .previous"
+	: "=&r" (val)
+	: "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT)
+	: "cc");
+
+	spin_unlock(&futex_atomic_lock);
+
+	return val;
+}
+
+#endif
 #endif
Index: linux-2.6.25.4-rt4/arch/arm/kernel/process.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/process.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/process.c	2008-05-29 09:47:27.000000000 -0400
@@ -36,6 +36,8 @@
 #include <asm/uaccess.h>
 #include <asm/mach/time.h>
 
+DEFINE_RAW_SPINLOCK(futex_atomic_lock);
+
 static const char *processor_modes[] = {
   "USER_26", "FIQ_26" , "IRQ_26" , "SVC_26" , "UK4_26" , "UK5_26" , "UK6_26" , "UK7_26" ,
   "UK8_26" , "UK9_26" , "UK10_26", "UK11_26", "UK12_26", "UK13_26", "UK14_26", "UK15_26",
@@ -133,7 +135,7 @@ static void default_idle(void)
 		cpu_relax();
 	else {
 		local_irq_disable();
-		if (!need_resched()) {
+		if (!need_resched() && !need_resched_delayed()) {
 			timer_dyn_reprogram();
 			arch_idle();
 		}
@@ -165,13 +167,15 @@ void cpu_idle(void)
 			idle = default_idle;
 		leds_event(led_idle_start);
 		tick_nohz_stop_sched_tick();
-		while (!need_resched())
+		while (!need_resched() && !need_resched_delayed())
 			idle();
 		leds_event(led_idle_end);
+		local_irq_disable();
 		tick_nohz_restart_sched_tick();
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
Index: linux-2.6.25.4-rt4/include/linux/bottom_half.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/bottom_half.h	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/bottom_half.h	2008-05-29 09:47:02.000000000 -0400
@@ -1,10 +1,17 @@
 #ifndef _LINUX_BH_H
 #define _LINUX_BH_H
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+# define local_bh_disable()		do { } while (0)
+# define __local_bh_disable(ip)		do { } while (0)
+# define _local_bh_enable()		do { } while (0)
+# define local_bh_enable()		do { } while (0)
+# define local_bh_enable_ip(ip)		do { } while (0)
+#else
 extern void local_bh_disable(void);
-extern void __local_bh_enable(void);
 extern void _local_bh_enable(void);
 extern void local_bh_enable(void);
 extern void local_bh_enable_ip(unsigned long ip);
+#endif
 
 #endif /* _LINUX_BH_H */
Index: linux-2.6.25.4-rt4/include/linux/interrupt.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/interrupt.h	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/interrupt.h	2008-05-29 09:46:31.000000000 -0400
@@ -50,10 +50,12 @@
 #define IRQF_SAMPLE_RANDOM	0x00000040
 #define IRQF_SHARED		0x00000080
 #define IRQF_PROBE_SHARED	0x00000100
-#define IRQF_TIMER		0x00000200
+#define __IRQF_TIMER		0x00000200
 #define IRQF_PERCPU		0x00000400
 #define IRQF_NOBALANCING	0x00000800
 #define IRQF_IRQPOLL		0x00001000
+#define IRQF_NODELAY		0x00002000
+#define IRQF_TIMER		(__IRQF_TIMER | IRQF_NODELAY)
 
 typedef irqreturn_t (*irq_handler_t)(int, void *);
 
@@ -65,7 +67,7 @@ struct irqaction {
 	void *dev_id;
 	struct irqaction *next;
 	int irq;
-	struct proc_dir_entry *dir;
+	struct proc_dir_entry *dir, *threaded;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
@@ -95,7 +97,7 @@ extern void devm_free_irq(struct device 
 #ifdef CONFIG_LOCKDEP
 # define local_irq_enable_in_hardirq()	do { } while (0)
 #else
-# define local_irq_enable_in_hardirq()	local_irq_enable()
+# define local_irq_enable_in_hardirq()	local_irq_enable_nort()
 #endif
 
 extern void disable_irq_nosync(unsigned int irq);
@@ -196,6 +198,7 @@ static inline int disable_irq_wake(unsig
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
+// FIXME: PREEMPT_RT: set_bit()?
 #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
 #endif
 
@@ -257,6 +260,8 @@ enum
 	HRTIMER_SOFTIRQ,
 #endif
 	RCU_SOFTIRQ, 	/* Preferable RCU should always be the last softirq */
+	/* Entries after this are ignored in split softirq mode */
+	MAX_SOFTIRQ,
 };
 
 /* softirq mask and active fields moved to irq_cpustat_t in
@@ -269,13 +274,26 @@ struct softirq_action
 	void	*data;
 };
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr)
+# define __do_raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+#else
+# define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr)
+#endif
+
 asmlinkage void do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data);
 extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
+extern void wakeup_irqd(void);
 
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern void wait_for_softirq(int softirq);
+#else
+# define wait_for_softirq(x) do {} while(0)
+#endif
 
 /* Tasklets --- multithreaded analogue of BHs.
 
@@ -290,8 +308,9 @@ extern void raise_softirq(unsigned int n
      to be executed on some cpu at least once after this.
    * If the tasklet is already scheduled, but its excecution is still not
      started, it will be executed only once.
-   * If this tasklet is already running on another CPU (or schedule is called
-     from tasklet itself), it is rescheduled for later.
+   * If this tasklet is already running on another CPU, it is rescheduled
+     for later.
+   * Schedule must not be called from the tasklet itself (a lockup occurs)
    * Tasklet is strictly serialized wrt itself, but not
      wrt another tasklets. If client needs some intertask synchronization,
      he makes it with spinlocks.
@@ -316,27 +335,36 @@ struct tasklet_struct name = { NULL, 0, 
 enum
 {
 	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
-	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_RUN,	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_PENDING	/* Tasklet is pending */
 };
 
-#ifdef CONFIG_SMP
+#define TASKLET_STATEF_SCHED	(1 << TASKLET_STATE_SCHED)
+#define TASKLET_STATEF_RUN	(1 << TASKLET_STATE_RUN)
+#define TASKLET_STATEF_PENDING	(1 << TASKLET_STATE_PENDING)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
+static inline int tasklet_tryunlock(struct tasklet_struct *t)
+{
+	return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
+}
+
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_clear_bit(); 
 	clear_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
-{
-	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
-}
+extern void tasklet_unlock_wait(struct tasklet_struct *t);
+
 #else
 #define tasklet_trylock(t) 1
+#define tasklet_tryunlock(t)	1
 #define tasklet_unlock_wait(t) do { } while (0)
 #define tasklet_unlock(t) do { } while (0)
 #endif
@@ -371,22 +399,14 @@ static inline void tasklet_disable(struc
 	smp_mb();
 }
 
-static inline void tasklet_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
-
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
+extern  void tasklet_enable(struct tasklet_struct *t);
+extern  void tasklet_hi_enable(struct tasklet_struct *t);
 
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 			 void (*func)(unsigned long), unsigned long data);
+void takeover_tasklets(unsigned int cpu);
 
 /*
  * Autoprobing for irqs:
@@ -446,4 +466,37 @@ static inline void init_irq_proc(void)
 
 int show_interrupts(struct seq_file *p, void *v);
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_irq_disable_nort()	do { } while (0)
+# define local_irq_enable_nort()	do { } while (0)
+# define local_irq_enable_rt()		local_irq_enable()
+# define local_irq_save_nort(flags)	do { local_save_flags(flags); } while (0)
+# define local_irq_restore_nort(flags)	do { (void)(flags); } while (0)
+# define spin_lock_nort(lock)		do { } while (0)
+# define spin_unlock_nort(lock)		do { } while (0)
+# define spin_lock_bh_nort(lock)	do { } while (0)
+# define spin_unlock_bh_nort(lock)	do { } while (0)
+# define spin_lock_rt(lock)		spin_lock(lock)
+# define spin_unlock_rt(lock)		spin_unlock(lock)
+# define smp_processor_id_rt(cpu)	(cpu)
+# define in_atomic_rt()			(!oops_in_progress && \
+					  (in_atomic() || irqs_disabled()))
+# define read_trylock_rt(lock)		({read_lock(lock); 1; })
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_enable_rt()		do { } while (0)
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define spin_lock_rt(lock)		do { } while (0)
+# define spin_unlock_rt(lock)		do { } while (0)
+# define spin_lock_nort(lock)		spin_lock(lock)
+# define spin_unlock_nort(lock)		spin_unlock(lock)
+# define spin_lock_bh_nort(lock)	spin_lock_bh(lock)
+# define spin_unlock_bh_nort(lock)	spin_unlock_bh(lock)
+# define smp_processor_id_rt(cpu)	smp_processor_id()
+# define in_atomic_rt()			0
+# define read_trylock_rt(lock)		read_trylock(lock)
+#endif
+
 #endif
Index: linux-2.6.25.4-rt4/kernel/softirq.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/softirq.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/softirq.c	2008-05-29 09:47:23.000000000 -0400
@@ -6,15 +6,23 @@
  *	Distribute under GPLv2.
  *
  *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ *	Softirq-split implemetation by
+ *	Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
  */
 
 #include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/delay.h>
 #include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/percpu.h>
+#include <linux/delay.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
@@ -48,7 +56,41 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+struct softirqdata {
+	int			nr;
+	unsigned long		cpu;
+	struct task_struct	*tsk;
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t	wait;
+	int			running;
+#endif
+};
+
+static DEFINE_PER_CPU(struct softirqdata [MAX_SOFTIRQ], ksoftirqd);
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+/*
+ * Preempting the softirq causes cases that would not be a
+ * problem when the softirq is not preempted. That is a
+ * process may have code to spin while waiting for a softirq
+ * to finish on another CPU.  But if it happens that the
+ * process has preempted the softirq, this could cause a
+ * deadlock.
+ */
+void wait_for_softirq(int softirq)
+{
+	struct softirqdata *data = &__get_cpu_var(ksoftirqd)[softirq];
+	if (data->running) {
+		DECLARE_WAITQUEUE(wait, current);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&data->wait, &wait);
+		if (data->running)
+			schedule();
+		remove_wait_queue(&data->wait, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+}
+#endif
 
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
@@ -56,15 +98,37 @@ static DEFINE_PER_CPU(struct task_struct
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+static void wakeup_softirqd(int softirq)
 {
 	/* Interrupts are disabled: no need to stop preemption */
-	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk;
 
-	if (tsk && tsk->state != TASK_RUNNING)
-		wake_up_process(tsk);
+	if (unlikely(!tsk))
+		return;
+	/*
+	 * Wake up the softirq task:
+	 */
+	wake_up_process(tsk);
+}
+
+/*
+ * Wake up the softirq threads which have work
+ */
+static void trigger_softirqs(void)
+{
+	u32 pending = local_softirq_pending();
+	int curr = 0;
+
+	while (pending) {
+		if (pending & 1)
+			wakeup_softirqd(curr);
+		pending >>= 1;
+		curr++;
+	}
 }
 
+#ifndef CONFIG_PREEMPT_HARDIRQS
+
 /*
  * This one is for softirq.c-internal use,
  * where hardirqs are disabled legitimately:
@@ -100,20 +164,6 @@ void local_bh_disable(void)
 
 EXPORT_SYMBOL(local_bh_disable);
 
-void __local_bh_enable(void)
-{
-	WARN_ON_ONCE(in_irq());
-
-	/*
-	 * softirqs should never be enabled by __local_bh_enable(),
-	 * it always nests inside local_bh_enable() sections:
-	 */
-	WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
-
-	sub_preempt_count(SOFTIRQ_OFFSET);
-}
-EXPORT_SYMBOL_GPL(__local_bh_enable);
-
 /*
  * Special-case - softirqs can safely be enabled in
  * cond_resched_softirq(), or by __do_softirq(),
@@ -138,7 +188,6 @@ void local_bh_enable(void)
 
 	WARN_ON_ONCE(in_irq());
 #endif
-	WARN_ON_ONCE(irqs_disabled());
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_save(flags);
@@ -196,6 +245,8 @@ void local_bh_enable_ip(unsigned long ip
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+#endif
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -205,52 +256,130 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  * we want to handle softirqs as soon as possible, but they
  * should not be able to lock up the box.
  */
-#define MAX_SOFTIRQ_RESTART 10
+#define MAX_SOFTIRQ_RESTART 20
 
-asmlinkage void __do_softirq(void)
+static DEFINE_PER_CPU(u32, softirq_running);
+
+static void ___do_softirq(const int same_prio_only)
 {
+	int max_restart = MAX_SOFTIRQ_RESTART, max_loops = MAX_SOFTIRQ_RESTART;
+	__u32 pending, available_mask, same_prio_skipped;
 	struct softirq_action *h;
-	__u32 pending;
-	int max_restart = MAX_SOFTIRQ_RESTART;
-	int cpu;
+	struct task_struct *tsk;
+	int cpu, softirq;
 
 	pending = local_softirq_pending();
 	account_system_vtime(current);
 
-	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	trace_softirq_enter();
-
 	cpu = smp_processor_id();
 restart:
+	available_mask = -1;
+	softirq = 0;
+	same_prio_skipped = 0;
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	local_irq_enable();
-
 	h = softirq_vec;
 
 	do {
+		u32 softirq_mask = 1 << softirq;
+
 		if (pending & 1) {
+			u32 preempt_count = preempt_count();
+
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS)
+			/*
+			 * If executed by a same-prio hardirq thread
+			 * then skip pending softirqs that belong
+			 * to softirq threads with different priority:
+			 */
+			if (same_prio_only) {
+				tsk = __get_cpu_var(ksoftirqd)[softirq].tsk;
+				if (tsk && tsk->normal_prio !=
+						current->normal_prio) {
+					same_prio_skipped |= softirq_mask;
+					available_mask &= ~softirq_mask;
+					goto next;
+				}
+			}
+#endif
+			/*
+			 * Is this softirq already being processed?
+			 */
+			if (per_cpu(softirq_running, cpu) & softirq_mask) {
+				available_mask &= ~softirq_mask;
+				goto next;
+			}
+			per_cpu(softirq_running, cpu) |= softirq_mask;
+			local_irq_enable();
+
 			h->action(h);
+			if (preempt_count != preempt_count()) {
+				print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action);
+				printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+				preempt_count() = preempt_count;
+			}
 			rcu_bh_qsctr_inc(cpu);
+			cond_resched_softirq_context();
+			local_irq_disable();
+			per_cpu(softirq_running, cpu) &= ~softirq_mask;
 		}
+next:
 		h++;
+		softirq++;
 		pending >>= 1;
 	} while (pending);
 
-	local_irq_disable();
-
+	or_softirq_pending(same_prio_skipped);
 	pending = local_softirq_pending();
-	if (pending && --max_restart)
-		goto restart;
+	if (pending & available_mask) {
+		if (--max_restart)
+			goto restart;
+		/*
+		 * With softirq threading there's no reason not to
+		 * finish the workload we have:
+		 */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		if (--max_loops) {
+			if (printk_ratelimit())
+				printk("INFO: softirq overload: %08x\n", pending);
+			max_restart = MAX_SOFTIRQ_RESTART;
+			goto restart;
+		}
+		if (printk_ratelimit())
+			printk("BUG: softirq loop! %08x\n", pending);
+#endif
+	}
 
 	if (pending)
-		wakeup_softirqd();
+		trigger_softirqs();
+}
+
+asmlinkage void __do_softirq(void)
+{
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	/*
+	 * 'preempt harder'. Push all softirq processing off to ksoftirqd.
+	 */
+	if (softirq_preemption) {
+		if (local_softirq_pending())
+			trigger_softirqs();
+		return;
+	}
+#endif
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	trace_softirq_enter();
+
+	___do_softirq(0);
 
 	trace_softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
+
 }
 
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -315,7 +444,7 @@ void irq_exit(void)
 		tick_nohz_stop_sched_tick();
 	rcu_irq_exit();
 #endif
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 }
 
 /*
@@ -323,19 +452,11 @@ void irq_exit(void)
  */
 inline void raise_softirq_irqoff(unsigned int nr)
 {
-	__raise_softirq_irqoff(nr);
+	__do_raise_softirq_irqoff(nr);
 
-	/*
-	 * If we're in an interrupt or softirq, we're done
-	 * (this also catches softirq-disabled code). We will
-	 * actually run the softirq once we return from
-	 * the irq or softirq.
-	 *
-	 * Otherwise we wake up ksoftirqd to make sure we
-	 * schedule the softirq soon.
-	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wakeup_softirqd(nr);
+#endif
 }
 
 void raise_softirq(unsigned int nr)
@@ -364,14 +485,45 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
 
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+	if (tasklet_trylock(t)) {
+again:
+		/* We may have been preempted before tasklet_trylock
+		 * and __tasklet_action may have already run.
+		 * So double check the sched bit while the takslet
+		 * is locked before adding it to the list.
+		 */
+		if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
+			WARN_ON(t->next != NULL);
+			t->next = head->list;
+			head->list = t;
+			raise_softirq_irqoff(nr);
+			tasklet_unlock(t);
+		} else {
+			/* This is subtle. If we hit the corner case above
+			 * It is possible that we get preempted right here,
+			 * and another task has successfully called
+			 * tasklet_schedule(), then this function, and
+			 * failed on the trylock. Thus we must be sure
+			 * before releasing the tasklet lock, that the
+			 * SCHED_BIT is clear. Otherwise the tasklet
+			 * may get its SCHED_BIT set, but not added to the
+			 * list
+			 */
+			if (!tasklet_tryunlock(t))
+				goto again;
+		}
+	}
+}
+
 void __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = t;
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -382,81 +534,130 @@ void __tasklet_hi_schedule(struct taskle
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = t;
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
-static void tasklet_action(struct softirq_action *a)
+void  tasklet_enable(struct tasklet_struct *t)
 {
-	struct tasklet_struct *list;
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_schedule(t);
+}
 
-	local_irq_disable();
-	list = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = NULL;
-	local_irq_enable();
+EXPORT_SYMBOL(tasklet_enable);
+
+void  tasklet_hi_enable(struct tasklet_struct *t)
+{
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_hi_schedule(t);
+}
+
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void
+__tasklet_action(struct softirq_action *a, struct tasklet_struct *list)
+{
+	int loops = 1000000;
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
+		/*
+		 * Should always succeed - after a tasklist got on the
+		 * list (after getting the SCHED bit set from 0 to 1),
+		 * nothing but the tasklet softirq it got queued to can
+		 * lock it:
+		 */
+		if (!tasklet_trylock(t)) {
+			WARN_ON(1);
+			continue;
+		}
+
+		t->next = NULL;
+
+		/*
+		 * If we cannot handle the tasklet because it's disabled,
+		 * mark it as pending. tasklet_enable() will later
+		 * re-schedule the tasklet.
+		 */
+		if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+			/* implicit unlock: */
+			wmb();
+			t->state = TASKLET_STATEF_PENDING;
+			continue;
+		}
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
+		/*
+		 * After this point on the tasklet might be rescheduled
+		 * on another CPU, but it can only be added to another
+		 * CPU's tasklet list if we unlock the tasklet (which we
+		 * dont do yet).
+		 */
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+			WARN_ON(1);
+
+again:
+		t->func(t->data);
+
+		/*
+		 * Try to unlock the tasklet. We must use cmpxchg, because
+		 * another CPU might have scheduled or disabled the tasklet.
+		 * We only allow the STATE_RUN -> 0 transition here.
+		 */
+		while (!tasklet_tryunlock(t)) {
+			/*
+			 * If it got disabled meanwhile, bail out:
+			 */
+			if (atomic_read(&t->count))
+				goto out_disabled;
+			/*
+			 * If it got scheduled meanwhile, re-execute
+			 * the tasklet function:
+			 */
+			if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+				goto again;
+			if (!--loops) {
+				printk("hm, tasklet state: %08lx\n", t->state);
+				WARN_ON(1);
 				tasklet_unlock(t);
-				continue;
+				break;
 			}
-			tasklet_unlock(t);
 		}
-
-		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_vec).list;
-		__get_cpu_var(tasklet_vec).list = t;
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
 	}
 }
 
-static void tasklet_hi_action(struct softirq_action *a)
+static void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
 
 	local_irq_disable();
-	list = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	list = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = NULL;
 	local_irq_enable();
 
-	while (list) {
-		struct tasklet_struct *t = list;
+	__tasklet_action(a, list);
+}
 
-		list = list->next;
+static void tasklet_hi_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
-		}
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	local_irq_enable();
 
-		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_hi_vec).list;
-		__get_cpu_var(tasklet_hi_vec).list = t;
-		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
-	}
+	__tasklet_action(a, list);
 }
 
-
 void tasklet_init(struct tasklet_struct *t,
 		  void (*func)(unsigned long), unsigned long data)
 {
@@ -476,7 +677,7 @@ void tasklet_kill(struct tasklet_struct 
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do
-			yield();
+			msleep(1);
 		while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -491,33 +692,98 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
 }
 
-static int ksoftirqd(void * __bind_cpu)
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+
+void tasklet_unlock_wait(struct tasklet_struct *t)
 {
+	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+		/*
+		 * Hack for now to avoid this busy-loop:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		msleep(1);
+#else
+		barrier();
+#endif
+	}
+}
+EXPORT_SYMBOL(tasklet_unlock_wait);
+
+#endif
+
+static int ksoftirqd(void * __data)
+{
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 };
+	struct softirqdata *data = __data;
+	u32 softirq_mask = (1 << data->nr);
+	struct softirq_action *h;
+	int cpu = data->cpu;
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	init_waitqueue_head(&data->wait);
+#endif
+
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+	current->flags |= PF_SOFTIRQ;
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop()) {
 		preempt_disable();
-		if (!local_softirq_pending()) {
-			preempt_enable_no_resched();
+		if (!(local_softirq_pending() & softirq_mask)) {
+sleep_more:
+			__preempt_enable_no_resched();
 			schedule();
 			preempt_disable();
 		}
 
 		__set_current_state(TASK_RUNNING);
 
-		while (local_softirq_pending()) {
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		data->running = 1;
+#endif
+
+		while (local_softirq_pending() & softirq_mask) {
 			/* Preempt disable stops cpu going offline.
 			   If already offline, we'll be on wrong CPU:
 			   don't process */
-			if (cpu_is_offline((long)__bind_cpu))
+			if (cpu_is_offline(cpu))
 				goto wait_to_die;
-			do_softirq();
-			preempt_enable_no_resched();
+
+			local_irq_disable();
+			/*
+			 * Is the softirq already being executed by
+			 * a hardirq context?
+			 */
+			if (per_cpu(softirq_running, cpu) & softirq_mask) {
+				local_irq_enable();
+				set_current_state(TASK_INTERRUPTIBLE);
+				goto sleep_more;
+			}
+			per_cpu(softirq_running, cpu) |= softirq_mask;
+			__preempt_enable_no_resched();
+			set_softirq_pending(local_softirq_pending() & ~softirq_mask);
+			local_bh_disable();
+			local_irq_enable();
+
+			h = &softirq_vec[data->nr];
+			if (h)
+				h->action(h);
+			rcu_bh_qsctr_inc(data->cpu);
+
+			local_irq_disable();
+			per_cpu(softirq_running, cpu) &= ~softirq_mask;
+			_local_bh_enable();
+			local_irq_enable();
+
 			cond_resched();
 			preempt_disable();
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		data->running = 0;
+		wake_up(&data->wait);
+#endif
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
@@ -564,7 +830,7 @@ void tasklet_kill_immediate(struct taskl
 	BUG();
 }
 
-static void takeover_tasklets(unsigned int cpu)
+void takeover_tasklets(unsigned int cpu)
 {
 	struct tasklet_struct **i;
 
@@ -586,49 +852,83 @@ static void takeover_tasklets(unsigned i
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static const char *softirq_names [] =
+{
+  [HI_SOFTIRQ]		= "high",
+  [SCHED_SOFTIRQ]	= "sched",
+  [TIMER_SOFTIRQ]	= "timer",
+  [NET_TX_SOFTIRQ]	= "net-tx",
+  [NET_RX_SOFTIRQ]	= "net-rx",
+  [BLOCK_SOFTIRQ]	= "block",
+  [TASKLET_SOFTIRQ]	= "tasklet",
+#ifdef CONFIG_HIGH_RES_TIMERS
+  [HRTIMER_SOFTIRQ]	= "hrtimer",
+#endif
+  [RCU_SOFTIRQ]		= "rcu",
+};
+
 static int __cpuinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
-	int hotcpu = (unsigned long)hcpu;
+	int hotcpu = (unsigned long)hcpu, i;
 	struct task_struct *p;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
-		if (IS_ERR(p)) {
-			printk("ksoftirqd for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
-		}
-		kthread_bind(p, hotcpu);
-  		per_cpu(ksoftirqd, hotcpu) = p;
- 		break;
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			per_cpu(ksoftirqd, hotcpu)[i].nr = i;
+			per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu;
+			per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL;
+		}
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			p = kthread_create(ksoftirqd,
+					   &per_cpu(ksoftirqd, hotcpu)[i],
+					   "sirq-%s/%d", softirq_names[i],
+					   hotcpu);
+			if (IS_ERR(p)) {
+				printk("ksoftirqd %d for %i failed\n", i,
+				       hotcpu);
+				return NOTIFY_BAD;
+			}
+			kthread_bind(p, hotcpu);
+			per_cpu(ksoftirqd, hotcpu)[i].tsk = p;
+		}
+		break;
+	break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
-		if (!per_cpu(ksoftirqd, hotcpu))
-			break;
-		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(ksoftirqd, hotcpu),
-			     any_online_cpu(cpu_online_map));
+#if 0
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			if (!per_cpu(ksoftirqd, hotcpu)[i].tsk)
+				continue;
+			kthread_bind(per_cpu(ksoftirqd, hotcpu)[i].tsk,
+				     any_online_cpu(cpu_online_map));
+		}
+#endif
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN: {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+		struct sched_param param;
 
-		p = per_cpu(ksoftirqd, hotcpu);
-		per_cpu(ksoftirqd, hotcpu) = NULL;
-		sched_setscheduler(p, SCHED_FIFO, &param);
-		kthread_stop(p);
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			param.sched_priority = MAX_RT_PRIO-1;
+			p = per_cpu(ksoftirqd, hotcpu)[i].tsk;
+			sched_setscheduler(p, SCHED_FIFO, &param);
+			per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL;
+			kthread_stop(p);
+		}
 		takeover_tasklets(hotcpu);
 		break;
-	}
-#endif /* CONFIG_HOTPLUG_CPU */
  	}
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
 	return NOTIFY_OK;
 }
 
@@ -647,6 +947,34 @@ __init int spawn_ksoftirqd(void)
 	return 0;
 }
 
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+
+int softirq_preemption = 1;
+
+EXPORT_SYMBOL(softirq_preemption);
+
+/*
+ * Real-Time Preemption depends on softirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init softirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		softirq_preemption = 0;
+	else
+		get_option(&str, &softirq_preemption);
+	if (!softirq_preemption)
+		printk("turning off softirq preemption!\n");
+
+	return 1;
+}
+
+__setup("softirq-preempt=", softirq_preempt_setup);
+#endif
+#endif
+
 #ifdef CONFIG_SMP
 /*
  * Call a function on all processors
Index: linux-2.6.25.4-rt4/include/linux/irq.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/irq.h	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/irq.h	2008-05-29 09:46:47.000000000 -0400
@@ -19,10 +19,12 @@
 #include <linux/cpumask.h>
 #include <linux/irqreturn.h>
 #include <linux/errno.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
 #include <asm/irq_regs.h>
+#include <asm/timex.h>
 
 struct irq_desc;
 typedef	void (*irq_flow_handler_t)(unsigned int irq,
@@ -61,6 +63,7 @@ typedef	void (*irq_flow_handler_t)(unsig
 #define IRQ_WAKEUP		0x00100000	/* IRQ triggers system wakeup */
 #define IRQ_MOVE_PENDING	0x00200000	/* need to re-target IRQ destination */
 #define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
+#define IRQ_NODELAY		0x40000000	/* IRQ must run immediately */
 
 #ifdef CONFIG_IRQ_PER_CPU
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
@@ -141,6 +144,8 @@ struct irq_chip {
  * @irq_count:		stats field to detect stalled irqs
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @last_unhandled:	aging timer for unhandled count
+ * @thread:		Thread pointer for threaded preemptible irq handling
+ * @wait_for_handler:	Waitqueue to wait for a running preemptible handler
  * @lock:		locking for SMP
  * @affinity:		IRQ affinity on SMP
  * @cpu:		cpu index useful for balancing
@@ -163,7 +168,10 @@ struct irq_desc {
 	unsigned int		irq_count;	/* For detecting broken IRQs */
 	unsigned int		irqs_unhandled;
 	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
-	spinlock_t		lock;
+	struct task_struct	*thread;
+	wait_queue_head_t	wait_for_handler;
+	cycles_t		timestamp;
+	raw_spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_t		affinity;
 	unsigned int		cpu;
@@ -396,7 +404,21 @@ extern int set_irq_msi(unsigned int irq,
 #define get_irq_data(irq)	(irq_desc[irq].handler_data)
 #define get_irq_msi(irq)	(irq_desc[irq].msi_desc)
 
-#endif /* CONFIG_GENERIC_HARDIRQS */
+/* Early initialization of irqs */
+extern void early_init_hardirqs(void);
+
+#if defined(CONFIG_PREEMPT_HARDIRQS)
+extern void init_hardirqs(void);
+#else
+static inline void init_hardirqs(void) { }
+#endif
+
+#else	/* end GENERIC HARDIRQS */
+
+static inline void early_init_hardirqs(void) { }
+static inline void init_hardirqs(void) { }
+
+#endif /* !CONFIG_GENERIC_HARDIRQS */
 
 #endif /* !CONFIG_S390 */
 
Index: linux-2.6.25.4-rt4/kernel/irq/autoprobe.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/irq/autoprobe.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/irq/autoprobe.c	2008-05-29 09:46:20.000000000 -0400
@@ -7,6 +7,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
Index: linux-2.6.25.4-rt4/kernel/irq/chip.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/irq/chip.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/irq/chip.c	2008-05-29 09:47:06.000000000 -0400
@@ -287,8 +287,10 @@ static inline void mask_ack_irq(struct i
 	if (desc->chip->mask_ack)
 		desc->chip->mask_ack(irq);
 	else {
-		desc->chip->mask(irq);
-		desc->chip->ack(irq);
+		if (desc->chip->mask)
+			desc->chip->mask(irq);
+		if (desc->chip->ack)
+			desc->chip->ack(irq);
 	}
 }
 
@@ -313,8 +315,10 @@ handle_simple_irq(unsigned int irq, stru
 
 	spin_lock(&desc->lock);
 
-	if (unlikely(desc->status & IRQ_INPROGRESS))
+	if (unlikely(desc->status & IRQ_INPROGRESS)) {
+		desc->status |= IRQ_PENDING;
 		goto out_unlock;
+	}
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_cpu(cpu).irqs[irq]++;
 
@@ -323,6 +327,11 @@ handle_simple_irq(unsigned int irq, stru
 		goto out_unlock;
 
 	desc->status |= IRQ_INPROGRESS;
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_unlock;
 	spin_unlock(&desc->lock);
 
 	action_ret = handle_IRQ_event(irq, action);
@@ -331,6 +340,8 @@ handle_simple_irq(unsigned int irq, stru
 
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+		desc->chip->unmask(irq);
 out_unlock:
 	spin_unlock(&desc->lock);
 }
@@ -369,6 +380,13 @@ handle_level_irq(unsigned int irq, struc
 		goto out_unlock;
 
 	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_unlock;
+
 	spin_unlock(&desc->lock);
 
 	action_ret = handle_IRQ_event(irq, action);
@@ -402,18 +420,16 @@ handle_fasteoi_irq(unsigned int irq, str
 
 	spin_lock(&desc->lock);
 
-	if (unlikely(desc->status & IRQ_INPROGRESS))
-		goto out;
-
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_cpu(cpu).irqs[irq]++;
 
 	/*
-	 * If its disabled or no action available
+	 * If it's running, disabled or no action available
 	 * then mask it and get out of here:
 	 */
 	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+	if (unlikely(!action || (desc->status & (IRQ_INPROGRESS |
+						 IRQ_DISABLED)))) {
 		desc->status |= IRQ_PENDING;
 		if (desc->chip->mask)
 			desc->chip->mask(irq);
@@ -421,6 +437,15 @@ handle_fasteoi_irq(unsigned int irq, str
 	}
 
 	desc->status |= IRQ_INPROGRESS;
+	/*
+	 * In the threaded case we fall back to a mask+eoi sequence:
+	 */
+	if (redirect_hardirq(desc)) {
+		if (desc->chip->mask)
+			desc->chip->mask(irq);
+		goto out;
+	}
+
 	desc->status &= ~IRQ_PENDING;
 	spin_unlock(&desc->lock);
 
@@ -430,9 +455,10 @@ handle_fasteoi_irq(unsigned int irq, str
 
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+		desc->chip->unmask(irq);
 out:
 	desc->chip->eoi(irq);
-
 	spin_unlock(&desc->lock);
 }
 
@@ -481,6 +507,12 @@ handle_edge_irq(unsigned int irq, struct
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
 
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_unlock;
+
 	do {
 		struct irqaction *action = desc->action;
 		irqreturn_t action_ret;
Index: linux-2.6.25.4-rt4/kernel/irq/handle.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/irq/handle.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/irq/handle.c	2008-05-29 09:46:47.000000000 -0400
@@ -13,6 +13,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 
@@ -53,12 +54,13 @@ struct irq_desc irq_desc[NR_IRQS] __cach
 		.chip = &no_irq_chip,
 		.handle_irq = handle_bad_irq,
 		.depth = 1,
-		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
+		.lock = RAW_SPIN_LOCK_UNLOCKED(irq_desc),
 #ifdef CONFIG_SMP
 		.affinity = CPU_MASK_ALL
 #endif
 	}
 };
+EXPORT_SYMBOL_GPL(irq_desc);
 
 /*
  * What should we do if we get a hw irq event on an illegal vector?
@@ -131,26 +133,87 @@ irqreturn_t handle_IRQ_event(unsigned in
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
 
+#ifdef __i386__
+	if (debug_direct_keyboard && irq == 1)
+		lockdep_off();
+#endif
+
 	handle_dynamic_tick(action);
 
-	if (!(action->flags & IRQF_DISABLED))
-		local_irq_enable_in_hardirq();
+	/*
+	 * Unconditionally enable interrupts for threaded
+	 * IRQ handlers:
+	 */
+	if (!hardirq_count() || !(action->flags & IRQF_DISABLED))
+		local_irq_enable();
 
 	do {
+		unsigned int preempt_count = preempt_count();
+
 		ret = action->handler(irq, action->dev_id);
+		if (preempt_count() != preempt_count) {
+			print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler);
+			printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+			dump_stack();
+			preempt_count() = preempt_count;
+		}
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
 		action = action->next;
 	} while (action);
 
-	if (status & IRQF_SAMPLE_RANDOM)
+	if (status & IRQF_SAMPLE_RANDOM) {
+		local_irq_enable();
 		add_interrupt_randomness(irq);
+	}
 	local_irq_disable();
 
+#ifdef __i386__
+	if (debug_direct_keyboard && irq == 1)
+		lockdep_on();
+#endif
 	return retval;
 }
 
+/*
+ * Hack - used for development only.
+ */
+int __read_mostly debug_direct_keyboard = 0;
+
+int __init debug_direct_keyboard_setup(char *str)
+{
+	debug_direct_keyboard = 1;
+	printk(KERN_INFO "Switching IRQ 1 (keyboard) to to direct!\n");
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_INFO "WARNING: kernel may easily crash this way!\n");
+#endif
+	return 1;
+}
+
+__setup("debug_direct_keyboard", debug_direct_keyboard_setup);
+
+int redirect_hardirq(struct irq_desc *desc)
+{
+	/*
+	 * Direct execution:
+	 */
+	if (!hardirq_preemption || (desc->status & IRQ_NODELAY) ||
+							!desc->thread)
+		return 0;
+
+#ifdef __i386__
+	if (debug_direct_keyboard && (desc - irq_desc == 1))
+		return 0;
+#endif
+
+	BUG_ON(!irqs_disabled());
+	if (desc->thread && desc->thread->state != TASK_RUNNING)
+		wake_up_process(desc->thread);
+
+	return 1;
+}
+
 #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 /**
  * __do_IRQ - original all in one highlevel IRQ handler
@@ -186,6 +249,13 @@ unsigned int __do_IRQ(unsigned int irq)
 		desc->chip->end(irq);
 		return 1;
 	}
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(get_irq_regs()))
+		touch_softlockup_watchdog();
 
 	spin_lock(&desc->lock);
 	if (desc->chip->ack)
Index: linux-2.6.25.4-rt4/kernel/irq/internals.h
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/irq/internals.h	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/irq/internals.h	2008-05-29 09:46:20.000000000 -0400
@@ -10,6 +10,10 @@ extern void irq_chip_set_defaults(struct
 /* Set default handler: */
 extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 
+extern int redirect_hardirq(struct irq_desc *desc);
+
+void recalculate_desc_flags(struct irq_desc *desc);
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
Index: linux-2.6.25.4-rt4/kernel/irq/manage.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/irq/manage.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/irq/manage.c	2008-05-29 09:47:16.000000000 -0400
@@ -8,8 +8,10 @@
  */
 
 #include <linux/irq.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/syscalls.h>
 #include <linux/interrupt.h>
 
 #include "internals.h"
@@ -41,8 +43,12 @@ void synchronize_irq(unsigned int irq)
 		 * Wait until we're out of the critical section.  This might
 		 * give the wrong answer due to the lack of memory barriers.
 		 */
-		while (desc->status & IRQ_INPROGRESS)
-			cpu_relax();
+		if (hardirq_preemption && !(desc->status & IRQ_NODELAY))
+			wait_event(desc->wait_for_handler,
+				   !(desc->status & IRQ_INPROGRESS));
+		else
+			while (desc->status & IRQ_INPROGRESS)
+				cpu_relax();
 
 		/* Ok, that indicated we're done: double-check carefully. */
 		spin_lock_irqsave(&desc->lock, flags);
@@ -185,6 +191,14 @@ void enable_irq(unsigned int irq)
 		desc->depth--;
 	}
 	spin_unlock_irqrestore(&desc->lock, flags);
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+	/*
+	 * Do a bh disable/enable pair to trigger any pending
+	 * irq resend logic:
+	 */
+	local_bh_disable();
+	local_bh_enable();
+#endif
 }
 EXPORT_SYMBOL(enable_irq);
 
@@ -234,6 +248,21 @@ int set_irq_wake(unsigned int irq, unsig
 EXPORT_SYMBOL(set_irq_wake);
 
 /*
+ * If any action has IRQF_NODELAY then turn IRQ_NODELAY on:
+ */
+void recalculate_desc_flags(struct irq_desc *desc)
+{
+	struct irqaction *action;
+
+	desc->status &= ~IRQ_NODELAY;
+	for (action = desc->action ; action; action = action->next)
+		if (action->flags & IRQF_NODELAY)
+			desc->status |= IRQ_NODELAY;
+}
+
+static int start_irq_thread(int irq, struct irq_desc *desc);
+
+/*
  * Internal function that tells the architecture code whether a
  * particular irq has been exclusively allocated or is available
  * for driver use.
@@ -298,6 +327,9 @@ int setup_irq(unsigned int irq, struct i
 		rand_initialize_irq(irq);
 	}
 
+	if (!(new->flags & IRQF_NODELAY))
+		if (start_irq_thread(irq, desc))
+			return -ENOMEM;
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -338,6 +370,11 @@ int setup_irq(unsigned int irq, struct i
 	if (new->flags & IRQF_NOBALANCING)
 		desc->status |= IRQ_NO_BALANCING;
 
+	/*
+	 * Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY:
+	 */
+	recalculate_desc_flags(desc);
+
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
@@ -384,7 +421,7 @@ int setup_irq(unsigned int irq, struct i
 
 	new->irq = irq;
 	register_irq_proc(irq);
-	new->dir = NULL;
+	new->dir = new->threaded = NULL;
 	register_handler_proc(irq, new);
 
 	return 0;
@@ -455,6 +492,7 @@ void free_irq(unsigned int irq, void *de
 				else
 					desc->chip->disable(irq);
 			}
+			recalculate_desc_flags(desc);
 			spin_unlock_irqrestore(&desc->lock, flags);
 			unregister_handler_proc(irq, action);
 
@@ -470,9 +508,9 @@ void free_irq(unsigned int irq, void *de
 			 * parallel with our fake
 			 */
 			if (action->flags & IRQF_SHARED) {
-				local_irq_save(flags);
+				local_irq_save_nort(flags);
 				action->handler(irq, dev_id);
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 			}
 #endif
 			kfree(action);
@@ -567,9 +605,9 @@ int request_irq(unsigned int irq, irq_ha
 		 */
 		unsigned long flags;
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		handler(irq, dev_id);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 #endif
 
@@ -580,3 +618,264 @@ int request_irq(unsigned int irq, irq_ha
 	return retval;
 }
 EXPORT_SYMBOL(request_irq);
+
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+int hardirq_preemption = 1;
+
+EXPORT_SYMBOL(hardirq_preemption);
+
+/*
+ * Real-Time Preemption depends on hardirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init hardirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		hardirq_preemption = 0;
+	else
+		get_option(&str, &hardirq_preemption);
+	if (!hardirq_preemption)
+		printk("turning off hardirq preemption!\n");
+
+	return 1;
+}
+
+__setup("hardirq-preempt=", hardirq_preempt_setup);
+
+#endif
+
+/*
+ * threaded simple handler
+ */
+static void thread_simple_irq(irq_desc_t *desc)
+{
+	struct irqaction *action = desc->action;
+	unsigned int irq = desc - irq_desc;
+	irqreturn_t action_ret;
+
+	do {
+		if (!action || desc->depth)
+			break;
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, action);
+		cond_resched_hardirq_context();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+	} while (desc->status & IRQ_PENDING);
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/*
+ * threaded level type irq handler
+ */
+static void thread_level_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	thread_simple_irq(desc);
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+		desc->chip->unmask(irq);
+}
+
+/*
+ * threaded fasteoi type irq handler
+ */
+static void thread_fasteoi_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	thread_simple_irq(desc);
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+		desc->chip->unmask(irq);
+}
+
+/*
+ * threaded edge type IRQ handler
+ */
+static void thread_edge_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->status &= ~IRQ_INPROGRESS;
+			desc->chip->mask(irq);
+			return;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) ==
+			    (IRQ_PENDING | IRQ_MASKED)) && !desc->depth))
+			desc->chip->unmask(irq);
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, action);
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/*
+ * threaded edge type IRQ handler
+ */
+static void thread_do_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->status &= ~IRQ_INPROGRESS;
+			desc->chip->disable(irq);
+			return;
+		}
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, action);
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+	desc->chip->end(irq);
+}
+
+static void do_hardirq(struct irq_desc *desc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+
+	if (!(desc->status & IRQ_INPROGRESS))
+		goto out;
+
+	if (desc->handle_irq == handle_simple_irq)
+		thread_simple_irq(desc);
+	else if (desc->handle_irq == handle_level_irq)
+		thread_level_irq(desc);
+	else if (desc->handle_irq == handle_fasteoi_irq)
+		thread_fasteoi_irq(desc);
+	else if (desc->handle_irq == handle_edge_irq)
+		thread_edge_irq(desc);
+	else
+		thread_do_irq(desc);
+ out:
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	if (waitqueue_active(&desc->wait_for_handler))
+		wake_up(&desc->wait_for_handler);
+}
+
+static int do_irqd(void * __desc)
+{
+	struct sched_param param = { 0, };
+	struct irq_desc *desc = __desc;
+
+#ifdef CONFIG_SMP
+	cpumask_t cpus_allowed;
+
+	cpus_allowed = desc->affinity;
+#endif
+	current->flags |= PF_NOFREEZE | PF_HARDIRQ;
+
+	/*
+	 * Set irq thread priority to SCHED_FIFO/50:
+	 */
+	param.sched_priority = MAX_USER_RT_PRIO/2;
+
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+
+	while (!kthread_should_stop()) {
+		local_irq_disable_nort();
+		do {
+			set_current_state(TASK_INTERRUPTIBLE);
+			do_hardirq(desc);
+		} while (current->state == TASK_RUNNING);
+
+		local_irq_enable_nort();
+#ifdef CONFIG_SMP
+		/*
+		 * Did IRQ affinities change?
+		 */
+		if (!cpus_equal(cpus_allowed, desc->affinity))
+			cpus_allowed = desc->affinity;
+#endif
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+static int ok_to_create_irq_threads;
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	if (desc->thread || !ok_to_create_irq_threads)
+		return 0;
+
+	desc->thread = kthread_create(do_irqd, desc, "IRQ-%d", irq);
+	if (!desc->thread) {
+		printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq);
+		return -ENOMEM;
+	}
+
+	/*
+	 * An interrupt may have come in before the thread pointer was
+	 * stored in desc->thread; make sure the thread gets woken up in
+	 * such a case:
+	 */
+	smp_mb();
+	wake_up_process(desc->thread);
+
+	return 0;
+}
+
+void __init init_hardirqs(void)
+{
+	int i;
+	ok_to_create_irq_threads = 1;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+
+		if (desc->action && !(desc->status & IRQ_NODELAY))
+			start_irq_thread(i, desc);
+	}
+}
+
+#else
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	return 0;
+}
+
+#endif
+
+void __init early_init_hardirqs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		init_waitqueue_head(&irq_desc[i].wait_for_handler);
+}
Index: linux-2.6.25.4-rt4/kernel/irq/proc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/irq/proc.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/irq/proc.c	2008-05-29 09:46:20.000000000 -0400
@@ -7,6 +7,8 @@
  */
 
 #include <linux/irq.h>
+#include <asm/uaccess.h>
+#include <linux/profile.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 
@@ -87,44 +89,6 @@ static int irq_spurious_read(char *page,
 			jiffies_to_msecs(d->last_unhandled));
 }
 
-#define MAX_NAMELEN 128
-
-static int name_unique(unsigned int irq, struct irqaction *new_action)
-{
-	struct irq_desc *desc = irq_desc + irq;
-	struct irqaction *action;
-	unsigned long flags;
-	int ret = 1;
-
-	spin_lock_irqsave(&desc->lock, flags);
-	for (action = desc->action ; action; action = action->next) {
-		if ((action != new_action) && action->name &&
-				!strcmp(new_action->name, action->name)) {
-			ret = 0;
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&desc->lock, flags);
-	return ret;
-}
-
-void register_handler_proc(unsigned int irq, struct irqaction *action)
-{
-	char name [MAX_NAMELEN];
-
-	if (!irq_desc[irq].dir || action->dir || !action->name ||
-					!name_unique(irq, action))
-		return;
-
-	memset(name, 0, MAX_NAMELEN);
-	snprintf(name, MAX_NAMELEN, "%s", action->name);
-
-	/* create /proc/irq/1234/handler/ */
-	action->dir = proc_mkdir(name, irq_desc[irq].dir);
-}
-
-#undef MAX_NAMELEN
-
 #define MAX_NAMELEN 10
 
 void register_irq_proc(unsigned int irq)
@@ -167,10 +131,96 @@ void register_irq_proc(unsigned int irq)
 
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
+	if (action->threaded)
+		remove_proc_entry(action->threaded->name, action->dir);
 	if (action->dir)
 		remove_proc_entry(action->dir->name, irq_desc[irq].dir);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+static int threaded_read_proc(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	return sprintf(page, "%c\n",
+		((struct irqaction *)data)->flags & IRQF_NODELAY ? '0' : '1');
+}
+
+static int threaded_write_proc(struct file *file, const char __user *buffer,
+			       unsigned long count, void *data)
+{
+	int c;
+	struct irqaction *action = data;
+	irq_desc_t *desc = irq_desc + action->irq;
+
+	if (get_user(c, buffer))
+		return -EFAULT;
+	if (c != '0' && c != '1')
+		return -EINVAL;
+
+	spin_lock_irq(&desc->lock);
+
+	if (c == '0')
+		action->flags |= IRQF_NODELAY;
+	if (c == '1')
+		action->flags &= ~IRQF_NODELAY;
+	recalculate_desc_flags(desc);
+
+	spin_unlock_irq(&desc->lock);
+
+	return 1;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_desc[irq].dir || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_desc[irq].dir);
+
+	if (!action->dir)
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	{
+		struct proc_dir_entry *entry;
+		/* create /proc/irq/1234/handler/threaded */
+		entry = create_proc_entry("threaded", 0600, action->dir);
+		if (!entry)
+			return;
+		entry->nlink = 1;
+		entry->data = (void *)action;
+		entry->read_proc = threaded_read_proc;
+		entry->write_proc = threaded_write_proc;
+		action->threaded = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
 void init_irq_proc(void)
 {
 	int i;
@@ -180,6 +230,9 @@ void init_irq_proc(void)
 	if (!root_irq_dir)
 		return;
 
+	/* create /proc/irq/prof_cpu_mask */
+	create_prof_cpu_mask(root_irq_dir);
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */
Index: linux-2.6.25.4-rt4/kernel/irq/spurious.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/irq/spurious.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/irq/spurious.c	2008-05-29 09:46:47.000000000 -0400
@@ -13,6 +13,11 @@
 #include <linux/interrupt.h>
 #include <linux/moduleparam.h>
 
+#ifdef CONFIG_X86_IO_APIC
+# include <asm/apicdef.h>
+# include <asm/io_apic.h>
+#endif
+
 static int irqfixup __read_mostly;
 
 /*
@@ -57,9 +62,8 @@ static int misrouted_irq(int irq)
 			}
 			action = action->next;
 		}
-		local_irq_disable();
 		/* Now clean up the flags */
-		spin_lock(&desc->lock);
+		spin_lock_irq(&desc->lock);
 		action = desc->action;
 
 		/*
@@ -205,6 +209,12 @@ void note_interrupt(unsigned int irq, st
 		 * The interrupt is stuck
 		 */
 		__report_bad_irq(irq, desc, action_ret);
+#ifdef CONFIG_X86_IO_APIC
+		if (!sis_apic_bug) {
+			sis_apic_bug = 1;
+			printk(KERN_ERR "turning off IO-APIC fast mode.\n");
+		}
+#else
 		/*
 		 * Now kill the IRQ
 		 */
@@ -212,6 +222,7 @@ void note_interrupt(unsigned int irq, st
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 		desc->chip->disable(irq);
+#endif
 	}
 	desc->irqs_unhandled = 0;
 }
@@ -232,6 +243,11 @@ MODULE_PARM_DESC(noirqdebug, "Disable ir
 
 static int __init irqfixup_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "irqfixup boot option not supported "
+		"w/ CONFIG_PREEMPT_RT\n");
+	return 1;
+#endif
 	irqfixup = 1;
 	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
 	printk(KERN_WARNING "This may impact system performance.\n");
@@ -245,6 +261,11 @@ MODULE_PARM_DESC("irqfixup", "0: No fixu
 
 static int __init irqpoll_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "irqpoll boot option not supported "
+		"w/ CONFIG_PREEMPT_RT\n");
+	return 1;
+#endif
 	irqfixup = 2;
 	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
 				"enabled\n");
Index: linux-2.6.25.4-rt4/include/linux/timer.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/timer.h	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/timer.h	2008-05-29 09:46:20.000000000 -0400
@@ -144,10 +144,12 @@ static inline void add_timer(struct time
 	__mod_timer(timer, timer->expires);
 }
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+  extern int timer_pending_sync(struct timer_list *timer);
   extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
+# define timer_pending_sync(t)		timer_pending(t)
 # define try_to_del_timer_sync(t)	del_timer(t)
 # define del_timer_sync(t)		del_timer(t)
 #endif
Index: linux-2.6.25.4-rt4/kernel/timer.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/timer.c	2008-05-29 09:45:53.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/timer.c	2008-05-29 09:47:23.000000000 -0400
@@ -34,6 +34,7 @@
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
@@ -69,6 +70,7 @@ struct tvec_root {
 struct tvec_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 	unsigned long timer_jiffies;
 	struct tvec_root tv1;
 	struct tvec tv2;
@@ -247,9 +249,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_relative
 static inline void set_running_timer(struct tvec_base *base,
 					struct timer_list *timer)
 {
-#ifdef CONFIG_SMP
 	base->running_timer = timer;
-#endif
 }
 
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
@@ -393,7 +393,7 @@ int __mod_timer(struct timer_list *timer
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret = 0, cpu;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -405,7 +405,8 @@ int __mod_timer(struct timer_list *timer
 		ret = 1;
 	}
 
-	new_base = __get_cpu_var(tvec_bases);
+	cpu = raw_smp_processor_id();
+	new_base = per_cpu(tvec_bases, cpu);
 
 	if (base != new_base) {
 		/*
@@ -463,6 +464,18 @@ void add_timer_on(struct timer_list *tim
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 
+/*
+ * Wait for a running timer
+ */
+void wait_for_running_timer(struct timer_list *timer)
+{
+	struct tvec_base *base = timer->base;
+
+	if (base->running_timer == timer)
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+}
+
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -533,7 +546,35 @@ int del_timer(struct timer_list *timer)
 
 EXPORT_SYMBOL(del_timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+/*
+ * This function checks whether a timer is active and not running on any
+ * CPU. Upon successful (ret >= 0) exit the timer is not queued and the
+ * handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int timer_pending_sync(struct timer_list *timer)
+{
+	struct tvec_base *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_timer_base(timer, &flags);
+
+	if (base->running_timer == timer)
+		goto out;
+
+	ret = 0;
+	if (timer_pending(timer))
+		ret = 1;
+out:
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return ret;
+}
+
+
 /**
  * try_to_del_timer_sync - Try to deactivate a timer
  * @timer: timer do del
@@ -590,7 +631,7 @@ int del_timer_sync(struct timer_list *ti
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		wait_for_running_timer(timer);
 	}
 }
 
@@ -636,6 +677,20 @@ static inline void __run_timers(struct t
 		struct list_head *head = &work_list;
 		int index = base->timer_jiffies & TVR_MASK;
 
+		if (softirq_need_resched()) {
+			spin_unlock_irq(&base->lock);
+			wake_up(&base->wait_for_running_timer);
+			cond_resched_softirq_context();
+			cpu_relax();
+			spin_lock_irq(&base->lock);
+			/*
+			 * We can simply continue after preemption, nobody
+			 * else can touch timer_jiffies so 'index' is still
+			 * valid. Any new jiffy will be taken care of in
+			 * subsequent loops:
+			 */
+		}
+
 		/*
 		 * Cascade timers:
 		 */
@@ -663,18 +718,17 @@ static inline void __run_timers(struct t
 				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk(KERN_ERR "huh, entered %p "
-					       "with preempt_count %08x, exited"
-					       " with %08x?\n",
-					       fn, preempt_count,
-					       preempt_count());
-					BUG();
+					print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
 				}
 			}
+			set_running_timer(base, NULL);
+			cond_resched_softirq_context();
 			spin_lock_irq(&base->lock);
 		}
 	}
-	set_running_timer(base, NULL);
+	wake_up(&base->wait_for_running_timer);
 	spin_unlock_irq(&base->lock);
 }
 
@@ -804,9 +858,22 @@ unsigned long get_next_timer_interrupt(u
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 	unsigned long expires;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
+	 * succeed then we return the worst-case 'expires in 1 tick'
+	 * value:
+	 */
+	if (spin_trylock(&base->lock)) {
+		expires = __next_timer_interrupt(base);
+		spin_unlock(&base->lock);
+	} else
+		expires = now + 1;
+#else
 	spin_lock(&base->lock);
 	expires = __next_timer_interrupt(base);
 	spin_unlock(&base->lock);
+#endif
 
 	if (time_before_eq(expires, now))
 		return now;
@@ -849,10 +916,10 @@ void update_process_times(int user_tick)
 
 	/* Note: this timer irq context must be accounted for as well. */
 	account_process_tick(p, user_tick);
+	scheduler_tick();
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
-	scheduler_tick();
 	run_posix_cpu_timers(p);
 }
 
@@ -861,8 +928,29 @@ void update_process_times(int user_tick)
  */
 static unsigned long count_active_tasks(void)
 {
+	/*
+	 * On PREEMPT_RT, we are running in the timer softirq thread,
+	 * so consider 1 less running tasks:
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	return (nr_active() - 1) * FIXED_1;
+#else
 	return nr_active() * FIXED_1;
+#endif
+}
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Nr of active tasks - counted in fixed-point numbers
+ */
+static unsigned long count_active_rt_tasks(void)
+{
+	extern unsigned long rt_nr_running(void);
+	extern unsigned long rt_nr_uninterruptible(void);
+
+	return (rt_nr_running() + rt_nr_uninterruptible()) * FIXED_1;
 }
+#endif
 
 /*
  * Hmm.. Changed this, as the GNU make sources (load.c) seems to
@@ -876,6 +964,8 @@ unsigned long avenrun[3];
 
 EXPORT_SYMBOL(avenrun);
 
+unsigned long avenrun_rt[3];
+
 /*
  * calc_load - given tick count, update the avenrun load estimates.
  * This is called while holding a write_lock on xtime_lock.
@@ -884,33 +974,32 @@ static inline void calc_load(unsigned lo
 {
 	unsigned long active_tasks; /* fixed-point */
 	static int count = LOAD_FREQ;
+#ifdef CONFIG_PREEMPT_RT
+	unsigned long active_rt_tasks; /* fixed-point */
+#endif
 
 	count -= ticks;
 	if (unlikely(count < 0)) {
 		active_tasks = count_active_tasks();
+#ifdef CONFIG_PREEMPT_RT
+		active_rt_tasks = count_active_rt_tasks();
+#endif
 		do {
 			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+#ifdef CONFIG_PREEMPT_RT
+			CALC_LOAD(avenrun_rt[0], EXP_1, active_rt_tasks);
+			CALC_LOAD(avenrun_rt[1], EXP_5, active_rt_tasks);
+			CALC_LOAD(avenrun_rt[2], EXP_15, active_rt_tasks);
+#endif
 			count += LOAD_FREQ;
+
 		} while (count < 0);
 	}
 }
 
 /*
- * This function runs timers and the timer-tq in bottom half context.
- */
-static void run_timer_softirq(struct softirq_action *h)
-{
-	struct tvec_base *base = __get_cpu_var(tvec_bases);
-
-	hrtimer_run_pending();
-
-	if (time_after_eq(jiffies, base->timer_jiffies))
-		__run_timers(base);
-}
-
-/*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
 void run_local_timers(void)
@@ -921,13 +1010,42 @@ void run_local_timers(void)
 }
 
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
+ * Time of day handling:
  */
-static inline void update_times(unsigned long ticks)
+static inline void update_times(void)
 {
-	update_wall_time();
-	calc_load(ticks);
+	static unsigned long last_tick = INITIAL_JIFFIES;
+	unsigned long ticks, flags;
+
+	/*
+	 * Dont take the xtime_lock from every CPU in
+	 * every tick - only when needed:
+	 */
+	if (jiffies == last_tick)
+		return;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	ticks = jiffies - last_tick;
+	if (ticks) {
+		last_tick += ticks;
+		calc_load(ticks);
+	}
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+
+/*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+	struct tvec_base *base = per_cpu(tvec_bases, raw_smp_processor_id());
+
+	update_times();
+	hrtimer_run_pending();
+
+	if (time_after_eq(jiffies, base->timer_jiffies))
+		__run_timers(base);
 }
 
 /*
@@ -939,7 +1057,7 @@ static inline void update_times(unsigned
 void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
-	update_times(ticks);
+	update_wall_time();
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1278,6 +1396,7 @@ static int __cpuinit init_timers_cpu(int
 
 	spin_lock_init(&base->lock);
 	lockdep_set_class(&base->lock, base_lock_keys + cpu);
+	init_waitqueue_head(&base->wait_for_running_timer);
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1315,7 +1434,7 @@ static void __cpuinit migrate_timers(int
 	old_base = per_cpu(tvec_bases, cpu);
 	new_base = get_cpu_var(tvec_bases);
 
-	local_irq_disable();
+	local_irq_disable_nort();
 	double_spin_lock(&new_base->lock, &old_base->lock,
 			 smp_processor_id() < cpu);
 
@@ -1332,7 +1451,7 @@ static void __cpuinit migrate_timers(int
 
 	double_spin_unlock(&new_base->lock, &old_base->lock,
 			   smp_processor_id() < cpu);
-	local_irq_enable();
+	local_irq_enable_nort();
 	put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
Index: linux-2.6.25.4-rt4/kernel/itimer.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/itimer.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/itimer.c	2008-05-29 09:46:21.000000000 -0400
@@ -170,6 +170,7 @@ again:
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_wait_for_timer(&tsk->signal->real_timer);
 			goto again;
 		}
 		expires = timeval_to_ktime(value->it_value);
Index: linux-2.6.25.4-rt4/kernel/posix-timers.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/posix-timers.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/posix-timers.c	2008-05-29 09:46:21.000000000 -0400
@@ -810,6 +810,7 @@ retry:
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		hrtimer_wait_for_timer(&timr->it.real.timer);
 		rtn = NULL;	// We already got the old time...
 		goto retry;
 	}
@@ -849,6 +850,7 @@ retry_delete:
 
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
 
@@ -881,6 +883,7 @@ retry_delete:
 
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
 	list_del(&timer->list);
Index: linux-2.6.25.4-rt4/arch/x86/kernel/i8259_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/i8259_32.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/i8259_32.c	2008-05-29 09:46:41.000000000 -0400
@@ -29,7 +29,7 @@
  */
 
 static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
 
 static struct irq_chip i8259A_chip = {
@@ -165,6 +165,8 @@ static void mask_and_ack_8259A(unsigned 
 	 */
 	if (cached_irq_mask & irqmask)
 		goto spurious_8259A_irq;
+	if (irq & 8)
+		outb(0x60+(irq&7),PIC_SLAVE_CMD); /* 'Specific EOI' to slave */
 	cached_irq_mask |= irqmask;
 
 handle_real_irq:
@@ -292,10 +294,10 @@ void init_8259A(int auto_eoi)
 	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
 	outb_pic(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
 	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
-	if (auto_eoi)	/* master does Auto EOI */
-		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-	else		/* master expects normal EOI */
-		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+	if (!auto_eoi)	/* master expects normal EOI */
+		outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+	else		/* master does Auto EOI */
+		outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
 
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 	outb_pic(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
@@ -347,6 +349,7 @@ static irqreturn_t math_error_irq(int cp
  */
 static struct irqaction fpu_irq = {
 	.handler = math_error_irq,
+	.flags = IRQF_NODELAY,
 	.mask = CPU_MASK_NONE,
 	.name = "fpu",
 };
Index: linux-2.6.25.4-rt4/arch/x86/mach-default/setup.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/mach-default/setup.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/mach-default/setup.c	2008-05-29 09:46:21.000000000 -0400
@@ -37,6 +37,7 @@ void __init pre_intr_init_hook(void)
  */
 static struct irqaction irq2 = {
 	.handler = no_action,
+	.flags = IRQF_NODELAY,
 	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
@@ -85,7 +86,7 @@ void __init trap_init_hook(void)
 
 static struct irqaction irq0  = {
 	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY,
 	.mask = CPU_MASK_NONE,
 	.name = "timer"
 };
Index: linux-2.6.25.4-rt4/arch/x86/mach-visws/visws_apic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/mach-visws/visws_apic.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/mach-visws/visws_apic.c	2008-05-29 09:46:21.000000000 -0400
@@ -257,11 +257,13 @@ out_unlock:
 static struct irqaction master_action = {
 	.handler =	piix4_master_intr,
 	.name =		"PIIX4-8259",
+	.flags =	IRQF_NODELAY,
 };
 
 static struct irqaction cascade_action = {
 	.handler = 	no_action,
 	.name =		"cascade",
+	.flags =	IRQF_NODELAY,
 };
 
 
Index: linux-2.6.25.4-rt4/arch/x86/mach-voyager/setup.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/mach-voyager/setup.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/mach-voyager/setup.c	2008-05-29 09:46:21.000000000 -0400
@@ -20,6 +20,7 @@ void __init pre_intr_init_hook(void)
  */
 static struct irqaction irq2 = {
 	.handler = no_action,
+	.flags = IRQF_NODELAY,
 	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
@@ -46,7 +47,7 @@ void __init trap_init_hook(void)
 
 static struct irqaction irq0 = {
 	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY,
 	.mask = CPU_MASK_NONE,
 	.name = "timer"
 };
Index: linux-2.6.25.4-rt4/arch/mips/kernel/i8253.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/i8253.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/i8253.c	2008-05-29 09:46:21.000000000 -0400
@@ -97,7 +97,7 @@ static irqreturn_t timer_interrupt(int i
 
 static struct irqaction irq0  = {
 	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED | IRQF_NOBALANCING,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_NODELAY,
 	.mask = CPU_MASK_NONE,
 	.name = "timer"
 };
Index: linux-2.6.25.4-rt4/arch/x86/kernel/i8259_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/i8259_64.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/i8259_64.c	2008-05-29 09:46:37.000000000 -0400
@@ -97,8 +97,8 @@ static void (*__initdata interrupt[NR_VE
  */
 
 static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
@@ -405,6 +405,7 @@ void init_8259A(int auto_eoi)
 
 static struct irqaction irq2 = {
 	.handler = no_action,
+	.flags = IRQF_NODELAY,
 	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
Index: linux-2.6.25.4-rt4/arch/x86/kernel/time_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/time_64.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/time_64.c	2008-05-29 09:46:22.000000000 -0400
@@ -101,7 +101,8 @@ unsigned long __init native_calculate_cp
 
 static struct irqaction irq0 = {
 	.handler	= timer_event_interrupt,
-	.flags		= IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
+	.flags		= IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING |
+			  IRQF_NODELAY,
 	.mask		= CPU_MASK_NONE,
 	.name		= "timer"
 };
Index: linux-2.6.25.4-rt4/arch/arm/common/time-acorn.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/common/time-acorn.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/common/time-acorn.c	2008-05-29 09:46:22.000000000 -0400
@@ -75,7 +75,7 @@ ioc_timer_interrupt(int irq, void *dev_i
 
 static struct irqaction ioc_timer_irq = {
 	.name		= "timer",
-	.flags		= IRQF_DISABLED,
+	.flags		= IRQF_DISABLED | IRQF_NODELAY,
 	.handler	= ioc_timer_interrupt
 };
 
Index: linux-2.6.25.4-rt4/arch/arm/oprofile/op_model_xscale.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/oprofile/op_model_xscale.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/oprofile/op_model_xscale.c	2008-05-29 09:46:22.000000000 -0400
@@ -381,8 +381,9 @@ static int xscale_pmu_start(void)
 {
 	int ret;
 	u32 pmnc = read_pmnc();
+	int irq_flags = IRQF_DISABLED | IRQF_NODELAY;
 
-	ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED,
+	ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, irq_flags,
 			"XScale PMU", (void *)results);
 
 	if (ret < 0) {
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/ppc_ksyms.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/ppc_ksyms.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/ppc_ksyms.c	2008-05-29 09:46:39.000000000 -0400
@@ -15,7 +15,6 @@
 #include <linux/bitops.h>
 
 #include <asm/page.h>
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/cacheflush.h>
 #include <asm/uaccess.h>
@@ -46,7 +45,7 @@
 #include <asm/dcr.h>
 
 #ifdef CONFIG_PPC64
-EXPORT_SYMBOL(local_irq_restore);
+EXPORT_SYMBOL(raw_local_irq_restore);
 #endif
 
 #ifdef CONFIG_PPC32
@@ -166,7 +165,6 @@ EXPORT_SYMBOL(screen_info);
 
 #ifdef CONFIG_PPC32
 EXPORT_SYMBOL(timer_interrupt);
-EXPORT_SYMBOL(irq_desc);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
 EXPORT_SYMBOL(console_drivers);
 EXPORT_SYMBOL(cacheable_memcpy);
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/iseries/setup.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/iseries/setup.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/iseries/setup.c	2008-05-29 09:46:22.000000000 -0400
@@ -562,12 +562,14 @@ static void iseries_shared_idle(void)
 {
 	while (1) {
 		tick_nohz_stop_sched_tick();
-		while (!need_resched() && !hvlpevent_is_pending()) {
+		while (!need_resched() && !need_resched_delayed()
+				&& !hvlpevent_is_pending()) {
 			local_irq_disable();
 			ppc64_runlatch_off();
 
 			/* Recheck with irqs off */
-			if (!need_resched() && !hvlpevent_is_pending())
+			if (!need_resched() && !need_resched_delayed()
+					&& !hvlpevent_is_pending())
 				yield_shared_processor();
 
 			HMT_medium();
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/pseries/setup.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/pseries/setup.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/pseries/setup.c	2008-05-29 09:46:22.000000000 -0400
@@ -413,7 +413,8 @@ static void pseries_dedicated_idle_sleep
 		set_thread_flag(TIF_POLLING_NRFLAG);
 
 		while (get_tb() < start_snooze) {
-			if (need_resched() || cpu_is_offline(cpu))
+			if (need_resched() || need_resched_delayed() ||
+			    cpu_is_offline(cpu))
 				goto out;
 			ppc64_runlatch_off();
 			HMT_low();
@@ -424,7 +425,8 @@ static void pseries_dedicated_idle_sleep
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 		smp_mb();
 		local_irq_disable();
-		if (need_resched() || cpu_is_offline(cpu))
+		if (need_resched() || need_resched_delayed() ||
+		    cpu_is_offline(cpu))
 			goto out;
 	}
 
Index: linux-2.6.25.4-rt4/include/asm-powerpc/thread_info.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/thread_info.h	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/thread_info.h	2008-05-29 09:47:07.000000000 -0400
@@ -121,9 +121,12 @@ static inline struct thread_info *curren
 #define TIF_RESTOREALL		11	/* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR		12	/* Force successful syscall return */
 #define TIF_RESTORE_SIGMASK	13	/* Restore signal mask in do_signal */
-#define TIF_FREEZE		14	/* Freezing for suspend */
-#define TIF_RUNLATCH		15	/* Is the runlatch enabled? */
-#define TIF_ABI_PENDING		16	/* 32/64 bit switch needed */
+#define TIF_NEED_RESCHED_DELAYED \
+				14	/* reschedule on return to userspace */
+#define TIF_FREEZE		15	/* Freezing for suspend */
+#define TIF_RUNLATCH		16	/* Is the runlatch enabled? */
+#define TIF_ABI_PENDING		17	/* 32/64 bit switch needed */
+
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -139,13 +142,16 @@ static inline struct thread_info *curren
 #define _TIF_RESTOREALL		(1<<TIF_RESTOREALL)
 #define _TIF_NOERROR		(1<<TIF_NOERROR)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_RUNLATCH		(1<<TIF_RUNLATCH)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
+
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
 
 #define _TIF_USER_WORK_MASK	( _TIF_SIGPENDING | \
-				 _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK)
+				 _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK | \
+				 _TIF_NEED_RESCHED_DELAYED)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
 
 /* Bits in local_flags */
Index: linux-2.6.25.4-rt4/arch/powerpc/sysdev/mpic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/sysdev/mpic.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/sysdev/mpic.c	2008-05-29 09:46:41.000000000 -0400
@@ -46,7 +46,7 @@
 
 static struct mpic *mpics;
 static struct mpic *mpic_primary;
-static DEFINE_SPINLOCK(mpic_lock);
+static DEFINE_RAW_SPINLOCK(mpic_lock);
 
 #ifdef CONFIG_PPC32	/* XXX for now */
 #ifdef CONFIG_IRQ_ALL_CPUS
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/celleb/interrupt.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/celleb/interrupt.c	2008-05-29 09:45:52.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/celleb/interrupt.c	2008-05-29 09:46:41.000000000 -0400
@@ -29,8 +29,12 @@
 #include "interrupt.h"
 #include "beat_wrapper.h"
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern int hardirq_preemption;
+#endif /* CONFIG_PREEMPT_HARDIRQS */
+
 #define	MAX_IRQS	NR_IRQS
-static DEFINE_SPINLOCK(beatic_irq_mask_lock);
+static DEFINE_RAW_SPINLOCK(beatic_irq_mask_lock);
 static uint64_t	beatic_irq_mask_enable[(MAX_IRQS+255)/64];
 static uint64_t	beatic_irq_mask_ack[(MAX_IRQS+255)/64];
 
@@ -71,12 +75,35 @@ static void beatic_mask_irq(unsigned int
 	spin_unlock_irqrestore(&beatic_irq_mask_lock, flags);
 }
 
+static void __beatic_eoi_irq(unsigned int irq_plug)
+{
+	s64 err;
+
+	if ((err = beat_downcount_of_interrupt(irq_plug)) != 0) {
+		if ((err & 0xFFFFFFFF) != 0xFFFFFFF5) /* -11: wrong state */
+			panic("Failed to downcount IRQ! Error = %16lx", err);
+
+		printk(KERN_ERR "IRQ over-downcounted, plug %d\n", irq_plug);
+	}
+}
+
 static void beatic_unmask_irq(unsigned int irq_plug)
 {
 	unsigned long flags;
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+	if (hardirq_preemption)
+		__beatic_eoi_irq(irq_plug);
+#endif /* CONFIG_PREEMPT_HARDIRQS */
+
 	spin_lock_irqsave(&beatic_irq_mask_lock, flags);
 	beatic_irq_mask_enable[irq_plug/64] |= 1UL << (63 - (irq_plug%64));
+
+#ifdef CONFIG_PREEMPT_HARDIRQS
+	if (hardirq_preemption)
+		beatic_irq_mask_ack[irq_plug/64] |= 1UL << (63 - (irq_plug%64));
+#endif /* CONFIG_PREEMPT_HARDIRQS */
+
 	beatic_update_irq_mask(irq_plug);
 	spin_unlock_irqrestore(&beatic_irq_mask_lock, flags);
 }
@@ -93,15 +120,15 @@ static void beatic_ack_irq(unsigned int 
 
 static void beatic_end_irq(unsigned int irq_plug)
 {
-	s64 err;
 	unsigned long flags;
 
-	if ((err = beat_downcount_of_interrupt(irq_plug)) != 0) {
-		if ((err & 0xFFFFFFFF) != 0xFFFFFFF5) /* -11: wrong state */
-			panic("Failed to downcount IRQ! Error = %16lx", err);
+#ifdef CONFIG_PREEMPT_HARDIRQS
+	if (hardirq_preemption)
+		return;
+#endif /* CONFIG_PREEMPT_HARDIRQS */
+
+	__beatic_eoi_irq(irq_plug);
 
-		printk(KERN_ERR "IRQ over-downcounted, plug %d\n", irq_plug);
-	}
 	spin_lock_irqsave(&beatic_irq_mask_lock, flags);
 	beatic_irq_mask_ack[irq_plug/64] |= 1UL << (63 - (irq_plug%64));
 	beatic_update_irq_mask(irq_plug);
Index: linux-2.6.25.4-rt4/mm/slab.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/slab.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/slab.c	2008-05-29 09:47:19.000000000 -0400
@@ -116,6 +116,63 @@
 #include	<asm/page.h>
 
 /*
+ * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking
+ * mechanism.
+ *
+ * On PREEMPT_RT, we use per-CPU locks for this. That's why the
+ * calling convention is changed slightly: a new 'flags' argument
+ * is passed to 'irq disable/enable' - the PREEMPT_RT code stores
+ * the CPU number of the lock there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define slab_irq_disable(cpu) \
+	do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_enable(cpu)		local_irq_enable()
+# define slab_irq_save(flags, cpu) \
+	do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_restore(flags, cpu)	local_irq_restore(flags)
+/*
+ * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT,
+ * which has no per-CPU locking effect since we are holding the cache
+ * lock in that case already.
+ *
+ * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.)
+ */
+# define slab_irq_disable_nort(cpu)	slab_irq_disable(cpu)
+# define slab_irq_enable_nort(cpu)	slab_irq_enable(cpu)
+# define slab_irq_disable_rt(flags)	do { (void)(flags); } while (0)
+# define slab_irq_enable_rt(flags)	do { (void)(flags); } while (0)
+# define slab_spin_lock_irq(lock, cpu) \
+	do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+					spin_unlock_irq(lock)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+	do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+	do { spin_unlock_irqrestore(lock, flags); } while (0)
+#else
+DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, };
+# define slab_irq_disable(cpu)		(void)get_cpu_var_locked(slab_irq_locks, &(cpu))
+# define slab_irq_enable(cpu)		put_cpu_var_locked(slab_irq_locks, cpu)
+# define slab_irq_save(flags, cpu) \
+	do { slab_irq_disable(cpu); (void) (flags); } while (0)
+# define slab_irq_restore(flags, cpu) \
+	do { slab_irq_enable(cpu); (void) (flags); } while (0)
+# define slab_irq_disable_rt(cpu)	slab_irq_disable(cpu)
+# define slab_irq_enable_rt(cpu)	slab_irq_enable(cpu)
+# define slab_irq_disable_nort(cpu)	do { } while (0)
+# define slab_irq_enable_nort(cpu)	do { } while (0)
+# define slab_spin_lock_irq(lock, cpu) \
+		do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+		do { spin_unlock(lock); slab_irq_enable(cpu); } while (0)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+	do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+	do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0)
+#endif
+
+/*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
  *
@@ -313,7 +370,7 @@ struct kmem_list3 __initdata initkmem_li
 static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
-			int node);
+			int node, int *this_cpu);
 static int enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(struct work_struct *unused);
 
@@ -756,9 +813,10 @@ int slab_is_available(void)
 
 static DEFINE_PER_CPU(struct delayed_work, reap_work);
 
-static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
+static inline struct array_cache *
+cpu_cache_get(struct kmem_cache *cachep, int this_cpu)
 {
-	return cachep->array[smp_processor_id()];
+	return cachep->array[this_cpu];
 }
 
 static inline struct kmem_cache *__find_general_cachep(size_t size,
@@ -992,7 +1050,7 @@ static int transfer_objects(struct array
 #ifndef CONFIG_NUMA
 
 #define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
+#define reap_alien(cachep, l3, this_cpu) 0
 
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -1003,27 +1061,29 @@ static inline void free_alien_cache(stru
 {
 }
 
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static inline int
+cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
 	return 0;
 }
 
 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
-		gfp_t flags)
+		gfp_t flags, int *this_cpu)
 {
 	return NULL;
 }
 
 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
-		 gfp_t flags, int nodeid)
+		 gfp_t flags, int nodeid, int *this_cpu)
 {
 	return NULL;
 }
 
 #else	/* CONFIG_NUMA */
 
-static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
-static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
+static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				int nodeid, int *this_cpu);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int *);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -1064,7 +1124,8 @@ static void free_alien_cache(struct arra
 }
 
 static void __drain_alien_cache(struct kmem_cache *cachep,
-				struct array_cache *ac, int node)
+				struct array_cache *ac, int node,
+				int *this_cpu)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 
@@ -1078,7 +1139,7 @@ static void __drain_alien_cache(struct k
 		if (rl3->shared)
 			transfer_objects(rl3->shared, ac, ac->limit);
 
-		free_block(cachep, ac->entry, ac->avail, node);
+		free_block(cachep, ac->entry, ac->avail, node, this_cpu);
 		ac->avail = 0;
 		spin_unlock(&rl3->list_lock);
 	}
@@ -1087,38 +1148,42 @@ static void __drain_alien_cache(struct k
 /*
  * Called from cache_reap() to regularly drain alien caches round robin.
  */
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static int
+reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu)
 {
-	int node = __get_cpu_var(reap_node);
+	int node = per_cpu(reap_node, *this_cpu);
 
 	if (l3->alien) {
 		struct array_cache *ac = l3->alien[node];
 
 		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
-			__drain_alien_cache(cachep, ac, node);
+			__drain_alien_cache(cachep, ac, node, this_cpu);
 			spin_unlock_irq(&ac->lock);
+			return 1;
 		}
 	}
+	return 0;
 }
 
 static void drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache **alien)
 {
-	int i = 0;
+	int i = 0, this_cpu;
 	struct array_cache *ac;
 	unsigned long flags;
 
 	for_each_online_node(i) {
 		ac = alien[i];
 		if (ac) {
-			spin_lock_irqsave(&ac->lock, flags);
-			__drain_alien_cache(cachep, ac, i);
-			spin_unlock_irqrestore(&ac->lock, flags);
+			slab_spin_lock_irqsave(&ac->lock, flags, this_cpu);
+			__drain_alien_cache(cachep, ac, i, &this_cpu);
+			slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu);
 		}
 	}
 }
 
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static inline int
+cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
 	struct slab *slabp = virt_to_slab(objp);
 	int nodeid = slabp->nodeid;
@@ -1126,7 +1191,7 @@ static inline int cache_free_alien(struc
 	struct array_cache *alien = NULL;
 	int node;
 
-	node = numa_node_id();
+	node = cpu_to_node(*this_cpu);
 
 	/*
 	 * Make sure we are not freeing a object from another node to the array
@@ -1142,13 +1207,13 @@ static inline int cache_free_alien(struc
 		spin_lock(&alien->lock);
 		if (unlikely(alien->avail == alien->limit)) {
 			STATS_INC_ACOVERFLOW(cachep);
-			__drain_alien_cache(cachep, alien, nodeid);
+			__drain_alien_cache(cachep, alien, nodeid, this_cpu);
 		}
 		alien->entry[alien->avail++] = objp;
 		spin_unlock(&alien->lock);
 	} else {
 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
-		free_block(cachep, &objp, 1, nodeid);
+		free_block(cachep, &objp, 1, nodeid, this_cpu);
 		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
 	}
 	return 1;
@@ -1165,6 +1230,7 @@ static void __cpuinit cpuup_canceled(lon
 		struct array_cache *nc;
 		struct array_cache *shared;
 		struct array_cache **alien;
+		int this_cpu;
 		cpumask_t mask;
 
 		mask = node_to_cpumask(node);
@@ -1176,29 +1242,31 @@ static void __cpuinit cpuup_canceled(lon
 		if (!l3)
 			goto free_array_cache;
 
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 		/* Free limit for this kmem_list3 */
 		l3->free_limit -= cachep->batchcount;
 		if (nc)
-			free_block(cachep, nc->entry, nc->avail, node);
+			free_block(cachep, nc->entry, nc->avail, node,
+				   &this_cpu);
 
 		if (!cpus_empty(mask)) {
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock,
+					     this_cpu);
 			goto free_array_cache;
 		}
 
 		shared = l3->shared;
 		if (shared) {
 			free_block(cachep, shared->entry,
-				   shared->avail, node);
+				   shared->avail, node, &this_cpu);
 			l3->shared = NULL;
 		}
 
 		alien = l3->alien;
 		l3->alien = NULL;
 
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 
 		kfree(shared);
 		if (alien) {
@@ -1227,6 +1295,7 @@ static int __cpuinit cpuup_prepare(long 
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	const int memsize = sizeof(struct kmem_list3);
+	int this_cpu;
 
 	/*
 	 * We need to do this right in the beginning since
@@ -1257,11 +1326,11 @@ static int __cpuinit cpuup_prepare(long 
 			cachep->nodelists[node] = l3;
 		}
 
-		spin_lock_irq(&cachep->nodelists[node]->list_lock);
+		slab_spin_lock_irq(&cachep->nodelists[node]->list_lock, this_cpu);
 		cachep->nodelists[node]->free_limit =
 			(1 + nr_cpus_node(node)) *
 			cachep->batchcount + cachep->num;
-		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+		slab_spin_unlock_irq(&cachep->nodelists[node]->list_lock, this_cpu);
 	}
 
 	/*
@@ -1298,7 +1367,7 @@ static int __cpuinit cpuup_prepare(long 
 		l3 = cachep->nodelists[node];
 		BUG_ON(!l3);
 
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 		if (!l3->shared) {
 			/*
 			 * We are serialised from CPU_DEAD or
@@ -1313,7 +1382,7 @@ static int __cpuinit cpuup_prepare(long 
 			alien = NULL;
 		}
 #endif
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 		kfree(shared);
 		free_alien_cache(alien);
 	}
@@ -1390,11 +1459,13 @@ static void init_list(struct kmem_cache 
 			int nodeid)
 {
 	struct kmem_list3 *ptr;
+	int this_cpu;
 
 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
 	BUG_ON(!ptr);
 
-	local_irq_disable();
+	WARN_ON(spin_is_locked(&list->list_lock));
+	slab_irq_disable(this_cpu);
 	memcpy(ptr, list, sizeof(struct kmem_list3));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1403,7 +1474,7 @@ static void init_list(struct kmem_cache 
 
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
+	slab_irq_enable(this_cpu);
 }
 
 /*
@@ -1566,36 +1637,34 @@ void __init kmem_cache_init(void)
 	/* 4) Replace the bootstrap head arrays */
 	{
 		struct array_cache *ptr;
+		int this_cpu;
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
-		local_irq_disable();
-		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, cpu_cache_get(&cache_cache),
-		       sizeof(struct arraycache_init));
+		slab_irq_disable(this_cpu);
+		BUG_ON(cpu_cache_get(&cache_cache, this_cpu) != &initarray_cache.cache);
+		memcpy(ptr, cpu_cache_get(&cache_cache, this_cpu),
+				sizeof(struct arraycache_init));
 		/*
 		 * Do not assume that spinlocks can be initialized via memcpy:
 		 */
 		spin_lock_init(&ptr->lock);
-
-		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
+		cache_cache.array[this_cpu] = ptr;
+		slab_irq_enable(this_cpu);
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
-		local_irq_disable();
-		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
-		       != &initarray_generic.cache);
-		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
-		       sizeof(struct arraycache_init));
+		slab_irq_disable(this_cpu);
+		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu)
+				!= &initarray_generic.cache);
+		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu),
+				sizeof(struct arraycache_init));
 		/*
 		 * Do not assume that spinlocks can be initialized via memcpy:
 		 */
 		spin_lock_init(&ptr->lock);
-
-		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-		    ptr;
-		local_irq_enable();
+		malloc_sizes[INDEX_AC].cs_cachep->array[this_cpu] = ptr;
+		slab_irq_enable(this_cpu);
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
@@ -1747,7 +1816,7 @@ static void store_stackinfo(struct kmem_
 
 	*addr++ = 0x12345678;
 	*addr++ = caller;
-	*addr++ = smp_processor_id();
+	*addr++ = raw_smp_processor_id();
 	size -= 3 * sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
@@ -1902,7 +1971,11 @@ static void check_poison_obj(struct kmem
 }
 #endif
 
+static void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu);
+
 #if DEBUG
+
 /**
  * slab_destroy_objs - destroy a slab and its objects
  * @cachep: cache pointer being destroyed
@@ -1911,7 +1984,8 @@ static void check_poison_obj(struct kmem
  * Call the registered destructor for each object in a slab that is being
  * destroyed.
  */
-static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
+static void
+slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
@@ -1954,7 +2028,8 @@ static void slab_destroy_objs(struct kme
  * Before calling the slab must have been unlinked from the cache.  The
  * cache-lock is not held/needed.
  */
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+static void
+slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -1968,8 +2043,12 @@ static void slab_destroy(struct kmem_cac
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
 	} else {
 		kmem_freepages(cachep, addr);
-		if (OFF_SLAB(cachep))
-			kmem_cache_free(cachep->slabp_cache, slabp);
+		if (OFF_SLAB(cachep)) {
+			if (this_cpu)
+				__cache_free(cachep->slabp_cache, slabp, this_cpu);
+			else
+				kmem_cache_free(cachep->slabp_cache, slabp);
+		}
 	}
 }
 
@@ -2066,6 +2145,8 @@ static size_t calculate_slab_order(struc
 
 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 {
+	int this_cpu;
+
 	if (g_cpucache_up == FULL)
 		return enable_cpucache(cachep);
 
@@ -2109,10 +2190,12 @@ static int __init_refok setup_cpu_cache(
 			jiffies + REAPTIMEOUT_LIST3 +
 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
-	cpu_cache_get(cachep)->avail = 0;
-	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-	cpu_cache_get(cachep)->batchcount = 1;
-	cpu_cache_get(cachep)->touched = 0;
+	this_cpu = raw_smp_processor_id();
+
+	cpu_cache_get(cachep, this_cpu)->avail = 0;
+	cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES;
+	cpu_cache_get(cachep, this_cpu)->batchcount = 1;
+	cpu_cache_get(cachep, this_cpu)->touched = 0;
 	cachep->batchcount = 1;
 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
 	return 0;
@@ -2402,19 +2485,19 @@ EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
 {
+/*
+ * On PREEMPT_RT we use locks to protect the per-CPU lists,
+ * and keep interrupts enabled.
+ */
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!irqs_disabled());
+#endif
 }
 
 static void check_irq_on(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(irqs_disabled());
-}
-
-static void check_spinlock_acquired(struct kmem_cache *cachep)
-{
-#ifdef CONFIG_SMP
-	check_irq_off();
-	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
 #endif
 }
 
@@ -2429,34 +2512,67 @@ static void check_spinlock_acquired_node
 #else
 #define check_irq_off()	do { } while(0)
 #define check_irq_on()	do { } while(0)
-#define check_spinlock_acquired(x) do { } while(0)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
-static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			struct array_cache *ac,
 			int force, int node);
 
-static void do_drain(void *arg)
+static void __do_drain(void *arg, int this_cpu)
 {
 	struct kmem_cache *cachep = arg;
+	int node = cpu_to_node(this_cpu);
 	struct array_cache *ac;
-	int node = numa_node_id();
 
 	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, this_cpu);
 	spin_lock(&cachep->nodelists[node]->list_lock);
-	free_block(cachep, ac->entry, ac->avail, node);
+	free_block(cachep, ac->entry, ac->avail, node, &this_cpu);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
 	ac->avail = 0;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static void do_drain(void *arg, int this_cpu)
+{
+	__do_drain(arg, this_cpu);
+}
+#else
+static void do_drain(void *arg)
+{
+	__do_drain(arg, smp_processor_id());
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * execute func() for all CPUs. On PREEMPT_RT we dont actually have
+ * to run on the remote CPUs - we only have to take their CPU-locks.
+ * (This is a rare operation, so cacheline bouncing is not an issue.)
+ */
+static void
+slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg)
+{
+	unsigned int i;
+
+	check_irq_on();
+	for_each_online_cpu(i) {
+		spin_lock(&__get_cpu_lock(slab_irq_locks, i));
+		func(arg, i);
+		spin_unlock(&__get_cpu_lock(slab_irq_locks, i));
+	}
+}
+#else
+# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1, 1)
+#endif
+
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
 	int node;
 
-	on_each_cpu(do_drain, cachep, 1, 1);
+	slab_on_each_cpu(do_drain, cachep);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -2481,16 +2597,16 @@ static int drain_freelist(struct kmem_ca
 			struct kmem_list3 *l3, int tofree)
 {
 	struct list_head *p;
-	int nr_freed;
+	int nr_freed, this_cpu;
 	struct slab *slabp;
 
 	nr_freed = 0;
 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
 
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 		p = l3->slabs_free.prev;
 		if (p == &l3->slabs_free) {
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 			goto out;
 		}
 
@@ -2499,13 +2615,9 @@ static int drain_freelist(struct kmem_ca
 		BUG_ON(slabp->inuse);
 #endif
 		list_del(&slabp->list);
-		/*
-		 * Safe to drop the lock. The slab is no longer linked
-		 * to the cache.
-		 */
 		l3->free_objects -= cache->num;
-		spin_unlock_irq(&l3->list_lock);
-		slab_destroy(cache, slabp);
+		slab_destroy(cache, slabp, &this_cpu);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 		nr_freed++;
 	}
 out:
@@ -2761,8 +2873,8 @@ static void slab_map_pages(struct kmem_c
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(struct kmem_cache *cachep,
-		gfp_t flags, int nodeid, void *objp)
+static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+		      void *objp, int *this_cpu)
 {
 	struct slab *slabp;
 	size_t offset;
@@ -2791,7 +2903,8 @@ static int cache_grow(struct kmem_cache 
 	offset *= cachep->colour_off;
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_enable();
+		slab_irq_enable_nort(*this_cpu);
+	slab_irq_enable_rt(*this_cpu);
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -2820,8 +2933,10 @@ static int cache_grow(struct kmem_cache 
 
 	cache_init_objs(cachep, slabp);
 
+	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		slab_irq_disable_nort(*this_cpu);
+
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 
@@ -2834,8 +2949,9 @@ static int cache_grow(struct kmem_cache 
 opps1:
 	kmem_freepages(cachep, objp);
 failed:
+	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		slab_irq_disable_nort(*this_cpu);
 	return 0;
 }
 
@@ -2957,7 +3073,8 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *
+cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
@@ -2967,7 +3084,7 @@ static void *cache_alloc_refill(struct k
 retry:
 	check_irq_off();
 	node = numa_node_id();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, *this_cpu);
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
 		/*
@@ -2977,7 +3094,7 @@ retry:
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
-	l3 = cachep->nodelists[node];
+	l3 = cachep->nodelists[cpu_to_node(*this_cpu)];
 
 	BUG_ON(ac->avail > 0 || !l3);
 	spin_lock(&l3->list_lock);
@@ -3000,7 +3117,7 @@ retry:
 
 		slabp = list_entry(entry, struct slab, list);
 		check_slabp(cachep, slabp);
-		check_spinlock_acquired(cachep);
+		check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu));
 
 		/*
 		 * The slab was either on partial or free list so
@@ -3014,8 +3131,9 @@ retry:
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 
-			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
-							    node);
+			ac->entry[ac->avail++] =
+				slab_get_obj(cachep, slabp,
+					     cpu_to_node(*this_cpu));
 		}
 		check_slabp(cachep, slabp);
 
@@ -3034,10 +3152,10 @@ alloc_done:
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+		x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu);
 
 		/* cache_grow can reenable interrupts, then ac could change. */
-		ac = cpu_cache_get(cachep);
+		ac = cpu_cache_get(cachep, *this_cpu);
 		if (!x && ac->avail == 0)	/* no objects in sight? abort */
 			return NULL;
 
@@ -3189,21 +3307,22 @@ static inline int should_failslab(struct
 
 #endif /* CONFIG_FAILSLAB */
 
-static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static inline void *
+____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
 	void *objp;
 	struct array_cache *ac;
 
 	check_irq_off();
 
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, *this_cpu);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
 		objp = ac->entry[--ac->avail];
 	} else {
 		STATS_INC_ALLOCMISS(cachep);
-		objp = cache_alloc_refill(cachep, flags);
+		objp = cache_alloc_refill(cachep, flags, this_cpu);
 	}
 	return objp;
 }
@@ -3215,7 +3334,8 @@ static inline void *____cache_alloc(stru
  * If we are in_interrupt, then process context, including cpusets and
  * mempolicy, may not apply and should not be used for allocation policy.
  */
-static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags,
+				int *this_cpu)
 {
 	int nid_alloc, nid_here;
 
@@ -3227,7 +3347,7 @@ static void *alternate_node_alloc(struct
 	else if (current->mempolicy)
 		nid_alloc = slab_node(current->mempolicy);
 	if (nid_alloc != nid_here)
-		return ____cache_alloc_node(cachep, flags, nid_alloc);
+		return ____cache_alloc_node(cachep, flags, nid_alloc, this_cpu);
 	return NULL;
 }
 
@@ -3239,7 +3359,7 @@ static void *alternate_node_alloc(struct
  * allocator to do its reclaim / fallback magic. We then insert the
  * slab into the proper nodelist and then allocate from it.
  */
-static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu)
 {
 	struct zonelist *zonelist;
 	gfp_t local_flags;
@@ -3265,8 +3385,10 @@ retry:
 		if (cpuset_zone_allowed_hardwall(*z, flags) &&
 			cache->nodelists[nid] &&
 			cache->nodelists[nid]->free_objects)
-				obj = ____cache_alloc_node(cache,
-					flags | GFP_THISNODE, nid);
+
+			obj = ____cache_alloc_node(cache,
+						   flags | GFP_THISNODE, nid,
+						   this_cpu);
 	}
 
 	if (!obj) {
@@ -3277,19 +3399,24 @@ retry:
 		 * set and go into memory reserves if necessary.
 		 */
 		if (local_flags & __GFP_WAIT)
-			local_irq_enable();
+			slab_irq_enable_nort(*this_cpu);
+		slab_irq_enable_rt(*this_cpu);
+
 		kmem_flagcheck(cache, flags);
 		obj = kmem_getpages(cache, local_flags, -1);
+
+		slab_irq_disable_rt(*this_cpu);
 		if (local_flags & __GFP_WAIT)
-			local_irq_disable();
+			slab_irq_disable_nort(*this_cpu);
+
 		if (obj) {
 			/*
 			 * Insert into the appropriate per node queues
 			 */
 			nid = page_to_nid(virt_to_page(obj));
-			if (cache_grow(cache, flags, nid, obj)) {
+			if (cache_grow(cache, flags, nid, obj, this_cpu)) {
 				obj = ____cache_alloc_node(cache,
-					flags | GFP_THISNODE, nid);
+					flags | GFP_THISNODE, nid, this_cpu);
 				if (!obj)
 					/*
 					 * Another processor may allocate the
@@ -3310,7 +3437,7 @@ retry:
  * A interface to enable slab creation on nodeid
  */
 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
-				int nodeid)
+				int nodeid, int *this_cpu)
 {
 	struct list_head *entry;
 	struct slab *slabp;
@@ -3358,11 +3485,11 @@ retry:
 
 must_grow:
 	spin_unlock(&l3->list_lock);
-	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu);
 	if (x)
 		goto retry;
 
-	return fallback_alloc(cachep, flags);
+	return fallback_alloc(cachep, flags, this_cpu);
 
 done:
 	return obj;
@@ -3384,39 +3511,41 @@ static __always_inline void *
 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 		   void *caller)
 {
-	unsigned long save_flags;
+	unsigned long irqflags;
+	int this_cpu;
 	void *ptr;
 
 	if (should_failslab(cachep, flags))
 		return NULL;
 
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
+
+	slab_irq_save(irqflags, this_cpu);
 
 	if (unlikely(nodeid == -1))
-		nodeid = numa_node_id();
+		nodeid = cpu_to_node(this_cpu);
 
 	if (unlikely(!cachep->nodelists[nodeid])) {
 		/* Node not bootstrapped yet */
-		ptr = fallback_alloc(cachep, flags);
+		ptr = fallback_alloc(cachep, flags, &this_cpu);
 		goto out;
 	}
 
-	if (nodeid == numa_node_id()) {
+	if (nodeid == cpu_to_node(this_cpu)) {
 		/*
 		 * Use the locally cached objects if possible.
 		 * However ____cache_alloc does not allow fallback
 		 * to other nodes. It may fail while we still have
 		 * objects on other nodes available.
 		 */
-		ptr = ____cache_alloc(cachep, flags);
+		ptr = ____cache_alloc(cachep, flags, &this_cpu);
 		if (ptr)
 			goto out;
 	}
 	/* ___cache_alloc_node can fall back to other nodes */
-	ptr = ____cache_alloc_node(cachep, flags, nodeid);
+	ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu);
   out:
-	local_irq_restore(save_flags);
+	slab_irq_restore(irqflags, this_cpu);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
 
 	if (unlikely((flags & __GFP_ZERO) && ptr))
@@ -3426,33 +3555,33 @@ __cache_alloc_node(struct kmem_cache *ca
 }
 
 static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
+__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu)
 {
 	void *objp;
 
 	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
-		objp = alternate_node_alloc(cache, flags);
+		objp = alternate_node_alloc(cache, flags, this_cpu);
 		if (objp)
 			goto out;
 	}
-	objp = ____cache_alloc(cache, flags);
 
+	objp = ____cache_alloc(cache, flags, this_cpu);
 	/*
 	 * We may just have run out of memory on the local node.
 	 * ____cache_alloc_node() knows how to locate memory on other nodes
 	 */
- 	if (!objp)
- 		objp = ____cache_alloc_node(cache, flags, numa_node_id());
-
+	if (!objp)
+		objp = ____cache_alloc_node(cache, flags,
+					    cpu_to_node(*this_cpu), this_cpu);
   out:
 	return objp;
 }
 #else
 
 static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
-	return ____cache_alloc(cachep, flags);
+	return ____cache_alloc(cachep, flags, this_cpu);
 }
 
 #endif /* CONFIG_NUMA */
@@ -3461,15 +3590,16 @@ static __always_inline void *
 __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 {
 	unsigned long save_flags;
+	int this_cpu;
 	void *objp;
 
 	if (should_failslab(cachep, flags))
 		return NULL;
 
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
-	objp = __do_cache_alloc(cachep, flags);
-	local_irq_restore(save_flags);
+	slab_irq_save(save_flags, this_cpu);
+	objp = __do_cache_alloc(cachep, flags, &this_cpu);
+	slab_irq_restore(save_flags, this_cpu);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
 	prefetchw(objp);
 
@@ -3483,7 +3613,7 @@ __cache_alloc(struct kmem_cache *cachep,
  * Caller needs to acquire correct kmem_list's list_lock
  */
 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
-		       int node)
+		       int node, int *this_cpu)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -3512,7 +3642,7 @@ static void free_block(struct kmem_cache
 				 * a different cache, refer to comments before
 				 * alloc_slabmgmt.
 				 */
-				slab_destroy(cachep, slabp);
+				slab_destroy(cachep, slabp, this_cpu);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
 			}
@@ -3526,11 +3656,12 @@ static void free_block(struct kmem_cache
 	}
 }
 
-static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
+static void
+cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
-	int node = numa_node_id();
+	int node = cpu_to_node(*this_cpu);
 
 	batchcount = ac->batchcount;
 #if DEBUG
@@ -3552,7 +3683,7 @@ static void cache_flusharray(struct kmem
 		}
 	}
 
-	free_block(cachep, ac->entry, batchcount, node);
+	free_block(cachep, ac->entry, batchcount, node, this_cpu);
 free_done:
 #if STATS
 	{
@@ -3581,9 +3712,9 @@ free_done:
  * Release an obj back to its cache. If the obj has a constructed state, it must
  * be in this state _before_ it is released.  Called with disabled ints.
  */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static void __cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
-	struct array_cache *ac = cpu_cache_get(cachep);
+	struct array_cache *ac = cpu_cache_get(cachep, *this_cpu);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
@@ -3595,7 +3726,7 @@ static inline void __cache_free(struct k
 	 * variable to skip the call, which is mostly likely to be present in
 	 * the cache.
 	 */
-	if (numa_platform && cache_free_alien(cachep, objp))
+	if (numa_platform && cache_free_alien(cachep, objp, this_cpu))
 		return;
 
 	if (likely(ac->avail < ac->limit)) {
@@ -3604,7 +3735,7 @@ static inline void __cache_free(struct k
 		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
-		cache_flusharray(cachep, ac);
+		cache_flusharray(cachep, ac, this_cpu);
 		ac->entry[ac->avail++] = objp;
 	}
 }
@@ -3761,11 +3892,12 @@ EXPORT_SYMBOL(__kmalloc);
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
+	int this_cpu;
 
-	local_irq_save(flags);
+	slab_irq_save(flags, this_cpu);
 	debug_check_no_locks_freed(objp, obj_size(cachep));
-	__cache_free(cachep, objp);
-	local_irq_restore(flags);
+	__cache_free(cachep, objp, &this_cpu);
+	slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -3782,15 +3914,16 @@ void kfree(const void *objp)
 {
 	struct kmem_cache *c;
 	unsigned long flags;
+	int this_cpu;
 
 	if (unlikely(ZERO_OR_NULL_PTR(objp)))
 		return;
-	local_irq_save(flags);
+	slab_irq_save(flags, this_cpu);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
 	debug_check_no_locks_freed(objp, obj_size(c));
-	__cache_free(c, (void *)objp);
-	local_irq_restore(flags);
+	__cache_free(c, (void *)objp, &this_cpu);
+	slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3811,7 +3944,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
  */
 static int alloc_kmemlist(struct kmem_cache *cachep)
 {
-	int node;
+	int node, this_cpu;
 	struct kmem_list3 *l3;
 	struct array_cache *new_shared;
 	struct array_cache **new_alien = NULL;
@@ -3839,11 +3972,11 @@ static int alloc_kmemlist(struct kmem_ca
 		if (l3) {
 			struct array_cache *shared = l3->shared;
 
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 			if (shared)
 				free_block(cachep, shared->entry,
-						shared->avail, node);
+					   shared->avail, node, &this_cpu);
 
 			l3->shared = new_shared;
 			if (!l3->alien) {
@@ -3852,7 +3985,7 @@ static int alloc_kmemlist(struct kmem_ca
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
@@ -3899,42 +4032,50 @@ struct ccupdate_struct {
 	struct array_cache *new[NR_CPUS];
 };
 
-static void do_ccupdate_local(void *info)
+static void __do_ccupdate_local(void *info, int this_cpu)
 {
 	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 
 	check_irq_off();
-	old = cpu_cache_get(new->cachep);
+	old = cpu_cache_get(new->cachep, this_cpu);
+
+	new->cachep->array[this_cpu] = new->new[this_cpu];
+	new->new[this_cpu] = old;
+}
 
-	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
-	new->new[smp_processor_id()] = old;
+#ifdef CONFIG_PREEMPT_RT
+static void do_ccupdate_local(void *arg, int this_cpu)
+{
+	__do_ccupdate_local(arg, this_cpu);
 }
+#else
+static void do_ccupdate_local(void *arg)
+{
+	__do_ccupdate_local(arg, smp_processor_id());
+}
+#endif
 
 /* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared)
 {
-	struct ccupdate_struct *new;
-	int i;
-
-	new = kzalloc(sizeof(*new), GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
+	struct ccupdate_struct new;
+	int i, this_cpu;
 
+	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
-		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
+		new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
 						batchcount);
-		if (!new->new[i]) {
+		if (!new.new[i]) {
 			for (i--; i >= 0; i--)
-				kfree(new->new[i]);
-			kfree(new);
+				kfree(new.new[i]);
 			return -ENOMEM;
 		}
 	}
-	new->cachep = cachep;
+	new.cachep = cachep;
 
-	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
+	slab_on_each_cpu(do_ccupdate_local, (void *)&new);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
@@ -3942,15 +4083,15 @@ static int do_tune_cpucache(struct kmem_
 	cachep->shared = shared;
 
 	for_each_online_cpu(i) {
-		struct array_cache *ccold = new->new[i];
+		struct array_cache *ccold = new.new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
-		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
-		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+		slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu);
+		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), &this_cpu);
+		slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu);
 		kfree(ccold);
 	}
-	kfree(new);
+
 	return alloc_kmemlist(cachep);
 }
 
@@ -4012,29 +4153,31 @@ static int enable_cpucache(struct kmem_c
  * Drain an array if it contains any elements taking the l3 lock only if
  * necessary. Note that the l3 listlock also protects the array_cache
  * if drain_array() is used on the shared array.
+ * returns non-zero if some work is done
  */
-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
-			 struct array_cache *ac, int force, int node)
+int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+		 struct array_cache *ac, int force, int node)
 {
-	int tofree;
+	int tofree, this_cpu;
 
 	if (!ac || !ac->avail)
-		return;
+		return 0;
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else {
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 		if (ac->avail) {
 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
 			if (tofree > ac->avail)
 				tofree = (ac->avail + 1) / 2;
-			free_block(cachep, ac->entry, tofree, node);
+			free_block(cachep, ac->entry, tofree, node, &this_cpu);
 			ac->avail -= tofree;
 			memmove(ac->entry, &(ac->entry[tofree]),
 				sizeof(void *) * ac->avail);
 		}
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	}
+	return 1;
 }
 
 /**
@@ -4051,11 +4194,12 @@ void drain_array(struct kmem_cache *cach
  */
 static void cache_reap(struct work_struct *w)
 {
+	int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu);
 	struct kmem_cache *searchp;
 	struct kmem_list3 *l3;
-	int node = numa_node_id();
 	struct delayed_work *work =
 		container_of(w, struct delayed_work, work);
+	int work_done = 0;
 
 	if (!mutex_trylock(&cache_chain_mutex))
 		/* Give up. Setup the next iteration. */
@@ -4071,9 +4215,12 @@ static void cache_reap(struct work_struc
 		 */
 		l3 = searchp->nodelists[node];
 
-		reap_alien(searchp, l3);
+		work_done += reap_alien(searchp, l3, &this_cpu);
+
+		node = cpu_to_node(this_cpu);
 
-		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+		work_done += drain_array(searchp, l3,
+			    cpu_cache_get(searchp, this_cpu), 0, node);
 
 		/*
 		 * These are racy checks but it does not matter
@@ -4084,7 +4231,7 @@ static void cache_reap(struct work_struc
 
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
 
-		drain_array(searchp, l3, l3->shared, 0, node);
+		work_done += drain_array(searchp, l3, l3->shared, 0, node);
 
 		if (l3->free_touched)
 			l3->free_touched = 0;
@@ -4103,7 +4250,8 @@ next:
 	next_reap_node();
 out:
 	/* Set up the next iteration */
-	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
+	schedule_delayed_work(work,
+		round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC));
 }
 
 #ifdef CONFIG_SLABINFO
@@ -4162,7 +4310,7 @@ static int s_show(struct seq_file *m, vo
 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
 	const char *name;
 	char *error = NULL;
-	int node;
+	int this_cpu, node;
 	struct kmem_list3 *l3;
 
 	active_objs = 0;
@@ -4173,7 +4321,7 @@ static int s_show(struct seq_file *m, vo
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 		list_for_each_entry(slabp, &l3->slabs_full, list) {
 			if (slabp->inuse != cachep->num && !error)
@@ -4198,7 +4346,7 @@ static int s_show(struct seq_file *m, vo
 		if (l3->shared)
 			shared_avail += l3->shared->avail;
 
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
@@ -4394,7 +4542,7 @@ static int leaks_show(struct seq_file *m
 	struct kmem_list3 *l3;
 	const char *name;
 	unsigned long *n = m->private;
-	int node;
+	int node, this_cpu;
 	int i;
 
 	if (!(cachep->flags & SLAB_STORE_USER))
@@ -4412,13 +4560,13 @@ static int leaks_show(struct seq_file *m
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 		list_for_each_entry(slabp, &l3->slabs_full, list)
 			handle_slab(n, cachep, slabp);
 		list_for_each_entry(slabp, &l3->slabs_partial, list)
 			handle_slab(n, cachep, slabp);
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	}
 	name = cachep->name;
 	if (n[0] == n[1]) {
Index: linux-2.6.25.4-rt4/mm/page_alloc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/page_alloc.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/page_alloc.c	2008-05-29 09:47:14.000000000 -0400
@@ -161,6 +161,53 @@ static unsigned long __meminitdata dma_r
   EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
+#ifdef CONFIG_PREEMPT_RT
+static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
+#endif
+
+static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	spin_lock(&__get_cpu_lock(pcp_locks, cpu));
+	flags = 0;
+#else
+	local_irq_save(*flags);
+#endif
+}
+
+static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	(void)get_cpu_var_locked(pcp_locks, this_cpu);
+	flags = 0;
+#else
+	local_irq_save(*flags);
+	*this_cpu = smp_processor_id();
+#endif
+}
+
+static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	put_cpu_var_locked(pcp_locks, this_cpu);
+#else
+	local_irq_restore(flags);
+#endif
+}
+
+static struct per_cpu_pageset *
+get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
+{
+	lock_cpu_pcp(flags, this_cpu);
+	return zone_pcp(zone, *this_cpu);
+}
+
+static void
+put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
+{
+	unlock_cpu_pcp(flags, this_cpu);
+}
+
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 EXPORT_SYMBOL(nr_node_ids);
@@ -524,8 +571,9 @@ static void free_one_page(struct zone *z
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
-	int i;
 	int reserved = 0;
+	int this_cpu;
+	int i;
 
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
@@ -537,10 +585,10 @@ static void __free_pages_ok(struct page 
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 
-	local_irq_save(flags);
-	__count_vm_events(PGFREE, 1 << order);
+	lock_cpu_pcp(&flags, &this_cpu);
+	count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order);
-	local_irq_restore(flags);
+	unlock_cpu_pcp(flags, this_cpu);
 }
 
 /*
@@ -888,15 +936,16 @@ void drain_zone_pages(struct zone *zone,
 {
 	unsigned long flags;
 	int to_drain;
+	int this_cpu;
 
-	local_irq_save(flags);
+	lock_cpu_pcp(&flags, &this_cpu);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
 	free_pages_bulk(zone, to_drain, &pcp->list, 0);
 	pcp->count -= to_drain;
-	local_irq_restore(flags);
+	unlock_cpu_pcp(flags, this_cpu);
 }
 #endif
 
@@ -920,12 +969,15 @@ static void drain_pages(unsigned int cpu
 			continue;
 
 		pset = zone_pcp(zone, cpu);
-
+		if (!pset) {
+			WARN_ON(1);
+			continue;
+		}
 		pcp = &pset->pcp;
-		local_irq_save(flags);
+		lock_cpu_pcp(&flags, &cpu);
 		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
 		pcp->count = 0;
-		local_irq_restore(flags);
+		unlock_cpu_pcp(flags, cpu);
 	}
 }
 
@@ -942,7 +994,40 @@ void drain_local_pages(void *arg)
  */
 void drain_all_pages(void)
 {
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * HACK!!!!!
+	 *  For RT we can't use IPIs to run drain_local_pages, since
+	 *  that code will call spin_locks that will now sleep.
+	 *  But, schedule_on_each_cpu will call kzalloc, which will
+	 *  call page_alloc which was what calls this.
+	 *
+	 *  Luckily, there's a condition to get here, and that is if
+	 *  the order passed in to alloc_pages is greater than 0
+	 *  (alloced more than a page size).  The slabs only allocate
+	 *  what is needed, and the allocation made by schedule_on_each_cpu
+	 *  does an alloc of "sizeof(void *)*nr_cpu_ids".
+	 *
+	 *  So we can safely call schedule_on_each_cpu if that number
+	 *  is less than a page. Otherwise don't bother. At least warn of
+	 *  this issue.
+	 *
+	 * And yes, this is one big hack.  Please fix ;-)
+	 */
+	if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
+		schedule_on_each_cpu(drain_local_pages, NULL, 0, 1);
+	else {
+		static int once;
+		if (!once) {
+			printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
+			once = 1;
+		}
+		drain_local_pages(NULL);
+	}
+
+#else
 	on_each_cpu(drain_local_pages, NULL, 0, 1);
+#endif
 }
 
 #ifdef CONFIG_HIBERNATION
@@ -987,8 +1072,10 @@ void mark_free_pages(struct zone *zone)
 static void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
+	struct per_cpu_pageset *pset;
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
+	int this_cpu;
 
 	if (PageAnon(page))
 		page->mapping = NULL;
@@ -1000,9 +1087,11 @@ static void free_hot_cold_page(struct pa
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
-	pcp = &zone_pcp(zone, get_cpu())->pcp;
-	local_irq_save(flags);
-	__count_vm_event(PGFREE);
+	pset = get_zone_pcp(zone, &flags, &this_cpu);
+	pcp = &pset->pcp;
+
+	count_vm_event(PGFREE);
+
 	if (cold)
 		list_add_tail(&page->lru, &pcp->list);
 	else
@@ -1013,8 +1102,7 @@ static void free_hot_cold_page(struct pa
 		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 		pcp->count -= pcp->batch;
 	}
-	local_irq_restore(flags);
-	put_cpu();
+	put_zone_pcp(zone, flags, this_cpu);
 }
 
 void free_hot_page(struct page *page)
@@ -1056,16 +1144,15 @@ static struct page *buffered_rmqueue(str
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
-	int cpu;
+	struct per_cpu_pageset *pset;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
+	int this_cpu;
 
 again:
-	cpu  = get_cpu();
+	pset = get_zone_pcp(zone, &flags, &this_cpu);
 	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
+		struct per_cpu_pages *pcp = &pset->pcp;
 
-		pcp = &zone_pcp(zone, cpu)->pcp;
-		local_irq_save(flags);
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
@@ -1094,7 +1181,7 @@ again:
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
-		spin_lock_irqsave(&zone->lock, flags);
+		spin_lock(&zone->lock);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
@@ -1103,8 +1190,7 @@ again:
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(zonelist, zone);
-	local_irq_restore(flags);
-	put_cpu();
+	put_zone_pcp(zone, flags, this_cpu);
 
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1112,8 +1198,7 @@ again:
 	return page;
 
 failed:
-	local_irq_restore(flags);
-	put_cpu();
+	put_zone_pcp(zone, flags, this_cpu);
 	return NULL;
 }
 
Index: linux-2.6.25.4-rt4/include/linux/smp.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/smp.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/smp.h	2008-05-29 09:47:05.000000000 -0400
@@ -7,6 +7,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/cpumask.h>
 
 extern void cpu_idle(void);
 
@@ -33,6 +34,24 @@ extern void smp_send_stop(void);
  */
 extern void smp_send_reschedule(int cpu);
 
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
+#ifdef HAVE_RESCHEDULE_ALLBUTSELF_CPUMASK
+extern void smp_send_reschedule_allbutself_cpumask(cpumask_t);
+#else
+static inline void smp_send_reschedule_allbutself_cpumask(cpumask_t mask) {
+	smp_send_reschedule_allbutself();
+}
+#endif
+
 
 /*
  * Prepare machine for booting other CPUs.
@@ -100,6 +119,8 @@ static inline int up_smp_call_function(v
 		0;				\
 	})
 static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_allbutself(void) { }
+static inline void smp_send_reschedule_allbutself_cpumask(cpumask_t mask) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
 #define smp_call_function_single(cpuid, func, info, retry, wait) \
@@ -139,7 +160,7 @@ static inline void smp_send_reschedule(i
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
-#define put_cpu_no_resched()	preempt_enable_no_resched()
+#define put_cpu_no_resched()	__preempt_enable_no_resched()
 
 void smp_setup_processor_id(void);
 
Index: linux-2.6.25.4-rt4/kernel/stop_machine.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/stop_machine.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/stop_machine.c	2008-05-29 09:46:43.000000000 -0400
@@ -62,7 +62,7 @@ static int stopmachine(void *cpu)
 		/* Yield in first stage: migration threads need to
 		 * help our sisters onto their CPUs. */
 		if (!prepared && !irqs_disabled)
-			yield();
+			__yield();
 		else
 			cpu_relax();
 	}
@@ -108,7 +108,7 @@ static int stop_machine(void)
 
 	/* Wait for them all to come to life. */
 	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		yield();
+		__yield();
 
 	/* If some failed, kill them all. */
 	if (ret < 0) {
@@ -132,7 +132,7 @@ static void restart_machine(void)
 {
 	stopmachine_set_state(STOPMACHINE_EXIT);
 	local_irq_enable();
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 }
 
 struct stop_machine_data
Index: linux-2.6.25.4-rt4/net/ipv4/tcp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/ipv4/tcp.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/net/ipv4/tcp.c	2008-05-29 09:46:25.000000000 -0400
@@ -1307,11 +1307,11 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
 		    !sysctl_tcp_low_latency &&
 		    __get_cpu_var(softnet_data).net_dma) {
-			preempt_enable_no_resched();
+			preempt_enable();
 			tp->ucopy.pinned_list =
 					dma_pin_iovec_pages(msg->msg_iov, len);
 		} else {
-			preempt_enable_no_resched();
+			preempt_enable();
 		}
 	}
 #endif
Index: linux-2.6.25.4-rt4/net/ipv4/route.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/ipv4/route.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/net/ipv4/route.c	2008-05-29 09:46:49.000000000 -0400
@@ -207,13 +207,13 @@ struct rt_hash_bucket {
 	struct rtable	*chain;
 };
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
+	defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT)
 /*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
  */
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
 # define RT_HASH_LOCK_SZ	256
 #else
 # if NR_CPUS >= 32
@@ -245,7 +245,7 @@ static __init void rt_hash_lock_init(voi
 		spin_lock_init(&rt_hash_locks[i]);
 }
 #else
-# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL)
 
 static inline void rt_hash_lock_init(void)
 {
Index: linux-2.6.25.4-rt4/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/acpi/processor_idle.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/acpi/processor_idle.c	2008-05-29 09:46:44.000000000 -0400
@@ -216,7 +216,7 @@ static void acpi_safe_halt(void)
 	 * test NEED_RESCHED:
 	 */
 	smp_mb();
-	if (!need_resched()) {
+	if (!need_resched() || !need_resched_delayed()) {
 		safe_halt();
 		local_irq_disable();
 	}
@@ -1485,7 +1485,7 @@ static int acpi_idle_enter_simple(struct
 	 */
 	smp_mb();
 
-	if (unlikely(need_resched())) {
+	if (unlikely(need_resched() || need_resched_delayed())) {
 		current_thread_info()->status |= TS_POLLING;
 		local_irq_enable();
 		return 0;
@@ -1530,7 +1530,7 @@ static int acpi_idle_enter_simple(struct
 }
 
 static int c3_cpu_count;
-static DEFINE_SPINLOCK(c3_lock);
+static DEFINE_RAW_SPINLOCK(c3_lock);
 
 /**
  * acpi_idle_enter_bm - enters C3 with proper BM handling
@@ -1574,7 +1574,7 @@ static int acpi_idle_enter_bm(struct cpu
 	 */
 	smp_mb();
 
-	if (unlikely(need_resched())) {
+	if (unlikely(need_resched() || need_resched_delayed())) {
 		current_thread_info()->status |= TS_POLLING;
 		local_irq_enable();
 		return 0;
Index: linux-2.6.25.4-rt4/drivers/input/ff-memless.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/input/ff-memless.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/input/ff-memless.c	2008-05-29 09:46:26.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/input.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/jiffies.h>
 
Index: linux-2.6.25.4-rt4/fs/proc/array.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/proc/array.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/proc/array.c	2008-05-29 09:46:26.000000000 -0400
@@ -136,12 +136,13 @@ static inline void task_name(struct seq_
  */
 static const char *task_state_array[] = {
 	"R (running)",		/*  0 */
-	"S (sleeping)",		/*  1 */
-	"D (disk sleep)",	/*  2 */
-	"T (stopped)",		/*  4 */
-	"T (tracing stop)",	/*  8 */
-	"Z (zombie)",		/* 16 */
-	"X (dead)"		/* 32 */
+	"M (running-mutex)",	/*  1 */
+	"S (sleeping)",		/*  2 */
+	"D (disk sleep)",	/*  4 */
+	"T (stopped)",		/*  8 */
+	"T (tracing stop)",	/* 16 */
+	"Z (zombie)",		/* 32 */
+	"X (dead)"		/* 64 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
@@ -308,6 +309,19 @@ static inline void task_context_switch_c
 			p->nivcsw);
 }
 
+#define get_blocked_on(t)	(-1)
+
+static inline void show_blocked_on(struct seq_file *m, struct task_struct *p)
+{
+	pid_t pid = get_blocked_on(p);
+
+	if (pid < 0)
+		return;
+
+	seq_printf(m, "BlckOn: %d\n", pid);
+}
+
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -327,6 +341,7 @@ int proc_pid_status(struct seq_file *m, 
 	task_show_regs(m, task);
 #endif
 	task_context_switch_counts(m, task);
+	show_blocked_on(m, task);
 	return 0;
 }
 
Index: linux-2.6.25.4-rt4/include/linux/bit_spinlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/bit_spinlock.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/bit_spinlock.h	2008-05-29 09:46:26.000000000 -0400
@@ -1,6 +1,8 @@
 #ifndef __LINUX_BIT_SPINLOCK_H
 #define __LINUX_BIT_SPINLOCK_H
 
+#if 0
+
 /*
  *  bit-based spin_lock()
  *
@@ -91,5 +93,7 @@ static inline int bit_spin_is_locked(int
 #endif
 }
 
+#endif
+
 #endif /* __LINUX_BIT_SPINLOCK_H */
 
Index: linux-2.6.25.4-rt4/include/linux/mutex.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/mutex.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/mutex.h	2008-05-29 09:47:03.000000000 -0400
@@ -12,11 +12,82 @@
 
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
+#include <linux/rt_lock.h>
 #include <linux/linkage.h>
 #include <linux/lockdep.h>
 
 #include <asm/atomic.h>
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+		, .dep_map = { .name = #lockname }
+#else
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+
+#include <linux/rtmutex.h>
+
+struct mutex {
+	struct rt_mutex		lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+
+#define __MUTEX_INITIALIZER(mutexname)					\
+	{								\
+		.lock = __RT_MUTEX_INITIALIZER(mutexname.lock)		\
+		__DEP_MAP_MUTEX_INITIALIZER(mutexname)			\
+	}
+
+#define DEFINE_MUTEX(mutexname)						\
+	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
+
+extern void
+_mutex_init(struct mutex *lock, char *name, struct lock_class_key *key);
+
+extern void __lockfunc _mutex_lock(struct mutex *lock);
+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_trylock(struct mutex *lock);
+extern void __lockfunc _mutex_unlock(struct mutex *lock);
+
+#define mutex_is_locked(l)		rt_mutex_is_locked(&(l)->lock)
+#define mutex_lock(l)			_mutex_lock(l)
+#define mutex_lock_interruptible(l)	_mutex_lock_interruptible(l)
+#define mutex_lock_killable(l)		_mutex_lock_killable(l)
+#define mutex_trylock(l)		_mutex_trylock(l)
+#define mutex_unlock(l)			_mutex_unlock(l)
+#define mutex_destroy(l)		rt_mutex_destroy(&(l)->lock)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define mutex_lock_nested(l, s)	_mutex_lock_nested(l, s)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible_nested(l, s)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable_nested(l, s)
+#else
+# define mutex_lock_nested(l, s)	_mutex_lock(l)
+# define mutex_lock_interruptible_nested(l, s) \
+					_mutex_lock_interruptible(l)
+# define mutex_lock_killable_nested(l, s) \
+					_mutex_lock_killable(l)
+#endif
+
+# define mutex_init(mutex)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	_mutex_init((mutex), #mutex, &__key);		\
+} while (0)
+
+#else
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -86,13 +157,6 @@ do {							\
 # define mutex_destroy(mutex)				do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
-		, .dep_map = { .name = #lockname }
-#else
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
-#endif
-
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .count = ATOMIC_INIT(1) \
 		, .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
@@ -149,3 +213,4 @@ extern int mutex_trylock(struct mutex *l
 extern void mutex_unlock(struct mutex *lock);
 
 #endif
+#endif
Index: linux-2.6.25.4-rt4/include/linux/plist.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/plist.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/plist.h	2008-05-29 09:47:11.000000000 -0400
@@ -81,7 +81,7 @@ struct plist_head {
 	struct list_head prio_list;
 	struct list_head node_list;
 #ifdef CONFIG_DEBUG_PI_LIST
-	spinlock_t *lock;
+	raw_spinlock_t *lock;
 #endif
 };
 
@@ -99,13 +99,13 @@ struct plist_node {
 /**
  * PLIST_HEAD_INIT - static struct plist_head initializer
  * @head:	struct plist_head variable name
- * @_lock:	lock to initialize for this list
+ * @_lock:	lock * to initialize for this list
  */
 #define PLIST_HEAD_INIT(head, _lock)			\
 {							\
 	.prio_list = LIST_HEAD_INIT((head).prio_list),	\
 	.node_list = LIST_HEAD_INIT((head).node_list),	\
-	PLIST_HEAD_LOCK_INIT(&(_lock))			\
+	PLIST_HEAD_LOCK_INIT(_lock)			\
 }
 
 /**
@@ -125,7 +125,7 @@ struct plist_node {
  * @lock:	list spinlock, remembered for debugging
  */
 static inline void
-plist_head_init(struct plist_head *head, spinlock_t *lock)
+plist_head_init(struct plist_head *head, raw_spinlock_t *lock)
 {
 	INIT_LIST_HEAD(&head->prio_list);
 	INIT_LIST_HEAD(&head->node_list);
@@ -148,6 +148,8 @@ static inline void plist_node_init(struc
 extern void plist_add(struct plist_node *node, struct plist_head *head);
 extern void plist_del(struct plist_node *node, struct plist_head *head);
 
+extern void plist_head_splice(struct plist_head *src, struct plist_head *dst);
+
 /**
  * plist_for_each - iterate over the plist
  * @pos:	the type * to use as a loop counter
Index: linux-2.6.25.4-rt4/include/linux/rt_lock.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/linux/rt_lock.h	2008-05-29 09:47:26.000000000 -0400
@@ -0,0 +1,282 @@
+#ifndef __LINUX_RT_LOCK_H
+#define __LINUX_RT_LOCK_H
+
+/*
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains the main data structure definitions.
+ */
+#include <linux/rtmutex.h>
+#include <asm/atomic.h>
+#include <linux/spinlock_types.h>
+#include <linux/sched_prio.h>
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct {
+	struct rt_mutex		lock;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __RT_SPIN_INITIALIZER(name)					\
+	{ .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),		\
+	  .save_state = 1,						\
+	  .file = __FILE__,						\
+	  .line = __LINE__, }
+#else
+# define __RT_SPIN_INITIALIZER(name)					\
+	{ .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) }
+#endif
+
+#define __SPIN_LOCK_UNLOCKED(name) (spinlock_t)				\
+	{ .lock = __RT_SPIN_INITIALIZER(name),				\
+	  SPIN_DEP_MAP_INIT(name) }
+
+#else /* !PREEMPT_RT */
+
+typedef raw_spinlock_t spinlock_t;
+
+#define __SPIN_LOCK_UNLOCKED	_RAW_SPIN_LOCK_UNLOCKED
+
+#endif
+
+#define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(spin_old_style)
+
+
+#define __DEFINE_SPINLOCK(name) \
+	spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
+
+#ifdef CONFIG_PREEMPT_RT
+
+struct rw_mutex {
+	struct task_struct	*owner;
+	struct rt_mutex		mutex;
+	atomic_t		count;	/* number of times held for read */
+	atomic_t		owners; /* number of owners as readers */
+	struct list_head	readers;
+	int prio;
+};
+
+/*
+ * RW-semaphores are a spinlock plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well:
+ */
+struct rw_semaphore {
+	struct rw_mutex	owners;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+/*
+ * rwlocks - an RW semaphore plus lock-break field:
+ */
+typedef struct {
+	struct rw_mutex	owners;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} rwlock_t;
+
+#define __RW_LOCK_UNLOCKED(name) (rwlock_t) \
+	{ .owners.mutex = __RT_SPIN_INITIALIZER(name.owners.mutex),	\
+	  .owners.prio = MAX_PRIO,					\
+	  RW_DEP_MAP_INIT(name) }
+#else /* !PREEMPT_RT */
+
+typedef raw_rwlock_t rwlock_t;
+
+#define __RW_LOCK_UNLOCKED	_RAW_RW_LOCK_UNLOCKED
+
+#endif
+
+#define RW_LOCK_UNLOCKED	__RW_LOCK_UNLOCKED(rw_old_style)
+
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Semaphores - a spinlock plus the semaphore count:
+ */
+struct semaphore {
+	atomic_t		count;
+	struct rt_mutex		lock;
+};
+
+#define DECLARE_MUTEX(name) \
+struct semaphore name = \
+	{ .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+extern void
+__sema_init(struct semaphore *sem, int val, char *name, char *file, int line);
+
+#define rt_sema_init(sem, val) \
+		__sema_init(sem, val, #sem, __FILE__, __LINE__)
+
+extern void
+__init_MUTEX(struct semaphore *sem, char *name, char *file, int line);
+#define rt_init_MUTEX(sem) \
+		__init_MUTEX(sem, #sem, __FILE__, __LINE__)
+
+extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void);
+
+/*
+ * No locked initialization for RT semaphores
+ */
+#define rt_init_MUTEX_LOCKED(sem) \
+		there_is_no_init_MUTEX_LOCKED_for_RT_semaphores()
+extern void  rt_down(struct semaphore *sem);
+extern int  rt_down_interruptible(struct semaphore *sem);
+extern int  rt_down_trylock(struct semaphore *sem);
+extern void  rt_up(struct semaphore *sem);
+
+#define rt_sem_is_locked(s)	rt_mutex_is_locked(&(s)->lock)
+#define rt_sema_count(s)	atomic_read(&(s)->count)
+
+extern int __bad_func_type(void);
+
+#include <linux/pickop.h>
+
+/*
+ * PICK_SEM_OP() is a small redirector to allow less typing of the lock
+ * types struct compat_semaphore, struct semaphore, at the front of the
+ * PICK_FUNCTION macro.
+ */
+#define PICK_SEM_OP(...) PICK_FUNCTION(struct compat_semaphore *,	\
+	struct semaphore *, ##__VA_ARGS__)
+#define PICK_SEM_OP_RET(...) PICK_FUNCTION_RET(struct compat_semaphore *,\
+	struct semaphore *, ##__VA_ARGS__)
+
+#define sema_init(sem, val) \
+	PICK_SEM_OP(compat_sema_init, rt_sema_init, sem, val)
+
+#define init_MUTEX(sem) PICK_SEM_OP(compat_init_MUTEX, rt_init_MUTEX, sem)
+
+#define init_MUTEX_LOCKED(sem) \
+	PICK_SEM_OP(compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem)
+
+#define down(sem) PICK_SEM_OP(compat_down, rt_down, sem)
+
+#define down_interruptible(sem) \
+	PICK_SEM_OP_RET(compat_down_interruptible, rt_down_interruptible, sem)
+
+#define down_trylock(sem) \
+	PICK_SEM_OP_RET(compat_down_trylock, rt_down_trylock, sem)
+
+#define up(sem) PICK_SEM_OP(compat_up, rt_up, sem)
+
+#define sem_is_locked(sem) \
+	PICK_SEM_OP_RET(compat_sem_is_locked, rt_sem_is_locked, sem)
+
+#define sema_count(sem) PICK_SEM_OP_RET(compat_sema_count, rt_sema_count, sem)
+
+/*
+ * rwsems:
+ */
+
+#define __RWSEM_INITIALIZER(name) \
+	{ .owners.mutex = __RT_MUTEX_INITIALIZER(name.owners.mutex),	\
+	  .owners.prio = MAX_PRIO,					\
+	  RW_DEP_MAP_INIT(name) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
+				     struct lock_class_key *key);
+
+# define rt_init_rwsem(sem)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_rwsem_init((sem), #sem, &__key);		\
+} while (0)
+
+extern void __dont_do_this_in_rt(struct rw_semaphore *rwsem);
+
+#define rt_down_read_non_owner(rwsem)	__dont_do_this_in_rt(rwsem)
+#define rt_up_read_non_owner(rwsem)	__dont_do_this_in_rt(rwsem)
+
+extern void  rt_down_write(struct rw_semaphore *rwsem);
+extern void
+rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
+extern void
+rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
+extern void  rt_down_read(struct rw_semaphore *rwsem);
+extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
+extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
+extern void  rt_up_read(struct rw_semaphore *rwsem);
+extern void  rt_up_write(struct rw_semaphore *rwsem);
+extern void  rt_downgrade_write(struct rw_semaphore *rwsem);
+
+# define rt_rwsem_is_locked(rws)	((rws)->owners.owner != NULL)
+
+#define PICK_RWSEM_OP(...) PICK_FUNCTION(struct compat_rw_semaphore *,	\
+	struct rw_semaphore *, ##__VA_ARGS__)
+#define PICK_RWSEM_OP_RET(...) PICK_FUNCTION_RET(struct compat_rw_semaphore *,\
+	struct rw_semaphore *, ##__VA_ARGS__)
+
+#define init_rwsem(rwsem) PICK_RWSEM_OP(compat_init_rwsem, rt_init_rwsem, rwsem)
+
+#define down_read(rwsem) PICK_RWSEM_OP(compat_down_read, rt_down_read, rwsem)
+
+#define down_read_non_owner(rwsem) \
+	PICK_RWSEM_OP(compat_down_read_non_owner, rt_down_read_non_owner, rwsem)
+
+#define down_read_trylock(rwsem) \
+	PICK_RWSEM_OP_RET(compat_down_read_trylock, rt_down_read_trylock, rwsem)
+
+#define down_write(rwsem) PICK_RWSEM_OP(compat_down_write, rt_down_write, rwsem)
+
+#define down_read_nested(rwsem, subclass) \
+	PICK_RWSEM_OP(compat_down_read_nested, rt_down_read_nested,	\
+		rwsem, subclass)
+
+#define down_write_nested(rwsem, subclass) \
+	PICK_RWSEM_OP(compat_down_write_nested, rt_down_write_nested,	\
+		rwsem, subclass)
+
+#define down_write_trylock(rwsem) \
+	PICK_RWSEM_OP_RET(compat_down_write_trylock, rt_down_write_trylock,\
+		rwsem)
+
+#define up_read(rwsem) PICK_RWSEM_OP(compat_up_read, rt_up_read, rwsem)
+
+#define up_read_non_owner(rwsem) \
+	PICK_RWSEM_OP(compat_up_read_non_owner, rt_up_read_non_owner, rwsem)
+
+#define up_write(rwsem) PICK_RWSEM_OP(compat_up_write, rt_up_write, rwsem)
+
+#define downgrade_write(rwsem) \
+	PICK_RWSEM_OP(compat_downgrade_write, rt_downgrade_write, rwsem)
+
+#define rwsem_is_locked(rwsem) \
+	PICK_RWSEM_OP_RET(compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem)
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif
+
Index: linux-2.6.25.4-rt4/include/linux/rtmutex.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/rtmutex.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/rtmutex.h	2008-05-29 09:47:11.000000000 -0400
@@ -24,7 +24,7 @@
  * @owner:	the mutex owner
  */
 struct rt_mutex {
-	spinlock_t		wait_lock;
+	raw_spinlock_t		wait_lock;
 	struct plist_head	wait_list;
 	struct task_struct	*owner;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -63,8 +63,8 @@ struct hrtimer_sleeper;
 #endif
 
 #define __RT_MUTEX_INITIALIZER(mutexname) \
-	{ .wait_lock = __SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
-	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \
+	{ .wait_lock = RAW_SPIN_LOCK_UNLOCKED(mutexname) \
+	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, &mutexname.wait_lock) \
 	, .owner = NULL \
 	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
 
@@ -88,6 +88,8 @@ extern void rt_mutex_destroy(struct rt_m
 extern void rt_mutex_lock(struct rt_mutex *lock);
 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
 						int detect_deadlock);
+extern int rt_mutex_lock_killable(struct rt_mutex *lock,
+				  int detect_deadlock);
 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
 					struct hrtimer_sleeper *timeout,
 					int detect_deadlock);
@@ -98,7 +100,7 @@ extern void rt_mutex_unlock(struct rt_mu
 
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
-	.pi_waiters	= PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock),	\
+	.pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, &tsk.pi_lock),	\
 	INIT_RT_MUTEX_DEBUG(tsk)
 #else
 # define INIT_RT_MUTEXES(tsk)
Index: linux-2.6.25.4-rt4/include/linux/rwsem-spinlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/rwsem-spinlock.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/rwsem-spinlock.h	2008-05-29 09:46:26.000000000 -0400
@@ -28,7 +28,7 @@ struct rwsem_waiter;
  * - if activity is -1 then there is one active writer
  * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	__s32			activity;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
@@ -43,33 +43,32 @@ struct rw_semaphore {
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
-#define __RWSEM_INITIALIZER(name) \
-{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \
-  __RWSEM_DEP_MAP_INIT(name) }
+#define __COMPAT_RWSEM_INITIALIZER(name) \
+{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+extern void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name,
 			 struct lock_class_key *key);
 
-#define init_rwsem(sem)						\
+#define compat_init_rwsem(sem)					\
 do {								\
 	static struct lock_class_key __key;			\
 								\
-	__init_rwsem((sem), #sem, &__key);			\
+	__compat_init_rwsem((sem), #sem, &__key);		\
 } while (0)
 
-extern void __down_read(struct rw_semaphore *sem);
-extern int __down_read_trylock(struct rw_semaphore *sem);
-extern void __down_write(struct rw_semaphore *sem);
-extern void __down_write_nested(struct rw_semaphore *sem, int subclass);
-extern int __down_write_trylock(struct rw_semaphore *sem);
-extern void __up_read(struct rw_semaphore *sem);
-extern void __up_write(struct rw_semaphore *sem);
-extern void __downgrade_write(struct rw_semaphore *sem);
+extern void __down_read(struct compat_rw_semaphore *sem);
+extern int __down_read_trylock(struct compat_rw_semaphore *sem);
+extern void __down_write(struct compat_rw_semaphore *sem);
+extern void __down_write_nested(struct compat_rw_semaphore *sem, int subclass);
+extern int __down_write_trylock(struct compat_rw_semaphore *sem);
+extern void __up_read(struct compat_rw_semaphore *sem);
+extern void __up_write(struct compat_rw_semaphore *sem);
+extern void __downgrade_write(struct compat_rw_semaphore *sem);
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->activity != 0);
 }
Index: linux-2.6.25.4-rt4/include/linux/rwsem.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/rwsem.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/rwsem.h	2008-05-29 09:46:26.000000000 -0400
@@ -9,6 +9,10 @@
 
 #include <linux/linkage.h>
 
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #ifdef __KERNEL__
 
 #include <linux/types.h>
@@ -16,48 +20,59 @@
 #include <asm/system.h>
 #include <asm/atomic.h>
 
-struct rw_semaphore;
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * On !PREEMPT_RT all rw-semaphores are compat:
+ */
+#define compat_rw_semaphore rw_semaphore
+#endif
+
+struct compat_rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
-#include <linux/rwsem-spinlock.h> /* use a generic implementation */
+# include <linux/rwsem-spinlock.h> /* use a generic implementation */
+#  ifndef CONFIG_PREEMPT_RT
+#  define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER
+#  define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+# endif
 #else
-#include <asm/rwsem.h> /* use an arch-specific implementation */
+# include <asm/rwsem.h> /* use an arch-specific implementation */
 #endif
 
 /*
  * lock for reading
  */
-extern void down_read(struct rw_semaphore *sem);
+extern void compat_down_read(struct compat_rw_semaphore *sem);
 
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-extern int down_read_trylock(struct rw_semaphore *sem);
+extern int compat_down_read_trylock(struct compat_rw_semaphore *sem);
 
 /*
  * lock for writing
  */
-extern void down_write(struct rw_semaphore *sem);
+extern void compat_down_write(struct compat_rw_semaphore *sem);
 
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-extern int down_write_trylock(struct rw_semaphore *sem);
+extern int compat_down_write_trylock(struct compat_rw_semaphore *sem);
 
 /*
  * release a read lock
  */
-extern void up_read(struct rw_semaphore *sem);
+extern void compat_up_read(struct compat_rw_semaphore *sem);
 
 /*
  * release a write lock
  */
-extern void up_write(struct rw_semaphore *sem);
+extern void compat_up_write(struct compat_rw_semaphore *sem);
 
 /*
  * downgrade write lock to read lock
  */
-extern void downgrade_write(struct rw_semaphore *sem);
+extern void compat_downgrade_write(struct compat_rw_semaphore *sem);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
@@ -73,22 +88,79 @@ extern void downgrade_write(struct rw_se
  * lockdep_set_class() at lock initialization time.
  * See Documentation/lockdep-design.txt for more details.)
  */
-extern void down_read_nested(struct rw_semaphore *sem, int subclass);
-extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+extern void
+compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass);
+extern void
+compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass);
 /*
  * Take/release a lock when not the owner will release it.
  *
  * [ This API should be avoided as much as possible - the
  *   proper abstraction for this case is completions. ]
  */
-extern void down_read_non_owner(struct rw_semaphore *sem);
-extern void up_read_non_owner(struct rw_semaphore *sem);
+extern void
+compat_down_read_non_owner(struct compat_rw_semaphore *sem);
+extern void
+compat_up_read_non_owner(struct compat_rw_semaphore *sem);
 #else
-# define down_read_nested(sem, subclass)		down_read(sem)
-# define down_write_nested(sem, subclass)	down_write(sem)
-# define down_read_non_owner(sem)		down_read(sem)
-# define up_read_non_owner(sem)			up_read(sem)
+# define compat_down_read_nested(sem, subclass)		compat_down_read(sem)
+# define compat_down_write_nested(sem, subclass)	compat_down_write(sem)
+# define compat_down_read_non_owner(sem)		compat_down_read(sem)
+# define compat_up_read_non_owner(sem)			compat_up_read(sem)
 #endif
 
+#ifndef CONFIG_PREEMPT_RT
+
+#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+
+/*
+ * NOTE, lockdep: this has to be a macro, so that separate class-keys
+ * get generated by the compiler, if the same function does multiple
+ * init_rwsem() calls to different rwsems.
+ */
+#define init_rwsem(rwsem)	compat_init_rwsem(rwsem)
+
+static inline void down_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_read(rwsem);
+}
+static inline int down_read_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_read_trylock(rwsem);
+}
+static inline void down_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_write(rwsem);
+}
+static inline int down_write_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_write_trylock(rwsem);
+}
+static inline void up_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_read(rwsem);
+}
+static inline void up_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_write(rwsem);
+}
+static inline void downgrade_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_downgrade_write(rwsem);
+}
+static inline int rwsem_is_locked(struct compat_rw_semaphore *sem)
+{
+	return compat_rwsem_is_locked(sem);
+}
+# define down_read_nested(sem, subclass) \
+		compat_down_read_nested(sem, subclass)
+# define down_write_nested(sem, subclass) \
+		compat_down_write_nested(sem, subclass)
+# define down_read_non_owner(sem) \
+		compat_down_read_non_owner(sem)
+# define up_read_non_owner(sem) \
+		compat_up_read_non_owner(sem)
+#endif /* !CONFIG_PREEMPT_RT */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_H */
Index: linux-2.6.25.4-rt4/include/linux/semaphore.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/linux/semaphore.h	2008-05-29 09:46:26.000000000 -0400
@@ -0,0 +1,49 @@
+#ifndef _LINUX_SEMAPHORE_H
+#define _LINUX_SEMAPHORE_H
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#else
+
+#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX
+
+static inline void sema_init(struct compat_semaphore *sem, int val)
+{
+	compat_sema_init(sem, val);
+}
+static inline void init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX(sem);
+}
+static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX_LOCKED(sem);
+}
+static inline void down(struct compat_semaphore *sem)
+{
+	compat_down(sem);
+}
+static inline int down_interruptible(struct compat_semaphore *sem)
+{
+	return compat_down_interruptible(sem);
+}
+static inline int down_trylock(struct compat_semaphore *sem)
+{
+	return compat_down_trylock(sem);
+}
+static inline void up(struct compat_semaphore *sem)
+{
+	compat_up(sem);
+}
+static inline int sem_is_locked(struct compat_semaphore *sem)
+{
+	return compat_sem_is_locked(sem);
+}
+static inline int sema_count(struct compat_semaphore *sem)
+{
+	return compat_sema_count(sem);
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif /* _LINUX_SEMAPHORE_H */
Index: linux-2.6.25.4-rt4/include/linux/seqlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/seqlock.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/seqlock.h	2008-05-29 09:47:09.000000000 -0400
@@ -32,46 +32,83 @@
 typedef struct {
 	unsigned sequence;
 	spinlock_t lock;
-} seqlock_t;
+} __seqlock_t;
+
+typedef struct {
+	unsigned sequence;
+	raw_spinlock_t lock;
+} __raw_seqlock_t;
+
+#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock)
+
+#ifdef CONFIG_PREEMPT_RT
+typedef __seqlock_t seqlock_t;
+#else
+typedef __raw_seqlock_t seqlock_t;
+#endif
+
+typedef __raw_seqlock_t raw_seqlock_t;
 
 /*
  * These macros triggered gcc-3.x compile-time problems.  We think these are
  * OK now.  Be cautious.
  */
-#define __SEQLOCK_UNLOCKED(lockname) \
-		 { 0, __SPIN_LOCK_UNLOCKED(lockname) }
+#define __RAW_SEQLOCK_UNLOCKED(lockname) \
+		{ 0, RAW_SPIN_LOCK_UNLOCKED(lockname) }
+
+#ifdef CONFIG_PREEMPT_RT
+# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) }
+#else
+# define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname)
+#endif
 
 #define SEQLOCK_UNLOCKED \
 		 __SEQLOCK_UNLOCKED(old_style_seqlock_init)
 
-#define seqlock_init(x)					\
-	do {						\
-		(x)->sequence = 0;			\
-		spin_lock_init(&(x)->lock);		\
-	} while (0)
+#define raw_seqlock_init(x) \
+	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
+
+#define seqlock_init(x) \
+		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0)
 
 #define DEFINE_SEQLOCK(x) \
 		seqlock_t x = __SEQLOCK_UNLOCKED(x)
 
+#define DEFINE_RAW_SEQLOCK(name) \
+	raw_seqlock_t name __cacheline_aligned_in_smp = \
+					__RAW_SEQLOCK_UNLOCKED(name)
+
+
 /* Lock out other writers and update the count.
  * Acts like a normal spin_lock/unlock.
  * Don't need preempt_disable() because that is in the spin_lock already.
  */
-static inline void write_seqlock(seqlock_t *sl)
+static inline void __write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();
 }
 
-static inline void write_sequnlock(seqlock_t *sl)
+static __always_inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+	__write_seqlock(sl);
+	return flags;
+}
+
+static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
+#define __write_sequnlock_irqrestore(sl, flags)	__write_sequnlock(sl)
+
+static inline int __write_tryseqlock(seqlock_t *sl)
 {
 	int ret = spin_trylock(&sl->lock);
 
@@ -83,7 +120,7 @@ static inline int write_tryseqlock(seqlo
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
+static __always_inline unsigned __read_seqbegin(const seqlock_t *sl)
 {
 	unsigned ret = sl->sequence;
 	smp_rmb();
@@ -98,12 +135,195 @@ static __always_inline unsigned read_seq
  *    
  * Using xor saves one conditional branch.
  */
-static __always_inline int read_seqretry(const seqlock_t *sl, unsigned iv)
+static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
+{
+	int ret;
+
+	smp_rmb();
+	ret = (iv & 1) | (sl->sequence ^ iv);
+	/*
+	 * If invalid then serialize with the writer, to make sure we
+	 * are not livelocking it:
+	 */
+	if (unlikely(ret)) {
+		unsigned long flags;
+		spin_lock_irqsave(&sl->lock, flags);
+		spin_unlock_irqrestore(&sl->lock, flags);
+	}
+	return ret;
+}
+
+static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	++sl->sequence;
+	smp_wmb();
+}
+
+static __always_inline unsigned long
+__write_seqlock_irqsave_raw(raw_seqlock_t *sl)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__write_seqlock_raw(sl);
+	return flags;
+}
+
+static __always_inline void __write_seqlock_irq_raw(raw_seqlock_t *sl)
+{
+	local_irq_disable();
+	__write_seqlock_raw(sl);
+}
+
+static __always_inline void __write_seqlock_bh_raw(raw_seqlock_t *sl)
+{
+	local_bh_disable();
+	__write_seqlock_raw(sl);
+}
+
+static __always_inline void __write_sequnlock_raw(raw_seqlock_t *sl)
+{
+	smp_wmb();
+	sl->sequence++;
+	spin_unlock(&sl->lock);
+}
+
+static __always_inline void
+__write_sequnlock_irqrestore_raw(raw_seqlock_t *sl, unsigned long flags)
+{
+	__write_sequnlock_raw(sl);
+	local_irq_restore(flags);
+	preempt_check_resched();
+}
+
+static __always_inline void __write_sequnlock_irq_raw(raw_seqlock_t *sl)
+{
+	__write_sequnlock_raw(sl);
+	local_irq_enable();
+	preempt_check_resched();
+}
+
+static __always_inline void __write_sequnlock_bh_raw(raw_seqlock_t *sl)
+{
+	__write_sequnlock_raw(sl);
+	local_bh_enable();
+}
+
+static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
+{
+	int ret = spin_trylock(&sl->lock);
+
+	if (ret) {
+		++sl->sequence;
+		smp_wmb();
+	}
+	return ret;
+}
+
+static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
+{
+	unsigned ret = sl->sequence;
+	smp_rmb();
+	return ret;
+}
+
+static __always_inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv)
 {
 	smp_rmb();
 	return (iv & 1) | (sl->sequence ^ iv);
 }
 
+extern int __bad_seqlock_type(void);
+
+/*
+ * PICK_SEQ_OP() is a small redirector to allow less typing of the lock
+ * types raw_seqlock_t, seqlock_t, at the front of the PICK_FUNCTION
+ * macro.
+ */
+#define PICK_SEQ_OP(...) 	\
+	PICK_FUNCTION(raw_seqlock_t *, seqlock_t *, ##__VA_ARGS__)
+#define PICK_SEQ_OP_RET(...) \
+	PICK_FUNCTION_RET(raw_seqlock_t *, seqlock_t *, ##__VA_ARGS__)
+
+#define write_seqlock(sl) PICK_SEQ_OP(__write_seqlock_raw, __write_seqlock, sl)
+
+#define write_sequnlock(sl)	\
+	PICK_SEQ_OP(__write_sequnlock_raw, __write_sequnlock, sl)
+
+#define write_tryseqlock(sl)	\
+	PICK_SEQ_OP_RET(__write_tryseqlock_raw, __write_tryseqlock, sl)
+
+#define read_seqbegin(sl) 	\
+	PICK_SEQ_OP_RET(__read_seqbegin_raw, __read_seqbegin, sl)
+
+#define read_seqretry(sl, iv)	\
+	PICK_SEQ_OP_RET(__read_seqretry_raw, __read_seqretry, sl, iv)
+
+#define write_seqlock_irqsave(lock, flags)			\
+do {								\
+	flags = PICK_SEQ_OP_RET(__write_seqlock_irqsave_raw,	\
+		__write_seqlock_irqsave, lock);			\
+} while (0)
+
+#define write_seqlock_irq(lock)	\
+	PICK_SEQ_OP(__write_seqlock_irq_raw, __write_seqlock, lock)
+
+#define write_seqlock_bh(lock)	\
+	PICK_SEQ_OP(__write_seqlock_bh_raw, __write_seqlock, lock)
+
+#define write_sequnlock_irqrestore(lock, flags)		\
+	PICK_SEQ_OP(__write_sequnlock_irqrestore_raw,	\
+		__write_sequnlock_irqrestore, lock, flags)
+
+#define write_sequnlock_bh(lock)	\
+	PICK_SEQ_OP(__write_sequnlock_bh_raw, __write_sequnlock, lock)
+
+#define write_sequnlock_irq(lock)	\
+	PICK_SEQ_OP(__write_sequnlock_irq_raw, __write_sequnlock, lock)
+
+static __always_inline
+unsigned long __seq_irqsave_raw(raw_seqlock_t *sl)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	return flags;
+}
+
+static __always_inline unsigned long __seq_irqsave(seqlock_t *sl)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+	return flags;
+}
+
+#define read_seqbegin_irqsave(lock, flags)				\
+({									\
+	flags = PICK_SEQ_OP_RET(__seq_irqsave_raw, __seq_irqsave, lock);\
+	read_seqbegin(lock);						\
+})
+
+static __always_inline int
+__read_seqretry_irqrestore(seqlock_t *sl, unsigned iv, unsigned long flags)
+{
+	return __read_seqretry(sl, iv);
+}
+
+static __always_inline int
+__read_seqretry_irqrestore_raw(raw_seqlock_t *sl, unsigned iv,
+			       unsigned long flags)
+{
+	int ret = read_seqretry(sl, iv);
+	local_irq_restore(flags);
+	preempt_check_resched();
+	return ret;
+}
+
+#define read_seqretry_irqrestore(lock, iv, flags)			\
+	PICK_SEQ_OP_RET(__read_seqretry_irqrestore_raw, 		\
+		__read_seqretry_irqrestore, lock, iv, flags)
 
 /*
  * Version using sequence counter only.
@@ -154,32 +374,4 @@ static inline void write_seqcount_end(se
 	smp_wmb();
 	s->sequence++;
 }
-
-/*
- * Possible sw/hw IRQ protected versions of the interfaces.
- */
-#define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
-#define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
-#define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
-
-#define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
-#define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
-#define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
-
-#define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
-
-#define read_seqretry_irqrestore(lock, iv, flags)			\
-	({								\
-		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
-		ret;							\
-	})
-
 #endif /* __LINUX_SEQLOCK_H */
Index: linux-2.6.25.4-rt4/include/linux/spinlock_api_smp.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/spinlock_api_smp.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/spinlock_api_smp.h	2008-05-29 09:46:26.000000000 -0400
@@ -19,43 +19,58 @@ int in_lock_functions(unsigned long addr
 
 #define assert_spin_locked(x)	BUG_ON(!spin_is_locked(x))
 
-void __lockfunc _spin_lock(spinlock_t *lock)		__acquires(lock);
-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
-							__acquires(lock);
-void __lockfunc _read_lock(rwlock_t *lock)		__acquires(lock);
-void __lockfunc _write_lock(rwlock_t *lock)		__acquires(lock);
-void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(lock);
-void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(lock);
-void __lockfunc _write_lock_bh(rwlock_t *lock)		__acquires(lock);
-void __lockfunc _spin_lock_irq(spinlock_t *lock)	__acquires(lock);
-void __lockfunc _read_lock_irq(rwlock_t *lock)		__acquires(lock);
-void __lockfunc _write_lock_irq(rwlock_t *lock)		__acquires(lock);
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
-							__acquires(lock);
-unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
-							__acquires(lock);
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
-							__acquires(lock);
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
-							__acquires(lock);
-int __lockfunc _spin_trylock(spinlock_t *lock);
-int __lockfunc _read_trylock(rwlock_t *lock);
-int __lockfunc _write_trylock(rwlock_t *lock);
-int __lockfunc _spin_trylock_bh(spinlock_t *lock);
-void __lockfunc _spin_unlock(spinlock_t *lock)		__releases(lock);
-void __lockfunc _read_unlock(rwlock_t *lock)		__releases(lock);
-void __lockfunc _write_unlock(rwlock_t *lock)		__releases(lock);
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)	__releases(lock);
-void __lockfunc _read_unlock_bh(rwlock_t *lock)		__releases(lock);
-void __lockfunc _write_unlock_bh(rwlock_t *lock)	__releases(lock);
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)	__releases(lock);
-void __lockfunc _read_unlock_irq(rwlock_t *lock)	__releases(lock);
-void __lockfunc _write_unlock_irq(rwlock_t *lock)	__releases(lock);
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
-							__releases(lock);
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(lock);
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(lock);
+#define ACQUIRE_SPIN		__acquires(lock)
+#define ACQUIRE_RW		__acquires(lock)
+#define RELEASE_SPIN		__releases(lock)
+#define RELEASE_RW		__releases(lock)
+
+void __lockfunc __spin_lock(raw_spinlock_t *lock)		ACQUIRE_SPIN;
+void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass)
+								ACQUIRE_SPIN;
+void __lockfunc __read_lock(raw_rwlock_t *lock)			ACQUIRE_RW;
+void __lockfunc __write_lock(raw_rwlock_t *lock)		ACQUIRE_RW;
+void __lockfunc __spin_lock_bh(raw_spinlock_t *lock)		ACQUIRE_SPIN;
+void __lockfunc __read_lock_bh(raw_rwlock_t *lock)		ACQUIRE_RW;
+void __lockfunc __write_lock_bh(raw_rwlock_t *lock)		ACQUIRE_RW;
+void __lockfunc __spin_lock_irq(raw_spinlock_t *lock)		ACQUIRE_SPIN;
+void __lockfunc __read_lock_irq(raw_rwlock_t *lock)		ACQUIRE_RW;
+void __lockfunc __write_lock_irq(raw_rwlock_t *lock)		ACQUIRE_RW;
+unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock)
+								ACQUIRE_SPIN;
+unsigned long __lockfunc
+__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass)	ACQUIRE_SPIN;
+unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock)
+								ACQUIRE_RW;
+unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock)
+								ACQUIRE_RW;
+int __lockfunc __spin_trylock(raw_spinlock_t *lock);
+int __lockfunc
+__spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags);
+int __lockfunc __read_trylock(raw_rwlock_t *lock);
+int __lockfunc __write_trylock(raw_rwlock_t *lock);
+int __lockfunc
+__write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags);
+int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock);
+int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock);
+void __lockfunc __spin_unlock(raw_spinlock_t *lock)		RELEASE_SPIN;
+void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock)
+								RELEASE_SPIN;
+void __lockfunc __read_unlock(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __write_unlock(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock)		RELEASE_SPIN;
+void __lockfunc __read_unlock_bh(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __write_unlock_bh(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock)		RELEASE_SPIN;
+void __lockfunc __read_unlock_irq(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __write_unlock_irq(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc
+__spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+								RELEASE_SPIN;
+void __lockfunc
+__read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+								RELEASE_RW;
+void
+__lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+								RELEASE_RW;
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
Index: linux-2.6.25.4-rt4/include/linux/spinlock_api_up.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/spinlock_api_up.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/spinlock_api_up.h	2008-05-29 09:46:26.000000000 -0400
@@ -33,12 +33,22 @@
 #define __LOCK_IRQ(lock) \
   do { local_irq_disable(); __LOCK(lock); } while (0)
 
-#define __LOCK_IRQSAVE(lock, flags) \
-  do { local_irq_save(flags); __LOCK(lock); } while (0)
+#define __LOCK_IRQSAVE(lock) \
+  ({ unsigned long __flags; local_irq_save(__flags); __LOCK(lock); __flags; })
+
+#define __TRYLOCK_IRQSAVE(lock, flags) \
+	({ local_irq_save(*(flags)); __LOCK(lock); 1; })
+
+#define __spin_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+
+#define __write_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
 
 #define __UNLOCK(lock) \
   do { preempt_enable(); __release(lock); (void)(lock); } while (0)
 
+#define __UNLOCK_NO_RESCHED(lock) \
+  do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0)
+
 #define __UNLOCK_BH(lock) \
   do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0)
 
@@ -48,34 +58,36 @@
 #define __UNLOCK_IRQRESTORE(lock, flags) \
   do { local_irq_restore(flags); __UNLOCK(lock); } while (0)
 
-#define _spin_lock(lock)			__LOCK(lock)
-#define _spin_lock_nested(lock, subclass)	__LOCK(lock)
-#define _read_lock(lock)			__LOCK(lock)
-#define _write_lock(lock)			__LOCK(lock)
-#define _spin_lock_bh(lock)			__LOCK_BH(lock)
-#define _read_lock_bh(lock)			__LOCK_BH(lock)
-#define _write_lock_bh(lock)			__LOCK_BH(lock)
-#define _spin_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _read_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _write_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _spin_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _read_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _write_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags)
-#define _spin_trylock(lock)			({ __LOCK(lock); 1; })
-#define _read_trylock(lock)			({ __LOCK(lock); 1; })
-#define _write_trylock(lock)			({ __LOCK(lock); 1; })
-#define _spin_trylock_bh(lock)			({ __LOCK_BH(lock); 1; })
-#define _spin_unlock(lock)			__UNLOCK(lock)
-#define _read_unlock(lock)			__UNLOCK(lock)
-#define _write_unlock(lock)			__UNLOCK(lock)
-#define _spin_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _write_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _read_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _spin_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _read_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _write_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _spin_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _read_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _write_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define __spin_lock(lock)			__LOCK(lock)
+#define __spin_lock_nested(lock, subclass)	__LOCK(lock)
+#define __read_lock(lock)			__LOCK(lock)
+#define __write_lock(lock)			__LOCK(lock)
+#define __spin_lock_bh(lock)			__LOCK_BH(lock)
+#define __read_lock_bh(lock)			__LOCK_BH(lock)
+#define __write_lock_bh(lock)			__LOCK_BH(lock)
+#define __spin_lock_irq(lock)			__LOCK_IRQ(lock)
+#define __read_lock_irq(lock)			__LOCK_IRQ(lock)
+#define __write_lock_irq(lock)			__LOCK_IRQ(lock)
+#define __spin_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define __read_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define __write_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define __spin_trylock(lock)			({ __LOCK(lock); 1; })
+#define __read_trylock(lock)			({ __LOCK(lock); 1; })
+#define __write_trylock(lock)			({ __LOCK(lock); 1; })
+#define __spin_trylock_bh(lock)			({ __LOCK_BH(lock); 1; })
+#define __spin_trylock_irq(lock)		({ __LOCK_IRQ(lock); 1; })
+#define __spin_unlock(lock)			__UNLOCK(lock)
+#define __spin_unlock_no_resched(lock)		__UNLOCK_NO_RESCHED(lock)
+#define __read_unlock(lock)			__UNLOCK(lock)
+#define __write_unlock(lock)			__UNLOCK(lock)
+#define __spin_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define __write_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define __read_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define __spin_unlock_irq(lock)			__UNLOCK_IRQ(lock)
+#define __read_unlock_irq(lock)			__UNLOCK_IRQ(lock)
+#define __write_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define __spin_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define __read_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define __write_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
 
 #endif /* __LINUX_SPINLOCK_API_UP_H */
Index: linux-2.6.25.4-rt4/include/linux/spinlock_types.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/spinlock_types.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/spinlock_types.h	2008-05-29 09:46:26.000000000 -0400
@@ -15,10 +15,27 @@
 # include <linux/spinlock_types_up.h>
 #endif
 
+/*
+ * Must define these before including other files, inline functions need them
+ */
+#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME
+
+#define LOCK_SECTION_START(extra)               \
+        ".subsection 1\n\t"                     \
+        extra                                   \
+        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
+        LOCK_SECTION_NAME ":\n\t"               \
+        ".endif\n"
+
+#define LOCK_SECTION_END                        \
+        ".previous\n\t"
+
+#define __lockfunc  __attribute__((section(".spinlock.text")))
+
 #include <linux/lockdep.h>
 
 typedef struct {
-	raw_spinlock_t raw_lock;
+	__raw_spinlock_t raw_lock;
 #ifdef CONFIG_GENERIC_LOCKBREAK
 	unsigned int break_lock;
 #endif
@@ -29,12 +46,12 @@ typedef struct {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
-} spinlock_t;
+} raw_spinlock_t;
 
 #define SPINLOCK_MAGIC		0xdead4ead
 
 typedef struct {
-	raw_rwlock_t raw_lock;
+	__raw_rwlock_t raw_lock;
 #ifdef CONFIG_GENERIC_LOCKBREAK
 	unsigned int break_lock;
 #endif
@@ -45,7 +62,7 @@ typedef struct {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
-} rwlock_t;
+} raw_rwlock_t;
 
 #define RWLOCK_MAGIC		0xdeaf1eed
 
@@ -64,24 +81,24 @@ typedef struct {
 #endif
 
 #ifdef CONFIG_DEBUG_SPINLOCK
-# define __SPIN_LOCK_UNLOCKED(lockname)					\
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+# define _RAW_SPIN_LOCK_UNLOCKED(lockname)				\
+			{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
 				.magic = SPINLOCK_MAGIC,		\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1,			\
 				SPIN_DEP_MAP_INIT(lockname) }
-#define __RW_LOCK_UNLOCKED(lockname)					\
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+#define _RAW_RW_LOCK_UNLOCKED(lockname)					\
+			{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				.magic = RWLOCK_MAGIC,			\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1,			\
 				RW_DEP_MAP_INIT(lockname) }
 #else
-# define __SPIN_LOCK_UNLOCKED(lockname) \
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+# define _RAW_SPIN_LOCK_UNLOCKED(lockname)				\
+			{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
 				SPIN_DEP_MAP_INIT(lockname) }
-#define __RW_LOCK_UNLOCKED(lockname) \
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+# define _RAW_RW_LOCK_UNLOCKED(lockname)				\
+			{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				RW_DEP_MAP_INIT(lockname) }
 #endif
 
@@ -91,10 +108,22 @@ typedef struct {
  * Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or
  * __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate.
  */
-#define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(old_style_spin_init)
-#define RW_LOCK_UNLOCKED	__RW_LOCK_UNLOCKED(old_style_rw_init)
 
-#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
-#define DEFINE_RWLOCK(x)	rwlock_t x = __RW_LOCK_UNLOCKED(x)
+# define RAW_SPIN_LOCK_UNLOCKED(lockname) \
+	(raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED(lockname)
+
+# define RAW_RW_LOCK_UNLOCKED(lockname)	\
+	(raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED(lockname)
+
+#define DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name __cacheline_aligned_in_smp = \
+		RAW_SPIN_LOCK_UNLOCKED(name)
+
+#define __DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_RAW_RWLOCK(name) \
+	raw_rwlock_t name __cacheline_aligned_in_smp = \
+		RAW_RW_LOCK_UNLOCKED(name)
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
Index: linux-2.6.25.4-rt4/include/linux/spinlock_types_up.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/spinlock_types_up.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/spinlock_types_up.h	2008-05-29 09:46:26.000000000 -0400
@@ -16,13 +16,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { 1 }
 
 #else
 
-typedef struct { } raw_spinlock_t;
+typedef struct { } __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
 
@@ -30,7 +30,7 @@ typedef struct { } raw_spinlock_t;
 
 typedef struct {
 	/* no debug version on UP */
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
 
Index: linux-2.6.25.4-rt4/include/linux/spinlock_up.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/spinlock_up.h	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/spinlock_up.h	2008-05-29 09:46:26.000000000 -0400
@@ -20,19 +20,19 @@
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define __raw_spin_is_locked(x)		((x)->slock == 0)
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	lock->slock = 0;
 }
 
 static inline void
-__raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+__raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	local_irq_save(flags);
 	lock->slock = 0;
 }
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	char oldval = lock->slock;
 
@@ -41,7 +41,7 @@ static inline int __raw_spin_trylock(raw
 	return oldval > 0;
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	lock->slock = 1;
 }
Index: linux-2.6.25.4-rt4/kernel/futex.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/futex.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/futex.c	2008-05-29 09:47:09.000000000 -0400
@@ -129,12 +129,14 @@ static struct futex_hash_bucket futex_qu
 /* Futex-fs vfsmount entry: */
 static struct vfsmount *futex_mnt;
 
+int futex_performance_hack;
+
 /*
  * Take mm->mmap_sem, when futex is shared
  */
 static inline void futex_lock_mm(struct rw_semaphore *fshared)
 {
-	if (fshared)
+	if (fshared && !futex_performance_hack)
 		down_read(fshared);
 }
 
@@ -143,7 +145,7 @@ static inline void futex_lock_mm(struct 
  */
 static inline void futex_unlock_mm(struct rw_semaphore *fshared)
 {
-	if (fshared)
+	if (fshared && !futex_performance_hack)
 		up_read(fshared);
 }
 
@@ -961,8 +963,12 @@ static int futex_requeue(u32 __user *uad
 				plist_add(&this->list, &hb2->chain);
 				this->lock_ptr = &hb2->lock;
 #ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_PREEMPT_RT
+				this->list.plist.lock = NULL;
+#else
 				this->list.plist.lock = &hb2->lock;
 #endif
+#endif
 			}
 			this->key = key2;
 			get_futex_key_refs(&key2);
@@ -1022,8 +1028,12 @@ static inline void __queue_me(struct fut
 
 	plist_node_init(&q->list, prio);
 #ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_PREEMPT_RT
+	q->list.plist.lock = NULL;
+#else
 	q->list.plist.lock = &hb->lock;
 #endif
+#endif
 	plist_add(&q->list, &hb->chain);
 	q->task = current;
 	spin_unlock(&hb->lock);
@@ -1263,6 +1273,10 @@ static int futex_wait(u32 __user *uaddr,
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
 	if (likely(!plist_node_empty(&q.list))) {
+		unsigned long nosched_flag = current->flags & PF_NOSCHED;
+
+		current->flags &= ~PF_NOSCHED;
+
 		if (!abs_time)
 			schedule();
 		else {
@@ -1287,6 +1301,8 @@ static int futex_wait(u32 __user *uaddr,
 			/* Flag if a timeout occured */
 			rem = (t.task == NULL);
 		}
+
+		current->flags |= nosched_flag;
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -2178,7 +2194,11 @@ static int __init futex_init(void)
 		futex_cmpxchg_enabled = 1;
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+#ifdef CONFIG_PREEMPT_RT
+		plist_head_init(&futex_queues[i].chain, NULL);
+#else
 		plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+#endif
 		spin_lock_init(&futex_queues[i].lock);
 	}
 
Index: linux-2.6.25.4-rt4/kernel/rt.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/kernel/rt.c	2008-05-29 09:47:27.000000000 -0400
@@ -0,0 +1,479 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ *    Doing priority inheritance with help of the scheduler.
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  - major rework based on Esben Nielsens initial patch
+ *  - replaced thread_info references by task_struct refs
+ *  - removed task->pending_owner dependency
+ *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ *    in the scheduler return path as discussed with Steven Rostedt
+ *
+ *  Copyright (C) 2006, Kihon Technologies Inc.
+ *    Steven Rostedt <rostedt@goodmis.org>
+ *  - debugged and patched Thomas Gleixner's rework.
+ *  - added back the cmpxchg to the rework.
+ *  - turned atomic require back on for SMP.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/rt_lock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Unlock these on crash:
+ */
+void zap_rt_locks(void)
+{
+	//trace_lock_init();
+}
+#endif
+
+/*
+ * struct mutex functions
+ */
+void _mutex_init(struct mutex *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+	__rt_mutex_init(&lock->lock, name);
+}
+EXPORT_SYMBOL(_mutex_init);
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	LOCK_CONTENDED_RT(lock, rt_mutex_trylock, rt_mutex_lock);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+static int __lockfunc __rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+	return rt_mutex_lock_interruptible(lock, 0);
+}
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = LOCK_CONTENDED_RT_RET(lock, rt_mutex_trylock,
+			__rt_mutex_lock_interruptible);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED_RT(lock, rt_mutex_trylock, rt_mutex_lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	ret = LOCK_CONTENDED_RT_RET(lock, rt_mutex_trylock,
+			__rt_mutex_lock_interruptible);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
+
+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	ret = rt_mutex_lock_killable(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable_nested);
+#endif
+
+int __lockfunc _mutex_trylock(struct mutex *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_trylock);
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+	mutex_release(&lock->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);
+
+/*
+ * rwlock_t functions
+ */
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+	int ret = rt_mutex_down_write_trylock(&rwlock->owners);
+
+	if (ret)
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
+{
+	*flags = 0;
+	return rt_write_trylock(rwlock);
+}
+EXPORT_SYMBOL(rt_write_trylock_irqsave);
+
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+	int ret;
+
+	ret = rt_mutex_down_read_trylock(&rwlock->owners);
+	if (ret)
+		rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+	LOCK_CONTENDED_RT_RW(rwlock, rt_mutex_down_write_trylock, rt_rwlock_write_lock);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+	rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+	LOCK_CONTENDED_RT_RW(rwlock, rt_mutex_down_read_trylock, rt_rwlock_read_lock);
+}
+
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+	rt_rwlock_write_unlock(&rwlock->owners);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+	rt_rwlock_read_unlock(&rwlock->owners);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_write_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_write_lock_irqsave);
+
+unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_read_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_read_lock_irqsave);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+	lockdep_init_map(&rwlock->dep_map, name, key, 0);
+#endif
+	rt_mutex_rwsem_init(&rwlock->owners, name);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+
+/*
+ * rw_semaphores
+ */
+
+void  rt_up_write(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_up_write(&rwsem->owners);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+void  rt_up_read(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_up_read(&rwsem->owners);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+/*
+ * downgrade a write lock into a read lock
+ */
+void  rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	rt_mutex_downgrade_write(&rwsem->owners);
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+int  rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+	int ret = rt_mutex_down_write_trylock(&rwsem->owners);
+
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+static void __rt_down_write(struct rw_semaphore *rwsem, int subclass)
+{
+	rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED_RT_RW(rwsem, rt_mutex_down_write_trylock, rt_mutex_down_write);
+}
+
+void  rt_down_write(struct rw_semaphore *rwsem)
+{
+	__rt_down_write(rwsem, 0);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	__rt_down_write(rwsem, subclass);
+}
+EXPORT_SYMBOL(rt_down_write_nested);
+
+int  rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	int ret;
+
+	ret = rt_mutex_down_read_trylock(&rwsem->owners);
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
+{
+	rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED_RT_RW(rwsem, rt_mutex_down_read_trylock, rt_mutex_down_read);
+}
+
+void  rt_down_read(struct rw_semaphore *rwsem)
+{
+	__rt_down_read(rwsem, 0);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	__rt_down_read(rwsem, subclass);
+}
+EXPORT_SYMBOL(rt_down_read_nested);
+
+void  __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
+			      struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
+	lockdep_init_map(&rwsem->dep_map, name, key, 0);
+#endif
+	rt_mutex_rwsem_init(&rwsem->owners, name);
+}
+EXPORT_SYMBOL(__rt_rwsem_init);
+
+/*
+ * Semaphores
+ */
+/*
+ * Linux Semaphores implemented via RT-mutexes.
+ *
+ * In the down() variants we use the mutex as the semaphore blocking
+ * object: we always acquire it, decrease the counter and keep the lock
+ * locked if we did the 1->0 transition. The next down() will then block.
+ *
+ * In the up() path we atomically increase the counter and do the
+ * unlock if we were the one doing the 0->1 transition.
+ */
+
+static inline void __down_complete(struct semaphore *sem)
+{
+	int count = atomic_dec_return(&sem->count);
+
+	if (unlikely(count > 0))
+		rt_mutex_unlock(&sem->lock);
+}
+
+void  rt_down(struct semaphore *sem)
+{
+	rt_mutex_lock(&sem->lock);
+	__down_complete(sem);
+}
+EXPORT_SYMBOL(rt_down);
+
+int  rt_down_interruptible(struct semaphore *sem)
+{
+	int ret;
+
+	ret = rt_mutex_lock_interruptible(&sem->lock, 0);
+	if (ret)
+		return ret;
+	__down_complete(sem);
+	return 0;
+}
+EXPORT_SYMBOL(rt_down_interruptible);
+
+/*
+ * try to down the semaphore, 0 on success and 1 on failure. (inverted)
+ */
+int  rt_down_trylock(struct semaphore *sem)
+{
+	/*
+	 * Here we are a tiny bit different from ordinary Linux semaphores,
+	 * because we can get 'transient' locking-failures when say a
+	 * process decreases the count from 9 to 8 and locks/releases the
+	 * embedded mutex internally. It would be quite complex to remove
+	 * these transient failures so lets try it the simple way first:
+	 */
+	if (rt_mutex_trylock(&sem->lock)) {
+		__down_complete(sem);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(rt_down_trylock);
+
+void  rt_up(struct semaphore *sem)
+{
+	int count;
+
+	/*
+	 * Disable preemption to make sure a highprio trylock-er cannot
+	 * preempt us here and get into an infinite loop:
+	 */
+	preempt_disable();
+	count = atomic_inc_return(&sem->count);
+	/*
+	 * If we did the 0 -> 1 transition then we are the ones to unlock it:
+	 */
+	if (likely(count == 1))
+		rt_mutex_unlock(&sem->lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(rt_up);
+
+void  __sema_init(struct semaphore *sem, int val,
+			  char *name, char *file, int line)
+{
+	atomic_set(&sem->count, val);
+	switch (val) {
+	case 0:
+		__rt_mutex_init(&sem->lock, name);
+		rt_mutex_lock(&sem->lock);
+		break;
+	default:
+		__rt_mutex_init(&sem->lock, name);
+		break;
+	}
+}
+EXPORT_SYMBOL(__sema_init);
+
+void  __init_MUTEX(struct semaphore *sem, char *name, char *file,
+			   int line)
+{
+	__sema_init(sem, 1, name, file, line);
+}
+EXPORT_SYMBOL(__init_MUTEX);
+
Index: linux-2.6.25.4-rt4/kernel/rtmutex-debug.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rtmutex-debug.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rtmutex-debug.c	2008-05-29 09:46:26.000000000 -0400
@@ -16,6 +16,7 @@
  *
  * See rt.c in preempt-rt for proper credits and further information
  */
+#include <linux/rt_lock.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/module.h>
@@ -29,61 +30,6 @@
 
 #include "rtmutex_common.h"
 
-# define TRACE_WARN_ON(x)			WARN_ON(x)
-# define TRACE_BUG_ON(x)			BUG_ON(x)
-
-# define TRACE_OFF()						\
-do {								\
-	if (rt_trace_on) {					\
-		rt_trace_on = 0;				\
-		console_verbose();				\
-		if (spin_is_locked(&current->pi_lock))		\
-			spin_unlock(&current->pi_lock);		\
-	}							\
-} while (0)
-
-# define TRACE_OFF_NOLOCK()					\
-do {								\
-	if (rt_trace_on) {					\
-		rt_trace_on = 0;				\
-		console_verbose();				\
-	}							\
-} while (0)
-
-# define TRACE_BUG_LOCKED()			\
-do {						\
-	TRACE_OFF();				\
-	BUG();					\
-} while (0)
-
-# define TRACE_WARN_ON_LOCKED(c)		\
-do {						\
-	if (unlikely(c)) {			\
-		TRACE_OFF();			\
-		WARN_ON(1);			\
-	}					\
-} while (0)
-
-# define TRACE_BUG_ON_LOCKED(c)			\
-do {						\
-	if (unlikely(c))			\
-		TRACE_BUG_LOCKED();		\
-} while (0)
-
-#ifdef CONFIG_SMP
-# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
-#else
-# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
-#endif
-
-/*
- * deadlock detection flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-static int rt_trace_on = 1;
-
 static void printk_task(struct task_struct *p)
 {
 	if (p)
@@ -111,8 +57,8 @@ static void printk_lock(struct rt_mutex 
 
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-	WARN_ON(!plist_head_empty(&task->pi_waiters));
-	WARN_ON(task->pi_blocked_on);
+	DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+	DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 
 /*
@@ -125,7 +71,7 @@ void debug_rt_mutex_deadlock(int detect,
 {
 	struct task_struct *task;
 
-	if (!rt_trace_on || detect || !act_waiter)
+	if (!debug_locks || detect || !act_waiter)
 		return;
 
 	task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +85,7 @@ void debug_rt_mutex_print_deadlock(struc
 {
 	struct task_struct *task;
 
-	if (!waiter->deadlock_lock || !rt_trace_on)
+	if (!waiter->deadlock_lock || !debug_locks)
 		return;
 
 	rcu_read_lock();
@@ -149,7 +95,8 @@ void debug_rt_mutex_print_deadlock(struc
 		return;
 	}
 
-	TRACE_OFF_NOLOCK();
+	if (!debug_locks_off())
+		return;
 
 	printk("\n============================================\n");
 	printk(  "[ BUG: circular locking deadlock detected! ]\n");
@@ -180,7 +127,6 @@ void debug_rt_mutex_print_deadlock(struc
 
 	printk("[ turning off deadlock detection."
 	       "Please report this trace. ]\n\n");
-	local_irq_disable();
 }
 
 void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +135,8 @@ void debug_rt_mutex_lock(struct rt_mutex
 
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
-	TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+	if (debug_locks)
+		DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
 }
 
 void
@@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mute
 
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
-	TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+	DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
 }
 
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,9 +160,9 @@ void debug_rt_mutex_init_waiter(struct r
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
 	put_pid(waiter->deadlock_task_pid);
-	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
-	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-	TRACE_WARN_ON(waiter->task);
+	DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	DEBUG_LOCKS_WARN_ON(waiter->task);
 	memset(waiter, 0x22, sizeof(*waiter));
 }
 
@@ -231,9 +178,36 @@ void debug_rt_mutex_init(struct rt_mutex
 void
 rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (task->lock_count >= MAX_LOCK_STACK) {
+		if (!debug_locks_off())
+			return;
+		printk("BUG: %s/%d: lock count overflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+#ifdef CONFIG_PREEMPT_RT
+	task->owned_lock[task->lock_count] = lock;
+#endif
+	task->lock_count++;
+#endif
 }
 
 void rt_mutex_deadlock_account_unlock(struct task_struct *task)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (!task->lock_count) {
+		if (!debug_locks_off())
+			return;
+		printk("BUG: %s/%d: lock count underflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+	task->lock_count--;
+#ifdef CONFIG_PREEMPT_RT
+	task->owned_lock[task->lock_count] = NULL;
+#endif
+#endif
 }
-
Index: linux-2.6.25.4-rt4/kernel/rwsem.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rwsem.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rwsem.c	2008-05-29 09:46:26.000000000 -0400
@@ -16,7 +16,7 @@
 /*
  * lock for reading
  */
-void __sched down_read(struct rw_semaphore *sem)
+void __sched compat_down_read(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -24,12 +24,12 @@ void __sched down_read(struct rw_semapho
 	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
 }
 
-EXPORT_SYMBOL(down_read);
+EXPORT_SYMBOL(compat_down_read);
 
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-int down_read_trylock(struct rw_semaphore *sem)
+int compat_down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret = __down_read_trylock(sem);
 
@@ -38,12 +38,12 @@ int down_read_trylock(struct rw_semaphor
 	return ret;
 }
 
-EXPORT_SYMBOL(down_read_trylock);
+EXPORT_SYMBOL(compat_down_read_trylock);
 
 /*
  * lock for writing
  */
-void __sched down_write(struct rw_semaphore *sem)
+void __sched compat_down_write(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
@@ -51,12 +51,12 @@ void __sched down_write(struct rw_semaph
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 }
 
-EXPORT_SYMBOL(down_write);
+EXPORT_SYMBOL(compat_down_write);
 
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-int down_write_trylock(struct rw_semaphore *sem)
+int compat_down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret = __down_write_trylock(sem);
 
@@ -65,36 +65,36 @@ int down_write_trylock(struct rw_semapho
 	return ret;
 }
 
-EXPORT_SYMBOL(down_write_trylock);
+EXPORT_SYMBOL(compat_down_write_trylock);
 
 /*
  * release a read lock
  */
-void up_read(struct rw_semaphore *sem)
+void compat_up_read(struct compat_rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
 
 	__up_read(sem);
 }
 
-EXPORT_SYMBOL(up_read);
+EXPORT_SYMBOL(compat_up_read);
 
 /*
  * release a write lock
  */
-void up_write(struct rw_semaphore *sem)
+void compat_up_write(struct compat_rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
 
 	__up_write(sem);
 }
 
-EXPORT_SYMBOL(up_write);
+EXPORT_SYMBOL(compat_up_write);
 
 /*
  * downgrade write lock to read lock
  */
-void downgrade_write(struct rw_semaphore *sem)
+void compat_downgrade_write(struct compat_rw_semaphore *sem)
 {
 	/*
 	 * lockdep: a downgraded write will live on as a write
@@ -103,11 +103,11 @@ void downgrade_write(struct rw_semaphore
 	__downgrade_write(sem);
 }
 
-EXPORT_SYMBOL(downgrade_write);
+EXPORT_SYMBOL(compat_downgrade_write);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-void down_read_nested(struct rw_semaphore *sem, int subclass)
+void compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass)
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -115,18 +115,18 @@ void down_read_nested(struct rw_semaphor
 	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
 }
 
-EXPORT_SYMBOL(down_read_nested);
+EXPORT_SYMBOL(compat_down_read_nested);
 
-void down_read_non_owner(struct rw_semaphore *sem)
+void compat_down_read_non_owner(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 
 	__down_read(sem);
 }
 
-EXPORT_SYMBOL(down_read_non_owner);
+EXPORT_SYMBOL(compat_down_read_non_owner);
 
-void down_write_nested(struct rw_semaphore *sem, int subclass)
+void compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass)
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -134,14 +134,14 @@ void down_write_nested(struct rw_semapho
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 }
 
-EXPORT_SYMBOL(down_write_nested);
+EXPORT_SYMBOL(compat_down_write_nested);
 
-void up_read_non_owner(struct rw_semaphore *sem)
+void compat_up_read_non_owner(struct compat_rw_semaphore *sem)
 {
 	__up_read(sem);
 }
 
-EXPORT_SYMBOL(up_read_non_owner);
+EXPORT_SYMBOL(compat_up_read_non_owner);
 
 #endif
 
Index: linux-2.6.25.4-rt4/kernel/spinlock.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/spinlock.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/spinlock.c	2008-05-29 09:46:26.000000000 -0400
@@ -21,7 +21,7 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
 
-int __lockfunc _spin_trylock(spinlock_t *lock)
+int __lockfunc __spin_trylock(raw_spinlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_spin_trylock(lock)) {
@@ -32,9 +32,46 @@ int __lockfunc _spin_trylock(spinlock_t 
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock);
+EXPORT_SYMBOL(__spin_trylock);
 
-int __lockfunc _read_trylock(rwlock_t *lock)
+int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+
+	if (_raw_spin_trylock(lock)) {
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+
+	__preempt_enable_no_resched();
+	local_irq_enable();
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(__spin_trylock_irq);
+
+int __lockfunc __spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags)
+{
+	local_irq_save(*flags);
+	preempt_disable();
+
+	if (_raw_spin_trylock(lock)) {
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+
+	__preempt_enable_no_resched();
+	local_irq_restore(*flags);
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(__spin_trylock_irqsave);
+
+int __lockfunc __read_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_read_trylock(lock)) {
@@ -45,9 +82,9 @@ int __lockfunc _read_trylock(rwlock_t *l
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_read_trylock);
+EXPORT_SYMBOL(__read_trylock);
 
-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc __write_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_write_trylock(lock)) {
@@ -58,7 +95,21 @@ int __lockfunc _write_trylock(rwlock_t *
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_write_trylock);
+EXPORT_SYMBOL(__write_trylock);
+
+int __lockfunc __write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+	local_irq_save(*flags);
+	ret = __write_trylock(lock);
+	if (ret)
+		return ret;
+
+	local_irq_restore(*flags);
+	return 0;
+}
+EXPORT_SYMBOL(__write_trylock_irqsave);
 
 /*
  * If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,15 +118,15 @@ EXPORT_SYMBOL(_write_trylock);
  */
 #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 
-void __lockfunc _read_lock(rwlock_t *lock)
+void __lockfunc __read_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
-EXPORT_SYMBOL(_read_lock);
+EXPORT_SYMBOL(__read_lock);
 
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock)
 {
 	unsigned long flags;
 
@@ -94,27 +145,27 @@ unsigned long __lockfunc _spin_lock_irqs
 #endif
 	return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave);
+EXPORT_SYMBOL(__spin_lock_irqsave);
 
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
+void __lockfunc __spin_lock_irq(raw_spinlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
-EXPORT_SYMBOL(_spin_lock_irq);
+EXPORT_SYMBOL(__spin_lock_irq);
 
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
+void __lockfunc __spin_lock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
-EXPORT_SYMBOL(_spin_lock_bh);
+EXPORT_SYMBOL(__spin_lock_bh);
 
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
@@ -124,27 +175,27 @@ unsigned long __lockfunc _read_lock_irqs
 	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 	return flags;
 }
-EXPORT_SYMBOL(_read_lock_irqsave);
+EXPORT_SYMBOL(__read_lock_irqsave);
 
-void __lockfunc _read_lock_irq(rwlock_t *lock)
+void __lockfunc __read_lock_irq(raw_rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
-EXPORT_SYMBOL(_read_lock_irq);
+EXPORT_SYMBOL(__read_lock_irq);
 
-void __lockfunc _read_lock_bh(rwlock_t *lock)
+void __lockfunc __read_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
-EXPORT_SYMBOL(_read_lock_bh);
+EXPORT_SYMBOL(__read_lock_bh);
 
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
@@ -154,43 +205,43 @@ unsigned long __lockfunc _write_lock_irq
 	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 	return flags;
 }
-EXPORT_SYMBOL(_write_lock_irqsave);
+EXPORT_SYMBOL(__write_lock_irqsave);
 
-void __lockfunc _write_lock_irq(rwlock_t *lock)
+void __lockfunc __write_lock_irq(raw_rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
-EXPORT_SYMBOL(_write_lock_irq);
+EXPORT_SYMBOL(__write_lock_irq);
 
-void __lockfunc _write_lock_bh(rwlock_t *lock)
+void __lockfunc __write_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
-EXPORT_SYMBOL(_write_lock_bh);
+EXPORT_SYMBOL(__write_lock_bh);
 
-void __lockfunc _spin_lock(spinlock_t *lock)
+void __lockfunc __spin_lock(raw_spinlock_t *lock)
 {
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 
-EXPORT_SYMBOL(_spin_lock);
+EXPORT_SYMBOL(__spin_lock);
 
-void __lockfunc _write_lock(rwlock_t *lock)
+void __lockfunc __write_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
 
-EXPORT_SYMBOL(_write_lock);
+EXPORT_SYMBOL(__write_lock);
 
 #else /* CONFIG_PREEMPT: */
 
@@ -203,7 +254,7 @@ EXPORT_SYMBOL(_write_lock);
  */
 
 #define BUILD_LOCK_OPS(op, locktype)					\
-void __lockfunc _##op##_lock(locktype##_t *lock)			\
+void __lockfunc __##op##_lock(locktype##_t *lock)			\
 {									\
 	for (;;) {							\
 		preempt_disable();					\
@@ -213,15 +264,16 @@ void __lockfunc _##op##_lock(locktype##_
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
-			_raw_##op##_relax(&lock->raw_lock);		\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+					(lock)->break_lock)		\
+			__raw_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock);						\
+EXPORT_SYMBOL(__##op##_lock);						\
 									\
-unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
+unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock)	\
 {									\
 	unsigned long flags;						\
 									\
@@ -235,23 +287,24 @@ unsigned long __lockfunc _##op##_lock_ir
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
-			_raw_##op##_relax(&lock->raw_lock);		\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+						 (lock)->break_lock)	\
+			__raw_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
 	return flags;							\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+EXPORT_SYMBOL(__##op##_lock_irqsave);					\
 									\
-void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
+void __lockfunc __##op##_lock_irq(locktype##_t *lock)			\
 {									\
-	_##op##_lock_irqsave(lock);					\
+	__##op##_lock_irqsave(lock);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irq);					\
+EXPORT_SYMBOL(__##op##_lock_irq);					\
 									\
-void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
+void __lockfunc __##op##_lock_bh(locktype##_t *lock)			\
 {									\
 	unsigned long flags;						\
 									\
@@ -260,39 +313,40 @@ void __lockfunc _##op##_lock_bh(locktype
 	/* irq-disabling. We use the generic preemption-aware	*/	\
 	/* function:						*/	\
 	/**/								\
-	flags = _##op##_lock_irqsave(lock);				\
+	flags = __##op##_lock_irqsave(lock);				\
 	local_bh_disable();						\
 	local_irq_restore(flags);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_bh)
+EXPORT_SYMBOL(__##op##_lock_bh)
 
 /*
  * Build preemption-friendly versions of the following
  * lock-spinning functions:
  *
- *         _[spin|read|write]_lock()
- *         _[spin|read|write]_lock_irq()
- *         _[spin|read|write]_lock_irqsave()
- *         _[spin|read|write]_lock_bh()
+ *         __[spin|read|write]_lock()
+ *         __[spin|read|write]_lock_irq()
+ *         __[spin|read|write]_lock_irqsave()
+ *         __[spin|read|write]_lock_bh()
  */
-BUILD_LOCK_OPS(spin, spinlock);
-BUILD_LOCK_OPS(read, rwlock);
-BUILD_LOCK_OPS(write, rwlock);
+BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(read, raw_rwlock);
+BUILD_LOCK_OPS(write, raw_rwlock);
 
 #endif /* CONFIG_PREEMPT */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass)
 {
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
+EXPORT_SYMBOL(__spin_lock_nested);
 
-EXPORT_SYMBOL(_spin_lock_nested);
-unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+unsigned long __lockfunc
+__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass)
 {
 	unsigned long flags;
 
@@ -311,117 +365,130 @@ unsigned long __lockfunc _spin_lock_irqs
 #endif
 	return flags;
 }
-
-EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+EXPORT_SYMBOL(__spin_lock_irqsave_nested);
 
 #endif
 
-void __lockfunc _spin_unlock(spinlock_t *lock)
+void __lockfunc __spin_unlock(raw_spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_spin_unlock);
+EXPORT_SYMBOL(__spin_unlock);
+
+void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock)
+{
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_spin_unlock(lock);
+	__preempt_enable_no_resched();
+}
+/* not exported */
 
-void __lockfunc _write_unlock(rwlock_t *lock)
+void __lockfunc __write_unlock(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_write_unlock);
+EXPORT_SYMBOL(__write_unlock);
 
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc __read_unlock(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_read_unlock);
+EXPORT_SYMBOL(__read_unlock);
 
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+void __lockfunc __spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irqrestore);
+EXPORT_SYMBOL(__spin_unlock_irqrestore);
 
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irq);
+EXPORT_SYMBOL(__spin_unlock_irq);
 
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
-EXPORT_SYMBOL(_spin_unlock_bh);
+EXPORT_SYMBOL(__spin_unlock_bh);
 
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc __read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irqrestore);
+EXPORT_SYMBOL(__read_unlock_irqrestore);
 
-void __lockfunc _read_unlock_irq(rwlock_t *lock)
+void __lockfunc __read_unlock_irq(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irq);
+EXPORT_SYMBOL(__read_unlock_irq);
 
-void __lockfunc _read_unlock_bh(rwlock_t *lock)
+void __lockfunc __read_unlock_bh(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
-EXPORT_SYMBOL(_read_unlock_bh);
+EXPORT_SYMBOL(__read_unlock_bh);
 
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irqrestore);
+EXPORT_SYMBOL(__write_unlock_irqrestore);
 
-void __lockfunc _write_unlock_irq(rwlock_t *lock)
+void __lockfunc __write_unlock_irq(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irq);
+EXPORT_SYMBOL(__write_unlock_irq);
 
-void __lockfunc _write_unlock_bh(rwlock_t *lock)
+void __lockfunc __write_unlock_bh(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
-EXPORT_SYMBOL(_write_unlock_bh);
+EXPORT_SYMBOL(__write_unlock_bh);
 
-int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
@@ -430,11 +497,11 @@ int __lockfunc _spin_trylock_bh(spinlock
 		return 1;
 	}
 
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock_bh);
+EXPORT_SYMBOL(__spin_trylock_bh);
 
 int in_lock_functions(unsigned long addr)
 {
@@ -442,6 +509,17 @@ int in_lock_functions(unsigned long addr
 	extern char __lock_text_start[], __lock_text_end[];
 
 	return addr >= (unsigned long)__lock_text_start
-	&& addr < (unsigned long)__lock_text_end;
+		&& addr < (unsigned long)__lock_text_end;
 }
 EXPORT_SYMBOL(in_lock_functions);
+
+void notrace __debug_atomic_dec_and_test(atomic_t *v)
+{
+	static int warn_once = 1;
+
+	if (!atomic_read(v) && warn_once) {
+		warn_once = 0;
+		printk("BUG: atomic counter underflow!\n");
+		WARN_ON(1);
+	}
+}
Index: linux-2.6.25.4-rt4/lib/dec_and_lock.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/dec_and_lock.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/dec_and_lock.c	2008-05-29 09:46:26.000000000 -0400
@@ -17,7 +17,7 @@
  * because the spin-lock and the decrement must be
  * "atomic".
  */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic)
 {
 #ifdef CONFIG_SMP
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
@@ -32,4 +32,4 @@ int _atomic_dec_and_lock(atomic_t *atomi
 	return 0;
 }
 
-EXPORT_SYMBOL(_atomic_dec_and_lock);
+EXPORT_SYMBOL(__atomic_dec_and_spin_lock);
Index: linux-2.6.25.4-rt4/lib/kernel_lock.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/kernel_lock.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/kernel_lock.c	2008-05-29 09:46:43.000000000 -0400
@@ -23,7 +23,7 @@
  *
  * Don't use in new code.
  */
-static DECLARE_MUTEX(kernel_sem);
+DECLARE_MUTEX(kernel_sem);
 
 /*
  * Re-acquire the kernel semaphore.
@@ -34,22 +34,25 @@ static DECLARE_MUTEX(kernel_sem);
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
+ *
+ * Called with interrupts disabled.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	struct task_struct *task = current;
 	int saved_lock_depth = task->lock_depth;
 
+	local_irq_enable();
 	BUG_ON(saved_lock_depth < 0);
 
 	task->lock_depth = -1;
-	preempt_enable_no_resched();
 
 	down(&kernel_sem);
 
-	preempt_disable();
 	task->lock_depth = saved_lock_depth;
 
+	local_irq_disable();
+
 	return 0;
 }
 
@@ -66,11 +69,15 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = __builtin_return_address(0);
+#endif
+	}
 
 	task->lock_depth = depth;
 }
@@ -81,8 +88,12 @@ void __lockfunc unlock_kernel(void)
 
 	BUG_ON(task->lock_depth < 0);
 
-	if (likely(--task->lock_depth < 0))
+	if (likely(--task->lock_depth == -1)) {
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = NULL;
+#endif
 		up(&kernel_sem);
+	}
 }
 
 EXPORT_SYMBOL(lock_kernel);
Index: linux-2.6.25.4-rt4/lib/locking-selftest.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/locking-selftest.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/locking-selftest.c	2008-05-29 09:46:43.000000000 -0400
@@ -158,7 +158,7 @@ static void init_shared_classes(void)
 		local_bh_disable();		\
 		local_irq_disable();		\
 		trace_softirq_enter();		\
-		WARN_ON(!in_softirq());
+		/* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */
 
 #define SOFTIRQ_EXIT()				\
 		trace_softirq_exit();		\
@@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem)
 #undef E
 
 /*
+ * FIXME: turns these into raw-spinlock tests on -rt
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+/*
  * locking an irq-safe lock with irqs enabled:
  */
 #define E1()				\
@@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_
 #include "locking-selftest-softirq.h"
 // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft)
 
+#endif /* !CONFIG_PREEMPT_RT */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define I_SPINLOCK(x)	lockdep_reset_lock(&lock_##x.dep_map)
 # define I_RWLOCK(x)	lockdep_reset_lock(&rwlock_##x.dep_map)
@@ -940,6 +947,9 @@ static void dotest(void (*testcase_fn)(v
 {
 	unsigned long saved_preempt_count = preempt_count();
 	int expected_failure = 0;
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES)
+        int saved_lock_count = current->lock_count;
+#endif
 
 	WARN_ON(irqs_disabled());
 
@@ -989,6 +999,9 @@ static void dotest(void (*testcase_fn)(v
 #endif
 
 	reset_locks();
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES)
+        current->lock_count = saved_lock_count;
+#endif
 }
 
 static inline void print_testname(const char *testname)
@@ -998,7 +1011,7 @@ static inline void print_testname(const 
 
 #define DO_TESTCASE_1(desc, name, nr)				\
 	print_testname(desc"/"#nr);				\
-	dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
 #define DO_TESTCASE_1B(desc, name, nr)				\
@@ -1006,17 +1019,17 @@ static inline void print_testname(const 
 	dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	printk("\n");
 
-#define DO_TESTCASE_3(desc, name, nr)				\
-	print_testname(desc"/"#nr);				\
-	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);	\
-	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
+#define DO_TESTCASE_3(desc, name, nr)					\
+	print_testname(desc"/"#nr);					\
+	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);		\
+	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
-#define DO_TESTCASE_3RW(desc, name, nr)				\
-	print_testname(desc"/"#nr);				\
+#define DO_TESTCASE_3RW(desc, name, nr)					\
+	print_testname(desc"/"#nr);					\
 	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\
-	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
+	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
@@ -1047,7 +1060,7 @@ static inline void print_testname(const 
 	print_testname(desc);					\
 	dotest(name##_spin, FAILURE, LOCKTYPE_SPIN);		\
 	dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK);		\
-	dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK);	\
 	dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);		\
 	dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);		\
 	dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);		\
@@ -1179,6 +1192,7 @@ void locking_selftest(void)
 	/*
 	 * irq-context testcases:
 	 */
+#ifndef CONFIG_PREEMPT_RT
 	DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1);
 	DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A);
 	DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B);
@@ -1188,6 +1202,7 @@ void locking_selftest(void)
 
 	DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
 //	DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
+#endif
 
 	if (unexpected_testcase_failures) {
 		printk("-----------------------------------------------------------------\n");
Index: linux-2.6.25.4-rt4/lib/plist.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/plist.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/plist.c	2008-05-29 09:47:10.000000000 -0400
@@ -53,7 +53,9 @@ static void plist_check_list(struct list
 
 static void plist_check_head(struct plist_head *head)
 {
+#ifndef CONFIG_PREEMPT_RT
 	WARN_ON(!head->lock);
+#endif
 	if (head->lock)
 		WARN_ON_SMP(!spin_is_locked(head->lock));
 	plist_check_list(&head->prio_list);
@@ -64,6 +66,30 @@ static void plist_check_head(struct plis
 # define plist_check_head(h)	do { } while (0)
 #endif
 
+static inline struct plist_node *prev_node(struct plist_node *iter)
+{
+	return list_entry(iter->plist.node_list.prev, struct plist_node,
+			plist.node_list);
+}
+
+static inline struct plist_node *next_node(struct plist_node *iter)
+{
+	return list_entry(iter->plist.node_list.next, struct plist_node,
+			plist.node_list);
+}
+
+static inline struct plist_node *prev_prio(struct plist_node *iter)
+{
+	return list_entry(iter->plist.prio_list.prev, struct plist_node,
+			plist.prio_list);
+}
+
+static inline struct plist_node *next_prio(struct plist_node *iter)
+{
+	return list_entry(iter->plist.prio_list.next, struct plist_node,
+			plist.prio_list);
+}
+
 /**
  * plist_add - add @node to @head
  *
@@ -81,8 +107,7 @@ void plist_add(struct plist_node *node, 
 		if (node->prio < iter->prio)
 			goto lt_prio;
 		else if (node->prio == iter->prio) {
-			iter = list_entry(iter->plist.prio_list.next,
-					struct plist_node, plist.prio_list);
+			iter = next_prio(iter);
 			goto eq_prio;
 		}
 	}
@@ -116,3 +141,44 @@ void plist_del(struct plist_node *node, 
 
 	plist_check_head(head);
 }
+
+void plist_head_splice(struct plist_head *src, struct plist_head *dst)
+{
+	struct plist_node *src_iter_first, *src_iter_last, *dst_iter;
+	struct plist_node *tail = container_of(dst, struct plist_node, plist);
+
+	dst_iter = next_prio(tail);
+
+	while (!plist_head_empty(src) && dst_iter != tail) {
+		src_iter_first = plist_first(src);
+
+		src_iter_last = next_prio(src_iter_first);
+		src_iter_last = prev_node(src_iter_last);
+
+		WARN_ON(src_iter_first->prio != src_iter_last->prio);
+		WARN_ON(list_empty(&src_iter_first->plist.prio_list));
+
+		while (src_iter_first->prio > dst_iter->prio) {
+			dst_iter = next_prio(dst_iter);
+			if (dst_iter == tail)
+				goto tail;
+		}
+
+		list_del_init(&src_iter_first->plist.prio_list);
+
+		if (src_iter_first->prio < dst_iter->prio) {
+			list_add_tail(&src_iter_first->plist.prio_list,
+					&dst_iter->plist.prio_list);
+		} else if (src_iter_first->prio == dst_iter->prio) {
+			dst_iter = next_prio(dst_iter);
+		} else BUG();
+
+		list_splice2_tail(&src_iter_first->plist.node_list,
+			       	  &src_iter_last->plist.node_list,
+				  &dst_iter->plist.node_list);
+	}
+
+tail:
+	list_splice_tail_init(&src->prio_list, &dst->prio_list);
+	list_splice_tail_init(&src->node_list, &dst->node_list);
+}
Index: linux-2.6.25.4-rt4/lib/rwsem-spinlock.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/rwsem-spinlock.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/rwsem-spinlock.c	2008-05-29 09:46:26.000000000 -0400
@@ -20,7 +20,7 @@ struct rwsem_waiter {
 /*
  * initialise the semaphore
  */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
+void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name,
 		  struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -44,8 +44,8 @@ void __init_rwsem(struct rw_semaphore *s
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only woken if wakewrite is non-zero
  */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+static inline struct compat_rw_semaphore *
+__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -103,8 +103,8 @@ __rwsem_do_wake(struct rw_semaphore *sem
 /*
  * wake a single writer
  */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
+static inline struct compat_rw_semaphore *
+__rwsem_wake_one_writer(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -125,7 +125,7 @@ __rwsem_wake_one_writer(struct rw_semaph
 /*
  * get a read lock on the semaphore
  */
-void __sched __down_read(struct rw_semaphore *sem)
+void __sched __down_read(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -168,7 +168,7 @@ void __sched __down_read(struct rw_semap
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-int __down_read_trylock(struct rw_semaphore *sem)
+int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -191,7 +191,8 @@ int __down_read_trylock(struct rw_semaph
  * get a write lock on the semaphore
  * - we increment the waiting count anyway to indicate an exclusive lock
  */
-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+void __sched
+__down_write_nested(struct compat_rw_semaphore *sem, int subclass)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -231,7 +232,7 @@ void __sched __down_write_nested(struct 
 	;
 }
 
-void __sched __down_write(struct rw_semaphore *sem)
+void __sched __down_write(struct compat_rw_semaphore *sem)
 {
 	__down_write_nested(sem, 0);
 }
@@ -239,7 +240,7 @@ void __sched __down_write(struct rw_sema
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-int __down_write_trylock(struct rw_semaphore *sem)
+int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -260,7 +261,7 @@ int __down_write_trylock(struct rw_semap
 /*
  * release a read lock on the semaphore
  */
-void __up_read(struct rw_semaphore *sem)
+void __up_read(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -275,7 +276,7 @@ void __up_read(struct rw_semaphore *sem)
 /*
  * release a write lock on the semaphore
  */
-void __up_write(struct rw_semaphore *sem)
+void __up_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -292,7 +293,7 @@ void __up_write(struct rw_semaphore *sem
  * downgrade a write lock into a read lock
  * - just wake up any readers at the front of the queue
  */
-void __downgrade_write(struct rw_semaphore *sem)
+void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -305,7 +306,7 @@ void __downgrade_write(struct rw_semapho
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 }
 
-EXPORT_SYMBOL(__init_rwsem);
+EXPORT_SYMBOL(__compat_init_rwsem);
 EXPORT_SYMBOL(__down_read);
 EXPORT_SYMBOL(__down_read_trylock);
 EXPORT_SYMBOL(__down_write_nested);
Index: linux-2.6.25.4-rt4/lib/rwsem.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/rwsem.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/rwsem.c	2008-05-29 09:46:26.000000000 -0400
@@ -11,8 +11,8 @@
 /*
  * Initialize an rwsem:
  */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
-		  struct lock_class_key *key)
+void __compat_init_rwsem(struct rw_semaphore *sem, const char *name,
+			 struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/*
@@ -26,7 +26,7 @@ void __init_rwsem(struct rw_semaphore *s
 	INIT_LIST_HEAD(&sem->wait_list);
 }
 
-EXPORT_SYMBOL(__init_rwsem);
+EXPORT_SYMBOL(__compat_init_rwsem);
 
 struct rwsem_waiter {
 	struct list_head list;
Index: linux-2.6.25.4-rt4/lib/semaphore-sleepers.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/semaphore-sleepers.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/semaphore-sleepers.c	2008-05-29 09:46:26.000000000 -0400
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -48,12 +49,12 @@
  *    we cannot lose wakeup events.
  */
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -90,7 +91,7 @@ void __sched __down(struct semaphore *se
 	tsk->state = TASK_RUNNING;
 }
 
-int __sched __down_interruptible(struct semaphore *sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -153,7 +154,7 @@ int __sched __down_interruptible(struct 
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-int __down_trylock(struct semaphore *sem)
+int __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -174,3 +175,10 @@ int __down_trylock(struct semaphore *sem
 	spin_unlock_irqrestore(&sem->wait.lock, flags);
 	return 1;
 }
+
+int  compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
Index: linux-2.6.25.4-rt4/lib/spinlock_debug.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/spinlock_debug.c	2008-05-29 09:45:51.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/spinlock_debug.c	2008-05-29 09:46:26.000000000 -0400
@@ -13,8 +13,8 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 
-void __spin_lock_init(spinlock_t *lock, const char *name,
-		      struct lock_class_key *key)
+void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
+			  struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/*
@@ -23,16 +23,16 @@ void __spin_lock_init(spinlock_t *lock, 
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->raw_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+	lock->raw_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	lock->magic = SPINLOCK_MAGIC;
 	lock->owner = SPINLOCK_OWNER_INIT;
 	lock->owner_cpu = -1;
 }
 
-EXPORT_SYMBOL(__spin_lock_init);
+EXPORT_SYMBOL(__raw_spin_lock_init);
 
-void __rwlock_init(rwlock_t *lock, const char *name,
-		   struct lock_class_key *key)
+void __raw_rwlock_init(raw_rwlock_t *lock, const char *name,
+		       struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/*
@@ -41,15 +41,15 @@ void __rwlock_init(rwlock_t *lock, const
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
-	lock->raw_lock = (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED;
+	lock->raw_lock = (__raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED;
 	lock->magic = RWLOCK_MAGIC;
 	lock->owner = SPINLOCK_OWNER_INIT;
 	lock->owner_cpu = -1;
 }
 
-EXPORT_SYMBOL(__rwlock_init);
+EXPORT_SYMBOL(__raw_rwlock_init);
 
-static void spin_bug(spinlock_t *lock, const char *msg)
+static void spin_bug(raw_spinlock_t *lock, const char *msg)
 {
 	struct task_struct *owner = NULL;
 
@@ -73,7 +73,7 @@ static void spin_bug(spinlock_t *lock, c
 #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
 
 static inline void
-debug_spin_lock_before(spinlock_t *lock)
+debug_spin_lock_before(raw_spinlock_t *lock)
 {
 	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
 	SPIN_BUG_ON(lock->owner == current, lock, "recursion");
@@ -81,13 +81,13 @@ debug_spin_lock_before(spinlock_t *lock)
 							lock, "cpu recursion");
 }
 
-static inline void debug_spin_lock_after(spinlock_t *lock)
+static inline void debug_spin_lock_after(raw_spinlock_t *lock)
 {
 	lock->owner_cpu = raw_smp_processor_id();
 	lock->owner = current;
 }
 
-static inline void debug_spin_unlock(spinlock_t *lock)
+static inline void debug_spin_unlock(raw_spinlock_t *lock)
 {
 	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
 	SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked");
@@ -98,7 +98,7 @@ static inline void debug_spin_unlock(spi
 	lock->owner_cpu = -1;
 }
 
-static void __spin_lock_debug(spinlock_t *lock)
+static void __spin_lock_debug(raw_spinlock_t *lock)
 {
 	u64 i;
 	u64 loops = loops_per_jiffy * HZ;
@@ -125,7 +125,7 @@ static void __spin_lock_debug(spinlock_t
 	}
 }
 
-void _raw_spin_lock(spinlock_t *lock)
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
 {
 	debug_spin_lock_before(lock);
 	if (unlikely(!__raw_spin_trylock(&lock->raw_lock)))
@@ -133,7 +133,7 @@ void _raw_spin_lock(spinlock_t *lock)
 	debug_spin_lock_after(lock);
 }
 
-int _raw_spin_trylock(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
 {
 	int ret = __raw_spin_trylock(&lock->raw_lock);
 
@@ -148,13 +148,13 @@ int _raw_spin_trylock(spinlock_t *lock)
 	return ret;
 }
 
-void _raw_spin_unlock(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
 	debug_spin_unlock(lock);
 	__raw_spin_unlock(&lock->raw_lock);
 }
 
-static void rwlock_bug(rwlock_t *lock, const char *msg)
+static void rwlock_bug(raw_rwlock_t *lock, const char *msg)
 {
 	if (!debug_locks_off())
 		return;
@@ -167,8 +167,8 @@ static void rwlock_bug(rwlock_t *lock, c
 
 #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
 
-#if 0		/* __write_lock_debug() can lock up - maybe this can too? */
-static void __read_lock_debug(rwlock_t *lock)
+#if 1		/* __write_lock_debug() can lock up - maybe this can too? */
+static void __raw_read_lock_debug(raw_rwlock_t *lock)
 {
 	u64 i;
 	u64 loops = loops_per_jiffy * HZ;
@@ -193,13 +193,13 @@ static void __read_lock_debug(rwlock_t *
 }
 #endif
 
-void _raw_read_lock(rwlock_t *lock)
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
-	__raw_read_lock(&lock->raw_lock);
+	__raw_read_lock_debug(lock);
 }
 
-int _raw_read_trylock(rwlock_t *lock)
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock)
 {
 	int ret = __raw_read_trylock(&lock->raw_lock);
 
@@ -212,13 +212,13 @@ int _raw_read_trylock(rwlock_t *lock)
 	return ret;
 }
 
-void _raw_read_unlock(rwlock_t *lock)
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
 	__raw_read_unlock(&lock->raw_lock);
 }
 
-static inline void debug_write_lock_before(rwlock_t *lock)
+static inline void debug_write_lock_before(raw_rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
 	RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
@@ -226,13 +226,13 @@ static inline void debug_write_lock_befo
 							lock, "cpu recursion");
 }
 
-static inline void debug_write_lock_after(rwlock_t *lock)
+static inline void debug_write_lock_after(raw_rwlock_t *lock)
 {
 	lock->owner_cpu = raw_smp_processor_id();
 	lock->owner = current;
 }
 
-static inline void debug_write_unlock(rwlock_t *lock)
+static inline void debug_write_unlock(raw_rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
 	RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
@@ -242,8 +242,8 @@ static inline void debug_write_unlock(rw
 	lock->owner_cpu = -1;
 }
 
-#if 0		/* This can cause lockups */
-static void __write_lock_debug(rwlock_t *lock)
+#if 1		/* This can cause lockups */
+static void __raw_write_lock_debug(raw_rwlock_t *lock)
 {
 	u64 i;
 	u64 loops = loops_per_jiffy * HZ;
@@ -268,14 +268,14 @@ static void __write_lock_debug(rwlock_t 
 }
 #endif
 
-void _raw_write_lock(rwlock_t *lock)
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)
 {
 	debug_write_lock_before(lock);
-	__raw_write_lock(&lock->raw_lock);
+	__raw_write_lock_debug(lock);
 	debug_write_lock_after(lock);
 }
 
-int _raw_write_trylock(rwlock_t *lock)
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock)
 {
 	int ret = __raw_write_trylock(&lock->raw_lock);
 
@@ -290,7 +290,7 @@ int _raw_write_trylock(rwlock_t *lock)
 	return ret;
 }
 
-void _raw_write_unlock(rwlock_t *lock)
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)
 {
 	debug_write_unlock(lock);
 	__raw_write_unlock(&lock->raw_lock);
Index: linux-2.6.25.4-rt4/include/linux/pickop.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/linux/pickop.h	2008-05-29 09:46:26.000000000 -0400
@@ -0,0 +1,32 @@
+#ifndef _LINUX_PICKOP_H
+#define _LINUX_PICKOP_H
+
+#undef PICK_TYPE_EQUAL
+#define PICK_TYPE_EQUAL(var, type) \
+		__builtin_types_compatible_p(typeof(var), type)
+
+extern int __bad_func_type(void);
+
+#define PICK_FUNCTION(type1, type2, func1, func2, arg0, ...)		\
+do {									\
+	if (PICK_TYPE_EQUAL((arg0), type1))				\
+		func1((type1)(arg0), ##__VA_ARGS__);			\
+	else if (PICK_TYPE_EQUAL((arg0), type2))			\
+		func2((type2)(arg0), ##__VA_ARGS__);			\
+	else __bad_func_type();						\
+} while (0)
+
+#define PICK_FUNCTION_RET(type1, type2, func1, func2, arg0, ...)	\
+({									\
+	unsigned long __ret;						\
+									\
+	if (PICK_TYPE_EQUAL((arg0), type1))				\
+		__ret = func1((type1)(arg0), ##__VA_ARGS__);		\
+	else if (PICK_TYPE_EQUAL((arg0), type2))			\
+		__ret = func2((type2)(arg0), ##__VA_ARGS__);		\
+	else __ret = __bad_func_type();					\
+									\
+	__ret;								\
+})
+
+#endif /* _LINUX_PICKOP_H */
Index: linux-2.6.25.4-rt4/arch/x86/kernel/apm_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/apm_32.c	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/apm_32.c	2008-05-29 09:46:27.000000000 -0400
@@ -783,7 +783,7 @@ static int apm_do_idle(void)
 		 */
 		smp_mb();
 	}
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		idled = 1;
 		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
 	}
Index: linux-2.6.25.4-rt4/arch/x86/lib/semaphore_32.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/lib/semaphore_32.S	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/lib/semaphore_32.S	2008-05-29 09:46:27.000000000 -0400
@@ -30,7 +30,7 @@
  * value or just clobbered..
  */
 	.section .sched.text, "ax"
-ENTRY(__down_failed)
+ENTRY(__compat_down_failed)
 	CFI_STARTPROC
 	FRAME
 	pushl %edx
@@ -39,7 +39,7 @@ ENTRY(__down_failed)
 	pushl %ecx
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ecx,0
-	call __down
+	call __compat_down
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	CFI_RESTORE ecx
@@ -50,8 +50,9 @@ ENTRY(__down_failed)
 	ret
 	CFI_ENDPROC
 	ENDPROC(__down_failed)
+	ENDPROC(__compat_down_failed)
 
-ENTRY(__down_failed_interruptible)
+ENTRY(__compat_down_failed_interruptible)
 	CFI_STARTPROC
 	FRAME
 	pushl %edx
@@ -60,7 +61,7 @@ ENTRY(__down_failed_interruptible)
 	pushl %ecx
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ecx,0
-	call __down_interruptible
+	call __compat_down_interruptible
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	CFI_RESTORE ecx
@@ -71,8 +72,9 @@ ENTRY(__down_failed_interruptible)
 	ret
 	CFI_ENDPROC
 	ENDPROC(__down_failed_interruptible)
+	ENDPROC(__compat_down_failed_interruptible)
 
-ENTRY(__down_failed_trylock)
+ENTRY(__compat_down_failed_trylock)
 	CFI_STARTPROC
 	FRAME
 	pushl %edx
@@ -81,7 +83,7 @@ ENTRY(__down_failed_trylock)
 	pushl %ecx
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ecx,0
-	call __down_trylock
+	call __compat_down_trylock
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	CFI_RESTORE ecx
@@ -91,9 +93,9 @@ ENTRY(__down_failed_trylock)
 	ENDFRAME
 	ret
 	CFI_ENDPROC
-	ENDPROC(__down_failed_trylock)
+	ENDPROC(__compat_down_failed_trylock)
 
-ENTRY(__up_wakeup)
+ENTRY(__compat_up_wakeup)
 	CFI_STARTPROC
 	FRAME
 	pushl %edx
@@ -102,7 +104,7 @@ ENTRY(__up_wakeup)
 	pushl %ecx
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ecx,0
-	call __up
+	call __compat_up
 	popl %ecx
 	CFI_ADJUST_CFA_OFFSET -4
 	CFI_RESTORE ecx
@@ -112,7 +114,7 @@ ENTRY(__up_wakeup)
 	ENDFRAME
 	ret
 	CFI_ENDPROC
-	ENDPROC(__up_wakeup)
+	ENDPROC(__compat_up_wakeup)
 
 /*
  * rw spinlock fallbacks
Index: linux-2.6.25.4-rt4/include/asm-x86/rwsem.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/rwsem.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/rwsem.h	2008-05-29 09:46:27.000000000 -0400
@@ -44,19 +44,19 @@
 
 struct rwsem_waiter;
 
-extern asmregparm struct rw_semaphore *
- rwsem_down_read_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *
- rwsem_down_write_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *
- rwsem_wake(struct rw_semaphore *);
-extern asmregparm struct rw_semaphore *
- rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern asmregparm struct compat_rw_semaphore *
+rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern asmregparm struct compat_rw_semaphore *
+rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern asmregparm struct compat_rw_semaphore *
+rwsem_wake(struct compat_rw_semaphore *);
+extern asmregparm struct compat_rw_semaphore *
+rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed long		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
@@ -82,23 +82,23 @@ struct rw_semaphore {
 { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
   LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+extern void __compat_init_rwsem(struct rw_semaphore *sem, const char *name,
 			 struct lock_class_key *key);
 
-#define init_rwsem(sem)						\
+#define compat_init_rwsem(sem)					\
 do {								\
 	static struct lock_class_key __key;			\
 								\
-	__init_rwsem((sem), #sem, &__key);			\
+	__compat_init_rwsem((sem), #sem, &__key);		\
 } while (0)
 
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning down_read\n\t"
@@ -115,7 +115,7 @@ LOCK_PREFIX	"  incl      (%%eax)\n\t" /*
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	__s32 result, tmp;
 	__asm__ __volatile__(
@@ -138,7 +138,8 @@ LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline void
+__down_write_nested(struct compat_rw_semaphore *sem, int subclass)
 {
 	int tmp;
 
@@ -164,7 +165,7 @@ static inline void __down_write(struct r
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	signed long ret = cmpxchg(&sem->count,
 				  RWSEM_UNLOCKED_VALUE, 
@@ -177,7 +178,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
 	__asm__ __volatile__(
@@ -195,7 +196,7 @@ LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __up_write\n\t"
@@ -213,7 +214,7 @@ LOCK_PREFIX	"  xaddl     %%edx,(%%eax)\n
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __downgrade_write\n\t"
@@ -230,7 +231,7 @@ LOCK_PREFIX	"  addl      %2,(%%eax)\n\t"
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 LOCK_PREFIX	"addl %1,%0"
@@ -241,7 +242,7 @@ LOCK_PREFIX	"addl %1,%0"
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	int tmp = delta;
 
@@ -253,7 +254,7 @@ LOCK_PREFIX	"xadd %0,%1"
 	return tmp+delta;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
Index: linux-2.6.25.4-rt4/include/asm-x86/semaphore_32.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/semaphore_32.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/semaphore_32.h	2008-05-29 09:46:27.000000000 -0400
@@ -3,8 +3,6 @@
 
 #include <linux/linkage.h>
 
-#ifdef __KERNEL__
-
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -41,29 +39,39 @@
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-static inline void sema_init (struct semaphore *sem, int val)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -73,27 +81,27 @@ static inline void sema_init (struct sem
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern asmregparm void __down_failed(atomic_t *count_ptr);
-extern asmregparm int  __down_failed_interruptible(atomic_t *count_ptr);
-extern asmregparm int  __down_failed_trylock(atomic_t *count_ptr);
-extern asmregparm void __up_wakeup(atomic_t *count_ptr);
+extern asmregparm void __compat_down_failed(void);
+extern asmregparm int  __compat_down_failed_interruptible(void);
+extern asmregparm int  __compat_down_failed_trylock(void);
+extern asmregparm void __compat_up_wakeup(void);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/i386/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__asm__ __volatile__(
@@ -101,7 +109,7 @@ static inline void down(struct semaphore
 		LOCK_PREFIX "decl %0\n\t"     /* --sem->count */
 		"jns 2f\n"
 		"\tlea %0,%%eax\n\t"
-		"call __down_failed\n"
+		"call __compat_down_failed\n"
 		"2:"
 		:"+m" (sem->count)
 		:
@@ -112,7 +120,7 @@ static inline void down(struct semaphore
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -123,7 +131,7 @@ static inline int down_interruptible(str
 		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
 		"jns 2f\n\t"
 		"lea %1,%%eax\n\t"
-		"call __down_failed_interruptible\n"
+		"call __compat_down_failed_interruptible\n"
 		"2:"
 		:"=&a" (result), "+m" (sem->count)
 		:
@@ -135,7 +143,7 @@ static inline int down_interruptible(str
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -145,7 +153,7 @@ static inline int down_trylock(struct se
 		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
 		"jns 2f\n\t"
 		"lea %1,%%eax\n\t"
-		"call __down_failed_trylock\n\t"
+		"call __compat_down_failed_trylock\n\t"
 		"2:\n"
 		:"=&a" (result), "+m" (sem->count)
 		:
@@ -157,19 +165,24 @@ static inline int down_trylock(struct se
  * Note! This is subtle. We jump to wake people up only if
  * the semaphore was negative (== somebody was waiting on it).
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
 		LOCK_PREFIX "incl %0\n\t"     /* ++sem->count */
 		"jg 1f\n\t"
 		"lea %0,%%eax\n\t"
-		"call __up_wakeup\n"
+		"call __compat_up_wakeup\n"
 		"1:"
 		:"+m" (sem->count)
 		:
 		:"memory","ax");
 }
 
-#endif
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif
Index: linux-2.6.25.4-rt4/include/asm-x86/spinlock_types.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/spinlock_types.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/spinlock_types.h	2008-05-29 09:46:27.000000000 -0400
@@ -7,13 +7,13 @@
 
 typedef struct {
 	unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
Index: linux-2.6.25.4-rt4/include/asm-x86/thread_info_32.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/thread_info_32.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/thread_info_32.h	2008-05-29 09:46:27.000000000 -0400
@@ -133,6 +133,7 @@ static inline struct thread_info *curren
 #define TIF_SECCOMP		7	/* secure computing */
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
 #define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
+#define TIF_NEED_RESCHED_DELAYED 10	/* reschedule on return to userspace */
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
@@ -153,6 +154,7 @@ static inline struct thread_info *curren
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
Index: linux-2.6.25.4-rt4/include/asm-x86/spinlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/spinlock.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/spinlock.h	2008-05-29 09:46:37.000000000 -0400
@@ -64,21 +64,21 @@ typedef int _slock_t;
  * much between them in performance though, especially as locks are out of line.
  */
 #if (NR_CPUS < 256)
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int __raw_spin_is_locked(__raw_spinlock_t *lock)
 {
 	int tmp = *(volatile signed int *)(&(lock)->slock);
 
 	return (((tmp >> 8) & 0xff) != (tmp & 0xff));
 }
 
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int __raw_spin_is_contended(__raw_spinlock_t *lock)
 {
 	int tmp = *(volatile signed int *)(&(lock)->slock);
 
 	return (((tmp >> 8) & 0xff) - (tmp & 0xff)) > 1;
 }
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	short inc = 0x0100;
 
@@ -99,7 +99,7 @@ static inline void __raw_spin_lock(raw_s
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	int tmp;
 	short new;
@@ -121,7 +121,7 @@ static inline int __raw_spin_trylock(raw
 	return tmp;
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		UNLOCK_LOCK_PREFIX "incb %0"
@@ -130,21 +130,21 @@ static inline void __raw_spin_unlock(raw
 		:"memory", "cc");
 }
 #else
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int __raw_spin_is_locked(__raw_spinlock_t *lock)
 {
 	int tmp = *(volatile signed int *)(&(lock)->slock);
 
 	return (((tmp >> 16) & 0xffff) != (tmp & 0xffff));
 }
 
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int __raw_spin_is_contended(__raw_spinlock_t *lock)
 {
 	int tmp = *(volatile signed int *)(&(lock)->slock);
 
 	return (((tmp >> 16) & 0xffff) - (tmp & 0xffff)) > 1;
 }
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	int inc = 0x00010000;
 	int tmp;
@@ -168,7 +168,7 @@ static inline void __raw_spin_lock(raw_s
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	int tmp;
 	int new;
@@ -191,7 +191,7 @@ static inline int __raw_spin_trylock(raw
 	return tmp;
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		UNLOCK_LOCK_PREFIX "incw %0"
@@ -201,7 +201,7 @@ static inline void __raw_spin_unlock(raw
 }
 #endif
 
-static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock_wait(__raw_spinlock_t *lock)
 {
 	while (__raw_spin_is_locked(lock))
 		cpu_relax();
@@ -225,7 +225,7 @@ static inline void __raw_spin_unlock_wai
  * read_can_lock - would read_trylock() succeed?
  * @lock: the rwlock in question.
  */
-static inline int __raw_read_can_lock(raw_rwlock_t *lock)
+static inline int __raw_read_can_lock(__raw_rwlock_t *lock)
 {
 	return (int)(lock)->lock > 0;
 }
@@ -234,12 +234,12 @@ static inline int __raw_read_can_lock(ra
  * write_can_lock - would write_trylock() succeed?
  * @lock: the rwlock in question.
  */
-static inline int __raw_write_can_lock(raw_rwlock_t *lock)
+static inline int __raw_write_can_lock(__raw_rwlock_t *lock)
 {
 	return (lock)->lock == RW_LOCK_BIAS;
 }
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
 		     "jns 1f\n"
@@ -248,7 +248,7 @@ static inline void __raw_read_lock(raw_r
 		     ::LOCK_PTR_REG (rw) : "memory");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
 		     "jz 1f\n"
@@ -257,7 +257,7 @@ static inline void __raw_write_lock(raw_
 		     ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 
@@ -268,7 +268,7 @@ static inline int __raw_read_trylock(raw
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 
@@ -278,19 +278,19 @@ static inline int __raw_write_trylock(ra
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX "addl %1, %0"
 		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
 }
 
-#define _raw_spin_relax(lock)	cpu_relax()
-#define _raw_read_relax(lock)	cpu_relax()
-#define _raw_write_relax(lock)	cpu_relax()
+#define __raw_spin_relax(lock)	cpu_relax()
+#define __raw_read_relax(lock)	cpu_relax()
+#define __raw_write_relax(lock)	cpu_relax()
 
 #endif
Index: linux-2.6.25.4-rt4/include/asm-x86/semaphore_64.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/semaphore_64.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/semaphore_64.h	2008-05-29 09:46:27.000000000 -0400
@@ -5,6 +5,10 @@
 
 #ifdef __KERNEL__
 
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -43,28 +47,33 @@
 #include <linux/rwsem.h>
 #include <linux/stringify.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
 
-static inline void sema_init (struct semaphore *sem, int val)
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -74,32 +83,33 @@ static inline void sema_init (struct sem
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
+asmlinkage void __compat_down_failed(void /* special register calling convention */);
+asmlinkage int  __compat_down_failed_interruptible(void  /* params in registers */);
+asmlinkage int  __compat_down_failed_trylock(void  /* params in registers */);
+asmlinkage void __compat_up_wakeup(void /* special register calling convention */);
 
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
+asmlinkage void __compat_down(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_interruptible(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_trylock(struct compat_semaphore * sem);
+asmlinkage void __compat_up(struct compat_semaphore * sem);
+asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/x86_64/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -107,7 +117,7 @@ static inline void down(struct semaphore
 		"# atomic down operation\n\t"
 		LOCK_PREFIX "decl %0\n\t"     /* --sem->count */
 		"jns 1f\n\t"
-		"call __down_failed\n"
+		"call __compat_down_failed\n"
 		"1:"
 		:"=m" (sem->count)
 		:"D" (sem)
@@ -118,7 +128,7 @@ static inline void down(struct semaphore
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -129,7 +139,7 @@ static inline int down_interruptible(str
 		"xorl %0,%0\n\t"
 		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
 		"jns 2f\n\t"
-		"call __down_failed_interruptible\n"
+		"call __compat_down_failed_interruptible\n"
 		"2:\n"
 		:"=&a" (result), "=m" (sem->count)
 		:"D" (sem)
@@ -141,7 +151,7 @@ static inline int down_interruptible(str
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -150,7 +160,7 @@ static inline int down_trylock(struct se
 		"xorl %0,%0\n\t"
 		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
 		"jns 2f\n\t"
-		"call __down_failed_trylock\n\t"
+		"call __compat_down_failed_trylock\n\t"
 		"2:\n"
 		:"=&a" (result), "=m" (sem->count)
 		:"D" (sem)
@@ -164,17 +174,20 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
 		LOCK_PREFIX "incl %0\n\t"     /* ++sem->count */
 		"jg 1f\n\t"
-		"call __up_wakeup\n"
+		"call __compat_up_wakeup\n"
 		"1:"
 		:"=m" (sem->count)
 		:"D" (sem)
 		:"memory");
 }
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 #endif
Index: linux-2.6.25.4-rt4/include/asm-x86/thread_info_64.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/thread_info_64.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/thread_info_64.h	2008-05-29 09:46:27.000000000 -0400
@@ -108,6 +108,7 @@ static inline struct thread_info *stack_
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedul on return to userspace */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
@@ -131,6 +132,7 @@ static inline struct thread_info *stack_
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
Index: linux-2.6.25.4-rt4/arch/mips/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/Kconfig	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/Kconfig	2008-05-29 09:47:17.000000000 -0400
@@ -44,6 +44,7 @@ config BCM47XX
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_NONCOHERENT
+	select NO_SPINLOCK
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select SYS_HAS_CPU_MIPS32_R1
@@ -708,6 +709,10 @@ config RWSEM_GENERIC_SPINLOCK
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 
+config ASM_SEMAPHORES
+	bool
+	default y
+
 config ARCH_HAS_ILOG2_U32
 	bool
 	default n
@@ -804,6 +809,9 @@ config DMA_NONCOHERENT
 config DMA_NEED_PCI_MAP_STATE
 	bool
 
+config NO_SPINLOCK
+	bool
+
 config EARLY_PRINTK
 	bool "Early printk" if EMBEDDED && DEBUG_KERNEL
 	depends on SYS_HAS_EARLY_PRINTK
@@ -1811,8 +1819,6 @@ config NR_CPUS
 	  performance should round up your number of processors to the next
 	  power of two.
 
-source "kernel/time/Kconfig"
-
 #
 # Timer Interrupt Frequency Configuration
 #
@@ -1930,12 +1936,17 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
-endmenu
-
-config RWSEM_GENERIC_SPINLOCK
+config GENERIC_TIME
 	bool
 	default y
 
+source "kernel/time/Kconfig"
+
+config CPU_SPEED
+	int "CPU speed used for clocksource/clockevent calculations"
+	default 600
+endmenu
+
 config LOCKDEP_SUPPORT
 	bool
 	default y
Index: linux-2.6.25.4-rt4/arch/mips/kernel/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/Makefile	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/Makefile	2008-05-29 09:46:27.000000000 -0400
@@ -5,7 +5,7 @@
 extra-y		:= head.o init_task.o vmlinux.lds
 
 obj-y		+= cpu-probe.o branch.o entry.o genex.o irq.o process.o \
-		   ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \
+		   ptrace.o reset.o setup.o signal.o syscall.o \
 		   time.o topology.o traps.o unaligned.o
 
 obj-$(CONFIG_CEVT_BCM1480)	+= cevt-bcm1480.o
@@ -26,6 +26,8 @@ obj-$(CONFIG_MODULES)		+= mips_ksyms.o m
 obj-$(CONFIG_CPU_LOONGSON2)	+= r4k_fpu.o r4k_switch.o
 obj-$(CONFIG_CPU_MIPS32)	+= r4k_fpu.o r4k_switch.o
 obj-$(CONFIG_CPU_MIPS64)	+= r4k_fpu.o r4k_switch.o
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
+
 obj-$(CONFIG_CPU_R3000)		+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_R4000)		+= r4k_fpu.o r4k_switch.o
 obj-$(CONFIG_CPU_R4300)		+= r4k_fpu.o r4k_switch.o
Index: linux-2.6.25.4-rt4/include/asm-mips/atomic.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/atomic.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/atomic.h	2008-05-29 09:46:37.000000000 -0400
@@ -171,7 +171,9 @@ static __inline__ int atomic_add_return(
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		raw_local_irq_save(flags);
@@ -180,6 +182,7 @@ static __inline__ int atomic_add_return(
 		v->counter = result;
 		raw_local_irq_restore(flags);
 	}
+#endif
 
 	smp_llsc_mb();
 
@@ -223,7 +226,9 @@ static __inline__ int atomic_sub_return(
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		raw_local_irq_save(flags);
@@ -232,6 +237,7 @@ static __inline__ int atomic_sub_return(
 		v->counter = result;
 		raw_local_irq_restore(flags);
 	}
+#endif
 
 	smp_llsc_mb();
 
@@ -291,7 +297,9 @@ static __inline__ int atomic_sub_if_posi
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		raw_local_irq_save(flags);
@@ -301,6 +309,7 @@ static __inline__ int atomic_sub_if_posi
 			v->counter = result;
 		raw_local_irq_restore(flags);
 	}
+#endif
 
 	smp_llsc_mb();
 
@@ -552,7 +561,9 @@ static __inline__ long atomic64_add_retu
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		raw_local_irq_save(flags);
@@ -561,6 +572,7 @@ static __inline__ long atomic64_add_retu
 		v->counter = result;
 		raw_local_irq_restore(flags);
 	}
+#endif
 
 	smp_llsc_mb();
 
@@ -604,7 +616,9 @@ static __inline__ long atomic64_sub_retu
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		raw_local_irq_save(flags);
@@ -682,6 +696,7 @@ static __inline__ long atomic64_sub_if_p
 			v->counter = result;
 		raw_local_irq_restore(flags);
 	}
+#endif
 
 	smp_llsc_mb();
 
Index: linux-2.6.25.4-rt4/include/asm-mips/semaphore.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/semaphore.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/semaphore.h	2008-05-29 09:46:37.000000000 -0400
@@ -24,12 +24,20 @@
 
 #ifdef __KERNEL__
 
-#include <asm/atomic.h>
-#include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
@@ -39,38 +47,41 @@ struct semaphore {
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name, count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name, 1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
 
-static inline void sema_init(struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX(struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -78,31 +89,37 @@ static inline void down(struct semaphore
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASM_SEMAPHORE_H */
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/Makefile	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/Makefile	2008-05-29 09:46:28.000000000 -0400
@@ -12,11 +12,12 @@ CFLAGS_prom_init.o      += -fPIC
 CFLAGS_btext.o		+= -fPIC
 endif
 
-obj-y				:= semaphore.o cputable.o ptrace.o syscalls.o \
+obj-y				:= cputable.o ptrace.o syscalls.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o \
 				   init_task.o process.o systbl.o idle.o \
 				   signal.o
 obj-y				+= vdso32/
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
 				   paca.o cpu_setup_ppc970.o \
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/semaphore.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/semaphore.c	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/semaphore.c	2008-05-29 09:46:28.000000000 -0400
@@ -31,7 +31,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -50,7 +50,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -63,7 +63,7 @@ void __up(struct semaphore *sem)
 	__sem_update_count(sem, 1);
 	wake_up(&sem->wait);
 }
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up);
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se
 	 */
 	wake_up(&sem->wait);
 }
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore *sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -132,4 +132,10 @@ int __sched __down_interruptible(struct 
 	wake_up(&sem->wait);
 	return retval;
 }
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+EXPORT_SYMBOL(compat_sem_is_locked);
Index: linux-2.6.25.4-rt4/arch/powerpc/lib/locks.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/lib/locks.c	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/lib/locks.c	2008-05-29 09:47:07.000000000 -0400
@@ -25,7 +25,7 @@
 #include <asm/smp.h>
 #include <asm/firmware.h>
 
-void __spin_yield(raw_spinlock_t *lock)
+void __spin_yield(__raw_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 
@@ -55,7 +55,7 @@ void __spin_yield(raw_spinlock_t *lock)
  * This turns out to be the same for read and write locks, since
  * we only know the holder if it is write-locked.
  */
-void __rw_yield(raw_rwlock_t *rw)
+void __rw_yield(__raw_rwlock_t *rw)
 {
 	int lock_value;
 	unsigned int holder_cpu, yield_count;
@@ -82,7 +82,7 @@ void __rw_yield(raw_rwlock_t *rw)
 }
 #endif
 
-void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+void __raw_spin_unlock_wait(__raw_spinlock_t *lock)
 {
 	while (lock->slock) {
 		HMT_low();
Index: linux-2.6.25.4-rt4/arch/ppc/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/Kconfig	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/Kconfig	2008-05-29 09:46:28.000000000 -0400
@@ -16,13 +16,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y
-
 config ARCH_HAS_ILOG2_U32
 	bool
 	default y
@@ -926,6 +919,18 @@ config ARCH_POPULATES_NODE_MAP
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "mm/Kconfig"
 
 source "fs/Kconfig.binfmt"
Index: linux-2.6.25.4-rt4/arch/ppc/kernel/entry.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/kernel/entry.S	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/kernel/entry.S	2008-05-29 09:46:28.000000000 -0400
@@ -882,7 +882,7 @@ global_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -896,7 +896,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	rlwinm	r9,r1,0,0,18
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	do_resched
 	andi.	r0,r9,_TIF_SIGPENDING
 	beq	restore_user
Index: linux-2.6.25.4-rt4/arch/ppc/kernel/semaphore.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/kernel/semaphore.c	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/kernel/semaphore.c	2008-05-29 09:46:28.000000000 -0400
@@ -29,7 +29,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -48,7 +48,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -70,7 +70,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -129,3 +129,8 @@ int __sched __down_interruptible(struct 
 	wake_up(&sem->wait);
 	return retval;
 }
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
Index: linux-2.6.25.4-rt4/arch/ppc/lib/locks.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/lib/locks.c	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/lib/locks.c	2008-05-29 09:46:28.000000000 -0400
@@ -42,7 +42,7 @@ static inline unsigned long __spin_trylo
 	return ret;
 }
 
-void _raw_spin_lock(spinlock_t *lock)
+void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	int cpu = smp_processor_id();
 	unsigned int stuck = INIT_STUCK;
@@ -62,9 +62,9 @@ void _raw_spin_lock(spinlock_t *lock)
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	lock->owner_cpu = cpu;
 }
-EXPORT_SYMBOL(_raw_spin_lock);
+EXPORT_SYMBOL(__raw_spin_lock);
 
-int _raw_spin_trylock(spinlock_t *lock)
+int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	if (__spin_trylock(&lock->lock))
 		return 0;
@@ -72,9 +72,9 @@ int _raw_spin_trylock(spinlock_t *lock)
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	return 1;
 }
-EXPORT_SYMBOL(_raw_spin_trylock);
+EXPORT_SYMBOL(__raw_spin_trylock);
 
-void _raw_spin_unlock(spinlock_t *lp)
+void __raw_spin_unlock(raw_spinlock_t *lp)
 {
   	if ( !lp->lock )
 		printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n",
@@ -88,13 +88,13 @@ void _raw_spin_unlock(spinlock_t *lp)
 	wmb();
 	lp->lock = 0;
 }
-EXPORT_SYMBOL(_raw_spin_unlock);
+EXPORT_SYMBOL(__raw_spin_unlock);
 
 /*
  * For rwlocks, zero is unlocked, -1 is write-locked,
  * positive is read-locked.
  */
-static __inline__ int __read_trylock(rwlock_t *rw)
+static __inline__ int __read_trylock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -114,13 +114,13 @@ static __inline__ int __read_trylock(rwl
 	return tmp;
 }
 
-int _raw_read_trylock(rwlock_t *rw)
+int __raw_read_trylock(raw_rwlock_t *rw)
 {
 	return __read_trylock(rw) > 0;
 }
-EXPORT_SYMBOL(_raw_read_trylock);
+EXPORT_SYMBOL(__raw_read_trylock);
 
-void _raw_read_lock(rwlock_t *rw)
+void __raw_read_lock(rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -135,9 +135,9 @@ void _raw_read_lock(rwlock_t *rw)
 		}
 	}
 }
-EXPORT_SYMBOL(_raw_read_lock);
+EXPORT_SYMBOL(__raw_read_lock);
 
-void _raw_read_unlock(rwlock_t *rw)
+void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	if ( rw->lock == 0 )
 		printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n",
@@ -146,9 +146,9 @@ void _raw_read_unlock(rwlock_t *rw)
 	wmb();
 	atomic_dec((atomic_t *) &(rw)->lock);
 }
-EXPORT_SYMBOL(_raw_read_unlock);
+EXPORT_SYMBOL(__raw_read_unlock);
 
-void _raw_write_lock(rwlock_t *rw)
+void __raw_write_lock(raw_rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -164,18 +164,18 @@ void _raw_write_lock(rwlock_t *rw)
 	}
 	wmb();
 }
-EXPORT_SYMBOL(_raw_write_lock);
+EXPORT_SYMBOL(__raw_write_lock);
 
-int _raw_write_trylock(rwlock_t *rw)
+int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	if (cmpxchg(&rw->lock, 0, -1) != 0)
 		return 0;
 	wmb();
 	return 1;
 }
-EXPORT_SYMBOL(_raw_write_trylock);
+EXPORT_SYMBOL(__raw_write_trylock);
 
-void _raw_write_unlock(rwlock_t *rw)
+void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	if (rw->lock >= 0)
 		printk("_write_lock(): %s/%d (nip %08lX) lock %d\n",
@@ -184,6 +184,6 @@ void _raw_write_unlock(rwlock_t *rw)
 	wmb();
 	rw->lock = 0;
 }
-EXPORT_SYMBOL(_raw_write_unlock);
+EXPORT_SYMBOL(__raw_write_unlock);
 
 #endif
Index: linux-2.6.25.4-rt4/drivers/macintosh/adb.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/macintosh/adb.c	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/macintosh/adb.c	2008-05-29 09:46:28.000000000 -0400
@@ -84,7 +84,7 @@ struct adb_driver *adb_controller;
 BLOCKING_NOTIFIER_HEAD(adb_client_list);
 static int adb_got_sleep;
 static int adb_inited;
-static DECLARE_MUTEX(adb_probe_mutex);
+static COMPAT_DECLARE_MUTEX(adb_probe_mutex);
 static int sleepy_trackpad;
 static int autopoll_devs;
 int __adb_probe_sync;
Index: linux-2.6.25.4-rt4/include/asm-powerpc/rwsem.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/rwsem.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/rwsem.h	2008-05-29 09:46:28.000000000 -0400
@@ -21,7 +21,7 @@
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	/* XXX this should be able to be an atomic_t  -- paulus */
 	signed int		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
@@ -30,7 +30,7 @@ struct rw_semaphore {
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
+	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list;
 };
 
@@ -38,15 +38,15 @@ struct rw_semaphore {
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
 	  LIST_HEAD_INIT((name).wait_list) }
 
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -56,13 +56,13 @@ static inline void init_rwsem(struct rw_
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0))
 		rwsem_down_read_failed(sem);
 }
 
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -78,7 +78,7 @@ static inline int __down_read_trylock(st
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -88,7 +88,7 @@ static inline void __down_write(struct r
 		rwsem_down_write_failed(sem);
 }
 
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -100,7 +100,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -112,7 +112,7 @@ static inline void __up_read(struct rw_s
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
 			      (atomic_t *)(&sem->count)) < 0))
@@ -122,7 +122,7 @@ static inline void __up_write(struct rw_
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	atomic_add(delta, (atomic_t *)(&sem->count));
 }
@@ -130,7 +130,7 @@ static inline void rwsem_atomic_add(int 
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -142,12 +142,12 @@ static inline void __downgrade_write(str
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
Index: linux-2.6.25.4-rt4/include/asm-powerpc/semaphore.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/semaphore.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/semaphore.h	2008-05-29 09:46:28.000000000 -0400
@@ -15,48 +15,58 @@
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all sempahores are compat
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
 	 * sleeping on `wait'.
 	 */
 	atomic_t count;
+	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -64,31 +74,35 @@ static inline void down(struct semaphore
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_POWERPC_SEMAPHORE_H */
Index: linux-2.6.25.4-rt4/include/asm-powerpc/spinlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/spinlock.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/spinlock.h	2008-05-29 09:46:40.000000000 -0400
@@ -53,7 +53,7 @@
  * This returns the old value in the lock, so we succeeded
  * in getting the lock if the return value is 0.
  */
-static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock)
+static __inline__ unsigned long ___raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	unsigned long tmp, token;
 
@@ -72,10 +72,10 @@ static __inline__ unsigned long __spin_t
 	return tmp;
 }
 
-static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock)
+static int __inline__ __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
-	return __spin_trylock(lock) == 0;
+	return ___raw_spin_trylock(lock) == 0;
 }
 
 /*
@@ -95,19 +95,19 @@ static int __inline__ __raw_spin_trylock
 #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (get_lppaca()->shared_proc)
-extern void __spin_yield(raw_spinlock_t *lock);
-extern void __rw_yield(raw_rwlock_t *lock);
+extern void __spin_yield(__raw_spinlock_t *lock);
+extern void __rw_yield(__raw_rwlock_t *lock);
 #else /* SPLPAR || ISERIES */
 #define __spin_yield(x)	barrier()
 #define __rw_yield(x)	barrier()
 #define SHARED_PROCESSOR	0
 #endif
 
-static void __inline__ __raw_spin_lock(raw_spinlock_t *lock)
+static void __inline__ __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
 	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
+		if (likely(___raw_spin_trylock(lock) == 0))
 			break;
 		do {
 			HMT_low();
@@ -118,13 +118,13 @@ static void __inline__ __raw_spin_lock(r
 	}
 }
 
-static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+static void __inline__ __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	unsigned long flags_dis;
 
 	CLEAR_IO_SYNC;
 	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
+		if (likely(___raw_spin_trylock(lock) == 0))
 			break;
 		local_save_flags(flags_dis);
 		local_irq_restore(flags);
@@ -138,7 +138,7 @@ static void __inline__ __raw_spin_lock_f
 	}
 }
 
-static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock)
+static __inline__ void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	SYNC_IO;
 	__asm__ __volatile__("# __raw_spin_unlock\n\t"
@@ -147,7 +147,7 @@ static __inline__ void __raw_spin_unlock
 }
 
 #ifdef CONFIG_PPC64
-extern void __raw_spin_unlock_wait(raw_spinlock_t *lock);
+extern void __raw_spin_unlock_wait(__raw_spinlock_t *lock);
 #else
 #define __raw_spin_unlock_wait(lock) \
 	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
@@ -179,7 +179,7 @@ extern void __raw_spin_unlock_wait(raw_s
  * This returns the old value in the lock + 1,
  * so we got a read lock if the return value is > 0.
  */
-static long __inline__ __read_trylock(raw_rwlock_t *rw)
+static long __inline__ ___raw_read_trylock(__raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -203,7 +203,7 @@ static long __inline__ __read_trylock(ra
  * This returns the old value in the lock,
  * so we got the write lock if the return value is 0.
  */
-static __inline__ long __write_trylock(raw_rwlock_t *rw)
+static __inline__ long ___raw_write_trylock(__raw_rwlock_t *rw)
 {
 	long tmp, token;
 
@@ -223,10 +223,10 @@ static __inline__ long __write_trylock(r
 	return tmp;
 }
 
-static void __inline__ __raw_read_lock(raw_rwlock_t *rw)
+static void __inline__ __raw_read_lock(__raw_rwlock_t *rw)
 {
 	while (1) {
-		if (likely(__read_trylock(rw) > 0))
+		if (likely(___raw_read_trylock(rw) > 0))
 			break;
 		do {
 			HMT_low();
@@ -237,10 +237,10 @@ static void __inline__ __raw_read_lock(r
 	}
 }
 
-static void __inline__ __raw_write_lock(raw_rwlock_t *rw)
+static void __inline__ __raw_write_lock(__raw_rwlock_t *rw)
 {
 	while (1) {
-		if (likely(__write_trylock(rw) == 0))
+		if (likely(___raw_write_trylock(rw) == 0))
 			break;
 		do {
 			HMT_low();
@@ -251,17 +251,17 @@ static void __inline__ __raw_write_lock(
 	}
 }
 
-static int __inline__ __raw_read_trylock(raw_rwlock_t *rw)
+static int __inline__ __raw_read_trylock(__raw_rwlock_t *rw)
 {
-	return __read_trylock(rw) > 0;
+	return ___raw_read_trylock(rw) > 0;
 }
 
-static int __inline__ __raw_write_trylock(raw_rwlock_t *rw)
+static int __inline__ __raw_write_trylock(__raw_rwlock_t *rw)
 {
-	return __write_trylock(rw) == 0;
+	return ___raw_write_trylock(rw) == 0;
 }
 
-static void __inline__ __raw_read_unlock(raw_rwlock_t *rw)
+static void __inline__ __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -278,7 +278,7 @@ static void __inline__ __raw_read_unlock
 	: "cr0", "memory");
 }
 
-static __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
+static __inline__ void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	__asm__ __volatile__("# write_unlock\n\t"
 				LWSYNC_ON_SMP: : :"memory");
@@ -289,5 +289,9 @@ static __inline__ void __raw_write_unloc
 #define _raw_read_relax(lock)	__rw_yield(lock)
 #define _raw_write_relax(lock)	__rw_yield(lock)
 
+#define __raw_spin_relax(lock)  cpu_relax()
+#define __raw_read_relax(lock)  cpu_relax()
+#define __raw_write_relax(lock) cpu_relax()
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
Index: linux-2.6.25.4-rt4/include/asm-powerpc/spinlock_types.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/spinlock_types.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/spinlock_types.h	2008-05-29 09:46:28.000000000 -0400
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile signed int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0 }
 
Index: linux-2.6.25.4-rt4/arch/arm/kernel/entry-armv.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/entry-armv.S	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/entry-armv.S	2008-05-29 09:46:28.000000000 -0400
@@ -210,7 +210,7 @@ __irq_svc:
 	irq_handler
 #ifdef CONFIG_PREEMPT
 	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	blne	svc_preempt
 preempt_return:
 	ldr	r0, [tsk, #TI_PREEMPT]		@ read preempt value
@@ -241,7 +241,7 @@ svc_preempt:
 	str	r7, [tsk, #TI_PREEMPT]		@ expects preempt_count == 0
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	beq	preempt_return			@ go again
 	b	1b
 #endif
Index: linux-2.6.25.4-rt4/arch/arm/kernel/semaphore.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/semaphore.c	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/semaphore.c	2008-05-29 09:47:33.000000000 -0400
@@ -49,14 +49,16 @@
  *    we cannot lose wakeup events.
  */
 
-void __up(struct semaphore *sem)
+ void __used __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
+EXPORT_SYMBOL(__compat_up);
+
 static DEFINE_SPINLOCK(semaphore_lock);
 
-void __sched __down(struct semaphore * sem)
+ void __used __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+EXPORT_SYMBOL(__compat_down);
+
+ int __used __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -140,6 +144,8 @@ int __sched __down_interruptible(struct 
 	return retval;
 }
 
+EXPORT_SYMBOL(__compat_down_interruptible);
+
 /*
  * Trylock failed - make sure we correct for
  * having decremented the count.
@@ -148,7 +154,7 @@ int __sched __down_interruptible(struct 
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-int __down_trylock(struct semaphore * sem)
+ int __used __sched __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se
 	return 1;
 }
 
+EXPORT_SYMBOL(__compat_down_trylock);
+
+ int __sched compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
+
 /*
  * The semaphore operations have a special calling sequence that
  * allow us to do a simpler in-line version of them. These routines
@@ -185,7 +200,7 @@ asm("	.section .sched.text,\"ax\",%progb
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down				\n\
+	bl	__compat_down			\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
 	.align	5				\n\
@@ -193,7 +208,7 @@ __down_failed:					\n\
 __down_interruptible_failed:			\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_interruptible		\n\
+	bl	__compat_down_interruptible	\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
@@ -202,7 +217,7 @@ __down_interruptible_failed:			\n\
 __down_trylock_failed:				\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_trylock			\n\
+	bl	__compat_down_trylock		\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
@@ -211,7 +226,7 @@ __down_trylock_failed:				\n\
 __up_wakeup:					\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__up				\n\
+	bl	__compat_up			\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 	");
 
Index: linux-2.6.25.4-rt4/include/asm-arm/semaphore.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/semaphore.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/semaphore.h	2008-05-29 09:46:28.000000000 -0400
@@ -5,45 +5,65 @@
 #define __ASM_ARM_SEMAPHORE_H
 
 #include <linux/linkage.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define semaphore compat_semaphore
+#endif
+
 #include <asm/atomic.h>
 #include <asm/locks.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INIT(name, cnt)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, cnt)				\
 {								\
 	.count	= ATOMIC_INIT(cnt),				\
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)	\
-	struct semaphore name = __SEMAPHORE_INIT(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
 
-static inline void sema_init(struct semaphore *sem, int val)
+static inline void compat_sema_init(struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	sem->sleepers = 0;
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX(struct semaphore *sem)
+static inline void compat_init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_sema_init(sem, 1);
+}
+
+static inline void compat_init_MUTEX_LOCKED(struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 0);
 }
 
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
+static inline int compat_sema_count(struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	return atomic_read(&sem->count);
 }
 
 /*
@@ -54,16 +74,18 @@ asmlinkage int  __down_interruptible_fai
 asmlinkage int  __down_trylock_failed(void);
 asmlinkage void __up_wakeup(void);
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern int  __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_up(struct compat_semaphore *sem);
+extern int __compat_down_interruptible(struct compat_semaphore * sem);
+extern int __compat_down_trylock(struct compat_semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down" is the actual routine that waits...
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__down_op(sem, __down_failed);
@@ -73,13 +95,13 @@ static inline void down(struct semaphore
  * This is ugly, but we want the default case to fall through.
  * "__down_interruptible" is the actual routine that waits...
  */
-static inline int down_interruptible (struct semaphore * sem)
+static inline int compat_down_interruptible (struct compat_semaphore * sem)
 {
 	might_sleep();
 	return __down_op_ret(sem, __down_interruptible_failed);
 }
 
-static inline int down_trylock(struct semaphore *sem)
+static inline int compat_down_trylock(struct compat_semaphore *sem)
 {
 	return __down_op_ret(sem, __down_trylock_failed);
 }
@@ -90,9 +112,10 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__up_op(sem, __up_wakeup);
 }
 
+#include <linux/semaphore.h>
 #endif
Index: linux-2.6.25.4-rt4/include/asm-arm/thread_info.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/thread_info.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/thread_info.h	2008-05-29 09:46:28.000000000 -0400
@@ -141,6 +141,7 @@ extern void iwmmxt_task_switch(struct th
  */
 #define TIF_SIGPENDING		0
 #define TIF_NEED_RESCHED	1
+#define TIF_NEED_RESCHED_DELAYED 3
 #define TIF_SYSCALL_TRACE	8
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
@@ -149,6 +150,7 @@ extern void iwmmxt_task_switch(struct th
 
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
Index: linux-2.6.25.4-rt4/drivers/acpi/osl.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/acpi/osl.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/acpi/osl.c	2008-05-29 09:46:29.000000000 -0400
@@ -762,13 +762,13 @@ void acpi_os_delete_lock(acpi_spinlock h
 acpi_status
 acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle)
 {
-	struct semaphore *sem = NULL;
+	struct compat_semaphore *sem = NULL;
 
 
-	sem = acpi_os_allocate(sizeof(struct semaphore));
+	sem = acpi_os_allocate(sizeof(struct compat_semaphore));
 	if (!sem)
 		return AE_NO_MEMORY;
-	memset(sem, 0, sizeof(struct semaphore));
+	memset(sem, 0, sizeof(struct compat_semaphore));
 
 	sema_init(sem, initial_units);
 
@@ -789,7 +789,7 @@ acpi_os_create_semaphore(u32 max_units, 
 
 acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 
 	if (!sem)
@@ -815,7 +815,7 @@ acpi_status acpi_os_delete_semaphore(acp
 acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout)
 {
 	acpi_status status = AE_OK;
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 	int ret = 0;
 
 
@@ -900,7 +900,7 @@ acpi_status acpi_os_wait_semaphore(acpi_
  */
 acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 
 	if (!sem || (units < 1))
Index: linux-2.6.25.4-rt4/drivers/media/dvb/dvb-core/dvb_frontend.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/media/dvb/dvb-core/dvb_frontend.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/media/dvb/dvb-core/dvb_frontend.c	2008-05-29 09:46:29.000000000 -0400
@@ -97,7 +97,7 @@ struct dvb_frontend_private {
 	struct dvb_device *dvbdev;
 	struct dvb_frontend_parameters parameters;
 	struct dvb_fe_events events;
-	struct semaphore sem;
+	struct compat_semaphore sem;
 	struct list_head list_head;
 	wait_queue_head_t wait_queue;
 	struct task_struct *thread;
Index: linux-2.6.25.4-rt4/drivers/net/3c527.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/3c527.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/3c527.c	2008-05-29 09:46:29.000000000 -0400
@@ -182,7 +182,7 @@ struct mc32_local
 
 	u16 rx_ring_tail;       /* index to rx de-queue end */
 
-	struct semaphore cmd_mutex;    /* Serialises issuing of execute commands */
+	struct compat_semaphore cmd_mutex;    /* Serialises issuing of execute commands */
         struct completion execution_cmd; /* Card has completed an execute command */
 	struct completion xceiver_cmd;   /* Card has completed a tx or rx command */
 };
Index: linux-2.6.25.4-rt4/drivers/net/hamradio/6pack.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/hamradio/6pack.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/hamradio/6pack.c	2008-05-29 09:46:29.000000000 -0400
@@ -123,7 +123,7 @@ struct sixpack {
 	struct timer_list	tx_t;
 	struct timer_list	resync_t;
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 	spinlock_t		lock;
 };
 
Index: linux-2.6.25.4-rt4/drivers/net/hamradio/mkiss.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/hamradio/mkiss.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/hamradio/mkiss.c	2008-05-29 09:46:29.000000000 -0400
@@ -84,7 +84,7 @@ struct mkiss {
 #define CRC_MODE_SMACK_TEST	4
 
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 };
 
 /*---------------------------------------------------------------------------*/
Index: linux-2.6.25.4-rt4/drivers/net/ppp_async.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/ppp_async.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/ppp_async.c	2008-05-29 09:46:29.000000000 -0400
@@ -67,7 +67,7 @@ struct asyncppp {
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 	unsigned char	obuf[OBUFSIZE];
 };
Index: linux-2.6.25.4-rt4/drivers/pci/hotplug/ibmphp_hpc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/pci/hotplug/ibmphp_hpc.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/pci/hotplug/ibmphp_hpc.c	2008-05-29 09:46:29.000000000 -0400
@@ -104,7 +104,7 @@ static int to_debug = 0;
 static struct mutex sem_hpcaccess;	// lock access to HPC
 static struct semaphore semOperations;	// lock all operations and
 					// access to data structures
-static struct semaphore sem_exit;	// make sure polling thread goes away
+static struct compat_semaphore sem_exit;	// make sure polling thread goes away
 static struct task_struct *ibmphp_poll_thread;
 //----------------------------------------------------------------------------
 // local function prototypes
Index: linux-2.6.25.4-rt4/drivers/scsi/aacraid/aacraid.h
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/scsi/aacraid/aacraid.h	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/scsi/aacraid/aacraid.h	2008-05-29 09:47:20.000000000 -0400
@@ -717,7 +717,7 @@ struct aac_fib_context {
 	u32			unique;		// unique value representing this context
 	ulong			jiffies;	// used for cleanup - dmb changed to ulong
 	struct list_head	next;		// used to link context's into a linked list
-	struct semaphore	wait_sem;	// this is used to wait for the next fib to arrive.
+	struct compat_semaphore	wait_sem;	// this is used to wait for the next fib to arrive.
 	int			wait;		// Set to true when thread is in WaitForSingleObject
 	unsigned long		count;		// total number of FIBs on FibList
 	struct list_head	fib_list;	// this holds fibs and their attachd hw_fibs
@@ -787,7 +787,7 @@ struct fib {
 	 *	This is the event the sendfib routine will wait on if the
 	 *	caller did not pass one and this is synch io.
 	 */
-	struct semaphore	event_wait;
+	struct compat_semaphore	event_wait;
 	spinlock_t		event_lock;
 
 	u32			done;	/* gets set to 1 when fib is complete */
Index: linux-2.6.25.4-rt4/drivers/usb/storage/usb.h
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/usb/storage/usb.h	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/usb/storage/usb.h	2008-05-29 09:46:29.000000000 -0400
@@ -147,7 +147,7 @@ struct us_data {
 	struct task_struct	*ctl_thread;	 /* the control thread   */
 
 	/* mutual exclusion and synchronization structures */
-	struct semaphore	sema;		 /* to sleep thread on	    */
+	struct compat_semaphore	sema;		 /* to sleep thread on	    */
 	struct completion	notify;		 /* thread begin/end	    */
 	wait_queue_head_t	delay_wait;	 /* wait during scan, reset */
 	struct completion	scanning_done;	 /* wait for scan thread    */
Index: linux-2.6.25.4-rt4/fs/jffs2/jffs2_fs_i.h
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/jffs2/jffs2_fs_i.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/jffs2/jffs2_fs_i.h	2008-05-29 09:46:29.000000000 -0400
@@ -24,7 +24,7 @@ struct jffs2_inode_info {
 	   before letting GC proceed. Or we'd have to put ugliness
 	   into the GC code so it didn't attempt to obtain the i_mutex
 	   for the inode(s) which are already locked */
-	struct semaphore sem;
+	struct compat_semaphore sem;
 
 	/* The highest (datanode) version number used for this ino */
 	uint32_t highest_version;
Index: linux-2.6.25.4-rt4/fs/xfs/linux-2.6/sema.h
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/xfs/linux-2.6/sema.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/xfs/linux-2.6/sema.h	2008-05-29 09:46:29.000000000 -0400
@@ -27,7 +27,7 @@
  * sema_t structure just maps to struct semaphore in Linux kernel.
  */
 
-typedef struct semaphore sema_t;
+typedef struct compat_semaphore sema_t;
 
 #define initnsema(sp, val, name)	sema_init(sp, val)
 #define psema(sp, b)			down(sp)
@@ -36,7 +36,12 @@ typedef struct semaphore sema_t;
 
 static inline int issemalocked(sema_t *sp)
 {
-	return down_trylock(sp) || (up(sp), 0);
+	int rv;
+
+	if ((rv = down_trylock(sp)))
+		return (rv);
+	up(sp);
+	return (0);
 }
 
 /*
Index: linux-2.6.25.4-rt4/fs/xfs/linux-2.6/xfs_buf.h
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/xfs/linux-2.6/xfs_buf.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/xfs/linux-2.6/xfs_buf.h	2008-05-29 09:46:29.000000000 -0400
@@ -118,7 +118,7 @@ typedef int (*xfs_buf_bdstrat_t)(struct 
 #define XB_PAGES	2
 
 typedef struct xfs_buf {
-	struct semaphore	b_sema;		/* semaphore for lockables */
+	struct compat_semaphore	b_sema;		/* semaphore for lockables */
 	unsigned long		b_queuetime;	/* time buffer was queued */
 	atomic_t		b_pin_count;	/* pin count */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
@@ -138,7 +138,7 @@ typedef struct xfs_buf {
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
 	xfs_buf_relse_t		b_relse;	/* releasing function */
 	xfs_buf_bdstrat_t	b_strat;	/* pre-write function */
-	struct semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
+	struct compat_semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
 	void			*b_fspriv3;
Index: linux-2.6.25.4-rt4/include/linux/parport.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/parport.h	2008-05-29 09:45:50.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/parport.h	2008-05-29 09:46:29.000000000 -0400
@@ -266,7 +266,7 @@ enum ieee1284_phase {
 struct ieee1284_info {
 	int mode;
 	volatile enum ieee1284_phase phase;
-	struct semaphore irq;
+	struct compat_semaphore irq;
 };
 
 /* A parallel port */
Index: linux-2.6.25.4-rt4/arch/ppc/mm/init.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/mm/init.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/mm/init.c	2008-05-29 09:46:29.000000000 -0400
@@ -55,7 +55,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
Index: linux-2.6.25.4-rt4/include/asm-generic/percpu.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-generic/percpu.h	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-generic/percpu.h	2008-05-29 09:46:29.000000000 -0400
@@ -9,6 +9,9 @@
  */
 #define per_cpu_var(var) per_cpu__##var
 
+#define __per_cpu_var_lock(var)	per_cpu_lock__##var##_locked
+#define __per_cpu_var_lock_var(var) per_cpu__##var##_locked
+
 #ifdef CONFIG_SMP
 
 /*
@@ -60,6 +63,14 @@ extern unsigned long __per_cpu_offset[NR
 #define __raw_get_cpu_var(var) \
 	(*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset))
 
+#define per_cpu_lock(var, cpu) \
+	(*SHIFT_PERCPU_PTR(&__per_cpu_var_lock(var), per_cpu_offset(cpu)))
+#define per_cpu_var_locked(var, cpu) \
+	(*SHIFT_PERCPU_PTR(&__per_cpu_var_lock_var(var), per_cpu_offset(cpu)))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
 
 #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
 extern void setup_per_cpu_areas(void);
@@ -68,9 +79,11 @@ extern void setup_per_cpu_areas(void);
 #else /* ! SMP */
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu_var(var)))
+#define per_cpu_var_locked(var, cpu)		(*((void)(cpu), &__per_cpu_var_lock_var(var)))
 #define __get_cpu_var(var)			per_cpu_var(var)
 #define __raw_get_cpu_var(var)			per_cpu_var(var)
-
+#define __get_cpu_lock(var, cpu)		__per_cpu_var_lock(var)
+#define __get_cpu_var_locked(var, cpu)		__per_cpu_var_lock_var(var)
 #endif	/* SMP */
 
 #ifndef PER_CPU_ATTRIBUTES
@@ -79,5 +92,8 @@ extern void setup_per_cpu_areas(void);
 
 #define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
 					__typeof__(type) per_cpu_var(name)
+#define DECLARE_PER_CPU_LOCKED(type, name)					\
+	extern PER_CPU_ATTRIBUTES spinlock_t __per_cpu_var_lock(name);		\
+	extern PER_CPU_ATTRIBUTES __typeof__(type) __per_cpu_var_lock_var(name)
 
 #endif /* _ASM_GENERIC_PERCPU_H_ */
Index: linux-2.6.25.4-rt4/include/asm-generic/tlb.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-generic/tlb.h	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-generic/tlb.h	2008-05-29 09:46:29.000000000 -0400
@@ -41,11 +41,12 @@ struct mmu_gather {
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
+	int			cpu;
 	struct page *		pages[FREE_PTE_NR];
 };
 
 /* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /* tlb_gather_mmu
  *	Return a pointer to an initialized struct mmu_gather.
@@ -53,8 +54,10 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 
 	/* Use fast mode if only one CPU is online */
@@ -90,7 +93,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 /* tlb_remove_page
Index: linux-2.6.25.4-rt4/include/linux/percpu.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/percpu.h	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/percpu.h	2008-05-29 09:46:29.000000000 -0400
@@ -18,16 +18,33 @@
 	__attribute__((__section__(".data.percpu.shared_aligned")))	\
 	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name		\
 	____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_LOCKED(type, name)					\
+	__attribute__((__section__(".data.percpu")))				\
+	PER_CPU_ATTRIBUTES __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked);	\
+	__attribute__((__section__(".data.percpu")))				\
+	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name##_locked
+
 #else
 #define DEFINE_PER_CPU(type, name)					\
 	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
 
 #define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		      \
 	DEFINE_PER_CPU(type, name)
+
+#define DEFINE_PER_CPU_LOCKED(type, name)					\
+	PER_CPU_ATTRIBUTES __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked);	\
+	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name##_locked
 #endif
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var)		\
+	EXPORT_SYMBOL(per_cpu_lock__##var##_locked);	\
+	EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var)			\
+	EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked);	\
+	EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
@@ -51,6 +68,29 @@
 	&__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
+/*
+ * Per-CPU data structures with an additional lock - useful for
+ * PREEMPT_RT code that wants to reschedule but also wants
+ * per-CPU data structures.
+ *
+ * 'cpu' gets updated with the CPU the task is currently executing on.
+ *
+ * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables
+ * are the same as the normal per-CPU variables, so there no
+ * runtime overhead.
+ */
+#define get_cpu_var_locked(var, cpuptr)			\
+(*({							\
+	int __cpu = raw_smp_processor_id();		\
+							\
+	*(cpuptr) = __cpu;				\
+	spin_lock(&__get_cpu_lock(var, __cpu));		\
+	&__get_cpu_var_locked(var, __cpu);		\
+}))
+
+#define put_cpu_var_locked(var, cpu) \
+	 do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0)
+
 #ifdef CONFIG_SMP
 
 struct percpu_data {
Index: linux-2.6.25.4-rt4/mm/swap.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/swap.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/swap.c	2008-05-29 09:47:14.000000000 -0400
@@ -34,10 +34,64 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
+/*
+ * On PREEMPT_RT we don't want to disable preemption for cpu variables.
+ * We grab a cpu and then use that cpu to lock the variables accordingly.
+ */
+#ifdef CONFIG_PREEMPT_RT
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs) = { 0, };
+
+#define swap_get_cpu_var_irq_save(var, flags, cpu)	\
+	({						\
+		(void)flags;				\
+		&get_cpu_var_locked(var, &cpu);		\
+	})
+#define swap_put_cpu_var_irq_restore(var, flags, cpu)	\
+	put_cpu_var_locked(var, cpu)
+#define swap_get_cpu_var(var, cpu) \
+	&get_cpu_var_locked(var, &cpu)
+#define swap_put_cpu_var(var, cpu)		\
+	put_cpu_var_locked(var, cpu)
+#define swap_per_cpu_lock(var, cpu)				\
+	({							\
+		spin_lock(&__get_cpu_lock(var, cpu));		\
+		&__get_cpu_var_locked(var, cpu);	\
+	})
+#define swap_per_cpu_unlock(var, cpu)			\
+		spin_unlock(&__get_cpu_lock(var, cpu));
+#define swap_get_cpu() raw_smp_processor_id();
+#define swap_put_cpu()
+#else
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
 static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
 
+#define swap_get_cpu_var_irq_save(var, flags, cpu)	\
+	({						\
+		(void)cpu;				\
+		local_irq_save(flags);			\
+		&__get_cpu_var(var);			\
+	})
+#define swap_put_cpu_var_irq_restore(var, flags, cpu)	\
+	local_irq_restore(flags)
+#define swap_get_cpu_var(var, cpu) \
+	({ (void)cpu; &get_cpu_var(var); })
+#define swap_put_cpu_var(var, cpu)		\
+	do {					\
+		(void)cpu;			\
+		put_cpu_var(var);		\
+	} while(0)
+#define swap_per_cpu_lock(var, cpu)		\
+	&per_cpu(lru_add_pvecs, cpu)
+#define swap_per_cpu_unlock(var, cpu)		\
+	do { } while(0)
+#define swap_get_cpu()  get_cpu()
+#define swap_put_cpu() put_cpu();
+
+#endif /* CONFIG_PREEMPT_RT */
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
@@ -139,6 +193,7 @@ int rotate_reclaimable_page(struct page 
 {
 	struct pagevec *pvec;
 	unsigned long flags;
+	int cpu;
 
 	if (PageLocked(page))
 		return 1;
@@ -150,11 +205,10 @@ int rotate_reclaimable_page(struct page 
 		return 1;
 
 	page_cache_get(page);
-	local_irq_save(flags);
-	pvec = &__get_cpu_var(lru_rotate_pvecs);
+	pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu);
 	if (!pagevec_add(pvec, page))
 		pagevec_move_tail(pvec);
-	local_irq_restore(flags);
+	swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu);
 
 	if (!test_clear_page_writeback(page))
 		BUG();
@@ -205,22 +259,24 @@ EXPORT_SYMBOL(mark_page_accessed);
  */
 void lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	int cpu;
+	struct pagevec *pvec = swap_get_cpu_var(lru_add_pvecs, cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add(pvec);
-	put_cpu_var(lru_add_pvecs);
+	swap_put_cpu_var(lru_add_pvecs, cpu);
 }
 
 void lru_cache_add_active(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+	int cpu;
+	struct pagevec *pvec = swap_get_cpu_var(lru_add_active_pvecs, cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
+	swap_put_cpu_var(lru_add_active_pvecs, cpu);
 }
 
 /*
@@ -232,33 +288,38 @@ static void drain_cpu_pagevecs(int cpu)
 {
 	struct pagevec *pvec;
 
-	pvec = &per_cpu(lru_add_pvecs, cpu);
+	pvec = swap_per_cpu_lock(lru_add_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
+	swap_per_cpu_unlock(lru_add_pvecs, cpu);
 
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+	pvec = swap_per_cpu_lock(lru_add_active_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
+	swap_per_cpu_unlock(lru_add_active_pvecs, cpu);
 
-	pvec = &per_cpu(lru_rotate_pvecs, cpu);
+	pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu);
 	if (pagevec_count(pvec)) {
 		unsigned long flags;
 
 		/* No harm done if a racing interrupt already did this */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		pagevec_move_tail(pvec);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
+	swap_per_cpu_unlock(lru_rotate_pvecs, cpu);
 }
 
 void lru_add_drain(void)
 {
-	drain_cpu_pagevecs(get_cpu());
-	put_cpu();
+	int cpu;
+	cpu = swap_get_cpu();
+	drain_cpu_pagevecs(cpu);
+	swap_put_cpu();
 }
 
 #ifdef CONFIG_NUMA
-static void lru_add_drain_per_cpu(struct work_struct *dummy)
+static void lru_add_drain_per_cpu(void *info)
 {
 	lru_add_drain();
 }
@@ -268,7 +329,7 @@ static void lru_add_drain_per_cpu(struct
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL, 0, 1);
 }
 
 #else
Index: linux-2.6.25.4-rt4/net/ipv4/netfilter/arp_tables.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/ipv4/netfilter/arp_tables.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/net/ipv4/netfilter/arp_tables.c	2008-05-29 09:46:29.000000000 -0400
@@ -244,7 +244,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 	read_lock_bh(&table->lock);
 	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	table_base = (void *)private->entries[raw_smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -1151,7 +1151,7 @@ static int do_add_counters(struct net *n
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
Index: linux-2.6.25.4-rt4/net/ipv4/netfilter/ip_tables.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/ipv4/netfilter/ip_tables.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/net/ipv4/netfilter/ip_tables.c	2008-05-29 09:46:29.000000000 -0400
@@ -353,7 +353,7 @@ ipt_do_table(struct sk_buff *skb,
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	table_base = (void *)private->entries[raw_smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
Index: linux-2.6.25.4-rt4/include/net/netfilter/nf_conntrack_ecache.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/net/netfilter/nf_conntrack_ecache.h	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/include/net/netfilter/nf_conntrack_ecache.h	2008-05-29 09:46:30.000000000 -0400
@@ -15,16 +15,15 @@ struct nf_conntrack_ecache {
 	struct nf_conn *ct;
 	unsigned int events;
 };
-DECLARE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
-
-#define CONNTRACK_ECACHE(x)	(__get_cpu_var(nf_conntrack_ecache).x)
+DECLARE_PER_CPU_LOCKED(struct nf_conntrack_ecache, nf_conntrack_ecache);
 
 extern struct atomic_notifier_head nf_conntrack_chain;
 extern int nf_conntrack_register_notifier(struct notifier_block *nb);
 extern int nf_conntrack_unregister_notifier(struct notifier_block *nb);
 
 extern void nf_ct_deliver_cached_events(const struct nf_conn *ct);
-extern void __nf_ct_event_cache_init(struct nf_conn *ct);
+extern void __nf_ct_event_cache_init(struct nf_conntrack_ecache *ecache,
+				     struct nf_conn *ct);
 extern void nf_ct_event_cache_flush(void);
 
 static inline void
@@ -33,12 +32,14 @@ nf_conntrack_event_cache(enum ip_conntra
 {
 	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
 	struct nf_conntrack_ecache *ecache;
+	int cpu;
 
 	local_bh_disable();
-	ecache = &__get_cpu_var(nf_conntrack_ecache);
+	ecache = &get_cpu_var_locked(nf_conntrack_ecache, &cpu);
 	if (ct != ecache->ct)
-		__nf_ct_event_cache_init(ct);
+		__nf_ct_event_cache_init(ecache, ct);
 	ecache->events |= event;
+	put_cpu_var_locked(nf_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
Index: linux-2.6.25.4-rt4/net/netfilter/nf_conntrack_ecache.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/netfilter/nf_conntrack_ecache.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/net/netfilter/nf_conntrack_ecache.c	2008-05-29 09:46:30.000000000 -0400
@@ -29,8 +29,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_chain);
 ATOMIC_NOTIFIER_HEAD(nf_ct_expect_chain);
 EXPORT_SYMBOL_GPL(nf_ct_expect_chain);
 
-DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
-EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache);
+DEFINE_PER_CPU_LOCKED(struct nf_conntrack_ecache, nf_conntrack_ecache);
+EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(nf_conntrack_ecache);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
@@ -52,22 +52,22 @@ __nf_ct_deliver_cached_events(struct nf_
 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
 {
 	struct nf_conntrack_ecache *ecache;
+	int cpu;
 
 	local_bh_disable();
-	ecache = &__get_cpu_var(nf_conntrack_ecache);
+	ecache = &get_cpu_var_locked(nf_conntrack_ecache, &cpu);
 	if (ecache->ct == ct)
 		__nf_ct_deliver_cached_events(ecache);
+	put_cpu_var_locked(nf_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
 
 /* Deliver cached events for old pending events, if current conntrack != old */
-void __nf_ct_event_cache_init(struct nf_conn *ct)
+void
+__nf_ct_event_cache_init(struct nf_conntrack_ecache *ecache, struct nf_conn *ct)
 {
-	struct nf_conntrack_ecache *ecache;
-
 	/* take care of delivering potentially old events */
-	ecache = &__get_cpu_var(nf_conntrack_ecache);
 	BUG_ON(ecache->ct == ct);
 	if (ecache->ct)
 		__nf_ct_deliver_cached_events(ecache);
@@ -85,7 +85,7 @@ void nf_ct_event_cache_flush(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		ecache = &per_cpu(nf_conntrack_ecache, cpu);
+		ecache = &__get_cpu_var_locked(nf_conntrack_ecache, cpu);
 		if (ecache->ct)
 			nf_ct_put(ecache->ct);
 	}
Index: linux-2.6.25.4-rt4/arch/powerpc/mm/init_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/mm/init_32.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/mm/init_32.c	2008-05-29 09:46:30.000000000 -0400
@@ -54,7 +54,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
Index: linux-2.6.25.4-rt4/arch/powerpc/mm/tlb_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/mm/tlb_64.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/mm/tlb_64.c	2008-05-29 09:47:15.000000000 -0400
@@ -30,13 +30,14 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/bug.h>
+#include <asm/machdep.h>
 
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
 /* This is declared as we are using the more or less generic
  * include/asm-powerpc/tlb.h file -- tgall
  */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
 
@@ -91,8 +92,11 @@ static void pte_free_submit(struct pte_f
 
 void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 {
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
+	/*
+	 * This is safe since tlb_gather_mmu has disabled preemption.
+	 * tlb->cpu is set by tlb_gather_mmu as well.
+	 */
+        cpumask_t local_cpumask = cpumask_of_cpu(tlb->cpu);
 	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
@@ -127,7 +131,7 @@ void pgtable_free_tlb(struct mmu_gather 
 void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, unsigned long pte, int huge)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
 	unsigned long vsid, vaddr;
 	unsigned int psize;
 	int ssize;
@@ -178,6 +182,7 @@ void hpte_need_flush(struct mm_struct *m
 	 */
 	if (!batch->active) {
 		flush_hash_page(vaddr, rpte, psize, ssize, 0);
+		put_cpu_var(ppc64_tlb_batch);
 		return;
 	}
 
@@ -204,8 +209,22 @@ void hpte_need_flush(struct mm_struct *m
 	batch->pte[i] = rpte;
 	batch->vaddr[i] = vaddr;
 	batch->index = ++i;
+
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Since flushing tlb needs expensive hypervisor call(s) on celleb,
+	 * always flush it on RT to reduce scheduling latency.
+	 */
+	if (machine_is(celleb)) {
+		__flush_tlb_pending(batch);
+		put_cpu_var(ppc64_tlb_batch);
+		return;
+	}
+#endif /* CONFIG_PREEMPT_RT */
+
 	if (i >= PPC64_TLB_BATCH_NR)
 		__flush_tlb_pending(batch);
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 /*
Index: linux-2.6.25.4-rt4/net/core/dev.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/core/dev.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/net/core/dev.c	2008-05-29 09:47:15.000000000 -0400
@@ -1696,11 +1696,10 @@ gso:
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
 
-		if (dev->xmit_lock_owner != cpu) {
+		if (dev->xmit_lock_owner != (void *)current) {
 
-			HARD_TX_LOCK(dev, cpu);
+			HARD_TX_LOCK(dev);
 
 			if (!netif_queue_stopped(dev) &&
 			    !netif_subqueue_stopped(dev, skb)) {
@@ -1805,8 +1804,8 @@ int netif_rx_ni(struct sk_buff *skb)
 {
 	int err;
 
-	preempt_disable();
 	err = netif_rx(skb);
+	preempt_disable();
 	if (local_softirq_pending())
 		do_softirq();
 	preempt_enable();
@@ -1834,7 +1833,8 @@ static inline struct net_device *skb_bon
 
 static void net_tx_action(struct softirq_action *h)
 {
-	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = &per_cpu(softnet_data,
+					   raw_smp_processor_id());
 
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
@@ -1850,6 +1850,11 @@ static void net_tx_action(struct softirq
 
 			BUG_TRAP(!atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+			cond_resched_softirq_context();
 		}
 	}
 
@@ -1868,12 +1873,27 @@ static void net_tx_action(struct softirq
 			smp_mb__before_clear_bit();
 			clear_bit(__LINK_STATE_SCHED, &dev->state);
 
+			/*
+			 * We are executing in softirq context here, and
+			 * if softirqs are preemptible, we must avoid
+			 * infinite reactivation of the softirq by
+			 * either the tx handler, or by netif_schedule().
+			 * (it would result in an infinitely looping
+			 *  softirq context)
+			 * So we take the spinlock unconditionally.
+			 */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			spin_lock(&dev->queue_lock);
+			qdisc_run(dev);
+			spin_unlock(&dev->queue_lock);
+#else
 			if (spin_trylock(&dev->queue_lock)) {
 				qdisc_run(dev);
 				spin_unlock(&dev->queue_lock);
 			} else {
 				netif_schedule(dev);
 			}
+#endif
 		}
 	}
 }
@@ -2041,7 +2061,7 @@ int netif_receive_skb(struct sk_buff *sk
 	if (!orig_dev)
 		return NET_RX_DROP;
 
-	__get_cpu_var(netdev_rx_stat).total++;
+	per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++;
 
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
@@ -2109,9 +2129,10 @@ out:
 static int process_backlog(struct napi_struct *napi, int quota)
 {
 	int work = 0;
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
 
+	queue = &per_cpu(softnet_data, raw_smp_processor_id());
 	napi->weight = weight_p;
 	do {
 		struct sk_buff *skb;
@@ -2149,7 +2170,7 @@ void __napi_schedule(struct napi_struct 
 
 	local_irq_save(flags);
 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__napi_schedule);
@@ -2243,7 +2264,7 @@ out:
 
 softnet_break:
 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	goto out;
 }
 
@@ -3694,7 +3715,7 @@ int register_netdevice(struct net_device
 	spin_lock_init(&dev->queue_lock);
 	spin_lock_init(&dev->_xmit_lock);
 	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
-	dev->xmit_lock_owner = -1;
+	dev->xmit_lock_owner = (void *)-1;
 	spin_lock_init(&dev->ingress_lock);
 
 	dev->iflink = -1;
Index: linux-2.6.25.4-rt4/fs/buffer.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/buffer.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/buffer.c	2008-05-29 09:46:56.000000000 -0400
@@ -40,7 +40,6 @@
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 #include <linux/mpage.h>
-#include <linux/bit_spinlock.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
@@ -348,7 +347,7 @@ void invalidate_bdev(struct block_device
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-	if (mapping->nrpages == 0)
+	if (mapping_nrpages(mapping) == 0)
 		return;
 
 	invalidate_bh_lrus();
@@ -403,8 +402,7 @@ static void end_buffer_async_read(struct
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -417,8 +415,7 @@ static void end_buffer_async_read(struct
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -430,8 +427,7 @@ static void end_buffer_async_read(struct
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -466,8 +462,7 @@ static void end_buffer_async_write(struc
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -479,14 +474,12 @@ static void end_buffer_async_write(struc
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -703,8 +696,9 @@ static int __set_page_dirty(struct page 
 	if (TestSetPageDirty(page))
 		return 0;
 
-	write_lock_irq(&mapping->tree_lock);
+	lock_page_ref_irq(page);
 	if (page->mapping) {	/* Race with truncate? */
+		DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree);
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 
 		if (mapping_cap_account_dirty(mapping)) {
@@ -713,10 +707,12 @@ static int __set_page_dirty(struct page 
 					BDI_RECLAIMABLE);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
-		radix_tree_tag_set(&mapping->page_tree,
+		radix_tree_lock(&ctx);
+		radix_tree_tag_set(ctx.tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
+		radix_tree_unlock(&ctx);
 	}
-	write_unlock_irq(&mapping->tree_lock);
+	unlock_page_ref_irq(page);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
 	return 1;
@@ -3184,6 +3180,8 @@ struct buffer_head *alloc_buffer_head(gf
 				set_migrateflags(gfp_flags, __GFP_RECLAIMABLE));
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		spin_lock_init(&ret->b_uptodate_lock);
+		spin_lock_init(&ret->b_state_lock);
 		get_cpu_var(bh_accounting).nr++;
 		recalc_bh_state();
 		put_cpu_var(bh_accounting);
@@ -3195,6 +3193,8 @@ EXPORT_SYMBOL(alloc_buffer_head);
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	BUG_ON(spin_is_locked(&bh->b_uptodate_lock));
+	BUG_ON(spin_is_locked(&bh->b_state_lock));
 	kmem_cache_free(bh_cachep, bh);
 	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
Index: linux-2.6.25.4-rt4/fs/ntfs/aops.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/ntfs/aops.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/ntfs/aops.c	2008-05-29 09:47:12.000000000 -0400
@@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(s
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(s
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(s
 		recs = PAGE_CACHE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
 		for (i = 0; i < recs; i++)
 			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 					i * rec_size), rec_size);
 		kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		flush_dcache_page(page);
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
@@ -159,8 +157,7 @@ static void ntfs_end_buffer_async_read(s
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
Index: linux-2.6.25.4-rt4/include/linux/buffer_head.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/buffer_head.h	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/buffer_head.h	2008-05-29 09:46:31.000000000 -0400
@@ -21,10 +21,6 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
-
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
 	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
@@ -73,6 +69,8 @@ struct buffer_head {
 	struct address_space *b_assoc_map;	/* mapping this buffer is
 						   associated with */
 	atomic_t b_count;		/* users using this buffer_head */
+	spinlock_t b_uptodate_lock;
+	spinlock_t b_state_lock;
 };
 
 /*
Index: linux-2.6.25.4-rt4/include/linux/jbd.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/jbd.h	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/jbd.h	2008-05-29 09:46:31.000000000 -0400
@@ -262,6 +262,15 @@ void buffer_assertion_failure(struct buf
 #define J_ASSERT_JH(jh, expr)	J_ASSERT(expr)
 #endif
 
+/*
+ * For assertions that are only valid on SMP (e.g. spin_is_locked()):
+ */
+#ifdef CONFIG_SMP
+# define J_ASSERT_JH_SMP(jh, expr)	J_ASSERT_JH(jh, expr)
+#else
+# define J_ASSERT_JH_SMP(jh, assert)	do { } while (0)
+#endif
+
 #if defined(JBD_PARANOID_IOFAIL)
 #define J_EXPECT(expr, why...)		J_ASSERT(expr)
 #define J_EXPECT_BH(bh, expr, why...)	J_ASSERT_BH(bh, expr)
@@ -317,32 +326,32 @@ static inline struct journal_head *bh2jh
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_State, &bh->b_state);
+	spin_lock(&bh->b_state_lock);
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_trylock(BH_State, &bh->b_state);
+	return spin_trylock(&bh->b_state_lock);
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_is_locked(BH_State, &bh->b_state);
+	return spin_is_locked(&bh->b_state_lock);
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_State, &bh->b_state);
+	spin_unlock(&bh->b_state_lock);
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_JournalHead, &bh->b_state);
+	spin_lock_irq(&bh->b_uptodate_lock);
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+	spin_unlock_irq(&bh->b_uptodate_lock);
 }
 
 struct jbd_revoke_table_s;
Index: linux-2.6.25.4-rt4/fs/jbd/transaction.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/jbd/transaction.c	2008-05-29 09:45:49.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/jbd/transaction.c	2008-05-29 09:46:31.000000000 -0400
@@ -1518,7 +1518,7 @@ static void __journal_temp_unlink_buffer
 	transaction_t *transaction;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	transaction = jh->b_transaction;
 	if (transaction)
 		assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1960,7 +1960,7 @@ void __journal_file_buffer(struct journa
 	int was_dirty = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	assert_spin_locked(&transaction->t_journal->j_list_lock);
 
 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2049,7 +2049,7 @@ void __journal_refile_buffer(struct jour
 	int was_dirty;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	if (jh->b_transaction)
 		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
 
Index: linux-2.6.25.4-rt4/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/proc/proc_misc.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/proc/proc_misc.c	2008-05-29 09:46:57.000000000 -0400
@@ -103,6 +103,27 @@ static int loadavg_read_proc(char *page,
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static int loadavg_rt_read_proc(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	extern unsigned long avenrun_rt[];
+	extern unsigned long rt_nr_running(void);
+	int a, b, c;
+	int len;
+
+	a = avenrun_rt[0] + (FIXED_1/200);
+	b = avenrun_rt[1] + (FIXED_1/200);
+	c = avenrun_rt[2] + (FIXED_1/200);
+	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+		LOAD_INT(a), LOAD_FRAC(a),
+		LOAD_INT(b), LOAD_FRAC(b),
+		LOAD_INT(c), LOAD_FRAC(c),
+		rt_nr_running(), nr_threads, current->nsproxy->pid_ns->last_pid);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif
+
 static int uptime_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -288,6 +309,25 @@ static const struct file_operations proc
 	.release	= seq_release,
 };
 
+#ifdef CONFIG_RADIX_TREE_OPTIMISTIC
+extern struct seq_operations optimistic_op;
+static int optimistic_open(struct inode *inode, struct file *file)
+{
+	(void)inode;
+	return seq_open(file, &optimistic_op);
+}
+
+extern ssize_t optimistic_write(struct file *, const char __user *, size_t, loff_t *);
+
+static struct file_operations optimistic_file_operations = {
+	.open	= optimistic_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = seq_release,
+	.write	= optimistic_write,
+};
+#endif
+
 static int devinfo_show(struct seq_file *f, void *v)
 {
 	int i = *(loff_t *) v;
@@ -460,7 +500,8 @@ static int show_stat(struct seq_file *p,
 {
 	int i;
 	unsigned long jif;
-	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+	cputime64_t user_rt, user, nice, system_rt, system, idle,
+		    iowait, irq, softirq, steal;
 	cputime64_t guest;
 	u64 sum = 0;
 	struct timespec boottime;
@@ -470,7 +511,7 @@ static int show_stat(struct seq_file *p,
 	if (!per_irq_sum)
 		return -ENOMEM;
 
-	user = nice = system = idle = iowait =
+	user_rt = user = nice = system_rt = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
 	guest = cputime64_zero;
 	getboottime(&boottime);
@@ -487,6 +528,8 @@ static int show_stat(struct seq_file *p,
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+		user_rt = cputime64_add(user_rt, kstat_cpu(i).cpustat.user_rt);
+		system_rt = cputime64_add(system_rt, kstat_cpu(i).cpustat.system_rt);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 		for (j = 0; j < NR_IRQS; j++) {
 			unsigned int temp = kstat_cpu(i).irqs[j];
@@ -495,7 +538,10 @@ static int show_stat(struct seq_file *p,
 		}
 	}
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	user = cputime64_add(user_rt, user);
+	system = cputime64_add(system_rt, system);
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -504,13 +550,17 @@ static int show_stat(struct seq_file *p,
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
 		(unsigned long long)cputime64_to_clock_t(steal),
+		(unsigned long long)cputime64_to_clock_t(user_rt),
+		(unsigned long long)cputime64_to_clock_t(system_rt),
 		(unsigned long long)cputime64_to_clock_t(guest));
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-		user = kstat_cpu(i).cpustat.user;
+		user_rt = kstat_cpu(i).cpustat.user_rt;
+		system_rt = kstat_cpu(i).cpustat.system_rt;
+		user = cputime64_add(user_rt, kstat_cpu(i).cpustat.user);
 		nice = kstat_cpu(i).cpustat.nice;
-		system = kstat_cpu(i).cpustat.system;
+ 		system = cputime64_add(system_rt, kstat_cpu(i).cpustat.system);
 		idle = kstat_cpu(i).cpustat.idle;
 		iowait = kstat_cpu(i).cpustat.iowait;
 		irq = kstat_cpu(i).cpustat.irq;
@@ -518,7 +568,7 @@ static int show_stat(struct seq_file *p,
 		steal = kstat_cpu(i).cpustat.steal;
 		guest = kstat_cpu(i).cpustat.guest;
 		seq_printf(p,
-			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -528,6 +578,8 @@ static int show_stat(struct seq_file *p,
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
 			(unsigned long long)cputime64_to_clock_t(steal),
+			(unsigned long long)cputime64_to_clock_t(user_rt),
+			(unsigned long long)cputime64_to_clock_t(system_rt),
 			(unsigned long long)cputime64_to_clock_t(guest));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
@@ -548,6 +600,38 @@ static int show_stat(struct seq_file *p,
 		nr_iowait());
 
 	kfree(per_irq_sum);
+#ifdef CONFIG_PREEMPT_RT
+	{
+		unsigned long nr_uninterruptible_cpu(int cpu);
+		extern int pi_initialized;
+		unsigned long rt_nr_running(void);
+		unsigned long rt_nr_running_cpu(int cpu);
+		unsigned long rt_nr_uninterruptible(void);
+		unsigned long rt_nr_uninterruptible_cpu(int cpu);
+
+		int i;
+
+		seq_printf(p, "pi_init: %d\n", pi_initialized);
+		seq_printf(p, "nr_running(): %ld\n",
+			nr_running());
+		seq_printf(p, "nr_uninterruptible(): %ld\n",
+			nr_uninterruptible());
+		for_each_online_cpu(i)
+			seq_printf(p, "nr_uninterruptible(%d): %ld\n",
+				i, nr_uninterruptible_cpu(i));
+		seq_printf(p, "rt_nr_running(): %ld\n",
+			rt_nr_running());
+		for_each_online_cpu(i)
+			seq_printf(p, "rt_nr_running(%d): %ld\n",
+				i, rt_nr_running_cpu(i));
+		seq_printf(p, "nr_rt_uninterruptible(): %ld\n",
+			   rt_nr_uninterruptible());
+		for_each_online_cpu(i)
+			seq_printf(p, "nr_rt_uninterruptible(%d): %ld\n",
+				   i, rt_nr_uninterruptible_cpu(i));
+	}
+#endif
+
 	return 0;
 }
 
@@ -827,6 +911,9 @@ void __init proc_misc_init(void)
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
 		{"loadavg",     loadavg_read_proc},
+#ifdef CONFIG_PREEMPT_RT
+		{"loadavgrt",   loadavg_rt_read_proc},
+#endif
 		{"uptime",	uptime_read_proc},
 		{"meminfo",	meminfo_read_proc},
 		{"version",	version_read_proc},
@@ -855,6 +942,9 @@ void __init proc_misc_init(void)
 			entry->proc_fops = &proc_kmsg_operations;
 	}
 #endif
+#ifdef CONFIG_RADIX_TREE_OPTIMISTIC
+	create_seq_entry("radix_optimistic", 0, &optimistic_file_operations);
+#endif
 	create_seq_entry("locks", 0, &proc_locks_operations);
 	create_seq_entry("devices", 0, &proc_devinfo_operations);
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
Index: linux-2.6.25.4-rt4/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/kernel_stat.h	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/kernel_stat.h	2008-05-29 09:46:32.000000000 -0400
@@ -23,6 +23,8 @@ struct cpu_usage_stat {
 	cputime64_t idle;
 	cputime64_t iowait;
 	cputime64_t steal;
+	cputime64_t user_rt;
+	cputime64_t system_rt;
 	cputime64_t guest;
 };
 
Index: linux-2.6.25.4-rt4/include/asm-generic/bug.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-generic/bug.h	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-generic/bug.h	2008-05-29 09:46:49.000000000 -0400
@@ -3,6 +3,10 @@
 
 #include <linux/compiler.h>
 
+#ifndef __ASSEMBLY__
+extern void __WARN_ON(const char *func, const char *file, const int line);
+#endif /* __ASSEMBLY__ */
+
 #ifdef CONFIG_BUG
 
 #ifdef CONFIG_GENERIC_BUG
@@ -81,4 +85,16 @@ extern void warn_on_slowpath(const char 
 # define WARN_ON_SMP(x)			do { } while (0)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+#endif
+
 #endif
Index: linux-2.6.25.4-rt4/include/linux/posix-timers.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/posix-timers.h	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/posix-timers.h	2008-05-29 09:46:33.000000000 -0400
@@ -115,4 +115,6 @@ void set_process_cpu_timer(struct task_s
 
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
+int posix_cpu_thread_init(void);
+
 #endif
Index: linux-2.6.25.4-rt4/kernel/posix-cpu-timers.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/posix-cpu-timers.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/posix-cpu-timers.c	2008-05-29 09:46:33.000000000 -0400
@@ -578,7 +578,7 @@ static void arm_timer(struct k_itimer *t
 		p->cpu_timers : p->signal->cpu_timers);
 	head += CPUCLOCK_WHICH(timer->it_clock);
 
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
 	listpos = head;
@@ -735,7 +735,7 @@ int posix_cpu_timer_set(struct k_itimer 
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	ret = 0;
 	spin_lock(&p->sighand->siglock);
@@ -1317,28 +1317,21 @@ out:
  * already updated our counts.  We need to check if any timers fire now.
  * Interrupts are disabled.
  */
-void run_posix_cpu_timers(struct task_struct *tsk)
+void __run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
 
-	BUG_ON(!irqs_disabled());
-
-#define UNEXPIRED(clock) \
-		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
-		 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
-
-	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
-	    (tsk->it_sched_expires == 0 ||
-	     tsk->se.sum_exec_runtime < tsk->it_sched_expires))
-		return;
-
-#undef	UNEXPIRED
-
 	/*
 	 * Double-check with locks held.
 	 */
 	read_lock(&tasklist_lock);
+	/* Make sure the task doesn't exit under us. */
+	if (unlikely(tsk->exit_state)) {
+		read_unlock(&tasklist_lock);
+		return;
+	}
+
 	if (likely(tsk->signal != NULL)) {
 		spin_lock(&tsk->sighand->siglock);
 
@@ -1385,6 +1378,182 @@ void run_posix_cpu_timers(struct task_st
 	}
 }
 
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+
+static int posix_cpu_timers_thread(void *data)
+{
+	int cpu = (long)data;
+
+	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+
+
+	while (!kthread_should_stop()) {
+		struct task_struct *tsk = NULL;
+		struct task_struct *next = NULL;
+
+		if (cpu_is_offline(cpu)) {
+			goto wait_to_die;
+		}
+
+		/* grab task list */
+		raw_local_irq_disable();
+		tsk = per_cpu(posix_timer_tasklist, cpu);
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+		raw_local_irq_enable();
+
+
+		/* its possible the list is empty, just return */
+		if (!tsk) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
+		/* Process task list */
+		while (1) {
+			/* save next */
+			next = tsk->posix_timer_list;
+
+			/* run the task timers, clear its ptr and
+			 * unreference it
+			 */
+			__run_posix_cpu_timers(tsk);
+			tsk->posix_timer_list = NULL;
+			put_task_struct(tsk);
+
+			/* check if this is the last on the list */
+			if (next == tsk)
+				break;
+			tsk = next;
+		}
+	}
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	unsigned long cpu = smp_processor_id();
+	struct task_struct *tasklist;
+
+	BUG_ON(!irqs_disabled());
+	if(!per_cpu(posix_timer_task, cpu))
+		return;
+
+
+#define UNEXPIRED(clock) \
+		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
+		 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
+
+	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
+	    (tsk->it_sched_expires == 0 ||
+	     tsk->se.sum_exec_runtime < tsk->it_sched_expires))
+		return;
+
+#undef	UNEXPIRED
+
+	/* get per-cpu references */
+	tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+	/* check to see if we're already queued */
+	if (!tsk->posix_timer_list) {
+		get_task_struct(tsk);
+		if (tasklist) {
+			tsk->posix_timer_list = tasklist;
+		} else {
+			/*
+			 * The list is terminated by a self-pointing
+			 * task_struct
+			 */
+			tsk->posix_timer_list = tsk;
+		}
+		per_cpu(posix_timer_tasklist, cpu) = tsk;
+	}
+	/* XXX signal the thread somehow */
+	wake_up_process(per_cpu(posix_timer_task, cpu));
+}
+
+
+
+
+/*
+ * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int posix_cpu_thread_call(struct notifier_block *nfb, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct sched_param param;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(posix_cpu_timers_thread, hcpu,
+					"posix_cpu_timers/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio to avoid getting starved */
+		param.sched_priority = MAX_RT_PRIO-1;
+		sched_setscheduler(p, SCHED_FIFO, &param);
+		per_cpu(posix_timer_task,cpu) = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(per_cpu(posix_timer_task,cpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(per_cpu(posix_timer_task,cpu),
+			     any_online_cpu(cpu_online_map));
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+	case CPU_DEAD:
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block __devinitdata posix_cpu_thread_notifier = {
+	.notifier_call = posix_cpu_thread_call,
+	.priority = 10
+};
+
+int __init posix_cpu_thread_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu);
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&posix_cpu_thread_notifier);
+	return 0;
+}
+
+
+
 /*
  * Set one of the process-wide special case CPU timers.
  * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
@@ -1650,6 +1819,12 @@ static __init int init_posix_cpu_timers(
 		.nsleep = thread_cpu_nsleep,
 		.nsleep_restart = thread_cpu_nsleep_restart,
 	};
+	unsigned long cpu;
+
+	/* init the per-cpu posix_timer_tasklets */
+	for_each_cpu_mask(cpu, cpu_possible_map) {
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+	}
 
 	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
 	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
Index: linux-2.6.25.4-rt4/drivers/net/3c59x.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/3c59x.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/3c59x.c	2008-05-29 09:46:33.000000000 -0400
@@ -792,9 +792,9 @@ static void poll_vortex(struct net_devic
 {
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 #endif
 
@@ -1739,6 +1739,7 @@ vortex_timer(unsigned long data)
 	int next_tick = 60*HZ;
 	int ok = 0;
 	int media_status, old_window;
+	unsigned long flags;
 
 	if (vortex_debug > 2) {
 		printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n",
@@ -1746,7 +1747,7 @@ vortex_timer(unsigned long data)
 		printk(KERN_DEBUG "dev->watchdog_timeo=%d\n", dev->watchdog_timeo);
 	}
 
-	disable_irq_lockdep(dev->irq);
+	spin_lock_irqsave(&vp->lock, flags);
 	old_window = ioread16(ioaddr + EL3_CMD) >> 13;
 	EL3WINDOW(4);
 	media_status = ioread16(ioaddr + Wn4_Media);
@@ -1769,9 +1770,7 @@ vortex_timer(unsigned long data)
 	case XCVR_MII: case XCVR_NWAY:
 		{
 			ok = 1;
-			spin_lock_bh(&vp->lock);
 			vortex_check_media(dev, 0);
-			spin_unlock_bh(&vp->lock);
 		}
 		break;
 	  default:					/* Other media types handled by Tx timeouts. */
@@ -1827,7 +1826,7 @@ leave_media_alone:
 			 dev->name, media_tbl[dev->if_port].name);
 
 	EL3WINDOW(old_window);
-	enable_irq_lockdep(dev->irq);
+	spin_unlock_irqrestore(&vp->lock, flags);
 	mod_timer(&vp->timer, RUN_AT(next_tick));
 	if (vp->deferred)
 		iowrite16(FakeIntr, ioaddr + EL3_CMD);
@@ -1861,12 +1860,12 @@ static void vortex_tx_timeout(struct net
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev);
 			else
 				vortex_interrupt(dev->irq, dev);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		}
 	}
 
Index: linux-2.6.25.4-rt4/drivers/serial/8250.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/serial/8250.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/serial/8250.c	2008-05-29 09:46:34.000000000 -0400
@@ -1455,7 +1455,10 @@ static irqreturn_t serial8250_interrupt(
 {
 	struct irq_info *i = dev_id;
 	struct list_head *l, *end = NULL;
-	int pass_counter = 0, handled = 0;
+#ifndef CONFIG_PREEMPT_RT
+	int pass_counter = 0;
+#endif
+	int handled = 0;
 
 	DEBUG_INTR("serial8250_interrupt(%d)...", irq);
 
@@ -1493,12 +1496,18 @@ static irqreturn_t serial8250_interrupt(
 
 		l = l->next;
 
+		/*
+		 * On preempt-rt we can be preempted and run in our
+		 * own thread.
+		 */
+#ifndef CONFIG_PREEMPT_RT
 		if (l == i->head && pass_counter++ > PASS_LIMIT) {
 			/* If we hit this, we're dead. */
 			printk(KERN_ERR "serial8250: too much work for "
 				"irq%d\n", irq);
 			break;
 		}
+#endif
 	} while (l != end);
 
 	spin_unlock(&i->lock);
@@ -2475,14 +2484,10 @@ serial8250_console_write(struct console 
 
 	touch_nmi_watchdog();
 
-	local_irq_save(flags);
-	if (up->port.sysrq) {
-		/* serial8250_handle_port() already took the lock */
-		locked = 0;
-	} else if (oops_in_progress) {
-		locked = spin_trylock(&up->port.lock);
-	} else
-		spin_lock(&up->port.lock);
+	if (up->port.sysrq || oops_in_progress)
+		locked = spin_trylock_irqsave(&up->port.lock, flags);
+	else
+		spin_lock_irqsave(&up->port.lock, flags);
 
 	/*
 	 *	First save the IER then disable the interrupts
@@ -2514,8 +2519,7 @@ serial8250_console_write(struct console 
 		check_modem_status(up);
 
 	if (locked)
-		spin_unlock(&up->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static int __init serial8250_console_setup(struct console *co, char *options)
Index: linux-2.6.25.4-rt4/drivers/net/ibm_emac/ibm_emac_core.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/ibm_emac/ibm_emac_core.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/ibm_emac/ibm_emac_core.c	2008-05-29 09:46:34.000000000 -0400
@@ -1058,6 +1058,8 @@ static inline int emac_xmit_finish(struc
 	++dev->stats.tx_packets;
 	dev->stats.tx_bytes += len;
 
+	spin_unlock(&dev->tx_lock);
+
 	return 0;
 }
 
@@ -1071,6 +1073,7 @@ static int emac_start_xmit(struct sk_buf
 	u16 ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY |
 	    MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb);
 
+	spin_lock(&dev->tx_lock);
 	slot = dev->tx_slot++;
 	if (dev->tx_slot == NUM_TX_BUFF) {
 		dev->tx_slot = 0;
@@ -1133,6 +1136,8 @@ static int emac_start_xmit_sg(struct sk_
 	if (likely(!nr_frags && len <= MAL_MAX_TX_SIZE))
 		return emac_start_xmit(skb, ndev);
 
+	spin_lock(&dev->tx_lock);
+
 	len -= skb->data_len;
 
 	/* Note, this is only an *estimation*, we can still run out of empty
@@ -1201,6 +1206,7 @@ static int emac_start_xmit_sg(struct sk_
       stop_queue:
 	netif_stop_queue(ndev);
 	DBG2("%d: stopped TX queue" NL, dev->def->index);
+	spin_unlock(&dev->tx_lock);
 	return 1;
 }
 #else
@@ -1240,6 +1246,7 @@ static void emac_poll_tx(void *param)
 	DBG2("%d: poll_tx, %d %d" NL, dev->def->index, dev->tx_cnt,
 	     dev->ack_slot);
 
+	spin_lock(&dev->tx_lock);
 	if (dev->tx_cnt) {
 		u16 ctrl;
 		int slot = dev->ack_slot, n = 0;
@@ -1249,6 +1256,7 @@ static void emac_poll_tx(void *param)
 			struct sk_buff *skb = dev->tx_skb[slot];
 			++n;
 
+			spin_unlock(&dev->tx_lock);
 			if (skb) {
 				dev_kfree_skb(skb);
 				dev->tx_skb[slot] = NULL;
@@ -1258,6 +1266,7 @@ static void emac_poll_tx(void *param)
 			if (unlikely(EMAC_IS_BAD_TX(ctrl)))
 				emac_parse_tx_error(dev, ctrl);
 
+			spin_lock(&dev->tx_lock);
 			if (--dev->tx_cnt)
 				goto again;
 		}
@@ -1270,6 +1279,7 @@ static void emac_poll_tx(void *param)
 			DBG2("%d: tx %d pkts" NL, dev->def->index, n);
 		}
 	}
+	spin_unlock(&dev->tx_lock);
 }
 
 static inline void emac_recycle_rx_skb(struct ocp_enet_private *dev, int slot,
@@ -1964,6 +1974,7 @@ static int __init emac_probe(struct ocp_
 	dev->ndev = ndev;
 	dev->ldev = &ocpdev->dev;
 	dev->def = ocpdev->def;
+	spin_lock_init(&dev->tx_lock);
 
 	/* Find MAL device we are connected to */
 	maldev =
Index: linux-2.6.25.4-rt4/drivers/net/ibm_emac/ibm_emac_core.h
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/ibm_emac/ibm_emac_core.h	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/ibm_emac/ibm_emac_core.h	2008-05-29 09:46:34.000000000 -0400
@@ -193,6 +193,8 @@ struct ocp_enet_private {
 	struct ibm_emac_error_stats	estats;
 	struct net_device_stats		nstats;
 
+	spinlock_t			tx_lock;
+
 	struct device*			ldev;
 };
 
Index: linux-2.6.25.4-rt4/drivers/char/tty_io.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/char/tty_io.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/char/tty_io.c	2008-05-29 09:46:53.000000000 -0400
@@ -244,14 +244,13 @@ int tty_paranoia_check(struct tty_struct
 static int check_tty_count(struct tty_struct *tty, const char *routine)
 {
 #ifdef CHECK_TTY_COUNT
-	struct list_head *p;
+	struct file *filp;
 	int count = 0;
 
-	file_list_lock();
-	list_for_each(p, &tty->tty_files) {
+	percpu_list_fold(&tty->tty_files);
+	lock_list_for_each_entry(filp, percpu_list_head(&tty->tty_files), f_u.fu_llist)
 		count++;
-	}
-	file_list_unlock();
+
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 	    tty->driver->subtype == PTY_TYPE_SLAVE &&
 	    tty->link && tty->link->count)
@@ -260,6 +259,7 @@ static int check_tty_count(struct tty_st
 		printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) "
 				    "!= #fd's(%d) in %s\n",
 		       tty->name, tty->count, count, routine);
+		dump_stack();
 		return count;
 	}
 #endif
@@ -1377,9 +1377,8 @@ static void do_tty_hangup(struct work_st
 	spin_unlock(&redirect_lock);
 
 	check_tty_count(tty, "do_tty_hangup");
-	file_list_lock();
 	/* This breaks for file handles being sent over AF_UNIX sockets ? */
-	list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) {
+	lock_list_for_each_entry(filp, percpu_list_head(&tty->tty_files), f_u.fu_llist) {
 		if (filp->f_op->write == redirected_tty_write)
 			cons_filp = filp;
 		if (filp->f_op->write != tty_write)
@@ -1388,7 +1387,6 @@ static void do_tty_hangup(struct work_st
 		tty_fasync(-1, filp, 0);	/* can't block */
 		filp->f_op = &hung_up_tty_fops;
 	}
-	file_list_unlock();
 	/*
 	 * FIXME! What are the locking issues here? This may me overdoing
 	 * things... This question is especially important now that we've
@@ -2269,9 +2267,9 @@ static void release_one_tty(struct tty_s
 	tty->magic = 0;
 	tty->driver->refcount--;
 
-	file_list_lock();
-	list_del_init(&tty->tty_files);
-	file_list_unlock();
+	percpu_list_fold(&tty->tty_files);
+	lock_list_del_init(percpu_list_head(&tty->tty_files));
+	percpu_list_destroy(&tty->tty_files);
 
 	free_tty_struct(tty);
 }
@@ -3694,10 +3692,14 @@ void tty_flip_buffer_push(struct tty_str
 		tty->buf.tail->commit = tty->buf.tail->used;
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
 
+#ifndef CONFIG_PREEMPT_RT
 	if (tty->low_latency)
 		flush_to_ldisc(&tty->buf.work.work);
 	else
 		schedule_delayed_work(&tty->buf.work, 1);
+#else
+	flush_to_ldisc(&tty->buf.work.work);
+#endif
 }
 
 EXPORT_SYMBOL(tty_flip_buffer_push);
@@ -3731,7 +3733,7 @@ static void initialize_tty_struct(struct
 	mutex_init(&tty->atomic_read_lock);
 	mutex_init(&tty->atomic_write_lock);
 	spin_lock_init(&tty->read_lock);
-	INIT_LIST_HEAD(&tty->tty_files);
+	percpu_list_init(&tty->tty_files);
 	INIT_WORK(&tty->SAK_work, do_SAK_work);
 }
 
Index: linux-2.6.25.4-rt4/arch/arm/kernel/dma.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/dma.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/dma.c	2008-05-29 09:46:35.000000000 -0400
@@ -20,7 +20,7 @@
 
 #include <asm/mach/dma.h>
 
-DEFINE_SPINLOCK(dma_spin_lock);
+DEFINE_RAW_SPINLOCK(dma_spin_lock);
 EXPORT_SYMBOL(dma_spin_lock);
 
 static dma_t dma_chan[MAX_DMA_CHANNELS];
Index: linux-2.6.25.4-rt4/arch/arm/kernel/signal.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/signal.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/signal.c	2008-05-29 09:46:35.000000000 -0400
@@ -623,6 +623,14 @@ static int do_signal(sigset_t *oldset, s
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux-2.6.25.4-rt4/arch/arm/kernel/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/kernel/smp.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/kernel/smp.c	2008-05-29 09:46:35.000000000 -0400
@@ -542,7 +542,7 @@ static void ipi_call_function(unsigned i
 		cpu_clear(cpu, data->unfinished);
 }
 
-static DEFINE_SPINLOCK(stop_lock);
+static DEFINE_RAW_SPINLOCK(stop_lock);
 
 /*
  * ipi_cpu_stop - handle IPI from smp_send_stop()
Index: linux-2.6.25.4-rt4/arch/arm/mm/consistent.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mm/consistent.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mm/consistent.c	2008-05-29 09:46:35.000000000 -0400
@@ -40,7 +40,7 @@
  * These are the page tables (2MB each) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte[NUM_CONSISTENT_PTES];
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
Index: linux-2.6.25.4-rt4/arch/arm/mm/copypage-v6.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mm/copypage-v6.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mm/copypage-v6.c	2008-05-29 09:46:35.000000000 -0400
@@ -26,7 +26,7 @@
 #define from_address	(0xffff8000)
 #define to_address	(0xffffc000)
 
-static DEFINE_SPINLOCK(v6_lock);
+static DEFINE_RAW_SPINLOCK(v6_lock);
 
 /*
  * Copy the user page.  No aliasing to deal with so we can just
Index: linux-2.6.25.4-rt4/arch/arm/mm/mmu.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mm/mmu.c	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mm/mmu.c	2008-05-29 09:46:35.000000000 -0400
@@ -25,7 +25,7 @@
 
 #include "mm.h"
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 extern void _stext, _etext, __data_start, _end;
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
Index: linux-2.6.25.4-rt4/include/asm-arm/dma.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/dma.h	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/dma.h	2008-05-29 09:46:35.000000000 -0400
@@ -27,7 +27,7 @@ typedef unsigned int dmamode_t;
 #define DMA_MODE_CASCADE 2
 #define DMA_AUTOINIT	 4
 
-extern spinlock_t  dma_spin_lock;
+extern raw_spinlock_t  dma_spin_lock;
 
 static inline unsigned long claim_dma_lock(void)
 {
Index: linux-2.6.25.4-rt4/include/asm-arm/tlb.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/tlb.h	2008-05-29 09:45:48.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/tlb.h	2008-05-29 09:46:35.000000000 -0400
@@ -36,15 +36,18 @@
 struct mmu_gather {
 	struct mm_struct	*mm;
 	unsigned int		fullmm;
+	int			cpu;
 };
 
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 	tlb->fullmm = full_mm_flush;
 
@@ -60,7 +63,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 #define tlb_remove_tlb_entry(tlb,ptep,address)	do { } while (0)
Index: linux-2.6.25.4-rt4/arch/arm/mm/context.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mm/context.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mm/context.c	2008-05-29 09:46:35.000000000 -0400
@@ -14,7 +14,7 @@
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 
-static DEFINE_SPINLOCK(cpu_asid_lock);
+static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
 unsigned int cpu_last_asid = ASID_FIRST_VERSION;
 
 /*
Index: linux-2.6.25.4-rt4/arch/arm/mach-sa1100/badge4.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-sa1100/badge4.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-sa1100/badge4.c	2008-05-29 09:46:35.000000000 -0400
@@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i
 	/* detect on->off and off->on transitions */
 	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
 		/* was off, now on */
-		printk(KERN_INFO "%s: enabling 5V supply rail\n", __func__);
 		GPSR = BADGE4_GPIO_PCMEN5V;
 	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
 		/* was on, now off */
-		printk(KERN_INFO "%s: disabling 5V supply rail\n", __func__);
 		GPCR = BADGE4_GPIO_PCMEN5V;
 	}
 
 	local_irq_restore(flags);
+
+	/* detect on->off and off->on transitions */
+	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
+		/* was off, now on */
+		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
+	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
+		/* was on, now off */
+		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
+	}
 }
 EXPORT_SYMBOL(badge4_set_5V);
 
Index: linux-2.6.25.4-rt4/arch/arm/mach-footbridge/netwinder-hw.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-footbridge/netwinder-hw.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-footbridge/netwinder-hw.c	2008-05-29 09:46:35.000000000 -0400
@@ -67,7 +67,7 @@ static inline void wb977_ww(int reg, int
 /*
  * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE
  */
-DEFINE_SPINLOCK(gpio_lock);
+DEFINE_RAW_SPINLOCK(gpio_lock);
 
 static unsigned int current_gpio_op;
 static unsigned int current_gpio_io;
Index: linux-2.6.25.4-rt4/arch/arm/mach-footbridge/netwinder-leds.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-footbridge/netwinder-leds.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-footbridge/netwinder-leds.c	2008-05-29 09:46:35.000000000 -0400
@@ -32,7 +32,7 @@ static char led_state;
 static char hw_led_state;
 
 static DEFINE_SPINLOCK(leds_lock);
-extern spinlock_t gpio_lock;
+extern raw_spinlock_t gpio_lock;
 
 static void netwinder_leds_event(led_event_t evt)
 {
Index: linux-2.6.25.4-rt4/arch/arm/mach-integrator/core.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-integrator/core.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-integrator/core.c	2008-05-29 09:46:36.000000000 -0400
@@ -164,7 +164,7 @@ static struct amba_pl010_data integrator
 
 #define CM_CTRL	IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET
 
-static DEFINE_SPINLOCK(cm_lock);
+static DEFINE_RAW_SPINLOCK(cm_lock);
 
 /**
  * cm_control - update the CM_CTRL register.
Index: linux-2.6.25.4-rt4/arch/arm/mach-integrator/pci_v3.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-integrator/pci_v3.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-integrator/pci_v3.c	2008-05-29 09:46:36.000000000 -0400
@@ -162,7 +162,7 @@
  *	 7:2	register number
  *  
  */
-static DEFINE_SPINLOCK(v3_lock);
+static DEFINE_RAW_SPINLOCK(v3_lock);
 
 #define PCI_BUS_NONMEM_START	0x00000000
 #define PCI_BUS_NONMEM_SIZE	SZ_256M
Index: linux-2.6.25.4-rt4/arch/arm/mach-ixp4xx/common-pci.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-ixp4xx/common-pci.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-ixp4xx/common-pci.c	2008-05-29 09:46:36.000000000 -0400
@@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0;
  * these transactions are atomic or we will end up
  * with corrupt data on the bus or in a driver.
  */
-static DEFINE_SPINLOCK(ixp4xx_pci_lock);
+static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock);
 
 /*
  * Read from PCI config space
Index: linux-2.6.25.4-rt4/include/asm-arm/arch-pxa/timex.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/arch-pxa/timex.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/arch-pxa/timex.h	2008-05-29 09:46:36.000000000 -0400
@@ -16,6 +16,8 @@
 #define CLOCK_TICK_RATE 3686400
 #elif defined(CONFIG_PXA27x)
 /* PXA27x timer base */
+#include <asm-arm/arch-pxa/hardware.h>
+#include <asm-arm/arch-pxa/pxa-regs.h>
 #ifdef CONFIG_MACH_MAINSTONE
 #define CLOCK_TICK_RATE 3249600
 #else
@@ -24,3 +26,7 @@
 #else
 #define CLOCK_TICK_RATE 3250000
 #endif
+
+#define mach_read_cycles() OSCR
+#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32)
+#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32)
Index: linux-2.6.25.4-rt4/arch/arm/mach-shark/leds.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-shark/leds.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-shark/leds.c	2008-05-29 09:46:36.000000000 -0400
@@ -32,7 +32,7 @@ static char led_state;
 static short hw_led_state;
 static short saved_state;
 
-static DEFINE_SPINLOCK(leds_lock);
+static DEFINE_RAW_SPINLOCK(leds_lock);
 
 short sequoia_read(int addr) {
   outw(addr,0x24);
Index: linux-2.6.25.4-rt4/arch/mips/kernel/asm-offsets.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/asm-offsets.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/asm-offsets.c	2008-05-29 09:46:37.000000000 -0400
@@ -10,9 +10,11 @@
  */
 #include <linux/compat.h>
 #include <linux/types.h>
+#include <linux/linkage.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
+#include <linux/irqflags.h>
 
 #include <asm/ptrace.h>
 #include <asm/processor.h>
Index: linux-2.6.25.4-rt4/arch/mips/kernel/entry.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/entry.S	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/entry.S	2008-05-29 09:46:37.000000000 -0400
@@ -30,7 +30,7 @@
 	.align	5
 #ifndef CONFIG_PREEMPT
 FEXPORT(ret_from_exception)
-	local_irq_disable			# preempt stop
+	raw_local_irq_disable			# preempt stop
 	b	__ret_from_irq
 #endif
 FEXPORT(ret_from_irq)
@@ -41,7 +41,7 @@ FEXPORT(__ret_from_irq)
 	beqz	t0, resume_kernel
 
 resume_userspace:
-	local_irq_disable		# make sure we dont miss an
+	raw_local_irq_disable	# make sure we dont miss an
 					# interrupt setting need_resched
 					# between sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -51,7 +51,9 @@ resume_userspace:
 
 #ifdef CONFIG_PREEMPT
 resume_kernel:
-	local_irq_disable
+	raw_local_irq_disable
+	lw	t0, kernel_preemption
+	beqz	t0, restore_all
 	lw	t0, TI_PRE_COUNT($28)
 	bnez	t0, restore_all
 need_resched:
@@ -61,7 +63,9 @@ need_resched:
 	LONG_L	t0, PT_STATUS(sp)		# Interrupts off?
 	andi	t0, 1
 	beqz	t0, restore_all
+	raw_local_irq_disable
 	jal	preempt_schedule_irq
+	sw      zero, TI_PRE_COUNT($28)
 	b	need_resched
 #endif
 
@@ -69,7 +73,7 @@ FEXPORT(ret_from_fork)
 	jal	schedule_tail		# a0 = struct task_struct *prev
 
 FEXPORT(syscall_exit)
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -142,19 +146,21 @@ FEXPORT(restore_partial)		# restore part
 	.set	at
 
 work_pending:
-	andi	t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS
+					# a2 is preloaded with TI_FLAGS
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beqz	t0, work_notifysig
 work_resched:
+	raw_local_irq_enable  t0
 	jal	schedule
 
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
 	andi	t0, a2, _TIF_WORK_MASK	# is there any work to be done
 					# other than syscall tracing?
 	beqz	t0, restore_all
-	andi	t0, a2, _TIF_NEED_RESCHED
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bnez	t0, work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -170,7 +176,7 @@ syscall_exit_work:
 	li	t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
 	and	t0, a2			# a2 is preloaded with TI_FLAGS
 	beqz	t0, work_pending	# trace bit set?
-	local_irq_enable		# could let do_syscall_trace()
+	raw_local_irq_enable	# could let do_syscall_trace()
 					# call schedule() instead
 	move	a0, sp
 	li	a1, 1
Index: linux-2.6.25.4-rt4/arch/mips/kernel/i8259.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/i8259.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/i8259.c	2008-05-29 09:46:37.000000000 -0400
@@ -29,7 +29,7 @@
  */
 
 static int i8259A_auto_eoi = -1;
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 static void disable_8259A_irq(unsigned int irq);
 static void enable_8259A_irq(unsigned int irq);
 static void mask_and_ack_8259A(unsigned int irq);
Index: linux-2.6.25.4-rt4/arch/mips/kernel/module.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/module.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/module.c	2008-05-29 09:46:37.000000000 -0400
@@ -40,7 +40,7 @@ struct mips_hi16 {
 static struct mips_hi16 *mips_hi16_list;
 
 static LIST_HEAD(dbe_list);
-static DEFINE_SPINLOCK(dbe_lock);
+static DEFINE_RAW_SPINLOCK(dbe_lock);
 
 void *module_alloc(unsigned long size)
 {
Index: linux-2.6.25.4-rt4/arch/mips/kernel/process.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/process.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/process.c	2008-05-29 09:46:37.000000000 -0400
@@ -54,7 +54,7 @@ void __noreturn cpu_idle(void)
 	/* endless idle loop with no priority at all */
 	while (1) {
 		tick_nohz_stop_sched_tick();
-		while (!need_resched()) {
+		while (!need_resched() && !need_resched_delayed()) {
 #ifdef CONFIG_SMTC_IDLE_HOOK_DEBUG
 			extern void smtc_idle_loop_hook(void);
 
@@ -64,9 +64,11 @@ void __noreturn cpu_idle(void)
 				(*cpu_wait)();
 		}
 		tick_nohz_restart_sched_tick();
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
Index: linux-2.6.25.4-rt4/arch/mips/kernel/scall32-o32.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/scall32-o32.S	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/scall32-o32.S	2008-05-29 09:46:37.000000000 -0400
@@ -73,7 +73,7 @@ stack_done:
 1:	sw	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	lw	a2, TI_FLAGS($28)	# current->work
Index: linux-2.6.25.4-rt4/arch/mips/kernel/scall64-64.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/scall64-64.S	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/scall64-64.S	2008-05-29 09:46:37.000000000 -0400
@@ -72,7 +72,7 @@ NESTED(handle_sys64, PT_SIZE, sp)
 1:	sd	v0, PT_R2(sp)		# result
 
 n64_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
Index: linux-2.6.25.4-rt4/arch/mips/kernel/scall64-n32.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/scall64-n32.S	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/scall64-n32.S	2008-05-29 09:46:37.000000000 -0400
@@ -69,7 +69,7 @@ NESTED(handle_sysn32, PT_SIZE, sp)
 	sd	v0, PT_R0(sp)		# set flag for syscall restarting
 1:	sd	v0, PT_R2(sp)		# result
 
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L  a2, TI_FLAGS($28)	# current->work
Index: linux-2.6.25.4-rt4/arch/mips/kernel/scall64-o32.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/scall64-o32.S	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/scall64-o32.S	2008-05-29 09:46:37.000000000 -0400
@@ -98,7 +98,7 @@ NESTED(handle_sys, PT_SIZE, sp)
 1:	sd	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make need_resched and
+	raw_local_irq_disable		# make need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
Index: linux-2.6.25.4-rt4/arch/mips/kernel/semaphore.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/semaphore.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/semaphore.c	2008-05-29 09:46:37.000000000 -0400
@@ -36,7 +36,7 @@
  * sem->count and sem->waking atomic.  Scalability isn't an issue because
  * this lock is used on UP only so it's just an empty variable.
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -67,7 +67,7 @@ static inline int __sem_update_count(str
 		: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
 		: "r" (incr), "m" (sem->count));
 	} else {
-		static DEFINE_SPINLOCK(semaphore_lock);
+		static DEFINE_RAW_SPINLOCK(semaphore_lock);
 		unsigned long flags;
 
 		spin_lock_irqsave(&semaphore_lock, flags);
@@ -80,7 +80,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -94,7 +94,7 @@ void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -104,7 +104,7 @@ EXPORT_SYMBOL(__up);
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -133,9 +133,9 @@ void __sched __down(struct semaphore *se
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -165,4 +165,10 @@ int __sched __down_interruptible(struct 
 	return retval;
 }
 
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
+
+int  compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+EXPORT_SYMBOL(compat_sem_is_locked);
Index: linux-2.6.25.4-rt4/arch/mips/kernel/signal.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/signal.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/signal.c	2008-05-29 09:46:37.000000000 -0400
@@ -629,6 +629,10 @@ static void do_signal(struct pt_regs *re
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
Index: linux-2.6.25.4-rt4/arch/mips/kernel/signal32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/signal32.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/signal32.c	2008-05-29 09:46:37.000000000 -0400
@@ -655,6 +655,10 @@ static int setup_rt_frame_32(struct k_si
 	if (err)
 		goto give_sigsegv;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * Arguments to signal handler:
 	 *
Index: linux-2.6.25.4-rt4/arch/mips/kernel/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/smp.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/smp.c	2008-05-29 09:46:37.000000000 -0400
@@ -128,7 +128,22 @@ asmlinkage __cpuinit void start_secondar
 	cpu_idle();
 }
 
-DEFINE_SPINLOCK(smp_call_lock);
+DEFINE_RAW_SPINLOCK(smp_call_lock);
+
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them.
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_online(i) && i != cpu)
+			core_send_ipi(i, SMP_RESCHEDULE_YOURSELF);
+}
 
 struct call_data_struct *call_data;
 
@@ -352,6 +367,8 @@ int setup_profiling_timer(unsigned int m
 	return 0;
 }
 
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
+
 static void flush_tlb_all_ipi(void *info)
 {
 	local_flush_tlb_all();
@@ -409,6 +426,7 @@ static inline void smp_on_each_tlb(void 
 void flush_tlb_mm(struct mm_struct *mm)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
 
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		smp_on_other_tlbs(flush_tlb_mm_ipi, mm);
@@ -421,6 +439,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 			if (cpu_context(cpu, mm))
 				cpu_context(cpu, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_mm(mm);
 
 	preempt_enable();
@@ -444,6 +463,8 @@ void flush_tlb_range(struct vm_area_stru
 	struct mm_struct *mm = vma->vm_mm;
 
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		struct flush_tlb_data fd = {
 			.vma = vma,
@@ -461,6 +482,7 @@ void flush_tlb_range(struct vm_area_stru
 			if (cpu_context(cpu, mm))
 				cpu_context(cpu, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_range(vma, start, end);
 	preempt_enable();
 }
@@ -492,6 +514,8 @@ static void flush_tlb_page_ipi(void *inf
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) {
 		struct flush_tlb_data fd = {
 			.vma = vma,
@@ -508,6 +532,7 @@ void flush_tlb_page(struct vm_area_struc
 			if (cpu_context(cpu, vma->vm_mm))
 				cpu_context(cpu, vma->vm_mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_page(vma, page);
 	preempt_enable();
 }
Index: linux-2.6.25.4-rt4/arch/mips/kernel/traps.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/traps.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/traps.c	2008-05-29 09:46:37.000000000 -0400
@@ -320,7 +320,7 @@ void show_registers(const struct pt_regs
 	printk("\n");
 }
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 
 void __noreturn die(const char * str, const struct pt_regs * regs)
 {
Index: linux-2.6.25.4-rt4/arch/mips/mm/init.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/mm/init.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/mm/init.c	2008-05-29 09:46:37.000000000 -0400
@@ -61,7 +61,7 @@
 
 #endif /* CONFIG_MIPS_MT_SMTC */
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * We have up to 8 empty zeroed pages so we can map one of the right colour
Index: linux-2.6.25.4-rt4/arch/mips/sibyte/sb1250/irq.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/sibyte/sb1250/irq.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/sibyte/sb1250/irq.c	2008-05-29 09:46:37.000000000 -0400
@@ -82,7 +82,7 @@ static struct irq_chip sb1250_irq_type =
 /* Store the CPU id (not the logical number) */
 int sb1250_irq_owner[SB1250_NR_IRQS];
 
-DEFINE_SPINLOCK(sb1250_imr_lock);
+DEFINE_RAW_SPINLOCK(sb1250_imr_lock);
 
 void sb1250_mask_irq(int cpu, int irq)
 {
@@ -316,6 +316,10 @@ void __init arch_init_irq(void)
 #ifdef CONFIG_KGDB
 	imask |= STATUSF_IP6;
 #endif
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+	imask |= STATUSF_IP7;
+#endif
 	/* Enable necessary IPs, disable the rest */
 	change_c0_status(ST0_IM, imask);
 
Index: linux-2.6.25.4-rt4/arch/mips/sibyte/sb1250/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/sibyte/sb1250/smp.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/sibyte/sb1250/smp.c	2008-05-29 09:46:37.000000000 -0400
@@ -97,7 +97,7 @@ static void __cpuinit sb1250_smp_finish(
 	extern void sb1250_clockevent_init(void);
 
 	sb1250_clockevent_init();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
Index: linux-2.6.25.4-rt4/arch/mips/sibyte/swarm/setup.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/sibyte/swarm/setup.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/sibyte/swarm/setup.c	2008-05-29 09:46:37.000000000 -0400
@@ -136,6 +136,12 @@ void __init plat_mem_setup(void)
 	if (m41t81_probe())
 		swarm_rtc_type = RTC_M4LT81;
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+	/*
+	 * set the mips_hpt_frequency here
+	 */
+	mips_hpt_frequency = CONFIG_CPU_SPEED * 1000000;
+#endif
 	printk("This kernel optimized for "
 #ifdef CONFIG_SIMULATION
 	       "simulation"
Index: linux-2.6.25.4-rt4/include/asm-mips/asmmacro.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/asmmacro.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/asmmacro.h	2008-05-29 09:46:37.000000000 -0400
@@ -21,7 +21,7 @@
 #endif
 
 #ifdef CONFIG_MIPS_MT_SMTC
-	.macro	local_irq_enable reg=t0
+	.macro	raw_local_irq_enable reg=t0
 	mfc0	\reg, CP0_TCSTATUS
 	ori	\reg, \reg, TCSTATUS_IXMT
 	xori	\reg, \reg, TCSTATUS_IXMT
@@ -29,21 +29,21 @@
 	_ehb
 	.endm
 
-	.macro	local_irq_disable reg=t0
+	.macro	raw_local_irq_disable reg=t0
 	mfc0	\reg, CP0_TCSTATUS
 	ori	\reg, \reg, TCSTATUS_IXMT
 	mtc0	\reg, CP0_TCSTATUS
 	_ehb
 	.endm
 #else
-	.macro	local_irq_enable reg=t0
+	.macro	raw_local_irq_enable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	mtc0	\reg, CP0_STATUS
 	irq_enable_hazard
 	.endm
 
-	.macro	local_irq_disable reg=t0
+	.macro	raw_local_irq_disable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	xori	\reg, \reg, 1
Index: linux-2.6.25.4-rt4/include/asm-mips/bitops.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/bitops.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/bitops.h	2008-05-29 09:46:37.000000000 -0400
@@ -606,9 +606,6 @@ static inline unsigned long __ffs(unsign
 }
 
 /*
- * fls - find last bit set.
- * @word: The word to search
- *
  * This is defined the same way as ffs.
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
@@ -626,6 +623,8 @@ static inline int fls64(__u64 word)
 
 	return 64 - word;
 }
+#define __bi_local_irq_save(x)		raw_local_irq_save(x)
+#define __bi_local_irq_restore(x)	raw_local_irq_restore(x)
 #else
 #include <asm-generic/bitops/fls64.h>
 #endif
Index: linux-2.6.25.4-rt4/include/asm-mips/hw_irq.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/hw_irq.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/hw_irq.h	2008-05-29 09:46:37.000000000 -0400
@@ -9,6 +9,7 @@
 #define __ASM_HW_IRQ_H
 
 #include <asm/atomic.h>
+#include <linux/irqflags.h>
 
 extern atomic_t irq_err_count;
 
Index: linux-2.6.25.4-rt4/include/asm-mips/i8259.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/i8259.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/i8259.h	2008-05-29 09:46:37.000000000 -0400
@@ -35,7 +35,7 @@
 #define SLAVE_ICW4_DEFAULT	0x01
 #define PIC_ICW4_AEOI		2
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern int i8259A_irq_pending(unsigned int irq);
 extern void make_8259A_irq(unsigned int irq);
Index: linux-2.6.25.4-rt4/include/asm-mips/io.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/io.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/io.h	2008-05-29 09:46:37.000000000 -0400
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/irqflags.h>
 
 #include <asm/addrspace.h>
 #include <asm/byteorder.h>
Index: linux-2.6.25.4-rt4/include/asm-mips/linkage.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/linkage.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/linkage.h	2008-05-29 09:46:37.000000000 -0400
@@ -3,6 +3,11 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm.h>
+
+/*  stuff */
+#define (x)	x
+#define
+
 #endif
 
 #define __weak __attribute__((weak))
Index: linux-2.6.25.4-rt4/include/asm-mips/m48t35.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/m48t35.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/m48t35.h	2008-05-29 09:46:37.000000000 -0400
@@ -6,7 +6,7 @@
 
 #include <linux/spinlock.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 struct m48t35_rtc {
 	volatile u8	pad[0x7ff8];    /* starts at 0x7ff8 */
Index: linux-2.6.25.4-rt4/include/asm-mips/rwsem.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/asm-mips/rwsem.h	2008-05-29 09:46:37.000000000 -0400
@@ -0,0 +1,176 @@
+/*
+ * include/asm-mips/rwsem.h: R/W semaphores for MIPS using the stuff
+ * in lib/rwsem.c.  Adapted largely from include/asm-ppc/rwsem.h
+ * by john.cooper@timesys.com
+ */
+
+#ifndef _MIPS_RWSEM_H
+#define _MIPS_RWSEM_H
+
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
+#ifdef __KERNEL__
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+/*
+ * the semaphore definition
+ */
+struct compat_rw_semaphore {
+	/* XXX this should be able to be an atomic_t  -- paulus */
+	signed long		count;
+#define RWSEM_UNLOCKED_VALUE		0x00000000
+#define RWSEM_ACTIVE_BIAS		0x00000001
+#define RWSEM_ACTIVE_MASK		0x0000ffff
+#define RWSEM_WAITING_BIAS		(-0x00010000)
+#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#if RWSEM_DEBUG
+	int			debug;
+#endif
+};
+
+/*
+ * initialisation
+ */
+#if RWSEM_DEBUG
+#define __RWSEM_DEBUG_INIT      , 0
+#else
+#define __RWSEM_DEBUG_INIT	/* */
+#endif
+
+#define __COMPAT_RWSEM_INITIALIZER(name) \
+	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	  LIST_HEAD_INIT((name).wait_list) \
+	  __RWSEM_DEBUG_INIT }
+
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
+
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
+
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
+{
+	sem->count = RWSEM_UNLOCKED_VALUE;
+	spin_lock_init(&sem->wait_lock);
+	INIT_LIST_HEAD(&sem->wait_list);
+#if RWSEM_DEBUG
+	sem->debug = 0;
+#endif
+}
+
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct compat_rw_semaphore *sem)
+{
+	if (atomic_inc_return((atomic_t *)(&sem->count)) > 0)
+		smp_wmb();
+	else
+		rwsem_down_read_failed(sem);
+}
+
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	while ((tmp = sem->count) >= 0) {
+		if (tmp == cmpxchg(&sem->count, tmp,
+				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
+			smp_wmb();
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+				(atomic_t *)(&sem->count));
+	if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
+		smp_wmb();
+	else
+		rwsem_down_write_failed(sem);
+}
+
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+		      RWSEM_ACTIVE_WRITE_BIAS);
+	smp_wmb();
+	return tmp == RWSEM_UNLOCKED_VALUE;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_dec_return((atomic_t *)(&sem->count));
+	if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct compat_rw_semaphore *sem)
+{
+	smp_wmb();
+	if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+			      (atomic_t *)(&sem->count)) < 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * implement atomic add functionality
+ */
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
+{
+	atomic_add(delta, (atomic_t *)(&sem->count));
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
+	if (tmp < 0)
+		rwsem_downgrade_wake(sem);
+}
+
+/*
+ * implement exchange and add functionality
+ */
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
+{
+	smp_mb();
+	return atomic_add_return(delta, (atomic_t *)(&sem->count));
+}
+
+#endif /* __KERNEL__ */
+#endif /* _MIPS_RWSEM_H */
Index: linux-2.6.25.4-rt4/include/asm-mips/spinlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/spinlock.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/spinlock.h	2008-05-29 09:46:37.000000000 -0400
@@ -28,7 +28,7 @@
  * We make no fairness assumptions.  They have a cost.
  */
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	unsigned int tmp;
 
@@ -70,7 +70,7 @@ static inline void __raw_spin_lock(raw_s
 	smp_llsc_mb();
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	smp_mb();
 
@@ -83,7 +83,7 @@ static inline void __raw_spin_unlock(raw
 	: "memory");
 }
 
-static inline unsigned int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline unsigned int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	unsigned int temp, res;
 
@@ -144,7 +144,7 @@ static inline unsigned int __raw_spin_tr
  */
 #define __raw_write_can_lock(rw)	(!(rw)->lock)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
@@ -189,7 +189,7 @@ static inline void __raw_read_lock(raw_r
 /* Note the use of sub, not subu which will make the kernel die with an
    overflow exception if we ever try to unlock an rwlock that is already
    unlocked or is being held by a writer.  */
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
@@ -223,7 +223,7 @@ static inline void __raw_read_unlock(raw
 	}
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
@@ -265,7 +265,7 @@ static inline void __raw_write_lock(raw_
 	smp_llsc_mb();
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	smp_mb();
 
@@ -277,7 +277,7 @@ static inline void __raw_write_unlock(ra
 	: "memory");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *rw)
+static inline int __raw_read_trylock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 	int ret;
@@ -321,7 +321,7 @@ static inline int __raw_read_trylock(raw
 	return ret;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *rw)
+static inline int __raw_write_trylock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 	int ret;
Index: linux-2.6.25.4-rt4/include/asm-mips/spinlock_types.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/spinlock_types.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/spinlock_types.h	2008-05-29 09:46:37.000000000 -0400
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0 }
 
Index: linux-2.6.25.4-rt4/include/asm-mips/thread_info.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/thread_info.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/thread_info.h	2008-05-29 09:46:37.000000000 -0400
@@ -112,6 +112,7 @@ register struct thread_info *__current_t
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_SYSCALL_AUDIT	3	/* syscall auditing active */
 #define TIF_SECCOMP		4	/* secure computing */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedule on return to userspace */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -129,6 +130,7 @@ register struct thread_info *__current_t
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
Index: linux-2.6.25.4-rt4/include/asm-mips/time.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/time.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/time.h	2008-05-29 09:47:17.000000000 -0400
@@ -19,8 +19,6 @@
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
 
-extern spinlock_t rtc_lock;
-
 /*
  * RTC ops.  By default, they point to weak no-op RTC functions.
  *	rtc_mips_set_time - reverse the above translation and set time to RTC.
Index: linux-2.6.25.4-rt4/include/asm-mips/timeofday.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/asm-mips/timeofday.h	2008-05-29 09:46:37.000000000 -0400
@@ -0,0 +1,5 @@
+#ifndef _ASM_MIPS_TIMEOFDAY_H
+#define _ASM_MIPS_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
+
Index: linux-2.6.25.4-rt4/include/asm-mips/uaccess.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/uaccess.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/uaccess.h	2008-05-29 09:46:37.000000000 -0400
@@ -434,7 +434,6 @@ extern size_t __copy_user(void *__to, co
 	const void *__cu_from;						\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -490,7 +489,6 @@ extern size_t __copy_user_inatomic(void 
 	const void *__cu_from;						\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -569,7 +567,6 @@ extern size_t __copy_user_inatomic(void 
 	const void __user *__cu_from;					\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -600,7 +597,6 @@ extern size_t __copy_user_inatomic(void 
 	const void __user *__cu_from;					\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -618,7 +614,6 @@ extern size_t __copy_user_inatomic(void 
 	const void __user *__cu_from;					\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -645,7 +640,6 @@ __clear_user(void __user *addr, __kernel
 {
 	__kernel_size_t res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, $0\n\t"
@@ -694,7 +688,6 @@ __strncpy_from_user(char *__to, const ch
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, %2\n\t"
@@ -731,7 +724,6 @@ strncpy_from_user(char *__to, const char
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, %2\n\t"
@@ -750,7 +742,6 @@ static inline long __strlen_user(const c
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		__MODULE_JAL(__strlen_user_nocheck_asm)
@@ -780,7 +771,6 @@ static inline long strlen_user(const cha
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		__MODULE_JAL(__strlen_user_asm)
@@ -797,7 +787,6 @@ static inline long __strnlen_user(const 
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, %2\n\t"
@@ -828,7 +817,6 @@ static inline long strnlen_user(const ch
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, %2\n\t"
Index: linux-2.6.25.4-rt4/arch/x86/kernel/early_printk.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/early_printk.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/early_printk.c	2008-05-29 09:47:18.000000000 -0400
@@ -51,7 +51,7 @@ static void early_vga_write(struct conso
 static struct console early_vga_console = {
 	.name =		"earlyvga",
 	.write =	early_vga_write,
-	.flags =	CON_PRINTBUFFER,
+	.flags =	CON_PRINTBUFFER | CON_ATOMIC,
 	.index =	-1,
 };
 
@@ -147,7 +147,7 @@ static __init void early_serial_init(cha
 static struct console early_serial_console = {
 	.name =		"earlyser",
 	.write =	early_serial_write,
-	.flags =	CON_PRINTBUFFER,
+	.flags =	CON_PRINTBUFFER | CON_ATOMIC,
 	.index =	-1,
 };
 
@@ -188,7 +188,7 @@ static void simnow_write(struct console 
 static struct console simnow_console = {
 	.name =		"simnow",
 	.write =	simnow_write,
-	.flags =	CON_PRINTBUFFER,
+	.flags =	CON_PRINTBUFFER | CON_ATOMIC,
 	.index =	-1,
 };
 
@@ -198,7 +198,7 @@ static int early_console_initialized = 0
 
 void early_printk(const char *fmt, ...)
 {
-	char buf[512];
+	static char buf[512];
 	int n;
 	va_list ap;
 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/head64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/head64.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/head64.c	2008-05-29 09:46:37.000000000 -0400
@@ -27,7 +27,11 @@ static void __init zap_identity_mappings
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
 	pgd_clear(pgd);
-	__flush_tlb_all();
+	/*
+	 * preempt_disable/enable does not work this early in the
+	 * bootup yet:
+	 */
+	write_cr3(read_cr3());
 }
 
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
Index: linux-2.6.25.4-rt4/arch/x86/kernel/signal_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/signal_64.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/signal_64.c	2008-05-29 09:46:37.000000000 -0400
@@ -419,6 +419,13 @@ static void do_signal(struct pt_regs *re
 	int signr;
 	sigset_t *oldset;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux-2.6.25.4-rt4/arch/x86/kernel/smp_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/smp_64.c	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/smp_64.c	2008-05-29 09:47:05.000000000 -0400
@@ -15,6 +15,7 @@
 #include <linux/delay.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/cpumask.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/interrupt.h>
@@ -55,7 +56,7 @@ union smp_flush_state {
 		cpumask_t flush_cpumask;
 		struct mm_struct *flush_mm;
 		unsigned long flush_va;
-		spinlock_t tlbstate_lock;
+		raw_spinlock_t tlbstate_lock;
 	};
 	char pad[SMP_CACHE_BYTES];
 } ____cacheline_aligned;
@@ -296,10 +297,28 @@ void smp_send_reschedule(int cpu)
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+void smp_send_reschedule_allbutself_cpumask(cpumask_t mask)
+{
+	cpu_clear(smp_processor_id(), mask);
+	cpus_and(mask, mask, cpu_online_map);
+	if (!cpus_empty(mask))
+		send_IPI_mask(mask, RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
Index: linux-2.6.25.4-rt4/include/asm-x86/acpi.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/acpi.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/acpi.h	2008-05-29 09:46:37.000000000 -0400
@@ -49,8 +49,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 int __acpi_acquire_global_lock(unsigned int *lock);
Index: linux-2.6.25.4-rt4/include/asm-x86/hw_irq_64.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/hw_irq_64.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/hw_irq_64.h	2008-05-29 09:46:37.000000000 -0400
@@ -118,7 +118,7 @@ void i8254_timer_resume(void);
 typedef int vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
 extern void __setup_vector_irq(int cpu);
-extern spinlock_t vector_lock;
+extern raw_spinlock_t vector_lock;
 
 /*
  * Various low-level irq details needed by irq.c, process.c,
Index: linux-2.6.25.4-rt4/include/asm-x86/tlbflush.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/tlbflush.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/tlbflush.h	2008-05-29 09:46:41.000000000 -0400
@@ -7,6 +7,21 @@
 #include <asm/processor.h>
 #include <asm/system.h>
 
+/*
+ * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the
+ * following complex race scenario:
+ *
+ * if the current task is lazy-TLB and does a TLB flush and
+ * gets preempted after the movl %%r3, %0 but before the
+ * movl %0, %%cr3 then its ->active_mm might change and it will
+ * install the wrong cr3 when it switches back. This is not a
+ * problem for the lazy-TLB task itself, but if the next task it
+ * switches to has an ->mm that is also the lazy-TLB task's
+ * new ->active_mm, then the scheduler will assume that cr3 is
+ * the new one, while we overwrote it with the old one. The result
+ * is the wrong cr3 in the new (non-lazy-TLB) task, which typically
+ * causes an infinite pagefault upon the next userspace access.
+ */
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -17,17 +32,22 @@
 
 static inline void __native_flush_tlb(void)
 {
+	preempt_disable();
 	write_cr3(read_cr3());
+	preempt_enable();
 }
 
 static inline void __native_flush_tlb_global(void)
 {
-	unsigned long cr4 = read_cr4();
+	unsigned long cr4;
 
+	preempt_disable();
+	cr4 = read_cr4();
 	/* clear PGE */
 	write_cr4(cr4 & ~X86_CR4_PGE);
 	/* write old PGE again and flush TLBs */
 	write_cr4(cr4);
+	preempt_enable();
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
@@ -84,6 +104,13 @@ static inline void __flush_tlb_one(unsig
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
+	/*
+	 * This is safe on PREEMPT_RT because if we preempt
+	 * right after the check but before the __flush_tlb(),
+	 * and if ->active_mm changes, then we might miss a
+	 * TLB flush, but that TLB flush happened already when
+	 * ->active_mm was changed:
+	 */
 	if (mm == current->active_mm)
 		__flush_tlb();
 }
Index: linux-2.6.25.4-rt4/include/asm-x86/vgtod.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/vgtod.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/vgtod.h	2008-05-29 09:46:37.000000000 -0400
@@ -5,7 +5,7 @@
 #include <linux/clocksource.h>
 
 struct vsyscall_gtod_data {
-	seqlock_t	lock;
+	raw_seqlock_t	lock;
 
 	/* open coded 'struct timespec' */
 	time_t		wall_time_sec;
Index: linux-2.6.25.4-rt4/include/asm-x86/i8259.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/i8259.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/i8259.h	2008-05-29 09:46:37.000000000 -0400
@@ -22,7 +22,7 @@ extern unsigned int cached_irq_mask;
 #define SLAVE_ICW4_DEFAULT	0x01
 #define PIC_ICW4_AEOI		2
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_8259A(int auto_eoi);
 extern void enable_8259A_irq(unsigned int irq);
Index: linux-2.6.25.4-rt4/arch/ia64/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/Kconfig	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/Kconfig	2008-05-29 09:46:38.000000000 -0400
@@ -53,6 +53,7 @@ config GENERIC_LOCKBREAK
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
+	depends on !PREEMPT_RT
 	default y
 
 config ARCH_HAS_ILOG2_U32
@@ -301,6 +302,69 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+
+config GENERIC_TIME
+       bool
+       default y
+
+config HIGH_RES_TIMERS
+	bool "High-Resolution Timers"
+	help
+
+	  POSIX timers are available by default.  This option enables
+	  high-resolution POSIX timers.  With this option the resolution
+	  is at least 1 microsecond.  High resolution is not free.  If
+	  enabled this option will add a small overhead each time a
+	  timer expires that is not on a 1/HZ tick boundary.  If no such
+	  timers are used the overhead is nil.
+
+	  This option enables two additional POSIX CLOCKS,
+	  CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR.  Note that this
+	  option does not change the resolution of CLOCK_REALTIME or
+	  CLOCK_MONOTONIC which remain at 1/HZ resolution.
+
+config HIGH_RES_RESOLUTION
+	int "High-Resolution-Timer resolution (nanoseconds)"
+	depends on HIGH_RES_TIMERS
+	default 1000
+	help
+
+	  This sets the resolution of timers accessed with
+          CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR.  Too
+	  fine a resolution (small a number) will usually not
+          be observable due to normal system latencies.  For an
+          800 MHZ processor about 10,000 is the recommended maximum
+	  (smallest number).  If you don't need that sort of resolution,
+	  higher numbers may generate less overhead.
+
+choice
+	prompt "Clock source"
+	depends on HIGH_RES_TIMERS
+	default HIGH_RES_TIMER_ITC
+	help
+	  This option allows you to choose the hardware source in charge
+	  of generating high precision interruptions on your system.
+	  On IA-64 these are:
+
+	  <timer>				<resolution>
+	  ITC Interval Time Counter		1/CPU clock
+	  HPET High Precision Event Timer	~ (XXX:have to check the spec)
+
+	  The ITC timer is available on all the ia64 computers because
+	  it is integrated directly into the processor. However it may not
+	  give correct results on MP machines with processors running
+	  at different clock rates. In this case you may want to use
+	  the HPET if available on your machine.
+
+
+config HIGH_RES_TIMER_ITC
+	bool "Interval Time Counter/ITC"
+
+config HIGH_RES_TIMER_HPET
+	bool "High Precision Event Timer/HPET"
+
+endchoice
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-1024)"
 	range 2 1024
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/asm-offsets.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/asm-offsets.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/asm-offsets.c	2008-05-29 09:46:38.000000000 -0400
@@ -257,6 +257,7 @@ void foo(void)
 	       offsetof (struct pal_min_state_area_s, pmsa_xip));
 	BLANK();
 
+#ifdef CONFIG_TIME_INTERPOLATION
 	/* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
 	DEFINE(IA64_GTOD_LOCK_OFFSET,
 		offsetof (struct fsyscall_gtod_data_t, lock));
@@ -278,4 +279,5 @@ void foo(void)
 		offsetof (struct itc_jitter_data_t, itc_jitter));
 	DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
 		offsetof (struct itc_jitter_data_t, itc_lastcycle));
+#endif
 }
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/entry.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/entry.S	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/entry.S	2008-05-29 09:46:38.000000000 -0400
@@ -1098,23 +1098,24 @@ skip_rbs_switch:
 	st8 [r2]=r8
 	st8 [r3]=r10
 .work_pending:
-	tbit.z p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
+	tbit.nz p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
+(p6)	br.cond.sptk.few .needresched
+	tbit.z p6,p0=r31,TIF_NEED_RESCHED_DELAYED	// current_thread_info()->need_resched_delayed==0?
 (p6)	br.cond.sptk.few .notify
-#ifdef CONFIG_PREEMPT
-(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
+
+.needresched:
+
+(pKStk) br.cond.sptk.many .fromkernel
 	;;
-(pKStk) st4 [r20]=r21
 	ssm psr.i		// enable interrupts
-#endif
 	br.call.spnt.many rp=schedule
-.ret9:	cmp.eq p6,p0=r0,r0				// p6 <- 1
-	rsm psr.i		// disable interrupts
-	;;
-#ifdef CONFIG_PREEMPT
-(pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
+.ret9a:	rsm psr.i		// disable interrupts
 	;;
-(pKStk)	st4 [r20]=r0		// preempt_count() <- 0
-#endif
+	br.cond.sptk.many .endpreemptdep
+.fromkernel:
+	br.call.spnt.many rp=preempt_schedule_irq
+.ret9b:	rsm psr.i		// disable interrupts
+.endpreemptdep:
 (pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// re-check
 
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/fsys.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/fsys.S	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/fsys.S	2008-05-29 09:46:38.000000000 -0400
@@ -26,6 +26,7 @@
 
 #include "entry.h"
 
+#ifdef CONFIG_TIME_INTERPOLATION
 /*
  * See Documentation/ia64/fsys.txt for details on fsyscalls.
  *
@@ -349,6 +350,26 @@ ENTRY(fsys_clock_gettime)
 	br.many .gettime
 END(fsys_clock_gettime)
 
+
+#else // !CONFIG_TIME_INTERPOLATION
+
+# define fsys_gettimeofday 0
+# define fsys_clock_gettime 0
+
+.fail_einval:
+	mov r8 = EINVAL
+	mov r10 = -1
+	FSYS_RETURN
+
+.fail_efault:
+	mov r8 = EFAULT
+	mov r10 = -1
+	FSYS_RETURN
+
+#endif
+
+
+
 /*
  * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
  */
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/iosapic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/iosapic.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/iosapic.c	2008-05-29 09:46:38.000000000 -0400
@@ -111,7 +111,7 @@
 	(PAGE_SIZE / sizeof(struct iosapic_rte_info))
 #define RTE_PREALLOCATED	(1)
 
-static DEFINE_SPINLOCK(iosapic_lock);
+static DEFINE_RAW_SPINLOCK(iosapic_lock);
 
 /*
  * These tables map IA-64 vectors to the IOSAPIC pin that generates this
@@ -390,6 +390,34 @@ iosapic_startup_level_irq (unsigned int 
 	return 0;
 }
 
+/*
+ * In the preemptible case mask the IRQ first then handle it and ack it.
+ */
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+static void
+iosapic_ack_level_irq (unsigned int irq)
+{
+	ia64_vector vec = irq_to_vector(irq);
+	struct iosapic_rte_info *rte;
+
+	move_irq(irq);
+	mask_irq(irq);
+	list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list)
+		iosapic_eoi(rte->addr, vec);
+}
+
+static void
+iosapic_end_level_irq (unsigned int irq)
+{
+	if (!(irq_desc[irq].status & IRQ_INPROGRESS))
+		unmask_irq(irq);
+}
+
+#else /* !CONFIG_PREEMPT_HARDIRQS */
+
+#define iosapic_ack_level_irq		nop
+
 static void
 iosapic_end_level_irq (unsigned int irq)
 {
@@ -412,10 +440,11 @@ iosapic_end_level_irq (unsigned int irq)
 	}
 }
 
+#endif
+
 #define iosapic_shutdown_level_irq	mask_irq
 #define iosapic_enable_level_irq	unmask_irq
 #define iosapic_disable_level_irq	mask_irq
-#define iosapic_ack_level_irq		nop
 
 static struct irq_chip irq_type_iosapic_level = {
 	.name =		"IO-SAPIC-level",
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/mca.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/mca.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/mca.c	2008-05-29 09:46:38.000000000 -0400
@@ -331,7 +331,7 @@ ia64_mca_spin(const char *func)
 
 typedef struct ia64_state_log_s
 {
-	spinlock_t	isl_lock;
+	raw_spinlock_t	isl_lock;
 	int		isl_index;
 	unsigned long	isl_count;
 	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/perfmon.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/perfmon.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/perfmon.c	2008-05-29 09:46:38.000000000 -0400
@@ -280,7 +280,7 @@ typedef struct {
  */
 
 typedef struct pfm_context {
-	spinlock_t		ctx_lock;		/* context protection */
+	raw_spinlock_t		ctx_lock;		/* context protection */
 
 	pfm_context_flags_t	ctx_flags;		/* bitmask of flags  (block reason incl.) */
 	unsigned int		ctx_state;		/* state: active/inactive (no bitfield) */
@@ -369,7 +369,7 @@ typedef struct pfm_context {
  * mostly used to synchronize between system wide and per-process
  */
 typedef struct {
-	spinlock_t		pfs_lock;		   /* lock the structure */
+	raw_spinlock_t		pfs_lock;		   /* lock the structure */
 
 	unsigned int		pfs_task_sessions;	   /* number of per task sessions */
 	unsigned int		pfs_sys_sessions;	   /* number of per system wide sessions */
@@ -510,7 +510,7 @@ static pfm_intr_handler_desc_t  *pfm_alt
 static struct proc_dir_entry 	*perfmon_dir;
 static pfm_uuid_t		pfm_null_uuid = {0,};
 
-static spinlock_t		pfm_buffer_fmt_lock;
+static raw_spinlock_t		pfm_buffer_fmt_lock;
 static LIST_HEAD(pfm_buffer_fmt_list);
 
 static pmu_config_t		*pmu_conf;
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/process.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/process.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/process.c	2008-05-29 09:46:38.000000000 -0400
@@ -94,6 +94,9 @@ show_stack (struct task_struct *task, un
 void
 dump_stack (void)
 {
+	if (irqs_disabled()) {
+		printk("Uh oh.. entering dump_stack() with irqs disabled.\n");
+	}
 	show_stack(NULL, NULL);
 }
 
@@ -214,7 +217,7 @@ void
 default_idle (void)
 {
 	local_irq_enable();
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		if (can_do_pal_halt) {
 			local_irq_disable();
 			if (!need_resched()) {
@@ -292,7 +295,7 @@ cpu_idle (void)
 			current_thread_info()->status |= TS_POLLING;
 		}
 
-		if (!need_resched()) {
+		if (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 #ifdef CONFIG_SMP
 			min_xtp();
@@ -311,10 +314,11 @@ cpu_idle (void)
 			normal_xtp();
 #endif
 		}
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
+
 		preempt_disable();
-		check_pgt_cache();
+
 		if (cpu_is_offline(cpu))
 			play_dead();
 	}
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/sal.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/sal.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/sal.c	2008-05-29 09:46:38.000000000 -0400
@@ -18,7 +18,7 @@
 #include <asm/sal.h>
 #include <asm/pal.h>
 
- __cacheline_aligned DEFINE_SPINLOCK(sal_lock);
+ __cacheline_aligned DEFINE_RAW_SPINLOCK(sal_lock);
 unsigned long sal_platform_features;
 
 unsigned short sal_revision;
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/salinfo.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/salinfo.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/salinfo.c	2008-05-29 09:46:38.000000000 -0400
@@ -140,7 +140,7 @@ enum salinfo_state {
 
 struct salinfo_data {
 	cpumask_t		cpu_event;	/* which cpus have outstanding events */
-	struct semaphore	mutex;
+	struct compat_semaphore	mutex;
 	u8			*log_buffer;
 	u64			log_size;
 	u8			*oemdata;	/* decoded oem data */
@@ -156,8 +156,8 @@ struct salinfo_data {
 
 static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)];
 
-static DEFINE_SPINLOCK(data_lock);
-static DEFINE_SPINLOCK(data_saved_lock);
+static DEFINE_RAW_SPINLOCK(data_lock);
+static DEFINE_RAW_SPINLOCK(data_saved_lock);
 
 /** salinfo_platform_oemdata - optional callback to decode oemdata from an error
  * record.
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/semaphore.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/semaphore.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/semaphore.c	2008-05-29 09:46:38.000000000 -0400
@@ -40,12 +40,12 @@
  */
 
 void
-__up (struct semaphore *sem)
+__up (struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
-void __sched __down (struct semaphore *sem)
+void __sched __down (struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,7 +82,7 @@ void __sched __down (struct semaphore *s
 	tsk->state = TASK_RUNNING;
 }
 
-int __sched __down_interruptible (struct semaphore * sem)
+int __sched __down_interruptible (struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -142,7 +142,7 @@ int __sched __down_interruptible (struct
  * count.
  */
 int
-__down_trylock (struct semaphore *sem)
+__down_trylock (struct compat_semaphore *sem)
 {
 	unsigned long flags;
 	int sleepers;
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/signal.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/signal.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/signal.c	2008-05-29 09:46:38.000000000 -0400
@@ -456,6 +456,14 @@ ia64_do_signal (struct sigscratch *scr, 
 	long errno = scr->pt.r8;
 #	define ERR_CODE(c)	(IS_IA32_PROCESS(&scr->pt) ? -(c) : (c))
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * In the ia64_leave_kernel code path, we want the common case to go fast, which
 	 * is why we may in certain cases get here from kernel mode. Just return without
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/smp.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/smp.c	2008-05-29 09:46:38.000000000 -0400
@@ -261,6 +261,22 @@ smp_send_reschedule (int cpu)
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu) {
+		if (cpu != smp_processor_id())
+			platform_send_ipi(cpu, IA64_IPI_RESCHEDULE,
+					  IA64_IPI_DM_INT, 0);
+	}
+}
+
+/*
  * Called with preemption disabled.
  */
 static void
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/smpboot.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/smpboot.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/smpboot.c	2008-05-29 09:46:38.000000000 -0400
@@ -371,6 +371,8 @@ smp_setup_percpu_timer (void)
 {
 }
 
+extern void register_itc_clockevent(void);
+
 static void __cpuinit
 smp_callin (void)
 {
@@ -449,6 +451,7 @@ smp_callin (void)
 #ifdef CONFIG_IA32_SUPPORT
 	ia32_gdt_init();
 #endif
+	register_itc_clockevent();
 
 	/*
 	 * Allow the master to continue.
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/time.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/time.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/time.c	2008-05-29 09:46:38.000000000 -0400
@@ -70,6 +70,7 @@ timer_interrupt (int irq, void *dev_id)
 
 	platform_timer_interrupt(irq, dev_id);
 
+#if 0
 	new_itm = local_cpu_data->itm_next;
 
 	if (!time_after(ia64_get_itc(), new_itm))
@@ -77,29 +78,48 @@ timer_interrupt (int irq, void *dev_id)
 		       ia64_get_itc(), new_itm);
 
 	profile_tick(CPU_PROFILING);
+#endif
+
+	if (time_after(ia64_get_itc(), local_cpu_data->itm_tick_next)) {
 
-	while (1) {
-		update_process_times(user_mode(get_irq_regs()));
+		unsigned long new_tick_itm;
+		new_tick_itm = local_cpu_data->itm_tick_next;
 
-		new_itm += local_cpu_data->itm_delta;
+		profile_tick(CPU_PROFILING, get_irq_regs());
 
-		if (smp_processor_id() == time_keeper_id) {
-			/*
-			 * Here we are in the timer irq handler. We have irqs locally
-			 * disabled, but we don't know if the timer_bh is running on
-			 * another CPU. We need to avoid to SMP race by acquiring the
-			 * xtime_lock.
-			 */
-			write_seqlock(&xtime_lock);
-			do_timer(1);
-			local_cpu_data->itm_next = new_itm;
-			write_sequnlock(&xtime_lock);
-		} else
-			local_cpu_data->itm_next = new_itm;
+		while (1) {
+			update_process_times(user_mode(get_irq_regs()));
+
+			new_tick_itm += local_cpu_data->itm_tick_delta;
+
+			if (smp_processor_id() == time_keeper_id) {
+				/*
+				 * Here we are in the timer irq handler. We have irqs locally
+				 * disabled, but we don't know if the timer_bh is running on
+				 * another CPU. We need to avoid to SMP race by acquiring the
+				 * xtime_lock.
+				 */
+				write_seqlock(&xtime_lock);
+				do_timer(get_irq_regs());
+				local_cpu_data->itm_tick_next = new_tick_itm;
+				write_sequnlock(&xtime_lock);
+			} else
+				local_cpu_data->itm_tick_next = new_tick_itm;
+
+			if (time_after(new_tick_itm, ia64_get_itc()))
+				break;
+		}
+	}
 
-		if (time_after(new_itm, ia64_get_itc()))
-			break;
+	if (time_after(ia64_get_itc(), local_cpu_data->itm_timer_next)) {
+		if (itc_clockevent.event_handler)
+			itc_clockevent.event_handler(get_irq_regs());
 
+		// FIXME, really, please
+		new_itm = local_cpu_data->itm_tick_next;
+
+		if (time_after(new_itm, local_cpu_data->itm_timer_next))
+			new_itm = local_cpu_data->itm_timer_next;
 		/*
 		 * Allow IPIs to interrupt the timer loop.
 		 */
@@ -117,8 +137,8 @@ timer_interrupt (int irq, void *dev_id)
 		 * too fast (with the potentially devastating effect
 		 * of losing monotony of time).
 		 */
-		while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2))
-			new_itm += local_cpu_data->itm_delta;
+		while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_tick_delta/2))
+			new_itm += local_cpu_data->itm_tick_delta;
 		ia64_set_itm(new_itm);
 		/* double check, in case we got hit by a (slow) PMI: */
 	} while (time_after_eq(ia64_get_itc(), new_itm));
@@ -137,7 +157,7 @@ ia64_cpu_local_tick (void)
 	/* arrange for the cycle counter to generate a timer interrupt: */
 	ia64_set_itv(IA64_TIMER_VECTOR);
 
-	delta = local_cpu_data->itm_delta;
+	delta = local_cpu_data->itm_tick_delta;
 	/*
 	 * Stagger the timer tick for each CPU so they don't occur all at (almost) the
 	 * same time:
@@ -146,8 +166,8 @@ ia64_cpu_local_tick (void)
 		unsigned long hi = 1UL << ia64_fls(cpu);
 		shift = (2*(cpu - hi) + 1) * delta/hi/2;
 	}
-	local_cpu_data->itm_next = ia64_get_itc() + delta + shift;
-	ia64_set_itm(local_cpu_data->itm_next);
+	local_cpu_data->itm_tick_next = ia64_get_itc() + delta + shift;
+	ia64_set_itm(local_cpu_data->itm_tick_next);
 }
 
 static int nojitter;
@@ -205,7 +225,7 @@ ia64_init_itm (void)
 
 	itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den;
 
-	local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ;
+	local_cpu_data->itm_tick_delta = (itc_freq + HZ/2) / HZ;
 	printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, "
 	       "ITC freq=%lu.%03luMHz", smp_processor_id(),
 	       platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000,
@@ -225,6 +245,7 @@ ia64_init_itm (void)
 	local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT)
 					+ itc_freq/2)/itc_freq;
 
+#ifdef CONFIG_TIME_INTERPOLATION
 	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
 #ifdef CONFIG_SMP
 		/* On IA64 in an SMP configuration ITCs are never accurately synchronized.
@@ -297,7 +318,7 @@ static cycle_t itc_get_cycles(void)
 
 static struct irqaction timer_irqaction = {
 	.handler =	timer_interrupt,
-	.flags =	IRQF_DISABLED | IRQF_IRQPOLL,
+	.flags =	IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NODELAY,
 	.name =		"timer"
 };
 
@@ -318,6 +339,8 @@ time_init (void)
 	 * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC).
 	 */
 	set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec);
+	register_itc_clocksource();
+	register_itc_clockevent();
 }
 
 /*
@@ -375,6 +398,7 @@ void update_vsyscall(struct timespec *wa
 		fsyscall_gtod_data.monotonic_time.tv_nsec -= NSEC_PER_SEC;
 		fsyscall_gtod_data.monotonic_time.tv_sec++;
 	}
+#endif
 
         write_sequnlock_irqrestore(&fsyscall_gtod_data.lock, flags);
 }
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/traps.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/traps.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/traps.c	2008-05-29 09:46:38.000000000 -0400
@@ -39,11 +39,11 @@ int
 die (const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =	__SPIN_LOCK_UNLOCKED(die.lock),
+		.lock =	RAW_SPIN_LOCK_UNLOCKED(die.lock),
 		.lock_owner = -1,
 		.lock_owner_depth = 0
 	};
@@ -191,7 +191,7 @@ __kprobes ia64_bad_break (unsigned long 
  * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes
  * care of clearing psr.dfh.
  */
-static inline void
+void
 disabled_fph_fault (struct pt_regs *regs)
 {
 	struct ia64_psr *psr = ia64_psr(regs);
@@ -210,7 +210,7 @@ disabled_fph_fault (struct pt_regs *regs
 			= (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER);
 
 		if (ia64_is_local_fpu_owner(current)) {
-			preempt_enable_no_resched();
+			__preempt_enable_no_resched();
 			return;
 		}
 
@@ -230,7 +230,7 @@ disabled_fph_fault (struct pt_regs *regs
 		 */
 		psr->mfh = 1;
 	}
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 }
 
 static inline int
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/unwind.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/unwind.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/unwind.c	2008-05-29 09:46:38.000000000 -0400
@@ -82,7 +82,7 @@ typedef unsigned long unw_word;
 typedef unsigned char unw_hash_index_t;
 
 static struct {
-	spinlock_t lock;			/* spinlock for unwind data */
+	raw_spinlock_t lock;			/* spinlock for unwind data */
 
 	/* list of unwind tables (one per load-module) */
 	struct unw_table *tables;
@@ -146,7 +146,7 @@ static struct {
 # endif
 } unw = {
 	.tables = &unw.kernel_table,
-	.lock = __SPIN_LOCK_UNLOCKED(unw.lock),
+	.lock = RAW_SPIN_LOCK_UNLOCKED(unw.lock),
 	.save_order = {
 		UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR,
 		UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR
Index: linux-2.6.25.4-rt4/arch/ia64/kernel/unwind_i.h
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/kernel/unwind_i.h	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/kernel/unwind_i.h	2008-05-29 09:46:38.000000000 -0400
@@ -154,7 +154,7 @@ struct unw_script {
 	unsigned long ip;		/* ip this script is for */
 	unsigned long pr_mask;		/* mask of predicates script depends on */
 	unsigned long pr_val;		/* predicate values this script is for */
-	rwlock_t lock;
+	raw_rwlock_t lock;
 	unsigned int flags;		/* see UNW_FLAG_* in unwind.h */
 	unsigned short lru_chain;	/* used for least-recently-used chain */
 	unsigned short coll_chain;	/* used for hash collisions */
Index: linux-2.6.25.4-rt4/arch/ia64/mm/init.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/mm/init.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/mm/init.c	2008-05-29 09:46:38.000000000 -0400
@@ -37,7 +37,7 @@
 #include <asm/unistd.h>
 #include <asm/mca.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 extern void ia64_tlb_init (void);
 
Index: linux-2.6.25.4-rt4/arch/ia64/mm/tlb.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ia64/mm/tlb.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ia64/mm/tlb.c	2008-05-29 09:46:38.000000000 -0400
@@ -33,7 +33,7 @@ static struct {
 } purge;
 
 struct ia64_ctx ia64_ctx = {
-	.lock =	__SPIN_LOCK_UNLOCKED(ia64_ctx.lock),
+	.lock =	RAW_SPIN_LOCK_UNLOCKED(ia64_ctx.lock),
 	.next =	1,
 	.max_ctx = ~0U
 };
Index: linux-2.6.25.4-rt4/include/asm-ia64/irqflags.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/asm-ia64/irqflags.h	2008-05-29 09:46:38.000000000 -0400
@@ -0,0 +1,95 @@
+
+/*
+ * include/asm-i64/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+/* For spinlocks etc */
+
+/*
+ * - clearing psr.i is implicitly serialized (visible by next insn)
+ * - setting psr.i requires data serialization
+ * - we need a stop-bit before reading PSR because we sometimes
+ *   write a floating-point register right before reading the PSR
+ *   and that writes to PSR.mfl
+ */
+#define __local_irq_save(x)			\
+do {						\
+	ia64_stop();				\
+	(x) = ia64_getreg(_IA64_REG_PSR);	\
+	ia64_stop();				\
+	ia64_rsm(IA64_PSR_I);			\
+} while (0)
+
+#define __local_irq_disable()			\
+do {						\
+	ia64_stop();				\
+	ia64_rsm(IA64_PSR_I);			\
+} while (0)
+
+#define __local_irq_restore(x)	ia64_intrin_local_irq_restore((x) & IA64_PSR_I)
+
+#ifdef CONFIG_IA64_DEBUG_IRQ
+
+  extern unsigned long last_cli_ip;
+
+# define __save_ip()		last_cli_ip = ia64_getreg(_IA64_REG_IP)
+
+# define raw_local_irq_save(x)					\
+do {								\
+	unsigned long psr;					\
+								\
+	__local_irq_save(psr);					\
+	if (psr & IA64_PSR_I)					\
+		__save_ip();					\
+	(x) = psr;						\
+} while (0)
+
+# define raw_local_irq_disable()	do { unsigned long x; local_irq_save(x); } while (0)
+
+# define raw_local_irq_restore(x)					\
+do {								\
+	unsigned long old_psr, psr = (x);			\
+								\
+	local_save_flags(old_psr);				\
+	__local_irq_restore(psr);				\
+	if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I))	\
+		__save_ip();					\
+} while (0)
+
+#else /* !CONFIG_IA64_DEBUG_IRQ */
+# define raw_local_irq_save(x)	__local_irq_save(x)
+# define raw_local_irq_disable()	__local_irq_disable()
+# define raw_local_irq_restore(x)	__local_irq_restore(x)
+#endif /* !CONFIG_IA64_DEBUG_IRQ */
+
+#define raw_local_irq_enable()	({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
+#define raw_local_save_flags(flags)	({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
+
+#define raw_irqs_disabled()				\
+({						\
+	unsigned long __ia64_id_flags;		\
+	local_save_flags(__ia64_id_flags);	\
+	(__ia64_id_flags & IA64_PSR_I) == 0;	\
+})
+
+#define raw_irqs_disabled_flags(flags) ((flags & IA64_PSR_I) == 0)
+
+
+#define raw_safe_halt()         ia64_pal_halt_light()    /* PAL_HALT_LIGHT */
+
+/* TBD... */
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_STR
+# define TRACE_IRQS_OFF_STR
+
+#endif
+
Index: linux-2.6.25.4-rt4/include/asm-ia64/mmu_context.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/mmu_context.h	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/mmu_context.h	2008-05-29 09:46:38.000000000 -0400
@@ -32,7 +32,7 @@
 #include <asm-generic/mm_hooks.h>
 
 struct ia64_ctx {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	unsigned int next;	/* next context number to use */
 	unsigned int limit;     /* available free range */
 	unsigned int max_ctx;   /* max. context value supported by all CPUs */
Index: linux-2.6.25.4-rt4/include/asm-ia64/processor.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/processor.h	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/processor.h	2008-05-29 09:46:38.000000000 -0400
@@ -125,8 +125,10 @@ struct ia64_psr {
  */
 struct cpuinfo_ia64 {
 	__u32 softirq_pending;
-	__u64 itm_delta;	/* # of clock cycles between clock ticks */
-	__u64 itm_next;		/* interval timer mask value to use for next clock tick */
+	__u64 itm_tick_delta;	/* # of clock cycles between clock ticks */
+	__u64 itm_tick_next;	/* interval timer mask value to use for next clock tick */
+	__u64 itm_timer_next;
+	__u64 __itm_next;
 	__u64 nsec_per_cyc;	/* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
 	__u64 unimpl_va_mask;	/* mask of unimplemented virtual address bits (from PAL) */
 	__u64 unimpl_pa_mask;	/* mask of unimplemented physical address bits (from PAL) */
Index: linux-2.6.25.4-rt4/include/asm-ia64/rtc.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/asm-ia64/rtc.h	2008-05-29 09:46:38.000000000 -0400
@@ -0,0 +1,7 @@
+#ifndef _IA64_RTC_H
+#define _IA64_RTC_H
+
+#error "no asm/rtc.h on IA64 !"
+
+#endif
+
Index: linux-2.6.25.4-rt4/include/asm-ia64/rwsem.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/rwsem.h	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/rwsem.h	2008-05-29 09:46:38.000000000 -0400
@@ -33,7 +33,7 @@
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed long		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
@@ -50,16 +50,16 @@ struct rw_semaphore {
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
 	  LIST_HEAD_INIT((name).wait_list) }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
 static inline void
-init_rwsem (struct rw_semaphore *sem)
+compat_init_rwsem (struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -70,7 +70,7 @@ init_rwsem (struct rw_semaphore *sem)
  * lock for reading
  */
 static inline void
-__down_read (struct rw_semaphore *sem)
+__down_read (struct compat_rw_semaphore *sem)
 {
 	long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1);
 
@@ -82,7 +82,7 @@ __down_read (struct rw_semaphore *sem)
  * lock for writing
  */
 static inline void
-__down_write (struct rw_semaphore *sem)
+__down_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -99,7 +99,7 @@ __down_write (struct rw_semaphore *sem)
  * unlock after reading
  */
 static inline void
-__up_read (struct rw_semaphore *sem)
+__up_read (struct compat_rw_semaphore *sem)
 {
 	long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1);
 
@@ -111,7 +111,7 @@ __up_read (struct rw_semaphore *sem)
  * unlock after writing
  */
 static inline void
-__up_write (struct rw_semaphore *sem)
+__up_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -128,7 +128,7 @@ __up_write (struct rw_semaphore *sem)
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 static inline int
-__down_read_trylock (struct rw_semaphore *sem)
+__down_read_trylock (struct compat_rw_semaphore *sem)
 {
 	long tmp;
 	while ((tmp = sem->count) >= 0) {
@@ -143,7 +143,7 @@ __down_read_trylock (struct rw_semaphore
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
 static inline int
-__down_write_trylock (struct rw_semaphore *sem)
+__down_write_trylock (struct compat_rw_semaphore *sem)
 {
 	long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE,
 			      RWSEM_ACTIVE_WRITE_BIAS);
@@ -154,7 +154,7 @@ __down_write_trylock (struct rw_semaphor
  * downgrade write lock to read lock
  */
 static inline void
-__downgrade_write (struct rw_semaphore *sem)
+__downgrade_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -174,7 +174,7 @@ __downgrade_write (struct rw_semaphore *
 #define rwsem_atomic_add(delta, sem)	atomic64_add(delta, (atomic64_t *)(&(sem)->count))
 #define rwsem_atomic_update(delta, sem)	atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
Index: linux-2.6.25.4-rt4/include/asm-ia64/sal.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/sal.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/sal.h	2008-05-29 09:46:38.000000000 -0400
@@ -43,7 +43,7 @@
 #include <asm/system.h>
 #include <asm/fpu.h>
 
-extern spinlock_t sal_lock;
+extern raw_spinlock_t sal_lock;
 
 /* SAL spec _requires_ eight args for each call. */
 #define __IA64_FW_CALL(entry,result,a0,a1,a2,a3,a4,a5,a6,a7)	\
Index: linux-2.6.25.4-rt4/include/asm-ia64/semaphore.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/semaphore.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/semaphore.h	2008-05-29 09:46:38.000000000 -0400
@@ -11,53 +11,64 @@
 
 #include <asm/atomic.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)					\
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name, count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count)					\
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name, count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem);
 
 static inline void
-sema_init (struct semaphore *sem, int val)
+compat_sema_init (struct compat_semaphore *sem, int val)
 {
-	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
+	*sem = (struct compat_semaphore) __COMPAT_SEMAPHORE_INITIALIZER(*sem, val);
 }
 
 static inline void
-init_MUTEX (struct semaphore *sem)
+compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
 static inline void
-init_MUTEX_LOCKED (struct semaphore *sem)
+compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down (struct semaphore * sem);
-extern int  __down_interruptible (struct semaphore * sem);
-extern int  __down_trylock (struct semaphore * sem);
-extern void __up (struct semaphore * sem);
+extern void __down (struct compat_semaphore * sem);
+extern int  __down_interruptible (struct compat_semaphore * sem);
+extern int  __down_trylock (struct compat_semaphore * sem);
+extern void __up (struct compat_semaphore * sem);
 
 /*
  * Atomically decrement the semaphore's count.  If it goes negative,
  * block the calling thread in the TASK_UNINTERRUPTIBLE state.
  */
 static inline void
-down (struct semaphore *sem)
+compat_down (struct compat_semaphore *sem)
 {
 	might_sleep();
 	if (ia64_fetchadd(-1, &sem->count.counter, acq) < 1)
@@ -69,7 +80,7 @@ down (struct semaphore *sem)
  * block the calling thread in the TASK_INTERRUPTIBLE state.
  */
 static inline int
-down_interruptible (struct semaphore * sem)
+compat_down_interruptible (struct compat_semaphore * sem)
 {
 	int ret = 0;
 
@@ -80,7 +91,7 @@ down_interruptible (struct semaphore * s
 }
 
 static inline int
-down_trylock (struct semaphore *sem)
+compat_down_trylock (struct compat_semaphore *sem)
 {
 	int ret = 0;
 
@@ -90,10 +101,12 @@ down_trylock (struct semaphore *sem)
 }
 
 static inline void
-up (struct semaphore * sem)
+compat_up (struct compat_semaphore * sem)
 {
 	if (ia64_fetchadd(1, &sem->count.counter, rel) <= -1)
 		__up(sem);
 }
 
+#include <linux/semaphore.h>
+
 #endif /* _ASM_IA64_SEMAPHORE_H */
Index: linux-2.6.25.4-rt4/include/asm-ia64/spinlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/spinlock.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/spinlock.h	2008-05-29 09:46:38.000000000 -0400
@@ -17,8 +17,6 @@
 #include <asm/intrinsics.h>
 #include <asm/system.h>
 
-#define __raw_spin_lock_init(x)			((x)->lock = 0)
-
 #ifdef ASM_SUPPORTED
 /*
  * Try to get the lock.  If we fail to get the lock, make a non-standard call to
@@ -30,7 +28,7 @@
 #define IA64_SPINLOCK_CLOBBERS "ar.ccv", "ar.pfs", "p14", "p15", "r27", "r28", "r29", "r30", "b6", "memory"
 
 static inline void
-__raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
+__raw_spin_lock_flags (__raw_spinlock_t *lock, unsigned long flags)
 {
 	register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
@@ -89,7 +87,7 @@ __raw_spin_lock_flags (raw_spinlock_t *l
 #define __raw_spin_lock(lock) __raw_spin_lock_flags(lock, 0)
 
 /* Unlock by doing an ordered store and releasing the cacheline with nta */
-static inline void __raw_spin_unlock(raw_spinlock_t *x) {
+static inline void __raw_spin_unlock(__raw_spinlock_t *x) {
 	barrier();
 	asm volatile ("st4.rel.nta [%0] = r0\n\t" :: "r"(x));
 }
@@ -109,7 +107,7 @@ do {											\
 		} while (ia64_spinlock_val);						\
 	}										\
 } while (0)
-#define __raw_spin_unlock(x)	do { barrier(); ((raw_spinlock_t *) x)->lock = 0; } while (0)
+#define __raw_spin_unlock(x)	do { barrier(); ((__raw_spinlock_t *) x)->lock = 0; } while (0)
 #endif /* !ASM_SUPPORTED */
 
 #define __raw_spin_is_locked(x)		((x)->lock != 0)
@@ -122,7 +120,7 @@ do {											\
 
 #define __raw_read_lock(rw)								\
 do {											\
-	raw_rwlock_t *__read_lock_ptr = (rw);						\
+	__raw_rwlock_t *__read_lock_ptr = (rw);						\
 											\
 	while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) {		\
 		ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);			\
@@ -133,7 +131,7 @@ do {											\
 
 #define __raw_read_unlock(rw)					\
 do {								\
-	raw_rwlock_t *__read_lock_ptr = (rw);			\
+	__raw_rwlock_t *__read_lock_ptr = (rw);			\
 	ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);	\
 } while (0)
 
@@ -165,7 +163,7 @@ do {										\
 	(result == 0);								\
 })
 
-static inline void __raw_write_unlock(raw_rwlock_t *x)
+static inline void __raw_write_unlock(__raw_rwlock_t *x)
 {
 	u8 *y = (u8 *)x;
 	barrier();
@@ -193,7 +191,7 @@ static inline void __raw_write_unlock(ra
 	(ia64_val == 0);						\
 })
 
-static inline void __raw_write_unlock(raw_rwlock_t *x)
+static inline void __raw_write_unlock(__raw_rwlock_t *x)
 {
 	barrier();
 	x->write_lock = 0;
@@ -201,10 +199,10 @@ static inline void __raw_write_unlock(ra
 
 #endif /* !ASM_SUPPORTED */
 
-static inline int __raw_read_trylock(raw_rwlock_t *x)
+static inline int __raw_read_trylock(__raw_rwlock_t *x)
 {
 	union {
-		raw_rwlock_t lock;
+		__raw_rwlock_t lock;
 		__u32 word;
 	} old, new;
 	old.lock = new.lock = *x;
@@ -213,8 +211,8 @@ static inline int __raw_read_trylock(raw
 	return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word;
 }
 
-#define _raw_spin_relax(lock)	cpu_relax()
-#define _raw_read_relax(lock)	cpu_relax()
-#define _raw_write_relax(lock)	cpu_relax()
+#define __raw_spin_relax(lock)	cpu_relax()
+#define __raw_read_relax(lock)	cpu_relax()
+#define __raw_write_relax(lock)	cpu_relax()
 
 #endif /*  _ASM_IA64_SPINLOCK_H */
Index: linux-2.6.25.4-rt4/include/asm-ia64/spinlock_types.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/spinlock_types.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/spinlock_types.h	2008-05-29 09:46:38.000000000 -0400
@@ -7,14 +7,14 @@
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile unsigned int read_counter	: 31;
 	volatile unsigned int write_lock	:  1;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0, 0 }
 
Index: linux-2.6.25.4-rt4/include/asm-ia64/system.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/system.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/system.h	2008-05-29 09:46:38.000000000 -0400
@@ -106,81 +106,16 @@ extern struct ia64_boot_param {
  */
 #define set_mb(var, value)	do { (var) = (value); mb(); } while (0)
 
-#define safe_halt()         ia64_pal_halt_light()    /* PAL_HALT_LIGHT */
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
  * that none of the previous instructions in the same group are
  * affected by the rsm/ssm.
  */
-/* For spinlocks etc */
 
-/*
- * - clearing psr.i is implicitly serialized (visible by next insn)
- * - setting psr.i requires data serialization
- * - we need a stop-bit before reading PSR because we sometimes
- *   write a floating-point register right before reading the PSR
- *   and that writes to PSR.mfl
- */
-#define __local_irq_save(x)			\
-do {						\
-	ia64_stop();				\
-	(x) = ia64_getreg(_IA64_REG_PSR);	\
-	ia64_stop();				\
-	ia64_rsm(IA64_PSR_I);			\
-} while (0)
-
-#define __local_irq_disable()			\
-do {						\
-	ia64_stop();				\
-	ia64_rsm(IA64_PSR_I);			\
-} while (0)
-
-#define __local_irq_restore(x)	ia64_intrin_local_irq_restore((x) & IA64_PSR_I)
-
-#ifdef CONFIG_IA64_DEBUG_IRQ
 
-  extern unsigned long last_cli_ip;
-
-# define __save_ip()		last_cli_ip = ia64_getreg(_IA64_REG_IP)
-
-# define local_irq_save(x)					\
-do {								\
-	unsigned long psr;					\
-								\
-	__local_irq_save(psr);					\
-	if (psr & IA64_PSR_I)					\
-		__save_ip();					\
-	(x) = psr;						\
-} while (0)
-
-# define local_irq_disable()	do { unsigned long x; local_irq_save(x); } while (0)
-
-# define local_irq_restore(x)					\
-do {								\
-	unsigned long old_psr, psr = (x);			\
-								\
-	local_save_flags(old_psr);				\
-	__local_irq_restore(psr);				\
-	if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I))	\
-		__save_ip();					\
-} while (0)
+#include <linux/trace_irqflags.h>
 
-#else /* !CONFIG_IA64_DEBUG_IRQ */
-# define local_irq_save(x)	__local_irq_save(x)
-# define local_irq_disable()	__local_irq_disable()
-# define local_irq_restore(x)	__local_irq_restore(x)
-#endif /* !CONFIG_IA64_DEBUG_IRQ */
-
-#define local_irq_enable()	({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
-#define local_save_flags(flags)	({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
-
-#define irqs_disabled()				\
-({						\
-	unsigned long __ia64_id_flags;		\
-	local_save_flags(__ia64_id_flags);	\
-	(__ia64_id_flags & IA64_PSR_I) == 0;	\
-})
 
 #ifdef __KERNEL__
 
Index: linux-2.6.25.4-rt4/include/asm-ia64/thread_info.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/thread_info.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/thread_info.h	2008-05-29 09:46:38.000000000 -0400
@@ -95,6 +95,7 @@ extern void tsk_clear_notify_resume(stru
 #define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
 #define TIF_FREEZE		20	/* is freezing for suspend */
 #define TIF_RESTORE_RSE		21	/* user RBS is newer than kernel RBS */
+#define TIF_NEED_RESCHED_DELAYED 22	/* reschedule on return to userspace */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
@@ -109,6 +110,7 @@ extern void tsk_clear_notify_resume(stru
 #define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_RESTORE_RSE	(1 << TIF_RESTORE_RSE)
+#define _TIF_NEED_RESCHED_DELAYED (1 << TIF_NEED_RESCHED_DELAYED)
 
 /* "work to do on user-return" bits */
 #define TIF_ALLWORK_MASK	(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
Index: linux-2.6.25.4-rt4/include/asm-ia64/tlb.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ia64/tlb.h	2008-05-29 09:45:47.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ia64/tlb.h	2008-05-29 09:46:38.000000000 -0400
@@ -40,6 +40,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
+#include <linux/percpu.h>
 
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
@@ -61,11 +62,12 @@ struct mmu_gather {
 	unsigned char		need_flush;	/* really unmapped some PTEs? */
 	unsigned long		start_addr;
 	unsigned long		end_addr;
+	int cpu;
 	struct page 		*pages[FREE_PTE_NR];
 };
 
 /* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * Flush the TLB for address range START to END and, if not in fast mode, release the
@@ -127,8 +129,10 @@ ia64_tlb_flush_mmu (struct mmu_gather *t
 static inline struct mmu_gather *
 tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 	/*
 	 * Use fast mode if only 1 CPU is online.
@@ -165,7 +169,7 @@ tlb_finish_mmu (struct mmu_gather *tlb, 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 /*
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/idle.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/idle.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/idle.c	2008-05-29 09:46:39.000000000 -0400
@@ -61,7 +61,8 @@ void cpu_idle(void)
 	set_thread_flag(TIF_POLLING_NRFLAG);
 	while (1) {
 		tick_nohz_stop_sched_tick();
-		while (!need_resched() && !cpu_should_die()) {
+		while (!need_resched() && !need_resched_delayed() &&
+				!cpu_should_die()) {
 			ppc64_runlatch_off();
 
 			if (ppc_md.power_save) {
@@ -74,7 +75,9 @@ void cpu_idle(void)
 				local_irq_disable();
 
 				/* check again after disabling irqs */
-				if (!need_resched() && !cpu_should_die())
+				if (!need_resched() &&
+				    !need_resched_delayed() &&
+				    !cpu_should_die())
 					ppc_md.power_save();
 
 				local_irq_enable();
@@ -95,7 +98,7 @@ void cpu_idle(void)
 		tick_nohz_restart_sched_tick();
 		if (cpu_should_die())
 			cpu_die();
-		preempt_enable_no_resched();
+		__preempt_enable_no_resched();
 		schedule();
 		preempt_disable();
 	}
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/smp.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/smp.c	2008-05-29 09:46:39.000000000 -0400
@@ -128,6 +128,16 @@ void smp_send_reschedule(int cpu)
 		smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE);
+}
+
 #ifdef CONFIG_DEBUGGER
 void smp_send_debugger_break(int cpu)
 {
@@ -159,7 +169,7 @@ static void stop_this_cpu(void *dummy)
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
+static  __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/traps.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/traps.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/traps.c	2008-05-29 09:46:39.000000000 -0400
@@ -97,11 +97,11 @@ static inline void pmac_backlight_unblan
 int die(const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			__SPIN_LOCK_UNLOCKED(die.lock),
+		.lock =			_RAW_SPIN_LOCK_UNLOCKED(die.lock),
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -188,6 +188,11 @@ void _exception(int signr, struct pt_reg
 				addr, regs->nip, regs->link, code);
 		}
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	memset(&info, 0, sizeof(info));
 	info.si_signo = signr;
 	info.si_code = code;
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/cell/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/cell/smp.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/cell/smp.c	2008-05-29 09:46:39.000000000 -0400
@@ -135,7 +135,7 @@ static void __devinit smp_iic_setup_cpu(
 		iic_setup_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase = 0;
 
 static void __devinit cell_give_timebase(void)
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/chrp/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/chrp/smp.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/chrp/smp.c	2008-05-29 09:46:39.000000000 -0400
@@ -42,7 +42,7 @@ static void __devinit smp_chrp_setup_cpu
 	mpic_setup_this_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 
 void __devinit smp_chrp_give_timebase(void)
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/chrp/time.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/chrp/time.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/chrp/time.c	2008-05-29 09:47:16.000000000 -0400
@@ -83,7 +83,12 @@ int chrp_set_rtc_time(struct rtc_time *t
 	unsigned char save_control, save_freq_select;
 	struct rtc_time tm = *tmarg;
 
+#if CONFIG_PREEMPT_RT
+	if (!spin_trylock(&rtc_lock))
+		return -1;
+#else
 	spin_lock(&rtc_lock);
+#endif
 
 	save_control = chrp_cmos_clock_read(RTC_CONTROL); /* tell the clock it's being set */
 
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/powermac/feature.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/powermac/feature.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/powermac/feature.c	2008-05-29 09:46:39.000000000 -0400
@@ -59,7 +59,7 @@ extern struct device_node *k2_skiplist[2
  * We use a single global lock to protect accesses. Each driver has
  * to take care of its own locking
  */
-DEFINE_SPINLOCK(feature_lock);
+DEFINE_RAW_SPINLOCK(feature_lock);
 
 #define LOCK(flags)	spin_lock_irqsave(&feature_lock, flags);
 #define UNLOCK(flags)	spin_unlock_irqrestore(&feature_lock, flags);
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/powermac/nvram.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/powermac/nvram.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/powermac/nvram.c	2008-05-29 09:46:39.000000000 -0400
@@ -80,7 +80,7 @@ static int is_core_99;
 static int core99_bank = 0;
 static int nvram_partitions[3];
 // XXX Turn that into a sem
-static DEFINE_SPINLOCK(nv_lock);
+static DEFINE_RAW_SPINLOCK(nv_lock);
 
 static int (*core99_write_bank)(int bank, u8* datas);
 static int (*core99_erase_bank)(int bank);
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/powermac/pic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/powermac/pic.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/powermac/pic.c	2008-05-29 09:46:39.000000000 -0400
@@ -63,7 +63,7 @@ static int max_irqs;
 static int max_real_irqs;
 static u32 level_mask[4];
 
-static DEFINE_SPINLOCK(pmac_pic_lock);
+static DEFINE_RAW_SPINLOCK(pmac_pic_lock);
 
 #define NR_MASK_WORDS	((NR_IRQS + 31) / 32)
 static unsigned long ppc_lost_interrupts[NR_MASK_WORDS];
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/pseries/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/pseries/smp.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/pseries/smp.c	2008-05-29 09:46:39.000000000 -0400
@@ -155,7 +155,7 @@ static void __devinit smp_xics_setup_cpu
 }
 #endif /* CONFIG_XICS */
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase = 0;
 
 static void __devinit pSeries_give_timebase(void)
Index: linux-2.6.25.4-rt4/arch/ppc/8260_io/enet.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/8260_io/enet.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/8260_io/enet.c	2008-05-29 09:46:39.000000000 -0400
@@ -115,7 +115,7 @@ struct scc_enet_private {
 	scc_t	*sccp;
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
Index: linux-2.6.25.4-rt4/arch/ppc/8260_io/fcc_enet.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/8260_io/fcc_enet.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/8260_io/fcc_enet.c	2008-05-29 09:46:39.000000000 -0400
@@ -375,7 +375,7 @@ struct fcc_enet_private {
 	volatile fcc_enet_t	*ep;
 	struct	net_device_stats stats;
 	uint	tx_free;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
Index: linux-2.6.25.4-rt4/arch/ppc/8xx_io/commproc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/8xx_io/commproc.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/8xx_io/commproc.c	2008-05-29 09:46:39.000000000 -0400
@@ -332,7 +332,7 @@ cpm_setbrg(uint brg, uint rate)
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /*
  * 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up...
Index: linux-2.6.25.4-rt4/arch/ppc/8xx_io/enet.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/8xx_io/enet.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/8xx_io/enet.c	2008-05-29 09:46:39.000000000 -0400
@@ -143,7 +143,7 @@ struct scc_enet_private {
 	unsigned char *rx_vaddr[RX_RING_SIZE];
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
Index: linux-2.6.25.4-rt4/arch/ppc/8xx_io/fec.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/8xx_io/fec.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/8xx_io/fec.c	2008-05-29 09:46:39.000000000 -0400
@@ -164,7 +164,7 @@ struct fec_enet_private {
 
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
Index: linux-2.6.25.4-rt4/arch/ppc/kernel/smp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/kernel/smp.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/kernel/smp.c	2008-05-29 09:46:39.000000000 -0400
@@ -136,6 +136,16 @@ void smp_send_reschedule(int cpu)
 	smp_message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0);
+}
+
 #ifdef CONFIG_XMON
 void smp_send_xmon_break(int cpu)
 {
@@ -160,7 +170,7 @@ void smp_send_stop(void)
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
Index: linux-2.6.25.4-rt4/arch/ppc/kernel/traps.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/kernel/traps.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/kernel/traps.c	2008-05-29 09:46:39.000000000 -0400
@@ -72,7 +72,7 @@ void (*debugger_fault_handler)(struct pt
  * Trap & Exception support
  */
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 int die(const char * str, struct pt_regs * fp, long err)
 {
@@ -108,6 +108,10 @@ void _exception(int signr, struct pt_reg
 		debugger(regs);
 		die("Exception in kernel mode", regs, signr);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	info.si_signo = signr;
 	info.si_errno = 0;
 	info.si_code = code;
Index: linux-2.6.25.4-rt4/arch/ppc/platforms/hdpu.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/platforms/hdpu.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/platforms/hdpu.c	2008-05-29 09:46:39.000000000 -0400
@@ -55,7 +55,7 @@ static void parse_bootinfo(unsigned long
 static void hdpu_set_l1pe(void);
 static void hdpu_cpustate_set(unsigned char new_state);
 #ifdef CONFIG_SMP
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 extern int smp_tb_synchronized;
 
Index: linux-2.6.25.4-rt4/arch/ppc/platforms/sbc82xx.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/platforms/sbc82xx.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/platforms/sbc82xx.c	2008-05-29 09:46:39.000000000 -0400
@@ -65,7 +65,7 @@ static void sbc82xx_time_init(void)
 
 static volatile char *sbc82xx_i8259_map;
 static char sbc82xx_i8259_mask = 0xff;
-static DEFINE_SPINLOCK(sbc82xx_i8259_lock);
+static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock);
 
 static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr)
 {
Index: linux-2.6.25.4-rt4/arch/ppc/syslib/cpm2_common.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/syslib/cpm2_common.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/syslib/cpm2_common.c	2008-05-29 09:46:39.000000000 -0400
@@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /* 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up... */
 static rh_block_t cpm_boot_dpmem_rh_block[16];
Index: linux-2.6.25.4-rt4/arch/ppc/syslib/open_pic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/syslib/open_pic.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/syslib/open_pic.c	2008-05-29 09:46:39.000000000 -0400
@@ -526,7 +526,7 @@ void openpic_reset_processor_phys(u_int 
 }
 
 #if defined(CONFIG_SMP) || defined(CONFIG_PM)
-static DEFINE_SPINLOCK(openpic_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic_setup_lock);
 #endif
 
 #ifdef CONFIG_SMP
Index: linux-2.6.25.4-rt4/arch/ppc/syslib/open_pic2.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/ppc/syslib/open_pic2.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/ppc/syslib/open_pic2.c	2008-05-29 09:46:39.000000000 -0400
@@ -380,7 +380,7 @@ static void openpic2_set_spurious(u_int 
 			   vec);
 }
 
-static DEFINE_SPINLOCK(openpic2_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic2_setup_lock);
 
 /*
  *  Initialize a timer interrupt (and disable it)
Index: linux-2.6.25.4-rt4/include/asm-powerpc/hw_irq.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/hw_irq.h	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/hw_irq.h	2008-05-29 09:46:39.000000000 -0400
@@ -16,7 +16,7 @@ extern void timer_interrupt(struct pt_re
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
 
-static inline unsigned long local_get_flags(void)
+static inline unsigned long raw_local_get_flags(void)
 {
 	unsigned long flags;
 
@@ -27,7 +27,7 @@ static inline unsigned long local_get_fl
 	return flags;
 }
 
-static inline unsigned long local_irq_disable(void)
+static inline unsigned long raw_local_irq_disable(void)
 {
 	unsigned long flags, zero;
 
@@ -39,17 +39,22 @@ static inline unsigned long local_irq_di
 	return flags;
 }
 
-extern void local_irq_restore(unsigned long);
+
 extern void iseries_handle_interrupts(void);
+extern unsigned long raw_local_get_flags(void);
+extern unsigned long raw_local_irq_disable(void);
+extern void raw_local_irq_restore(unsigned long);
+
+#define raw_local_irq_enable()		raw_local_irq_restore(1)
+#define raw_local_save_flags(flags)	((flags) = raw_local_get_flags())
+#define raw_local_irq_save(flags)	((flags) = raw_local_irq_disable())
 
-#define local_irq_enable()	local_irq_restore(1)
-#define local_save_flags(flags)	((flags) = local_get_flags())
-#define local_irq_save(flags)	((flags) = local_irq_disable())
+#define raw_irqs_disabled()		(raw_local_get_flags() == 0)
+#define raw_irqs_disabled_flags(flags)	((flags) == 0)
 
-#define irqs_disabled()		(local_get_flags() == 0)
 
-#define __hard_irq_enable()	__mtmsrd(mfmsr() | MSR_EE, 1)
-#define __hard_irq_disable()	__mtmsrd(mfmsr() & ~MSR_EE, 1)
+#define __hard_irq_enable()		__mtmsrd(mfmsr() | MSR_EE, 1)
+#define __hard_irq_disable()		__mtmsrd(mfmsr() & ~MSR_EE, 1)
 
 #define  hard_irq_disable()			\
 	do {					\
@@ -58,17 +63,17 @@ extern void iseries_handle_interrupts(vo
 		get_paca()->hard_enabled = 0;	\
 	} while(0)
 
-#else
+#else  /* CONFIG_PPC64 */
 
 #if defined(CONFIG_BOOKE)
 #define SET_MSR_EE(x)	mtmsr(x)
-#define local_irq_restore(flags)	__asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory")
+#define raw_local_irq_restore(flags)	__asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory")
 #else
 #define SET_MSR_EE(x)	mtmsr(x)
-#define local_irq_restore(flags)	mtmsr(flags)
-#endif
+#define raw_local_irq_restore(flags)	mtmsr(flags)
+#endif /* CONFIG_BOOKE */
 
-static inline void local_irq_disable(void)
+static inline void raw_local_irq_disable(void)
 {
 #ifdef CONFIG_BOOKE
 	__asm__ __volatile__("wrteei 0": : :"memory");
@@ -80,7 +85,7 @@ static inline void local_irq_disable(voi
 #endif
 }
 
-static inline void local_irq_enable(void)
+static inline void raw_local_irq_enable(void)
 {
 #ifdef CONFIG_BOOKE
 	__asm__ __volatile__("wrteei 1": : :"memory");
@@ -92,7 +97,7 @@ static inline void local_irq_enable(void
 #endif
 }
 
-static inline void local_irq_save_ptr(unsigned long *flags)
+static inline void raw_local_irq_save_ptr(unsigned long *flags)
 {
 	unsigned long msr;
 	msr = mfmsr();
@@ -105,13 +110,16 @@ static inline void local_irq_save_ptr(un
 	__asm__ __volatile__("": : :"memory");
 }
 
-#define local_save_flags(flags)	((flags) = mfmsr())
-#define local_irq_save(flags)	local_irq_save_ptr(&flags)
-#define irqs_disabled()		((mfmsr() & MSR_EE) == 0)
+#define raw_local_save_flags(flags)		((flags) = mfmsr())
+#define raw_local_irq_save(flags)		raw_local_irq_save_ptr(&flags)
+#define raw_irqs_disabled()			((mfmsr() & MSR_EE) == 0)
+#define raw_irqs_disabled_flags(flags)	((flags & MSR_EE) == 0)
 
 #define hard_irq_enable()	local_irq_enable()
 #define hard_irq_disable()	local_irq_disable()
 
+#include <linux/irqflags.h>
+
 #endif /* CONFIG_PPC64 */
 
 /*
Index: linux-2.6.25.4-rt4/arch/powerpc/Kconfig.debug
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/Kconfig.debug	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/Kconfig.debug	2008-05-29 09:46:39.000000000 -0400
@@ -2,6 +2,10 @@ menu "Kernel hacking"
 
 source "lib/Kconfig.debug"
 
+config TRACE_IRQFLAGS_SUPPORT
+	bool
+	default y
+
 config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
Index: linux-2.6.25.4-rt4/include/asm-powerpc/pmac_feature.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/pmac_feature.h	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/pmac_feature.h	2008-05-29 09:46:39.000000000 -0400
@@ -378,7 +378,7 @@ extern struct macio_chip* macio_find(str
  * Those are exported by pmac feature for internal use by arch code
  * only like the platform function callbacks, do not use directly in drivers
  */
-extern spinlock_t feature_lock;
+extern raw_spinlock_t feature_lock;
 extern struct device_node *uninorth_node;
 extern u32 __iomem *uninorth_base;
 
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/head_64.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/head_64.S	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/head_64.S	2008-05-29 09:46:39.000000000 -0400
@@ -878,7 +878,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISER
 	 * handles any interrupts pending at this point.
 	 */
 	ld	r3,SOFTE(r1)
-	bl	.local_irq_restore
+	bl	.raw_local_irq_restore
 	b	11f
 
 /* Here we have a page fault that hash_page can't handle. */
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/rtas.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/rtas.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/rtas.c	2008-05-29 09:46:40.000000000 -0400
@@ -41,7 +41,7 @@
 #include <asm/atomic.h>
 
 struct rtas_t rtas = {
-	.lock = SPIN_LOCK_UNLOCKED
+	.lock = RAW_SPIN_LOCK_UNLOCKED(lock)
 };
 EXPORT_SYMBOL(rtas);
 
Index: linux-2.6.25.4-rt4/arch/powerpc/mm/hash_native_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/mm/hash_native_64.c	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/mm/hash_native_64.c	2008-05-29 09:46:40.000000000 -0400
@@ -36,7 +36,7 @@
 
 #define HPTE_LOCK_BIT 3
 
-static DEFINE_SPINLOCK(native_tlbie_lock);
+static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
 static inline void __tlbie(unsigned long va, int psize, int ssize)
 {
Index: linux-2.6.25.4-rt4/include/asm-powerpc/rtas.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/rtas.h	2008-05-29 09:45:46.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/rtas.h	2008-05-29 09:46:40.000000000 -0400
@@ -58,7 +58,7 @@ struct rtas_t {
 	unsigned long entry;		/* physical address pointer */
 	unsigned long base;		/* physical address pointer */
 	unsigned long size;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct rtas_args args;
 	struct device_node *dev;	/* virtual address pointer */
 };
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/celleb/htab.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/celleb/htab.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/celleb/htab.c	2008-05-29 09:46:41.000000000 -0400
@@ -40,7 +40,7 @@
 #define DBG_LOW(fmt...) do { } while(0)
 #endif
 
-static DEFINE_SPINLOCK(beat_htab_lock);
+static DEFINE_RAW_SPINLOCK(beat_htab_lock);
 
 static inline unsigned int beat_read_mask(unsigned hpte_group)
 {
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/pmc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/pmc.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/pmc.c	2008-05-29 09:46:41.000000000 -0400
@@ -37,7 +37,7 @@ static void dummy_perf(struct pt_regs *r
 }
 
 
-static DEFINE_SPINLOCK(pmc_owner_lock);
+static DEFINE_RAW_SPINLOCK(pmc_owner_lock);
 static void *pmc_owner_caller; /* mostly for debugging */
 perf_irq_t perf_irq = dummy_perf;
 
Index: linux-2.6.25.4-rt4/arch/powerpc/sysdev/i8259.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/sysdev/i8259.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/sysdev/i8259.c	2008-05-29 09:46:41.000000000 -0400
@@ -23,7 +23,7 @@ static unsigned char cached_8259[2] = { 
 #define cached_A1 (cached_8259[0])
 #define cached_21 (cached_8259[1])
 
-static DEFINE_SPINLOCK(i8259_lock);
+static DEFINE_RAW_SPINLOCK(i8259_lock);
 
 static struct irq_host *i8259_host;
 
Index: linux-2.6.25.4-rt4/arch/powerpc/sysdev/ipic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/sysdev/ipic.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/sysdev/ipic.c	2008-05-29 09:46:41.000000000 -0400
@@ -31,7 +31,7 @@
 
 static struct ipic * primary_ipic;
 static struct irq_chip ipic_level_irq_chip, ipic_edge_irq_chip;
-static DEFINE_SPINLOCK(ipic_lock);
+static DEFINE_RAW_SPINLOCK(ipic_lock);
 
 static struct ipic_info ipic_info[] = {
 	[1] = {
Index: linux-2.6.25.4-rt4/include/asm-powerpc/mpic.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/mpic.h	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/mpic.h	2008-05-29 09:46:41.000000000 -0400
@@ -280,7 +280,7 @@ struct mpic
 #ifdef CONFIG_MPIC_U3_HT_IRQS
 	/* The fixup table */
 	struct mpic_irq_fixup	*fixups;
-	spinlock_t		fixup_lock;
+	raw_spinlock_t		fixup_lock;
 #endif
 
 	/* Register access method */
Index: linux-2.6.25.4-rt4/arch/x86/Kconfig.debug
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/Kconfig.debug	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/Kconfig.debug	2008-05-29 09:46:41.000000000 -0400
@@ -84,6 +84,7 @@ config 4KSTACKS
 	bool "Use 4Kb for kernel stacks instead of 8Kb"
 	depends on DEBUG_KERNEL
 	depends on X86_32
+	default y
 	help
 	  If you say Y here the kernel will use a 4Kb stacksize for the
 	  kernel stack attached to each process/thread. This facilitates
Index: linux-2.6.25.4-rt4/arch/x86/kernel/cpu/mtrr/generic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/cpu/mtrr/generic.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/cpu/mtrr/generic.c	2008-05-29 09:46:41.000000000 -0400
@@ -329,7 +329,7 @@ static unsigned long set_mtrr_state(void
 
 
 static unsigned long cr4 = 0;
-static DEFINE_SPINLOCK(set_atomicity_lock);
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
 
 /*
  * Since we are disabling the cache don't allow any interrupts - they
Index: linux-2.6.25.4-rt4/arch/x86/kernel/head_32.S
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/head_32.S	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/head_32.S	2008-05-29 09:46:41.000000000 -0400
@@ -585,6 +585,7 @@ ignore_int:
 	call printk
 #endif
 	addl $(5*4),%esp
+	call dump_stack
 	popl %ds
 	popl %es
 	popl %edx
Index: linux-2.6.25.4-rt4/arch/x86/kernel/i8253.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/i8253.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/i8253.c	2008-05-29 09:46:41.000000000 -0400
@@ -15,7 +15,7 @@
 #include <asm/io.h>
 #include <asm/hpet.h>
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 #ifdef CONFIG_X86_32
Index: linux-2.6.25.4-rt4/arch/x86/kernel/microcode.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/microcode.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/microcode.c	2008-05-29 09:46:41.000000000 -0400
@@ -117,7 +117,7 @@ MODULE_LICENSE("GPL");
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
 /* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
+static DEFINE_RAW_SPINLOCK(microcode_update_lock);
 
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DEFINE_MUTEX(microcode_mutex);
Index: linux-2.6.25.4-rt4/arch/x86/kernel/signal_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/signal_32.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/signal_32.c	2008-05-29 09:46:41.000000000 -0400
@@ -547,6 +547,13 @@ handle_signal(unsigned long sig, siginfo
 		}
 	}
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
 	 * flag so that register information in the sigcontext is correct.
@@ -585,6 +592,13 @@ static void do_signal(struct pt_regs *re
 	struct k_sigaction ka;
 	sigset_t *oldset;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux-2.6.25.4-rt4/arch/x86/kernel/smp_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/smp_32.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/smp_32.c	2008-05-29 09:47:05.000000000 -0400
@@ -18,6 +18,7 @@
 #include <linux/cache.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/cpumask.h>
 #include <linux/module.h>
 
 #include <asm/mtrr.h>
@@ -247,7 +248,7 @@ void send_IPI_mask_sequence(cpumask_t ma
 static cpumask_t flush_cpumask;
 static struct mm_struct * flush_mm;
 static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
 
 /*
  * We cannot call mmdrop() because we are in interrupt context,
@@ -477,10 +478,28 @@ static void native_smp_send_reschedule(i
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+void smp_send_reschedule_allbutself_cpumask(cpumask_t mask)
+{
+	cpu_clear(smp_processor_id(), mask);
+	cpus_and(mask, mask, cpu_online_map);
+	if (!cpus_empty(mask))
+		send_IPI_mask(mask, RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
@@ -635,9 +654,8 @@ static void native_smp_send_stop(void)
 }
 
 /*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back. Trigger a reschedule pass so that
+ * RT-overload balancing can pass tasks around.
  */
 void smp_reschedule_interrupt(struct pt_regs *regs)
 {
Index: linux-2.6.25.4-rt4/arch/x86/kernel/vm86_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/vm86_32.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/vm86_32.c	2008-05-29 09:46:41.000000000 -0400
@@ -134,6 +134,7 @@ struct pt_regs * save_v86_state(struct k
 	local_irq_enable();
 
 	if (!current->thread.vm86_info) {
+		local_irq_disable();
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
Index: linux-2.6.25.4-rt4/arch/x86/pci/common.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/pci/common.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/pci/common.c	2008-05-29 09:46:41.000000000 -0400
@@ -75,7 +75,7 @@ int pcibios_scanned;
  * This interrupt-safe spinlock protects all accesses to PCI
  * configuration space.
  */
-DEFINE_SPINLOCK(pci_config_lock);
+DEFINE_RAW_SPINLOCK(pci_config_lock);
 
 /*
  * Several buggy motherboards address only 16 devices and mirror
Index: linux-2.6.25.4-rt4/arch/x86/pci/direct.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/pci/direct.c	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/pci/direct.c	2008-05-29 09:46:41.000000000 -0400
@@ -220,16 +220,23 @@ static int __init pci_check_type1(void)
 	unsigned int tmp;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x01, 0xCFB);
 	tmp = inl(0xCF8);
 	outl(0x80000000, 0xCF8);
-	if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
-		works = 1;
+
+	if (inl(0xCF8) == 0x80000000) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf1))
+			works = 1;
+
+		spin_lock_irqsave(&pci_config_lock, flags);
 	}
 	outl(tmp, 0xCF8);
-	local_irq_restore(flags);
+
+	spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
@@ -239,17 +246,19 @@ static int __init pci_check_type2(void)
 	unsigned long flags;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x00, 0xCFB);
 	outb(0x00, 0xCF8);
 	outb(0x00, 0xCFA);
-	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
-	    pci_sanity_check(&pci_direct_conf2)) {
-		works = 1;
-	}
 
-	local_irq_restore(flags);
+	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf2))
+			works = 1;
+	} else
+		spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
Index: linux-2.6.25.4-rt4/arch/x86/pci/pci.h
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/pci/pci.h	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/pci/pci.h	2008-05-29 09:46:41.000000000 -0400
@@ -82,7 +82,7 @@ struct irq_routing_table {
 extern unsigned int pcibios_irq_mask;
 
 extern int pcibios_scanned;
-extern spinlock_t pci_config_lock;
+extern raw_spinlock_t pci_config_lock;
 
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
Index: linux-2.6.25.4-rt4/include/asm-x86/highmem.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/highmem.h	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/highmem.h	2008-05-29 09:46:58.000000000 -0400
@@ -61,6 +61,16 @@ extern void *kmap_high(struct page *page
 extern void kunmap_high(struct page *page);
 
 void *kmap(struct page *page);
+extern void kunmap_virt(void *ptr);
+extern struct page *kmap_to_page(void *ptr);
+void kunmap(struct page *page);
+
+void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
+void *__kmap_atomic(struct page *page, enum km_type type);
+void __kunmap_atomic(void *kvaddr, enum km_type type);
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *__kmap_atomic_to_page(void *ptr);
+
 void kunmap(struct page *page);
 void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
 void *kmap_atomic(struct page *page, enum km_type type);
@@ -74,6 +84,23 @@ struct page *kmap_atomic_to_page(void *p
 
 #define flush_cache_kmaps()	do { } while (0)
 
+/*
+ * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap():
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define kmap_atomic_prot(page, type, prot)	({ pagefault_disable(); kmap(page); })
+# define kmap_atomic(page, type)	({ pagefault_disable(); kmap(page); })
+# define kmap_atomic_pfn(pfn, type)	kmap(pfn_to_page(pfn))
+# define kunmap_atomic(kvaddr, type)	do { pagefault_enable(); kunmap_virt(kvaddr); } while(0)
+# define kmap_atomic_to_page(kvaddr)	kmap_to_page(kvaddr)
+#else
+# define kmap_atomic_prot(page, type, prot)	__kmap_atomic_prot(page, type, prot)
+# define kmap_atomic(page, type)	__kmap_atomic(page, type)
+# define kmap_atomic_pfn(pfn, type)	__kmap_atomic_pfn(pfn, type)
+# define kunmap_atomic(kvaddr, type)	__kunmap_atomic(kvaddr, type)
+# define kmap_atomic_to_page(kvaddr)	__kmap_atomic_to_page(kvaddr)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
Index: linux-2.6.25.4-rt4/include/asm-x86/i8253.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/i8253.h	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/i8253.h	2008-05-29 09:46:41.000000000 -0400
@@ -6,7 +6,7 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 extern struct clock_event_device *global_clock_event;
 
Index: linux-2.6.25.4-rt4/include/asm-x86/mach-default/irq_vectors.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/mach-default/irq_vectors.h	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/mach-default/irq_vectors.h	2008-05-29 09:46:41.000000000 -0400
@@ -63,7 +63,7 @@
  * levels. (0x80 is the syscall vector)
  */
 #define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
+#define FIRST_SYSTEM_VECTOR	0xee
 
 #define TIMER_IRQ 0
 
Index: linux-2.6.25.4-rt4/include/asm-x86/mc146818rtc.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/mc146818rtc.h	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/mc146818rtc.h	2008-05-29 09:46:41.000000000 -0400
@@ -72,7 +72,7 @@ static inline unsigned char current_lock
 		lock_cmos(reg)
 #define lock_cmos_suffix(reg) \
 		unlock_cmos();			\
-		local_irq_restore(cmos_flags);	\
+		local_irq_restore(cmos_flags); \
 	} while (0)
 #else
 #define lock_cmos_prefix(reg) do {} while (0)
Index: linux-2.6.25.4-rt4/include/asm-x86/xor_32.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/xor_32.h	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/xor_32.h	2008-05-29 09:46:41.000000000 -0400
@@ -860,7 +860,21 @@ static struct xor_block_template xor_blo
 #include <asm-generic/xor.h>
 
 #undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
+/*
+ * MMX/SSE ops disable preemption for long periods of time,
+ * so on PREEMPT_RT use the register-based ops only:
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	} while (0)
+# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
+#else
+# define XOR_TRY_TEMPLATES				\
 	do {						\
 		xor_speed(&xor_block_8regs);		\
 		xor_speed(&xor_block_8regs_p);		\
@@ -873,9 +887,10 @@ static struct xor_block_template xor_blo
 	                xor_speed(&xor_block_p5_mmx);	\
 	        }					\
 	} while (0)
-
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
+# define XOR_SELECT_TEMPLATE(FASTEST) \
 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#endif
+
Index: linux-2.6.25.4-rt4/arch/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/Kconfig	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/Kconfig	2008-05-29 09:46:41.000000000 -0400
@@ -16,6 +16,11 @@ config OPROFILE
 config HAVE_OPROFILE
 	def_bool n
 
+config PROFILE_NMI
+	bool
+	depends on OPROFILE
+	default y
+
 config KPROBES
 	bool "Kprobes"
 	depends on KALLSYMS && MODULES
Index: linux-2.6.25.4-rt4/include/linux/mm_types.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/mm_types.h	2008-05-29 09:45:45.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/mm_types.h	2008-05-29 09:46:42.000000000 -0400
@@ -202,6 +202,9 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
+	/* realtime bits */
+	struct list_head	delayed_drop;
+
 	/* Swap token stuff */
 	/*
 	 * Last value of global fault stamp as seen by this process.
Index: linux-2.6.25.4-rt4/include/linux/completion.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/completion.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/completion.h	2008-05-29 09:46:43.000000000 -0400
@@ -49,6 +49,7 @@ extern unsigned long wait_for_completion
 						   unsigned long timeout);
 extern unsigned long wait_for_completion_interruptible_timeout(
 			struct completion *x, unsigned long timeout);
+extern unsigned int completion_done(struct completion *x);
 
 extern void complete(struct completion *);
 extern void complete_all(struct completion *);
Index: linux-2.6.25.4-rt4/include/linux/hardirq.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/hardirq.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/hardirq.h	2008-05-29 09:47:21.000000000 -0400
@@ -22,10 +22,13 @@
  * PREEMPT_MASK: 0x000000ff
  * SOFTIRQ_MASK: 0x0000ff00
  * HARDIRQ_MASK: 0x0fff0000
+ * HARDNMI_MASK: 0x40000000
  */
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
 
+#define HARDNMI_BITS	1
+
 #ifndef HARDIRQ_BITS
 #define HARDIRQ_BITS	12
 
@@ -41,36 +44,43 @@
 # error HARDIRQ_BITS is too low!
 #endif
 #endif
+#define PREEMPT_ACTIVE_BITS	1
 
-#define PREEMPT_SHIFT	0
-#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-
-#define __IRQ_MASK(x)	((1UL << (x))-1)
-
-#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-
-#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define PREEMPT_SHIFT		0
+#define SOFTIRQ_SHIFT		(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT		(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define HARDNMI_SHIFT  		(30)
+
+#define __IRQ_MASK(x)		((1UL << (x))-1)
+
+#define PREEMPT_MASK		(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK		(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK		(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define HARDNMI_MASK   		(__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)
+
+#define PREEMPT_OFFSET		(1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET		(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET		(1UL << HARDIRQ_SHIFT)
+#define HARDNMI_OFFSET 		(1UL << HARDNMI_SHIFT)
 
 #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
-#error PREEMPT_ACTIVE is too low!
+# error PREEMPT_ACTIVE is too low!
 #endif
 
+#define hardnmi_count()	(preempt_count() & HARDNMI_MASK)
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
-#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+#define irq_count()	\
+	(preempt_count() & (HARDNMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
 
 /*
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
  */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
+#define in_nmi()	(hardnmi_count())
+#define in_irq()	(hardirq_count() || (current->flags & PF_HARDIRQ))
+#define in_softirq()	(softirq_count() || (current->flags & PF_SOFTIRQ))
+#define in_interrupt()	(irq_count())
 
 /*
  * Are we running in atomic context?  WARNING: this macro cannot
@@ -159,7 +169,19 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()					\
+	do {                                            \
+		lockdep_off();                          \
+		BUG_ON(hardnmi_count());                \
+		add_preempt_count(HARDNMI_OFFSET);      \
+		__irq_enter();                          \
+	} while (0)
+
+#define nmi_exit()					\
+	do {                                            \
+		__irq_exit();                           \
+		sub_preempt_count(HARDNMI_OFFSET);      \
+		lockdep_on();                           \
+	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
Index: linux-2.6.25.4-rt4/include/linux/kernel.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/kernel.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/kernel.h	2008-05-29 09:47:00.000000000 -0400
@@ -111,7 +111,7 @@ extern int _cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
   void __might_sleep(char *file, int line);
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
@@ -212,6 +212,18 @@ static inline bool printk_timed_ratelimi
 extern void __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);
 
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
 unsigned long int_sqrt(unsigned long);
 
 static inline void console_silent(void)
@@ -230,6 +242,10 @@ extern void wake_up_klogd(void);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_timeout;
 extern int panic_on_oops;
+
+extern void pause_on_oops_head(void);
+extern void pause_on_oops_tail(void);
+
 extern int panic_on_unrecovered_nmi;
 extern int tainted;
 extern const char *print_tainted(void);
@@ -239,6 +255,7 @@ extern int root_mountflags;
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
+	SYSTEM_BOOTING_SCHEDULER_OK,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
Index: linux-2.6.25.4-rt4/include/linux/radix-tree.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/radix-tree.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/radix-tree.h	2008-05-29 09:46:56.000000000 -0400
@@ -62,23 +62,65 @@ struct radix_tree_root {
 	unsigned int		height;
 	gfp_t			gfp_mask;
 	struct radix_tree_node	*rnode;
+	spinlock_t		lock;
 };
 
 #define RADIX_TREE_INIT(mask)	{					\
 	.height = 0,							\
 	.gfp_mask = (mask),						\
 	.rnode = NULL,							\
+	.lock = __SPIN_LOCK_UNLOCKED(radix_tree_root.lock),		\
 }
 
 #define RADIX_TREE(name, mask) \
 	struct radix_tree_root name = RADIX_TREE_INIT(mask)
 
-#define INIT_RADIX_TREE(root, mask)					\
-do {									\
-	(root)->height = 0;						\
-	(root)->gfp_mask = (mask);					\
-	(root)->rnode = NULL;						\
-} while (0)
+static inline void INIT_RADIX_TREE(struct radix_tree_root *root, gfp_t gfp_mask)
+{
+	root->height = 0;
+	root->gfp_mask = gfp_mask;
+	root->rnode = NULL;
+	spin_lock_init(&root->lock);
+}
+
+struct radix_tree_context {
+	struct radix_tree_root	*tree;
+	struct radix_tree_root	*root;
+#ifdef CONFIG_RADIX_TREE_CONCURRENT
+	spinlock_t		*locked;
+#endif
+};
+
+#ifdef CONFIG_RADIX_TREE_CONCURRENT
+#define RADIX_CONTEXT_ROOT(context)					\
+	((struct radix_tree_root *)(((unsigned long)context) + 1))
+
+#define __RADIX_TREE_CONTEXT_INIT(context, _tree)			\
+		.tree = RADIX_CONTEXT_ROOT(&context),			\
+		.locked = NULL,
+#else
+#define __RADIX_TREE_CONTEXT_INIT(context, _tree)			\
+		.tree = (_tree),
+#endif
+
+#define DEFINE_RADIX_TREE_CONTEXT(context, _tree) 			\
+	struct radix_tree_context context = { 				\
+		.root = (_tree), 					\
+		__RADIX_TREE_CONTEXT_INIT(context, _tree)		\
+       	}
+
+static inline void
+init_radix_tree_context(struct radix_tree_context *ctx,
+		struct radix_tree_root *root)
+{
+	ctx->root = root;
+#ifdef CONFIG_RADIX_TREE_CONCURRENT
+	ctx->tree = RADIX_CONTEXT_ROOT(ctx);
+	ctx->locked = NULL;
+#else
+	ctx->tree = root;
+#endif
+}
 
 /**
  * Radix-tree synchronization
@@ -99,12 +141,15 @@ do {									\
  *
  * The notable exceptions to this rule are the following functions:
  * radix_tree_lookup
+ * radix_tree_lookup_slot
  * radix_tree_tag_get
  * radix_tree_gang_lookup
+ * radix_tree_gang_lookup_slot
  * radix_tree_gang_lookup_tag
+ * radix_tree_gang_lookup_tag_slot
  * radix_tree_tagged
  *
- * The first 4 functions are able to be called locklessly, using RCU. The
+ * The first 7 functions are able to be called locklessly, using RCU. The
  * caller must ensure calls to these functions are made within rcu_read_lock()
  * regions. Other readers (lock-free or otherwise) and modifications may be
  * running concurrently.
@@ -152,6 +197,48 @@ static inline void radix_tree_replace_sl
 	rcu_assign_pointer(*pslot, item);
 }
 
+#if defined(CONFIG_RADIX_TREE_OPTIMISTIC)
+static inline void radix_tree_lock(struct radix_tree_context *context)
+{
+	rcu_read_lock();
+	BUG_ON(context->locked);
+}
+#elif defined(CONFIG_RADIX_TREE_CONCURRENT)
+static inline void radix_tree_lock(struct radix_tree_context *context)
+{
+	struct radix_tree_root *root = context->root;
+
+	rcu_read_lock();
+	spin_lock(&root->lock);
+	BUG_ON(context->locked);
+	context->locked = &root->lock;
+}
+#else
+static inline void radix_tree_lock(struct radix_tree_context *context)
+{
+	struct radix_tree_root *root = context->root;
+
+	rcu_read_lock();
+	spin_lock(&root->lock);
+}
+#endif
+
+#if defined(CONFIG_RADIX_TREE_CONCURRENT)
+static inline void radix_tree_unlock(struct radix_tree_context *context)
+{
+	BUG_ON(!context->locked);
+	spin_unlock(context->locked);
+	context->locked = NULL;
+	rcu_read_unlock();
+}
+#else
+static inline void radix_tree_unlock(struct radix_tree_context *context)
+{
+	spin_unlock(&context->root->lock);
+	rcu_read_unlock();
+}
+#endif
+
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
@@ -159,9 +246,23 @@ void *radix_tree_delete(struct radix_tre
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+unsigned int
+radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+			unsigned long first_index, unsigned int max_items);
 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
+/*
+ * On a mutex based kernel we can freely schedule within the radix code:
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int radix_tree_preload(gfp_t gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(gfp_t gfp_mask);
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -173,11 +274,17 @@ unsigned int
 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag);
+unsigned int
+radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag);
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	preempt_enable();
+#endif
 }
 
 #endif /* _LINUX_RADIX_TREE_H */
Index: linux-2.6.25.4-rt4/include/linux/smp_lock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/smp_lock.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/smp_lock.h	2008-05-29 09:46:43.000000000 -0400
@@ -32,7 +32,7 @@ extern void __lockfunc unlock_kernel(voi
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
-#define reacquire_kernel_lock(task)		0
+#define reacquire_kernel_lock(task)		do { } while(0)
 #define kernel_locked()				1
 
 #endif /* CONFIG_LOCK_KERNEL */
Index: linux-2.6.25.4-rt4/include/linux/workqueue.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/workqueue.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/workqueue.h	2008-05-29 09:47:11.000000000 -0400
@@ -9,6 +9,8 @@
 #include <linux/linkage.h>
 #include <linux/bitops.h>
 #include <linux/lockdep.h>
+#include <linux/plist.h>
+#include <linux/sched_prio.h>
 #include <asm/atomic.h>
 
 struct workqueue_struct;
@@ -27,7 +29,7 @@ struct work_struct {
 #define WORK_STRUCT_PENDING 0		/* T if work item pending execution */
 #define WORK_STRUCT_FLAG_MASK (3UL)
 #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
-	struct list_head entry;
+	struct plist_node entry;
 	work_func_t func;
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map lockdep_map;
@@ -39,6 +41,7 @@ struct work_struct {
 struct delayed_work {
 	struct work_struct work;
 	struct timer_list timer;
+	int prio;
 };
 
 struct execute_work {
@@ -59,7 +62,7 @@ struct execute_work {
 
 #define __WORK_INITIALIZER(n, f) {				\
 	.data = WORK_DATA_INIT(),				\
-	.entry	= { &(n).entry, &(n).entry },			\
+	.entry	= PLIST_NODE_INIT(n.entry, MAX_PRIO),		\
 	.func = (f),						\
 	__WORK_INIT_LOCKDEP_MAP(#n, &(n))			\
 	}
@@ -100,14 +103,14 @@ struct execute_work {
 									\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
 		lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0);\
-		INIT_LIST_HEAD(&(_work)->entry);			\
+		plist_node_init(&(_work)->entry, -1);			\
 		PREPARE_WORK((_work), (_func));				\
 	} while (0)
 #else
 #define INIT_WORK(_work, _func)						\
 	do {								\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
-		INIT_LIST_HEAD(&(_work)->entry);			\
+		plist_node_init(&(_work)->entry, -1);			\
 		PREPARE_WORK((_work), (_func));				\
 	} while (0)
 #endif
@@ -176,6 +179,9 @@ __create_workqueue_key(const char *name,
 #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
 
+extern void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			       int rt_priority, int nice);
+
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int queue_work(struct workqueue_struct *wq, struct work_struct *work);
@@ -191,7 +197,8 @@ extern int schedule_work(struct work_str
 extern int schedule_delayed_work(struct delayed_work *work, unsigned long delay);
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
 					unsigned long delay);
-extern int schedule_on_each_cpu(work_func_t func);
+extern int schedule_on_each_cpu_wq(struct workqueue_struct *wq, work_func_t func);
+extern int schedule_on_each_cpu(void (*func)(void *info), void *info, int retry, int wait);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
 
Index: linux-2.6.25.4-rt4/kernel/exit.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/exit.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/exit.c	2008-05-29 09:46:43.000000000 -0400
@@ -61,7 +61,9 @@ static void __unhash_process(struct task
 		detach_pid(p, PIDTYPE_SID);
 
 		list_del_rcu(&p->tasks);
+		preempt_disable();
 		__get_cpu_var(process_counts)--;
+		preempt_enable();
 	}
 	list_del_rcu(&p->thread_group);
 	remove_parent(p);
@@ -609,9 +611,11 @@ static void exit_mm(struct task_struct *
 	task_lock(tsk);
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
+	preempt_disable(); // FIXME
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
+	preempt_enable();
 	task_unlock(tsk);
 	mmput(mm);
 }
@@ -1014,15 +1018,18 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	preempt_disable();
+again:
+	local_irq_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
 
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;)
-		cpu_relax();	/* For when BUG is null */
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08x, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
@@ -1470,6 +1477,7 @@ repeat:
 
 		list_for_each_entry(p, &tsk->children, sibling) {
 			int ret = eligible_child(type, pid, options, p);
+			BUG_ON(!atomic_read(&p->usage));
 			if (!ret)
 				continue;
 
Index: linux-2.6.25.4-rt4/kernel/signal.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/signal.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/signal.c	2008-05-29 09:46:43.000000000 -0400
@@ -762,8 +762,10 @@ specific_send_sig_info(int sig, struct s
 {
 	int ret = 0;
 
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
+#ifdef CONFIG_SMP
 	assert_spin_locked(&t->sighand->siglock);
+#endif
 
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
@@ -1627,6 +1629,7 @@ static void ptrace_stop(int exit_code, i
 	if (!unlikely(killed) && may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
+		current->flags &= ~PF_NOSCHED;
 		schedule();
 	} else {
 		/*
@@ -1695,6 +1698,7 @@ finish_stop(int stop_count)
 	}
 
 	do {
+		current->flags &= ~PF_NOSCHED;
 		schedule();
 	} while (try_to_freeze());
 	/*
@@ -1772,6 +1776,9 @@ relock:
 	 */
 	try_to_freeze();
 
+#ifdef CONFIG_PREEMPT_RT
+	might_sleep();
+#endif
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
 		struct k_sigaction *ka;
Index: linux-2.6.25.4-rt4/kernel/sys.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/sys.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/sys.c	2008-05-29 09:46:58.000000000 -0400
@@ -32,10 +32,12 @@
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
+#include <linux/hardirq.h>
 #include <linux/cpu.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
+#include <linux/rt_lock.h>
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
 
@@ -264,6 +266,15 @@ out_unlock:
  */
 void emergency_restart(void)
 {
+	/*
+	 * Call the notifier chain if we are not in an
+	 * atomic context:
+	 */
+#ifdef CONFIG_PREEMPT
+	if (!in_atomic() && !irqs_disabled())
+		blocking_notifier_call_chain(&reboot_notifier_list,
+					     SYS_RESTART, NULL);
+#endif
 	machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
Index: linux-2.6.25.4-rt4/kernel/user.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/user.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/user.c	2008-05-29 09:47:13.000000000 -0400
@@ -265,14 +265,14 @@ static void remove_user_sysfs_dir(struct
 	 */
 	uids_mutex_lock();
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
 		uid_hash_remove(up);
 		remove_user = 1;
 		spin_unlock_irqrestore(&uidhash_lock, flags);
 	} else {
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	if (!remove_user)
@@ -353,11 +353,11 @@ void free_uid(struct user_struct *up)
 	if (!up)
 		return;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
 		free_user(up, flags);
 	else
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 }
 
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
Index: linux-2.6.25.4-rt4/kernel/workqueue.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/workqueue.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/workqueue.c	2008-05-29 09:47:11.000000000 -0400
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/syscalls.h>
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
@@ -33,6 +34,11 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
+#include <linux/wait.h>
+
+#include <asm/uaccess.h>
+
+struct wq_barrier;
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -42,7 +48,7 @@ struct cpu_workqueue_struct {
 
 	spinlock_t lock;
 
-	struct list_head worklist;
+	struct plist_head worklist;
 	wait_queue_head_t more_work;
 	struct work_struct *current_work;
 
@@ -50,6 +56,9 @@ struct cpu_workqueue_struct {
 	struct task_struct *thread;
 
 	int run_depth;		/* Detect run_workqueue() recursion depth */
+
+	wait_queue_head_t work_done;
+	struct wq_barrier *barrier;
 } ____cacheline_aligned;
 
 /*
@@ -125,7 +134,7 @@ struct cpu_workqueue_struct *get_wq_data
 }
 
 static void insert_work(struct cpu_workqueue_struct *cwq,
-				struct work_struct *work, int tail)
+		struct work_struct *work, int prio, int boost_prio)
 {
 	set_wq_data(work, cwq);
 	/*
@@ -133,21 +142,22 @@ static void insert_work(struct cpu_workq
 	 * result of list_add() below, see try_to_grab_pending().
 	 */
 	smp_wmb();
-	if (tail)
-		list_add_tail(&work->entry, &cwq->worklist);
-	else
-		list_add(&work->entry, &cwq->worklist);
+	plist_node_init(&work->entry, prio);
+	plist_add(&work->entry, &cwq->worklist);
+
+	if (boost_prio < cwq->thread->prio)
+		task_setprio(cwq->thread, boost_prio);
 	wake_up(&cwq->more_work);
 }
 
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
-			 struct work_struct *work)
+			 struct work_struct *work, int prio)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&cwq->lock, flags);
-	insert_work(cwq, work, 1);
+	insert_work(cwq, work, prio, prio);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -160,15 +170,16 @@ static void __queue_work(struct cpu_work
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
+ *
+ * Especially no such guarantee on PREEMPT_RT.
  */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0;
+	int ret = 0, cpu = raw_smp_processor_id();
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq_per_cpu(wq, get_cpu()), work);
-		put_cpu();
+		BUG_ON(!plist_node_empty(&work->entry));
+		__queue_work(wq_per_cpu(wq, cpu), work, current->normal_prio);
 		ret = 1;
 	}
 	return ret;
@@ -181,7 +192,8 @@ static void delayed_work_timer_fn(unsign
 	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
 	struct workqueue_struct *wq = cwq->wq;
 
-	__queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+	__queue_work(wq_per_cpu(wq, raw_smp_processor_id()),
+			&dwork->work, dwork->prio);
 }
 
 /**
@@ -221,9 +233,10 @@ int queue_delayed_work_on(int cpu, struc
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
 		BUG_ON(timer_pending(timer));
-		BUG_ON(!list_empty(&work->entry));
+		BUG_ON(!plist_node_empty(&work->entry));
 
 		/* This stores cwq for the moment, for the timer_fn */
+		dwork->prio = current->normal_prio;
 		set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
@@ -239,8 +252,34 @@ int queue_delayed_work_on(int cpu, struc
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+static void leak_check(void *func)
+{
+	if (!in_atomic() && lockdep_depth(current) <= 0)
+		return;
+	printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+				"%s/0x%08x/%d\n",
+				current->comm, preempt_count(),
+				current->pid);
+	printk(KERN_ERR "    last function: ");
+	print_symbol("%s\n", (unsigned long)func);
+	debug_show_held_locks(current);
+	dump_stack();
+}
+
+struct wq_barrier {
+	struct work_struct		work;
+	struct plist_head		worklist;
+	struct wq_barrier 		*prev_barrier;
+	int				prev_prio;
+	int				waiter_prio;
+	struct cpu_workqueue_struct 	*cwq;
+	struct completion		done;
+};
+
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
+	struct plist_head *worklist = &cwq->worklist;
+
 	spin_lock_irq(&cwq->lock);
 	cwq->run_depth++;
 	if (cwq->run_depth > 3) {
@@ -249,8 +288,11 @@ static void run_workqueue(struct cpu_wor
 			__FUNCTION__, cwq->run_depth);
 		dump_stack();
 	}
-	while (!list_empty(&cwq->worklist)) {
-		struct work_struct *work = list_entry(cwq->worklist.next,
+
+again:
+	while (!plist_head_empty(worklist)) {
+		int prio;
+		struct work_struct *work = plist_first_entry(worklist,
 						struct work_struct, entry);
 		work_func_t f = work->func;
 #ifdef CONFIG_LOCKDEP
@@ -265,32 +307,56 @@ static void run_workqueue(struct cpu_wor
 		struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
 
+		prio = work->entry.prio;
+		if (unlikely(worklist != &cwq->worklist)) {
+			prio = min(prio, cwq->barrier->prev_prio);
+			prio = min(prio, cwq->barrier->waiter_prio);
+			prio = min(prio, plist_first(&cwq->worklist)->prio);
+		}
+		prio = max(prio, 0);
+
+		if (likely(cwq->thread->prio != prio))
+			task_setprio(cwq->thread, prio);
+
 		cwq->current_work = work;
-		list_del_init(cwq->worklist.next);
+		plist_del(&work->entry, worklist);
+		plist_node_init(&work->entry, MAX_PRIO);
 		spin_unlock_irq(&cwq->lock);
 
 		BUG_ON(get_wq_data(work) != cwq);
 		work_clear_pending(work);
+		leak_check(NULL);
 		lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
 		lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
 		f(work);
 		lock_release(&lockdep_map, 1, _THIS_IP_);
 		lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
-
-		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-			printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
-					"%s/0x%08x/%d\n",
-					current->comm, preempt_count(),
-				       	task_pid_nr(current));
-			printk(KERN_ERR "    last function: ");
-			print_symbol("%s\n", (unsigned long)f);
-			debug_show_held_locks(current);
-			dump_stack();
-		}
+		leak_check(f);
 
 		spin_lock_irq(&cwq->lock);
 		cwq->current_work = NULL;
+		wake_up_all(&cwq->work_done);
+		if (unlikely(cwq->barrier))
+			worklist = &cwq->barrier->worklist;
+	}
+
+	if (unlikely(worklist != &cwq->worklist)) {
+		struct wq_barrier *barrier = cwq->barrier;
+
+		BUG_ON(!barrier);
+		cwq->barrier = barrier->prev_barrier;
+		complete(&barrier->done);
+
+		if (unlikely(cwq->barrier))
+			worklist = &cwq->barrier->worklist;
+		else
+			worklist = &cwq->worklist;
+
+		if (!plist_head_empty(worklist))
+			goto again;
 	}
+
+	task_setprio(cwq->thread, current->normal_prio);
 	cwq->run_depth--;
 	spin_unlock_irq(&cwq->lock);
 }
@@ -309,7 +375,7 @@ static int worker_thread(void *__cwq)
 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
 		if (!freezing(current) &&
 		    !kthread_should_stop() &&
-		    list_empty(&cwq->worklist))
+		    plist_head_empty(&cwq->worklist))
 			schedule();
 		finish_wait(&cwq->more_work, &wait);
 
@@ -324,26 +390,37 @@ static int worker_thread(void *__cwq)
 	return 0;
 }
 
-struct wq_barrier {
-	struct work_struct	work;
-	struct completion	done;
-};
-
 static void wq_barrier_func(struct work_struct *work)
 {
-	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
-	complete(&barr->done);
+	struct wq_barrier *barrier =
+		container_of(work, struct wq_barrier, work);
+	struct cpu_workqueue_struct *cwq = barrier->cwq;
+	int prio = MAX_PRIO;
+
+	spin_lock_irq(&cwq->lock);
+	barrier->prev_barrier = cwq->barrier;
+	if (cwq->barrier) {
+		prio = min(prio, cwq->barrier->waiter_prio);
+		prio = min(prio, plist_first(&cwq->barrier->worklist)->prio);
+	}
+	barrier->prev_prio = prio;
+	cwq->barrier = barrier;
+	spin_unlock_irq(&cwq->lock);
 }
 
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-					struct wq_barrier *barr, int tail)
+		struct wq_barrier *barr)
 {
 	INIT_WORK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
 
+	plist_head_init(&barr->worklist, NULL);
+	plist_head_splice(&cwq->worklist, &barr->worklist);
+	barr->cwq = cwq;
 	init_completion(&barr->done);
+	barr->waiter_prio = current->prio;
 
-	insert_work(cwq, &barr->work, tail);
+	insert_work(cwq, &barr->work, 0, current->prio);
 }
 
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -362,8 +439,9 @@ static int flush_cpu_workqueue(struct cp
 
 		active = 0;
 		spin_lock_irq(&cwq->lock);
-		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-			insert_wq_barrier(cwq, &barr, 1);
+		if (!plist_head_empty(&cwq->worklist) ||
+			cwq->current_work != NULL) {
+			insert_wq_barrier(cwq, &barr);
 			active = 1;
 		}
 		spin_unlock_irq(&cwq->lock);
@@ -423,7 +501,7 @@ static int try_to_grab_pending(struct wo
 		return ret;
 
 	spin_lock_irq(&cwq->lock);
-	if (!list_empty(&work->entry)) {
+	if (!plist_node_empty(&work->entry)) {
 		/*
 		 * This work is queued, but perhaps we locked the wrong cwq.
 		 * In that case we must see the new value after rmb(), see
@@ -431,7 +509,8 @@ static int try_to_grab_pending(struct wo
 		 */
 		smp_rmb();
 		if (cwq == get_wq_data(work)) {
-			list_del_init(&work->entry);
+			plist_del(&work->entry, &cwq->worklist);
+			plist_node_init(&work->entry, MAX_PRIO);
 			ret = 1;
 		}
 	}
@@ -440,21 +519,24 @@ static int try_to_grab_pending(struct wo
 	return ret;
 }
 
-static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
-				struct work_struct *work)
+static inline
+int is_current_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
 {
-	struct wq_barrier barr;
-	int running = 0;
+	int ret;
 
 	spin_lock_irq(&cwq->lock);
-	if (unlikely(cwq->current_work == work)) {
-		insert_wq_barrier(cwq, &barr, 0);
-		running = 1;
-	}
+	ret = (cwq->current_work == work);
 	spin_unlock_irq(&cwq->lock);
 
-	if (unlikely(running))
-		wait_for_completion(&barr.done);
+	return ret;
+}
+
+static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
+				struct work_struct *work)
+{
+	DEFINE_WAIT(wait);
+
+	wait_event(cwq->work_done, !is_current_work(cwq, work));
 }
 
 static void wait_on_work(struct work_struct *work)
@@ -584,38 +666,121 @@ int schedule_delayed_work_on(int cpu,
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 
+struct schedule_on_each_cpu_work {
+	struct work_struct work;
+	void (*func)(void *info);
+	void *info;
+};
+
+static void schedule_on_each_cpu_func(struct work_struct *work)
+{
+	struct schedule_on_each_cpu_work *w;
+
+	w = container_of(work, typeof(*w), work);
+	w->func(w->info);
+
+	kfree(w);
+}
+
 /**
  * schedule_on_each_cpu - call a function on each online CPU from keventd
  * @func: the function to call
+ * @info: data to pass to function
+ * @retry: ignored
+ * @wait: wait for completion
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int schedule_on_each_cpu(void (*func)(void *info), void *info, int retry, int wait)
+{
+	int cpu;
+	struct schedule_on_each_cpu_work **works;
+	int err = 0;
+
+	works = kzalloc(sizeof(void *)*nr_cpu_ids, GFP_KERNEL);
+	if (!works)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		works[cpu] = kmalloc_node(sizeof(struct schedule_on_each_cpu_work),
+				GFP_KERNEL, cpu_to_node(cpu));
+		if (!works[cpu]) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		struct schedule_on_each_cpu_work *work;
+
+		work = works[cpu];
+		works[cpu] = NULL;
+
+		work->func = func;
+		work->info = info;
+		INIT_WORK(&work->work, schedule_on_each_cpu_func);
+		set_bit(WORK_STRUCT_PENDING, work_data_bits(&work->work));
+		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
+				&work->work, current->normal_prio);
+	}
+	put_online_cpus();
+out:
+	for_each_possible_cpu(cpu) {
+		if (works[cpu])
+			kfree(works[cpu]);
+	}
+	kfree(works);
+
+	if (!err && wait)
+		flush_workqueue(keventd_wq);
+
+	return err;
+}
+EXPORT_SYMBOL(schedule_on_each_cpu);
+
+/**
+ * schedule_on_each_cpu_wq - call a function on each online CPU on a per-CPU wq
+ * @func: the function to call
  *
  * Returns zero on success.
  * Returns -ve errno on failure.
  *
+ * Appears to be racy against CPU hotplug.
+ *
  * schedule_on_each_cpu() is very slow.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_each_cpu_wq(struct workqueue_struct *wq, work_func_t func)
 {
 	int cpu;
 	struct work_struct *works;
 
+	if (is_single_threaded(wq)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
 	works = alloc_percpu(struct work_struct);
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
-		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work,
+				current->normal_prio);
 	}
-	flush_workqueue(keventd_wq);
-	put_online_cpus();
+	flush_workqueue(wq);
 	free_percpu(works);
+
 	return 0;
 }
 
+
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);
@@ -676,8 +841,10 @@ init_cpu_workqueue(struct workqueue_stru
 
 	cwq->wq = wq;
 	spin_lock_init(&cwq->lock);
-	INIT_LIST_HEAD(&cwq->worklist);
+	plist_head_init(&cwq->worklist, NULL);
 	init_waitqueue_head(&cwq->more_work);
+	cwq->barrier = NULL;
+	init_waitqueue_head(&cwq->work_done);
 
 	return cwq;
 }
@@ -797,6 +964,49 @@ static void cleanup_workqueue_thread(str
 	cwq->thread = NULL;
 }
 
+void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
+			       int policy, int rt_priority, int nice)
+{
+	struct sched_param param = { .sched_priority = rt_priority };
+	struct cpu_workqueue_struct *cwq;
+	mm_segment_t oldfs = get_fs();
+	struct task_struct *p;
+	unsigned long flags;
+	int ret;
+
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	spin_lock_irqsave(&cwq->lock, flags);
+	p = cwq->thread;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+
+	set_user_nice(p, nice);
+
+	set_fs(KERNEL_DS);
+	ret = sys_sched_setscheduler(p->pid, policy, &param);
+	set_fs(oldfs);
+
+	WARN_ON(ret);
+}
+
+void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			int rt_priority, int nice)
+{
+	int cpu;
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	get_online_cpus();
+	spin_lock(&workqueue_lock);
+	if (is_single_threaded(wq))
+		set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
+	else {
+		for_each_online_cpu(cpu)
+			set_workqueue_thread_prio(wq, cpu, policy,
+						  rt_priority, nice);
+	}
+	spin_unlock(&workqueue_lock);
+	put_online_cpus();
+}
+
 /**
  * destroy_workqueue - safely terminate a workqueue
  * @wq: target workqueue
@@ -875,4 +1085,5 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+	set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
 }
Index: linux-2.6.25.4-rt4/lib/radix-tree.c
===================================================================
--- linux-2.6.25.4-rt4.orig/lib/radix-tree.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/lib/radix-tree.c	2008-05-29 09:47:20.000000000 -0400
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
+#include <linux/spinlock.h>
 
 
 #ifdef __KERNEL__
@@ -52,11 +53,17 @@ struct radix_tree_node {
 	struct rcu_head	rcu_head;
 	void		*slots[RADIX_TREE_MAP_SIZE];
 	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+#ifdef CONFIG_RADIX_TREE_CONCURRENT
+	spinlock_t	lock;
+#endif
 };
 
 struct radix_tree_path {
 	struct radix_tree_node *node;
 	int offset;
+#ifdef CONFIG_RADIX_TREE_CONCURRENT
+	spinlock_t *locked;
+#endif
 };
 
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
@@ -69,6 +76,139 @@ struct radix_tree_path {
  */
 static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
 
+#ifdef CONFIG_RADIX_TREE_CONCURRENT
+static struct lock_class_key radix_node_class[RADIX_TREE_MAX_PATH];
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static const char *radix_node_key_string[RADIX_TREE_MAX_PATH] = {
+	"radix-node-00",
+	"radix-node-01",
+	"radix-node-02",
+	"radix-node-03",
+	"radix-node-04",
+	"radix-node-05",
+	"radix-node-06",
+	"radix-node-07",
+	"radix-node-08",
+	"radix-node-09",
+	"radix-node-10",
+	"radix-node-11",
+	"radix-node-12",
+	"radix-node-13",
+	"radix-node-14",
+	"radix-node-15",
+};
+#endif
+
+#ifdef CONFIG_RADIX_TREE_OPTIMISTIC
+static DEFINE_PER_CPU(unsigned long[RADIX_TREE_MAX_PATH+1], optimistic_histogram);
+
+static void optimistic_hit(unsigned long height)
+{
+	unsigned long flags;
+
+	if (height > RADIX_TREE_MAX_PATH)
+		height = RADIX_TREE_MAX_PATH;
+
+	/*
+	 * HACK:
+	 *  On PREEMPT_RT this protects the percpu var.
+	 *  We really need to fix this to pass in cpu var protected
+	 *  with a RT sleeping spinlock.
+	 */
+	local_irq_save(flags);
+	__get_cpu_var(optimistic_histogram)[height]++;
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos < 0 || *pos > RADIX_TREE_MAX_PATH)
+		return NULL;
+
+	m->private = (void *)(unsigned long)*pos;
+	return pos;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	if (*pos < RADIX_TREE_MAX_PATH) {
+		(*pos)++;
+		(*((unsigned long *)&m->private))++;
+		return pos;
+	}
+	return NULL;
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+unsigned long get_optimistic_stat(unsigned long index)
+{
+	unsigned long total = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		total += per_cpu(optimistic_histogram, cpu)[index];
+	}
+	return total;
+}
+
+static int frag_show(struct seq_file *m, void *arg)
+{
+	unsigned long index = (unsigned long)m->private;
+	unsigned long hits = get_optimistic_stat(index);
+
+	if (index == 0)
+		seq_printf(m, "levels skipped\thits\n");
+
+	if (index < RADIX_TREE_MAX_PATH)
+		seq_printf(m, "%9lu\t%9lu\n", index, hits);
+	else
+		seq_printf(m, "failed\t%9lu\n", hits);
+
+	return 0;
+}
+
+struct seq_operations optimistic_op = {
+	.start = frag_start,
+	.next = frag_next,
+	.stop = frag_stop,
+	.show = frag_show,
+};
+
+static void optimistic_reset(void)
+{
+	int cpu;
+	int height;
+	for_each_possible_cpu(cpu) {
+		for (height = 0; height <= RADIX_TREE_MAX_PATH; height++)
+			per_cpu(optimistic_histogram, cpu)[height] = 0;
+	}
+}
+
+ssize_t optimistic_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	if (count) {
+		char c;
+		if (get_user(c, buf))
+			return -EFAULT;
+		if (c == '0')
+			optimistic_reset();
+	}
+	return count;
+}
+
+#endif // CONFIG_PROC_FS
+#endif // CONFIG_RADIX_TREE_OPTIMISTIC
+
 /*
  * Radix tree node cache.
  */
@@ -93,7 +233,7 @@ static inline gfp_t root_gfp_mask(struct
  * that the caller has pinned this thread of control to the current CPU.
  */
 static struct radix_tree_node *
-radix_tree_node_alloc(struct radix_tree_root *root)
+radix_tree_node_alloc(struct radix_tree_root *root, int height)
 {
 	struct radix_tree_node *ret = NULL;
 	gfp_t gfp_mask = root_gfp_mask(root);
@@ -106,18 +246,26 @@ radix_tree_node_alloc(struct radix_tree_
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
-		rtp = &__get_cpu_var(radix_tree_preloads);
+		rtp = &get_cpu_var(radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes[rtp->nr - 1];
 			rtp->nodes[rtp->nr - 1] = NULL;
 			rtp->nr--;
 		}
+		put_cpu_var(radix_tree_preloads);
 	}
 	if (ret == NULL)
 		ret = kmem_cache_alloc(radix_tree_node_cachep,
 				set_migrateflags(gfp_mask, __GFP_RECLAIMABLE));
 
 	BUG_ON(radix_tree_is_indirect_ptr(ret));
+#ifdef CONFIG_RADIX_TREE_CONCURRENT
+	spin_lock_init(&ret->lock);
+	lockdep_set_class_and_name(&ret->lock,
+			&radix_node_class[height],
+			radix_node_key_string[height]);
+#endif
+	ret->height = height;
 	return ret;
 }
 
@@ -134,6 +282,8 @@ radix_tree_node_free(struct radix_tree_n
 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -167,6 +317,8 @@ out:
 }
 EXPORT_SYMBOL(radix_tree_preload);
 
+#endif
+
 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
 		int offset)
 {
@@ -220,6 +372,22 @@ static inline int any_tag_set(struct rad
 	return 0;
 }
 
+static inline int any_tag_set_but(struct radix_tree_node *node,
+		unsigned int tag, int offset)
+{
+	int idx;
+	int offset_idx = offset / BITS_PER_LONG;
+	unsigned long offset_mask = ~(1UL << (offset % BITS_PER_LONG));
+	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+		unsigned long mask = ~0UL;
+		if (idx == offset_idx)
+			mask = offset_mask;
+		if (node->tags[tag][idx] & mask)
+			return 1;
+	}
+	return 0;
+}
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
@@ -249,8 +417,8 @@ static int radix_tree_extend(struct radi
 	}
 
 	do {
-		unsigned int newheight;
-		if (!(node = radix_tree_node_alloc(root)))
+		unsigned int newheight = root->height + 1;
+		if (!(node = radix_tree_node_alloc(root, newheight)))
 			return -ENOMEM;
 
 		/* Increase the height.  */
@@ -262,8 +430,6 @@ static int radix_tree_extend(struct radi
 				tag_set(node, tag, 0);
 		}
 
-		newheight = root->height+1;
-		node->height = newheight;
 		node->count = 1;
 		node = radix_tree_ptr_to_indirect(node);
 		rcu_assign_pointer(root->rnode, node);
@@ -273,6 +439,193 @@ out:
 	return 0;
 }
 
+#ifdef CONFIG_RADIX_TREE_CONCURRENT
+static inline struct radix_tree_context *
+radix_tree_get_context(struct radix_tree_root **rootp)
+{
+	struct radix_tree_context *context = NULL;
+	unsigned long addr = (unsigned long)*rootp;
+
+	if (addr & 1) {
+		context = (struct radix_tree_context *)(addr - 1);
+		*rootp = context->root;
+	}
+
+	return context;
+}
+
+#define RADIX_TREE_CONTEXT(context, root) \
+	struct radix_tree_context *context =	\
+		radix_tree_get_context(&root)
+
+static inline spinlock_t *radix_node_lock(struct radix_tree_root *root,
+		struct radix_tree_node *node)
+{
+	spinlock_t *locked = &node->lock;
+	spin_lock(locked);
+	return locked;
+}
+
+static inline void radix_ladder_lock(struct radix_tree_context *context,
+		struct radix_tree_node *node)
+{
+	if (context) {
+		struct radix_tree_root *root = context->root;
+		spinlock_t *locked = radix_node_lock(root, node);
+		if (locked) {
+			spin_unlock(context->locked);
+			context->locked = locked;
+		}
+	}
+}
+
+static inline void radix_path_init(struct radix_tree_context *context,
+		struct radix_tree_path *pathp)
+{
+	pathp->locked = context ? context->locked : NULL;
+}
+
+static inline void radix_path_lock(struct radix_tree_context *context,
+		struct radix_tree_path *pathp, struct radix_tree_node *node)
+{
+	if (context) {
+		struct radix_tree_root *root = context->root;
+		spinlock_t *locked = radix_node_lock(root, node);
+		if (locked)
+			context->locked = locked;
+		pathp->locked = locked;
+	} else
+		pathp->locked = NULL;
+}
+
+static inline void radix_path_unlock(struct radix_tree_context *context,
+		struct radix_tree_path *punlock)
+{
+	if (context && punlock->locked &&
+			context->locked != punlock->locked)
+		spin_unlock(punlock->locked);
+}
+#else
+#define RADIX_TREE_CONTEXT(context, root) do { } while (0)
+#define radix_ladder_lock(context, node) do { } while (0)
+#define radix_path_init(context, pathp) do { } while (0)
+#define radix_path_lock(context, pathp, node) do { } while (0)
+#define radix_path_unlock(context, punlock) do { } while (0)
+#endif
+
+#ifdef CONFIG_RADIX_TREE_OPTIMISTIC
+typedef int (*radix_valid_fn)(struct radix_tree_node *, int, int);
+
+static struct radix_tree_node *
+radix_optimistic_lookup(struct radix_tree_context *context, unsigned long index,
+		int tag, radix_valid_fn valid)
+{
+	unsigned int height, shift;
+	struct radix_tree_node *node, *ret = NULL, **slot;
+	struct radix_tree_root *root = context->root;
+
+	node = rcu_dereference(root->rnode);
+	if (node == NULL)
+		return NULL;
+
+	if (!radix_tree_is_indirect_ptr(node))
+			return NULL;
+
+	node = radix_tree_indirect_to_ptr(node);
+
+	height = node->height;
+	if (index > radix_tree_maxindex(height))
+		return NULL;
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	do {
+		int offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		if ((*valid)(node, offset, tag))
+			ret = node;
+		slot = (struct radix_tree_node **)(node->slots + offset);
+		node = rcu_dereference(*slot);
+		if (!node)
+			break;
+
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	} while (height > 0);
+
+	return ret;
+}
+
+static struct radix_tree_node *
+__radix_optimistic_lock(struct radix_tree_context *context, unsigned long index,
+	       	int tag, radix_valid_fn valid)
+{
+	struct radix_tree_node *node;
+	spinlock_t *locked;
+	unsigned int shift, offset;
+
+	node = radix_optimistic_lookup(context, index, tag, valid);
+	if (!node)
+		goto out;
+
+	locked = radix_node_lock(context->root, node);
+	if (!locked)
+		goto out;
+
+#if 0
+	if (node != radix_optimistic_lookup(context, index, tag, valid))
+		goto out_unlock;
+#else
+	/* check if the node got freed */
+	if (!node->count)
+		goto out_unlock;
+
+	/* check if the node is still a valid termination point */
+	shift = (node->height - 1) * RADIX_TREE_MAP_SHIFT;
+	offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+	if (!(*valid)(node, offset, tag))
+		goto out_unlock;
+#endif
+
+	context->locked = locked;
+	return node;
+
+out_unlock:
+	spin_unlock(locked);
+out:
+	return NULL;
+}
+
+static struct radix_tree_node *
+radix_optimistic_lock(struct radix_tree_context *context, unsigned long index,
+		int tag, radix_valid_fn valid)
+{
+	struct radix_tree_node *node = NULL;
+
+	if (context) {
+		node = __radix_optimistic_lock(context, index, tag, valid);
+		if (!node) {
+			BUG_ON(context->locked);
+			spin_lock(&context->root->lock);
+			context->locked = &context->root->lock;
+			optimistic_hit(RADIX_TREE_MAX_PATH);
+		} else
+			optimistic_hit(context->root->height - node->height);
+	}
+	return node;
+}
+
+static int radix_valid_always(struct radix_tree_node *node, int offset, int tag)
+{
+	return 1;
+}
+
+static int radix_valid_tag(struct radix_tree_node *node, int offset, int tag)
+{
+	return tag_get(node, tag, offset);
+}
+#else
+#define radix_optimistic_lock(context, index, tag, valid) NULL
+#endif
+
 /**
  *	radix_tree_insert    -    insert into a radix tree
  *	@root:		radix tree root
@@ -288,9 +641,18 @@ int radix_tree_insert(struct radix_tree_
 	unsigned int height, shift;
 	int offset;
 	int error;
+	int tag;
+	RADIX_TREE_CONTEXT(context, root);
 
 	BUG_ON(radix_tree_is_indirect_ptr(item));
 
+	node = radix_optimistic_lock(context, index, 0, radix_valid_always);
+	if (node) {
+		height = node->height;
+		shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+		goto optimistic;
+	}
+
 	/* Make sure the tree is high enough.  */
 	if (index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
@@ -299,7 +661,6 @@ int radix_tree_insert(struct radix_tree_
 	}
 
 	slot = radix_tree_indirect_to_ptr(root->rnode);
-
 	height = root->height;
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
@@ -307,9 +668,8 @@ int radix_tree_insert(struct radix_tree_
 	while (height > 0) {
 		if (slot == NULL) {
 			/* Have to add a child node.  */
-			if (!(slot = radix_tree_node_alloc(root)))
+			if (!(slot = radix_tree_node_alloc(root, height)))
 				return -ENOMEM;
-			slot->height = height;
 			if (node) {
 				rcu_assign_pointer(node->slots[offset], slot);
 				node->count++;
@@ -319,8 +679,11 @@ int radix_tree_insert(struct radix_tree_
 		}
 
 		/* Go a level down */
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		node = slot;
+		radix_ladder_lock(context, node);
+
+optimistic:
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		slot = node->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
@@ -332,12 +695,12 @@ int radix_tree_insert(struct radix_tree_
 	if (node) {
 		node->count++;
 		rcu_assign_pointer(node->slots[offset], item);
-		BUG_ON(tag_get(node, 0, offset));
-		BUG_ON(tag_get(node, 1, offset));
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			BUG_ON(tag_get(node, tag, offset));
 	} else {
 		rcu_assign_pointer(root->rnode, item);
-		BUG_ON(root_tag_get(root, 0));
-		BUG_ON(root_tag_get(root, 1));
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			BUG_ON(root_tag_get(root, tag));
 	}
 
 	return 0;
@@ -352,18 +715,22 @@ EXPORT_SYMBOL(radix_tree_insert);
  *	Returns:  the slot corresponding to the position @index in the
  *	radix tree @root. This is useful for update-if-exists operations.
  *
- *	This function cannot be called under rcu_read_lock, it must be
- *	excluded from writers, as must the returned slot for subsequent
- *	use by radix_tree_deref_slot() and radix_tree_replace slot.
- *	Caller must hold tree write locked across slot lookup and
- *	replace.
+ *	This function can be called under rcu_read_lock iff the slot is not
+ *	modified by radix_tree_replace_slot, otherwise it must be called
+ *	exclusive from other writers. Any dereference of the slot must be done
+ *	using radix_tree_deref_slot.
  */
 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
 {
 	unsigned int height, shift;
 	struct radix_tree_node *node, **slot;
+	RADIX_TREE_CONTEXT(context, root);
+
+	node = radix_optimistic_lock(context, index, 0, radix_valid_always);
+	if (node)
+		goto optimistic;
 
-	node = root->rnode;
+	node = rcu_dereference(root->rnode);
 	if (node == NULL)
 		return NULL;
 
@@ -374,6 +741,7 @@ void **radix_tree_lookup_slot(struct rad
 	}
 	node = radix_tree_indirect_to_ptr(node);
 
+optimistic:
 	height = node->height;
 	if (index > radix_tree_maxindex(height))
 		return NULL;
@@ -383,10 +751,12 @@ void **radix_tree_lookup_slot(struct rad
 	do {
 		slot = (struct radix_tree_node **)
 			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
-		node = *slot;
+		node = rcu_dereference(*slot);
 		if (node == NULL)
 			return NULL;
 
+		radix_ladder_lock(context, node);
+
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	} while (height > 0);
@@ -462,6 +832,14 @@ void *radix_tree_tag_set(struct radix_tr
 {
 	unsigned int height, shift;
 	struct radix_tree_node *slot;
+	RADIX_TREE_CONTEXT(context, root);
+
+	slot = radix_optimistic_lock(context, index, tag, radix_valid_tag);
+	if (slot) {
+		height = slot->height;
+		shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+		goto optimistic;
+	}
 
 	height = root->height;
 	BUG_ON(index > radix_tree_maxindex(height));
@@ -469,9 +847,16 @@ void *radix_tree_tag_set(struct radix_tr
 	slot = radix_tree_indirect_to_ptr(root->rnode);
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 
+	/* set the root's tag bit */
+	if (slot && !root_tag_get(root, tag))
+		root_tag_set(root, tag);
+
 	while (height > 0) {
 		int offset;
 
+		radix_ladder_lock(context, slot);
+
+optimistic:
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		if (!tag_get(slot, tag, offset))
 			tag_set(slot, tag, offset);
@@ -481,14 +866,24 @@ void *radix_tree_tag_set(struct radix_tr
 		height--;
 	}
 
-	/* set the root's tag bit */
-	if (slot && !root_tag_get(root, tag))
-		root_tag_set(root, tag);
-
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
 
+/*
+ * the change can never propagate upwards from here.
+ */
+static
+int radix_valid_tag_clear(struct radix_tree_node *node, int offset, int tag)
+{
+	int this, other;
+
+	this = tag_get(node, tag, offset);
+	other = any_tag_set_but(node, tag, offset);
+
+	return !this || other;
+}
+
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
  *	@root:		radix tree root
@@ -511,28 +906,51 @@ void *radix_tree_tag_clear(struct radix_
 	 * since the "list" is null terminated.
 	 */
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
+	struct radix_tree_path *punlock = path, *piter;
 	struct radix_tree_node *slot = NULL;
-	unsigned int height, shift;
+	unsigned int height, shift, offset;
+
+ 	RADIX_TREE_CONTEXT(context, root);
+
+	slot = radix_optimistic_lock(context, index, tag,
+			radix_valid_tag_clear);
+	if (slot) {
+		height = slot->height;
+		shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp->offset = offset;
+		pathp->node = slot;
+		radix_path_init(context, pathp);
+		goto optimistic;
+	}
+
+ 	pathp->node = NULL;
+ 	radix_path_init(context, pathp);
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
 		goto out;
 
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
 	slot = radix_tree_indirect_to_ptr(root->rnode);
 
 	while (height > 0) {
-		int offset;
-
 		if (slot == NULL)
 			goto out;
 
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp[1].offset = offset;
-		pathp[1].node = slot;
-		slot = slot->slots[offset];
 		pathp++;
+		pathp->offset = offset;
+		pathp->node = slot;
+		radix_path_lock(context, pathp, slot);
+
+		if (radix_valid_tag_clear(slot, offset, tag)) {
+			for (; punlock < pathp; punlock++)
+				radix_path_unlock(context, punlock);
+		}
+
+optimistic:
+		slot = slot->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	}
@@ -540,20 +958,22 @@ void *radix_tree_tag_clear(struct radix_
 	if (slot == NULL)
 		goto out;
 
-	while (pathp->node) {
-		if (!tag_get(pathp->node, tag, pathp->offset))
-			goto out;
-		tag_clear(pathp->node, tag, pathp->offset);
-		if (any_tag_set(pathp->node, tag))
-			goto out;
-		pathp--;
+	for (piter = pathp; piter >= punlock; piter--) {
+		if (piter->node) {
+			if (!tag_get(piter->node, tag, piter->offset))
+				break;
+			tag_clear(piter->node, tag, piter->offset);
+			if (any_tag_set(piter->node, tag))
+				break;
+		} else {
+			if (root_tag_get(root, tag))
+				root_tag_clear(root, tag);
+		}
 	}
 
-	/* clear the root's tag bit */
-	if (root_tag_get(root, tag))
-		root_tag_clear(root, tag);
-
 out:
+	for (; punlock < pathp; punlock++)
+		radix_path_unlock(context, punlock);
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
@@ -660,7 +1080,7 @@ unsigned long radix_tree_next_hole(struc
 EXPORT_SYMBOL(radix_tree_next_hole);
 
 static unsigned int
-__lookup(struct radix_tree_node *slot, void **results, unsigned long index,
+__lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)
 {
 	unsigned int nr_found = 0;
@@ -694,11 +1114,9 @@ __lookup(struct radix_tree_node *slot, v
 
 	/* Bottom level: grab some items */
 	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
-		struct radix_tree_node *node;
 		index++;
-		node = slot->slots[i];
-		if (node) {
-			results[nr_found++] = rcu_dereference(node);
+		if (slot->slots[i]) {
+			results[nr_found++] = &(slot->slots[i]);
 			if (nr_found == max_items)
 				goto out;
 		}
@@ -752,13 +1170,22 @@ radix_tree_gang_lookup(struct radix_tree
 
 	ret = 0;
 	while (ret < max_items) {
-		unsigned int nr_found;
+		unsigned int nr_found, slots_found, i;
 		unsigned long next_index;	/* Index of next search */
 
 		if (cur_index > max_index)
 			break;
-		nr_found = __lookup(node, results + ret, cur_index,
+		slots_found = __lookup(node, (void ***)results + ret, cur_index,
 					max_items - ret, &next_index);
+		nr_found = 0;
+		for (i = 0; i < slots_found; i++) {
+			struct radix_tree_node *slot;
+			slot = *(((void ***)results)[ret + i]);
+			if (!slot)
+				continue;
+			results[ret + nr_found] = rcu_dereference(slot);
+			nr_found++;
+		}
 		ret += nr_found;
 		if (next_index == 0)
 			break;
@@ -769,12 +1196,71 @@ radix_tree_gang_lookup(struct radix_tree
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup);
 
+/**
+ *	radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *
+ *	Performs an index-ascending scan of the tree for present items.  Places
+ *	their slots at *@results and returns the number of items which were
+ *	placed at *@results.
+ *
+ *	The implementation is naive.
+ *
+ *	Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must
+ *	be dereferenced with radix_tree_deref_slot, and if using only RCU
+ *	protection, radix_tree_deref_slot may fail requiring a retry.
+ */
+unsigned int
+radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+			unsigned long first_index, unsigned int max_items)
+{
+	unsigned long max_index;
+	struct radix_tree_node *node;
+	unsigned long cur_index = first_index;
+	unsigned int ret;
+
+	node = rcu_dereference(root->rnode);
+	if (!node)
+		return 0;
+
+	if (!radix_tree_is_indirect_ptr(node)) {
+		if (first_index > 0)
+			return 0;
+		results[0] = (void **)&root->rnode;
+		return 1;
+	}
+	node = radix_tree_indirect_to_ptr(node);
+
+	max_index = radix_tree_maxindex(node->height);
+
+	ret = 0;
+	while (ret < max_items) {
+		unsigned int slots_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		slots_found = __lookup(node, results + ret, cur_index,
+					max_items - ret, &next_index);
+		ret += slots_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
+
 /*
  * FIXME: the two tag_get()s here should use find_next_bit() instead of
  * open-coding the search.
  */
 static unsigned int
-__lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
+__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index, unsigned int tag)
 {
 	unsigned int nr_found = 0;
@@ -819,9 +1305,8 @@ __lookup_tag(struct radix_tree_node *slo
 				 * lookup ->slots[x] without a lock (ie. can't
 				 * rely on its value remaining the same).
 				 */
-				if (node) {
-					node = rcu_dereference(node);
-					results[nr_found++] = node;
+				if (slot->slots[j]) {
+					results[nr_found++] = &slot->slots[j];
 					if (nr_found == max_items)
 						goto out;
 				}
@@ -880,13 +1365,22 @@ radix_tree_gang_lookup_tag(struct radix_
 
 	ret = 0;
 	while (ret < max_items) {
-		unsigned int nr_found;
+		unsigned int slots_found, nr_found, i;
 		unsigned long next_index;	/* Index of next search */
 
 		if (cur_index > max_index)
 			break;
-		nr_found = __lookup_tag(node, results + ret, cur_index,
-					max_items - ret, &next_index, tag);
+		slots_found = __lookup_tag(node, (void ***)results + ret,
+				cur_index, max_items - ret, &next_index, tag);
+		nr_found = 0;
+		for (i = 0; i < slots_found; i++) {
+			struct radix_tree_node *slot;
+			slot = *((void ***)results)[ret + i];
+			if (!slot)
+				continue;
+			results[ret + nr_found] = rcu_dereference(slot);
+			nr_found++;
+		}
 		ret += nr_found;
 		if (next_index == 0)
 			break;
@@ -898,6 +1392,67 @@ radix_tree_gang_lookup_tag(struct radix_
 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
 
 /**
+ *	radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
+ *					  radix tree based on a tag
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ *	Performs an index-ascending scan of the tree for present items which
+ *	have the tag indexed by @tag set.  Places the slots at *@results and
+ *	returns the number of slots which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag)
+{
+	struct radix_tree_node *node;
+	unsigned long max_index;
+	unsigned long cur_index = first_index;
+	unsigned int ret;
+
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
+	node = rcu_dereference(root->rnode);
+	if (!node)
+		return 0;
+
+	if (!radix_tree_is_indirect_ptr(node)) {
+		if (first_index > 0)
+			return 0;
+		results[0] = (void **)&root->rnode;
+		return 1;
+	}
+	node = radix_tree_indirect_to_ptr(node);
+
+	max_index = radix_tree_maxindex(node->height);
+
+	ret = 0;
+	while (ret < max_items) {
+		unsigned int slots_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		slots_found = __lookup_tag(node, results + ret,
+				cur_index, max_items - ret, &next_index, tag);
+		ret += slots_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
+
+
+/**
  *	radix_tree_shrink    -    shrink height of a radix tree to minimal
  *	@root		radix tree root
  */
@@ -907,6 +1462,7 @@ static inline void radix_tree_shrink(str
 	while (root->height > 0) {
 		struct radix_tree_node *to_free = root->rnode;
 		void *newptr;
+		int tag;
 
 		BUG_ON(!radix_tree_is_indirect_ptr(to_free));
 		to_free = radix_tree_indirect_to_ptr(to_free);
@@ -933,14 +1489,35 @@ static inline void radix_tree_shrink(str
 		root->rnode = newptr;
 		root->height--;
 		/* must only free zeroed nodes into the slab */
-		tag_clear(to_free, 0, 0);
-		tag_clear(to_free, 1, 0);
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			tag_clear(to_free, tag, 0);
 		to_free->slots[0] = NULL;
 		to_free->count = 0;
-		radix_tree_node_free(to_free);
 	}
 }
 
+static
+int radix_valid_delete(struct radix_tree_node *node, int offset, int tag)
+{
+	/*
+	 * we need to check for > 2, because nodes with a single child
+	 * can still be deleted, see radix_tree_shrink().
+	 */
+	int unlock = (node->count > 2);
+
+	if (!unlock)
+		return unlock;
+
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+		if (!radix_valid_tag_clear(node, offset, tag)) {
+			unlock = 0;
+			break;
+		}
+	}
+
+	return unlock;
+}
+
 /**
  *	radix_tree_delete    -    delete an item from a radix tree
  *	@root:		radix tree root
@@ -957,11 +1534,26 @@ void *radix_tree_delete(struct radix_tre
 	 * since the "list" is null terminated.
 	 */
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
+	struct radix_tree_path *punlock = path, *piter;
 	struct radix_tree_node *slot = NULL;
-	struct radix_tree_node *to_free;
 	unsigned int height, shift;
 	int tag;
 	int offset;
+	RADIX_TREE_CONTEXT(context, root);
+
+	slot = radix_optimistic_lock(context, index, 0, radix_valid_delete);
+	if (slot) {
+		height = slot->height;
+		shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp->offset = offset;
+		pathp->node = slot;
+		radix_path_init(context, pathp);
+		goto optimistic;
+	}
+
+	pathp->node = NULL;
+	radix_path_init(context, pathp);
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
@@ -976,7 +1568,6 @@ void *radix_tree_delete(struct radix_tre
 	slot = radix_tree_indirect_to_ptr(slot);
 
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
 
 	do {
 		if (slot == NULL)
@@ -986,6 +1577,14 @@ void *radix_tree_delete(struct radix_tre
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		pathp->offset = offset;
 		pathp->node = slot;
+		radix_path_lock(context, pathp, slot);
+
+		if (radix_valid_delete(slot, offset, 0)) {
+			for (; punlock < pathp; punlock++)
+				radix_path_unlock(context, punlock);
+		}
+
+optimistic:
 		slot = slot->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
@@ -998,41 +1597,45 @@ void *radix_tree_delete(struct radix_tre
 	 * Clear all tags associated with the just-deleted item
 	 */
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (tag_get(pathp->node, tag, pathp->offset))
-			radix_tree_tag_clear(root, index, tag);
+		for (piter = pathp; piter >= punlock; piter--) {
+			if (piter->node) {
+				if (!tag_get(piter->node, tag, piter->offset))
+					break;
+				tag_clear(piter->node, tag, piter->offset);
+				if (any_tag_set(piter->node, tag))
+					break;
+			} else {
+				if (root_tag_get(root, tag))
+					root_tag_clear(root, tag);
+			}
+		}
 	}
 
-	to_free = NULL;
-	/* Now free the nodes we do not need anymore */
-	while (pathp->node) {
-		pathp->node->slots[pathp->offset] = NULL;
-		pathp->node->count--;
-		/*
-		 * Queue the node for deferred freeing after the
-		 * last reference to it disappears (set NULL, above).
-		 */
-		if (to_free)
-			radix_tree_node_free(to_free);
+	/* Now unhook the nodes we do not need anymore */
+	for (piter = pathp; piter >= punlock && piter->node; piter--) {
+		piter->node->slots[piter->offset] = NULL;
+		piter->node->count--;
 
-		if (pathp->node->count) {
-			if (pathp->node ==
+		if (piter->node->count) {
+			if (piter->node ==
 					radix_tree_indirect_to_ptr(root->rnode))
 				radix_tree_shrink(root);
 			goto out;
 		}
+	}
 
-		/* Node with zero slots in use so free it */
-		to_free = pathp->node;
-		pathp--;
+	BUG_ON(piter->node);
 
-	}
 	root_tag_clear_all(root);
 	root->height = 0;
 	root->rnode = NULL;
-	if (to_free)
-		radix_tree_node_free(to_free);
 
 out:
+	for (; punlock <= pathp; punlock++) {
+		radix_path_unlock(context, punlock);
+		if (punlock->node && punlock->node->count == 0)
+			radix_tree_node_free(punlock->node);
+	}
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_delete);
Index: linux-2.6.25.4-rt4/kernel/notifier.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/notifier.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/notifier.c	2008-05-29 09:46:43.000000000 -0400
@@ -56,7 +56,7 @@ static int notifier_chain_unregister(str
  *	@returns:	notifier_call_chain returns the value returned by the
  *			last notifier function called.
  */
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
+static int __kprobes notrace notifier_call_chain(struct notifier_block **nl,
 					unsigned long val, void *v,
 					int nr_to_call,	int *nr_calls)
 {
@@ -194,7 +194,7 @@ int blocking_notifier_chain_register(str
 	 * not yet working and interrupts must remain disabled.  At
 	 * such times we must not call down_write().
 	 */
-	if (unlikely(system_state == SYSTEM_BOOTING))
+	if (unlikely(system_state < SYSTEM_RUNNING))
 		return notifier_chain_register(&nh->head, n);
 
 	down_write(&nh->rwsem);
Index: linux-2.6.25.4-rt4/block/blk-core.c
===================================================================
--- linux-2.6.25.4-rt4.orig/block/blk-core.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/block/blk-core.c	2008-05-29 09:46:44.000000000 -0400
@@ -211,7 +211,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags);
  */
 void blk_plug_device(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
@@ -233,7 +233,7 @@ EXPORT_SYMBOL(blk_plug_device);
  */
 int blk_remove_plug(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		return 0;
@@ -331,7 +331,7 @@ EXPORT_SYMBOL(blk_unplug);
  **/
 void blk_start_queue(struct request_queue *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
 
Index: linux-2.6.25.4-rt4/fs/aio.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/aio.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/aio.c	2008-05-29 09:46:44.000000000 -0400
@@ -581,13 +581,15 @@ static void use_mm(struct mm_struct *mm)
 	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
-	tsk->mm = mm;
-	tsk->active_mm = mm;
+	local_irq_disable(); // FIXME
 	/*
 	 * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
 	 * it won't work. Update it accordingly if you change it here
 	 */
 	switch_mm(active_mm, mm, tsk);
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	local_irq_enable();
 	task_unlock(tsk);
 
 	mmdrop(active_mm);
Index: linux-2.6.25.4-rt4/fs/block_dev.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/block_dev.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/block_dev.c	2008-05-29 09:46:55.000000000 -0400
@@ -61,7 +61,7 @@ static sector_t max_block(struct block_d
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 static void kill_bdev(struct block_device *bdev)
 {
-	if (bdev->bd_inode->i_mapping->nrpages == 0)
+	if (mapping_nrpages(bdev->bd_inode->i_mapping) == 0)
 		return;
 	invalidate_bh_lrus();
 	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
@@ -408,7 +408,7 @@ long nr_blockdev_pages(void)
 	long ret = 0;
 	spin_lock(&bdev_lock);
 	list_for_each_entry(bdev, &all_bdevs, bd_list) {
-		ret += bdev->bd_inode->i_mapping->nrpages;
+		ret += mapping_nrpages(bdev->bd_inode->i_mapping);
 	}
 	spin_unlock(&bdev_lock);
 	return ret;
@@ -1029,14 +1029,32 @@ static int __blkdev_get(struct block_dev
 	 * For now, block device ->open() routine must _not_
 	 * examine anything in 'inode' argument except ->i_rdev.
 	 */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_flags = flags;
-	fake_file.f_path.dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
+	struct file *fake_file;
+	struct dentry *fake_dentry;
+	int err = -ENOMEM;
 
-	return do_open(bdev, &fake_file, for_part);
+	fake_file = kmalloc(sizeof(*fake_file), GFP_KERNEL);
+	if (!fake_file)
+		goto out;
+	memset(fake_file, 0, sizeof(*fake_file));
+
+	fake_dentry = kmalloc(sizeof(*fake_dentry), GFP_KERNEL);
+	if (!fake_dentry)
+		goto out_free_file;
+	memset(fake_dentry, 0, sizeof(*fake_dentry));
+
+	fake_file->f_mode = mode;
+	fake_file->f_flags = flags;
+	fake_file->f_path.dentry = fake_dentry;
+	fake_dentry->d_inode = bdev->bd_inode;
+
+	err = do_open(bdev, fake_file, for_part);
+
+	kfree(fake_dentry);
+out_free_file:
+	kfree(fake_file);
+out:
+	return err;
 }
 
 int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags)
Index: linux-2.6.25.4-rt4/fs/dcache.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/dcache.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/dcache.c	2008-05-29 09:46:44.000000000 -0400
@@ -693,8 +693,9 @@ void shrink_dcache_for_umount(struct sup
 {
 	struct dentry *dentry;
 
-	if (down_read_trylock(&sb->s_umount))
-		BUG();
+// -rt: this might succeed there ...
+//	if (down_read_trylock(&sb->s_umount))
+//		BUG();
 
 	dentry = sb->s_root;
 	sb->s_root = NULL;
Index: linux-2.6.25.4-rt4/fs/dnotify.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/dnotify.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/dnotify.c	2008-05-29 09:46:44.000000000 -0400
@@ -173,7 +173,7 @@ void dnotify_parent(struct dentry *dentr
 
 	spin_lock(&dentry->d_lock);
 	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
+	if (unlikely(parent->d_inode->i_dnotify_mask & event)) {
 		dget(parent);
 		spin_unlock(&dentry->d_lock);
 		__inode_dir_notify(parent->d_inode, event);
Index: linux-2.6.25.4-rt4/fs/exec.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/exec.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/exec.c	2008-05-29 09:46:44.000000000 -0400
@@ -48,6 +48,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
+#include <linux/delay.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
@@ -730,11 +731,16 @@ static int exec_mmap(struct mm_struct *m
 		}
 	}
 	task_lock(tsk);
+
+	local_irq_disable();
 	active_mm = tsk->active_mm;
+	activate_mm(active_mm, mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	activate_mm(active_mm, mm);
+	local_irq_enable();
+
 	task_unlock(tsk);
+
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
Index: linux-2.6.25.4-rt4/fs/file.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/file.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/file.c	2008-05-29 09:46:44.000000000 -0400
@@ -98,14 +98,15 @@ void free_fdtable_rcu(struct rcu_head *r
 		kfree(fdt->open_fds);
 		kfree(fdt);
 	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
+
+		fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id());
+
 		spin_lock(&fddef->lock);
 		fdt->next = fddef->next;
 		fddef->next = fdt;
 		/* vmallocs are handled from the workqueue context */
 		schedule_work(&fddef->wq);
 		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
 	}
 }
 
Index: linux-2.6.25.4-rt4/fs/lockd/svc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/lockd/svc.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/lockd/svc.c	2008-05-29 09:46:44.000000000 -0400
@@ -344,16 +344,12 @@ lockd_down(void)
 	 * Wait for the lockd process to exit, but since we're holding
 	 * the lockd semaphore, we can't wait around forever ...
 	 */
-	clear_thread_flag(TIF_SIGPENDING);
-	interruptible_sleep_on_timeout(&lockd_exit, HZ);
-	if (nlmsvc_pid) {
+	if (wait_event_interruptible_timeout(lockd_exit,
+					     nlmsvc_pid == 0, HZ) <= 0) {
 		printk(KERN_WARNING 
 			"lockd_down: lockd failed to exit, clearing pid\n");
 		nlmsvc_pid = 0;
 	}
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
 out:
 	mutex_unlock(&nlmsvc_mutex);
 }
Index: linux-2.6.25.4-rt4/fs/pipe.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/pipe.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/pipe.c	2008-05-29 09:46:52.000000000 -0400
@@ -385,8 +385,14 @@ redo:
 		wake_up_interruptible_sync(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_accessed(filp);
+#endif
 	return ret;
 }
 
@@ -558,8 +564,14 @@ out:
 		wake_up_interruptible_sync(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_update_time(filp);
+#endif
 	return ret;
 }
 
@@ -996,12 +1008,17 @@ struct file *create_write_pipe(void)
 	return ERR_PTR(err);
 }
 
-void free_write_pipe(struct file *f)
+void free_write_pipe(struct file *file)
 {
-	free_pipe_info(f->f_dentry->d_inode);
-	dput(f->f_path.dentry);
-	mntput(f->f_path.mnt);
-	put_filp(f);
+	struct dentry *dentry = file->f_path.dentry;
+	struct vfsmount *mnt = file->f_path.mnt;
+
+	free_pipe_info(file->f_dentry->d_inode);
+	file->f_path.dentry = NULL;
+	file->f_path.mnt = NULL;
+	put_filp(file);
+	dput(dentry);
+	mntput(mnt);
 }
 
 struct file *create_read_pipe(struct file *wrf)
Index: linux-2.6.25.4-rt4/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/proc/task_mmu.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/proc/task_mmu.c	2008-05-29 09:46:44.000000000 -0400
@@ -173,8 +173,10 @@ static void *m_start(struct seq_file *m,
 	vma = NULL;
 	if ((unsigned long)l < mm->map_count) {
 		vma = mm->mmap;
-		while (l-- && vma)
+		while (l-- && vma) {
 			vma = vma->vm_next;
+			cond_resched();
+		}
 		goto out;
 	}
 
Index: linux-2.6.25.4-rt4/fs/xfs/linux-2.6/mrlock.h
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/xfs/linux-2.6/mrlock.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/xfs/linux-2.6/mrlock.h	2008-05-29 09:46:44.000000000 -0400
@@ -23,8 +23,8 @@
 enum { MR_NONE, MR_ACCESS, MR_UPDATE };
 
 typedef struct {
-	struct rw_semaphore	mr_lock;
-	int			mr_writer;
+	struct compat_rw_semaphore	mr_lock;
+	int				mr_writer;
 } mrlock_t;
 
 #define mrinit(mrp, name)	\
Index: linux-2.6.25.4-rt4/fs/xfs/xfs_mount.h
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/xfs/xfs_mount.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/xfs/xfs_mount.h	2008-05-29 09:46:44.000000000 -0400
@@ -282,7 +282,7 @@ typedef struct xfs_mount {
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* XFS_IN_MAXLEVELS */
 	struct xfs_perag	*m_perag;	/* per-ag accounting info */
-	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	struct compat_rw_semaphore m_peraglock;	/* lock for m_perag (pointer) */
 	struct mutex		m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
Index: linux-2.6.25.4-rt4/include/linux/genhd.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/genhd.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/genhd.h	2008-05-29 09:47:21.000000000 -0400
@@ -164,8 +164,13 @@ static inline struct hd_struct *get_part
 }
 
 #ifdef	CONFIG_SMP
-#define __disk_stat_add(gendiskp, field, addnd) 	\
-	(per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd)
+#define __disk_stat_add(gendiskp, field, addnd)			\
+do {								\
+	preempt_disable();					\
+	(per_cpu_ptr(gendiskp->dkstats,				\
+			smp_processor_id())->field += addnd);	\
+	preempt_enable();					\
+} while (0)
 
 #define disk_stat_read(gendiskp, field)					\
 ({									\
@@ -184,7 +189,11 @@ static inline void disk_stat_set_all(str
 }		
 
 #define __part_stat_add(part, field, addnd)				\
-	(per_cpu_ptr(part->dkstats, smp_processor_id())->field += addnd)
+do {									\
+	preempt_disable();						\
+	(per_cpu_ptr(part->dkstats, smp_processor_id())->field += addnd); \
+	preempt_enable();						\
+} while (0)
 
 #define __all_stat_add(gendiskp, field, addnd, sector)		\
 ({								\
Index: linux-2.6.25.4-rt4/drivers/acpi/ec.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/acpi/ec.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/acpi/ec.c	2008-05-29 09:46:44.000000000 -0400
@@ -531,7 +531,19 @@ static u32 acpi_ec_gpe_handler(void *dat
 	pr_debug(PREFIX "~~~> interrupt\n");
 	clear_bit(EC_FLAGS_WAIT_GPE, &ec->flags);
 	if (test_bit(EC_FLAGS_GPE_MODE, &ec->flags))
+#if 0
 		wake_up(&ec->wait);
+#else
+		// hack ...
+		if (waitqueue_active(&ec->wait)) {
+			struct task_struct *task;
+
+			task = list_entry(ec->wait.task_list.next,
+					  wait_queue_t, task_list)->private;
+			if (task)
+				wake_up_process(task);
+		}
+#endif
 
 	if (acpi_ec_read_status(ec) & ACPI_EC_FLAG_SCI) {
 		if (!test_and_set_bit(EC_FLAGS_QUERY_PENDING, &ec->flags))
Index: linux-2.6.25.4-rt4/drivers/acpi/hardware/hwregs.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/acpi/hardware/hwregs.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/acpi/hardware/hwregs.c	2008-05-29 09:46:44.000000000 -0400
@@ -73,7 +73,7 @@ acpi_status acpi_hw_clear_acpi_status(vo
 			  ACPI_BITMASK_ALL_FIXED_STATUS,
 			  (u16) acpi_gbl_FADT.xpm1a_event_block.address));
 
-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+	spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
 
 	status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
 					ACPI_BITMASK_ALL_FIXED_STATUS);
@@ -97,7 +97,7 @@ acpi_status acpi_hw_clear_acpi_status(vo
 	status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block);
 
       unlock_and_exit:
-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
+	spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
 	return_ACPI_STATUS(status);
 }
 
@@ -300,9 +300,9 @@ acpi_status acpi_get_register(u32 regist
 {
 	acpi_status status;
 	acpi_cpu_flags flags;
-	flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+	spin_lock_irqsave(acpi_gbl_hardware_lock, flags);
 	status = acpi_get_register_unlocked(register_id, return_value);
-	acpi_os_release_lock(acpi_gbl_hardware_lock, flags);
+	spin_unlock_irqrestore(acpi_gbl_hardware_lock, flags);
 	return status;
 }
 
@@ -339,7 +339,7 @@ acpi_status acpi_set_register(u32 regist
 		return_ACPI_STATUS(AE_BAD_PARAMETER);
 	}
 
-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+	spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
 
 	/* Always do a register read first so we can insert the new bits  */
 
@@ -443,7 +443,7 @@ acpi_status acpi_set_register(u32 regist
 
       unlock_and_exit:
 
-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
+	spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
 
 	/* Normalize the value that was read */
 
Index: linux-2.6.25.4-rt4/drivers/acpi/utilities/utmutex.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/acpi/utilities/utmutex.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/acpi/utilities/utmutex.c	2008-05-29 09:46:44.000000000 -0400
@@ -116,7 +116,7 @@ void acpi_ut_mutex_terminate(void)
 	/* Delete the spinlocks */
 
 	acpi_os_delete_lock(acpi_gbl_gpe_lock);
-	acpi_os_delete_lock(acpi_gbl_hardware_lock);
+//	acpi_os_delete_lock(acpi_gbl_hardware_lock);
 	return_VOID;
 }
 
Index: linux-2.6.25.4-rt4/include/acpi/acglobal.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/acpi/acglobal.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/acpi/acglobal.h	2008-05-29 09:46:44.000000000 -0400
@@ -180,7 +180,12 @@ ACPI_EXTERN acpi_semaphore acpi_gbl_glob
  * interrupt level
  */
 ACPI_EXTERN spinlock_t _acpi_gbl_gpe_lock;	/* For GPE data structs and registers */
-ACPI_EXTERN spinlock_t _acpi_gbl_hardware_lock;	/* For ACPI H/W except GPE registers */
+
+/*
+ * Need to be raw because it might be used in acpi_processor_idle():
+ */
+ACPI_EXTERN raw_spinlock_t _acpi_gbl_hardware_lock;	/* For ACPI H/W except GPE registers */
+
 #define acpi_gbl_gpe_lock	&_acpi_gbl_gpe_lock
 #define acpi_gbl_hardware_lock	&_acpi_gbl_hardware_lock
 
Index: linux-2.6.25.4-rt4/include/acpi/acpiosxf.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/acpi/acpiosxf.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/acpi/acpiosxf.h	2008-05-29 09:46:44.000000000 -0400
@@ -61,7 +61,7 @@ typedef enum {
 	OSL_EC_BURST_HANDLER
 } acpi_execute_type;
 
-#define ACPI_NO_UNIT_LIMIT          ((u32) -1)
+#define ACPI_NO_UNIT_LIMIT          (INT_MAX/2)
 #define ACPI_MUTEX_SEM              1
 
 /* Functions for acpi_os_signal */
Index: linux-2.6.25.4-rt4/ipc/mqueue.c
===================================================================
--- linux-2.6.25.4-rt4.orig/ipc/mqueue.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/ipc/mqueue.c	2008-05-29 09:46:45.000000000 -0400
@@ -778,12 +778,17 @@ static inline void pipelined_send(struct
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable();
 	receiver->msg = message;
 	list_del(&receiver->list);
 	receiver->state = STATE_PENDING;
 	wake_up_process(receiver->task);
 	smp_wmb();
 	receiver->state = STATE_READY;
+	preempt_enable();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
Index: linux-2.6.25.4-rt4/ipc/msg.c
===================================================================
--- linux-2.6.25.4-rt4.orig/ipc/msg.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/ipc/msg.c	2008-05-29 09:46:45.000000000 -0400
@@ -238,12 +238,19 @@ static void expunge_all(struct msg_queue
 	while (tmp != &msq->q_receivers) {
 		struct msg_receiver *msr;
 
+		/*
+		 * Make sure that the wakeup doesnt preempt
+		 * this CPU prematurely. (on PREEMPT_RT)
+		 */
+		preempt_disable();
+
 		msr = list_entry(tmp, struct msg_receiver, r_list);
 		tmp = tmp->next;
 		msr->r_msg = NULL;
-		wake_up_process(msr->r_tsk);
-		smp_mb();
+		wake_up_process(msr->r_tsk); /* serializes */
 		msr->r_msg = ERR_PTR(res);
+
+		preempt_enable();
 	}
 }
 
@@ -615,22 +622,28 @@ static inline int pipelined_send(struct 
 		    !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
 					       msr->r_msgtype, msr->r_mode)) {
 
+			/*
+			 * Make sure that the wakeup doesnt preempt
+			 * this CPU prematurely. (on PREEMPT_RT)
+			 */
+			preempt_disable();
+
 			list_del(&msr->r_list);
 			if (msr->r_maxsize < msg->m_ts) {
 				msr->r_msg = NULL;
-				wake_up_process(msr->r_tsk);
-				smp_mb();
+				wake_up_process(msr->r_tsk); /* serializes */
 				msr->r_msg = ERR_PTR(-E2BIG);
 			} else {
 				msr->r_msg = NULL;
 				msq->q_lrpid = task_pid_vnr(msr->r_tsk);
 				msq->q_rtime = get_seconds();
-				wake_up_process(msr->r_tsk);
-				smp_mb();
+				wake_up_process(msr->r_tsk); /* serializes */
 				msr->r_msg = msg;
+				preempt_enable();
 
 				return 1;
 			}
+			preempt_enable();
 		}
 	}
 	return 0;
Index: linux-2.6.25.4-rt4/ipc/sem.c
===================================================================
--- linux-2.6.25.4-rt4.orig/ipc/sem.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/ipc/sem.c	2008-05-29 09:46:45.000000000 -0400
@@ -445,6 +445,11 @@ static void update_queue (struct sem_arr
 		if (error <= 0) {
 			struct sem_queue *n;
 			remove_from_queue(sma,q);
+			/*
+			 * make sure that the wakeup doesnt preempt
+			 * _this_ cpu prematurely. (on preempt_rt)
+			 */
+			preempt_disable();
 			q->status = IN_WAKEUP;
 			/*
 			 * Continue scanning. The next operation
@@ -467,6 +472,7 @@ static void update_queue (struct sem_arr
 			 */
 			smp_wmb();
 			q->status = error;
+			preempt_enable();
 			q = n;
 		} else {
 			q = q->next;
Index: linux-2.6.25.4-rt4/sound/core/pcm_lib.c
===================================================================
--- linux-2.6.25.4-rt4.orig/sound/core/pcm_lib.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/sound/core/pcm_lib.c	2008-05-29 09:47:31.000000000 -0400
@@ -29,6 +29,7 @@
 #include <sound/pcm_params.h>
 #include <sound/timer.h>
 
+#include <linux/ftrace.h>
 /*
  * fill ring buffer with silence
  * runtime->silence_start: starting pointer to silence area
Index: linux-2.6.25.4-rt4/include/linux/pagevec.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/pagevec.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/pagevec.h	2008-05-29 09:46:45.000000000 -0400
@@ -9,7 +9,7 @@
 #define _LINUX_PAGEVEC_H
 
 /* 14 pointers + two long's align the pagevec structure to a power of two */
-#define PAGEVEC_SIZE	14
+#define PAGEVEC_SIZE	8
 
 struct page;
 struct address_space;
Index: linux-2.6.25.4-rt4/include/linux/vmstat.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/vmstat.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/vmstat.h	2008-05-29 09:46:45.000000000 -0400
@@ -59,7 +59,12 @@ DECLARE_PER_CPU(struct vm_event_state, v
 
 static inline void __count_vm_event(enum vm_event_item item)
 {
+#ifdef CONFIG_PREEMPT_RT
+	get_cpu_var(vm_event_states).event[item]++;
+	put_cpu();
+#else
 	__get_cpu_var(vm_event_states).event[item]++;
+#endif
 }
 
 static inline void count_vm_event(enum vm_event_item item)
@@ -70,7 +75,12 @@ static inline void count_vm_event(enum v
 
 static inline void __count_vm_events(enum vm_event_item item, long delta)
 {
+#ifdef CONFIG_PREEMPT_RT
+	get_cpu_var(vm_event_states).event[item] += delta;
+	put_cpu();
+#else
 	__get_cpu_var(vm_event_states).event[item] += delta;
+#endif
 }
 
 static inline void count_vm_events(enum vm_event_item item, long delta)
Index: linux-2.6.25.4-rt4/mm/bounce.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/bounce.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/bounce.c	2008-05-29 09:46:45.000000000 -0400
@@ -48,11 +48,11 @@ static void bounce_copy_vec(struct bio_v
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
Index: linux-2.6.25.4-rt4/mm/mmap.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/mmap.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/mmap.c	2008-05-29 09:46:45.000000000 -0400
@@ -1915,10 +1915,16 @@ asmlinkage long sys_munmap(unsigned long
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
-	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+# ifdef CONFIG_PREEMPT_RT
+	if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) {
 		WARN_ON(1);
-		up_read(&mm->mmap_sem);
 	}
+# else
+        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+		WARN_ON(1);
+		up_read(&mm->mmap_sem);
+        }
+# endif
 #endif
 }
 
Index: linux-2.6.25.4-rt4/mm/vmscan.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/vmscan.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/vmscan.c	2008-05-29 09:46:56.000000000 -0400
@@ -23,6 +23,7 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/interrupt.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
@@ -400,7 +401,7 @@ int remove_mapping(struct address_space 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 
-	write_lock_irq(&mapping->tree_lock);
+	lock_page_ref_irq(page);
 	/*
 	 * The non racy check for a busy page.
 	 *
@@ -435,19 +436,19 @@ int remove_mapping(struct address_space 
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
-		__put_page(page);	/* The pagecache ref */
-		return 1;
+		goto free_it;
 	}
 
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
-	__put_page(page);
+
+free_it:
+	unlock_page_ref_irq(page);
+	__put_page(page); /* The pagecache ref */
 	return 1;
 
 cannot_free:
-	write_unlock_irq(&mapping->tree_lock);
+	unlock_page_ref_irq(page);
 	return 0;
 }
 
@@ -872,7 +873,7 @@ static unsigned long shrink_inactive_lis
 		}
 
 		nr_reclaimed += nr_freed;
-		local_irq_disable();
+		local_irq_disable_nort();
 		if (current_is_kswapd()) {
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
 			__count_vm_events(KSWAPD_STEAL, nr_freed);
@@ -904,9 +905,14 @@ static unsigned long shrink_inactive_lis
 			}
 		}
   	} while (nr_scanned < max_scan);
+	/*
+	 * Non-PREEMPT_RT relies on IRQs-off protecting the page_states
+	 * per-CPU data. PREEMPT_RT has that data protected even in
+	 * __mod_page_state(), so no need to keep IRQs disabled.
+	 */
 	spin_unlock(&zone->lru_lock);
 done:
-	local_irq_enable();
+	local_irq_enable_nort();
 	pagevec_release(&pvec);
 	return nr_reclaimed;
 }
Index: linux-2.6.25.4-rt4/mm/vmstat.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/vmstat.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/vmstat.c	2008-05-29 09:46:45.000000000 -0400
@@ -150,10 +150,14 @@ static void refresh_zone_stat_thresholds
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 				int delta)
 {
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
-	s8 *p = pcp->vm_stat_diff + item;
+	struct per_cpu_pageset *pcp;
+	int cpu;
 	long x;
+	s8 *p;
 
+	cpu = get_cpu();
+	pcp = zone_pcp(zone, cpu);
+	p = pcp->vm_stat_diff + item;
 	x = delta + *p;
 
 	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
@@ -161,6 +165,7 @@ void __mod_zone_page_state(struct zone *
 		x = 0;
 	}
 	*p = x;
+	put_cpu();
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
@@ -203,9 +208,13 @@ EXPORT_SYMBOL(mod_zone_page_state);
  */
 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
-	s8 *p = pcp->vm_stat_diff + item;
+	struct per_cpu_pageset *pcp;
+	int cpu;
+	s8 *p;
 
+	cpu = get_cpu();
+	pcp = zone_pcp(zone, cpu);
+	p = pcp->vm_stat_diff + item;
 	(*p)++;
 
 	if (unlikely(*p > pcp->stat_threshold)) {
@@ -214,18 +223,34 @@ void __inc_zone_state(struct zone *zone,
 		zone_page_state_add(*p + overstep, zone, item);
 		*p = -overstep;
 	}
+	put_cpu();
 }
 
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
+#ifdef CONFIG_PREEMPT_RT
+	unsigned long flags;
+	struct zone *zone;
+
+	zone = page_zone(page);
+	local_irq_save(flags);
+	__inc_zone_state(zone, item);
+	local_irq_restore(flags);
+#else
 	__inc_zone_state(page_zone(page), item);
+#endif
 }
 EXPORT_SYMBOL(__inc_zone_page_state);
 
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
-	s8 *p = pcp->vm_stat_diff + item;
+	struct per_cpu_pageset *pcp;
+	int cpu;
+	s8 *p;
+
+	cpu = get_cpu();
+	pcp = zone_pcp(zone, cpu);
+	p = pcp->vm_stat_diff + item;
 
 	(*p)--;
 
@@ -235,6 +260,7 @@ void __dec_zone_state(struct zone *zone,
 		zone_page_state_add(*p - overstep, zone, item);
 		*p = overstep;
 	}
+	put_cpu();
 }
 
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
Index: linux-2.6.25.4-rt4/drivers/block/paride/pseudo.h
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/block/paride/pseudo.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/block/paride/pseudo.h	2008-05-29 09:46:45.000000000 -0400
@@ -43,7 +43,7 @@ static unsigned long ps_timeout;
 static int ps_tq_active = 0;
 static int ps_nice = 0;
 
-static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused)));
+static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock);
 
 static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int);
 
Index: linux-2.6.25.4-rt4/drivers/video/console/fbcon.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/video/console/fbcon.c	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/video/console/fbcon.c	2008-05-29 09:46:46.000000000 -0400
@@ -1303,7 +1303,6 @@ static void fbcon_clear(struct vc_data *
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	struct fbcon_ops *ops = info->fbcon_par;
-
 	struct display *p = &fb_display[vc->vc_num];
 	u_int y_break;
 
@@ -1332,10 +1331,11 @@ static void fbcon_putcs(struct vc_data *
 	struct display *p = &fb_display[vc->vc_num];
 	struct fbcon_ops *ops = info->fbcon_par;
 
-	if (!fbcon_is_inactive(vc, info))
+	if (!fbcon_is_inactive(vc, info)) {
 		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
 			   get_color(vc, info, scr_readw(s), 1),
 			   get_color(vc, info, scr_readw(s), 0));
+	}
 }
 
 static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
@@ -3319,6 +3319,7 @@ static const struct consw fb_con = {
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_preemptible 	= 1,
 };
 
 static struct notifier_block fbcon_event_notifier = {
Index: linux-2.6.25.4-rt4/include/linux/console.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/console.h	2008-05-29 09:45:44.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/console.h	2008-05-29 09:47:18.000000000 -0400
@@ -55,6 +55,7 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	int	con_preemptible; // can it reschedule from within printk?
 };
 
 extern const struct consw *conswitchp;
@@ -91,6 +92,17 @@ void give_up_console(const struct consw 
 #define CON_ENABLED	(4)
 #define CON_BOOT	(8)
 #define CON_ANYTIME	(16) /* Safe to call when cpu is offline */
+#define CON_ATOMIC	(32) /* Safe to call in PREEMPT_RT atomic */
+
+#ifdef CONFIG_PREEMPT_RT
+# define console_atomic_safe(con)		\
+	(((con)->flags & CON_ATOMIC) ||		\
+	 (!in_atomic() && !irqs_disabled()) ||	\
+	 (system_state != SYSTEM_RUNNING) ||	\
+	 oops_in_progress)
+#else
+# define console_atomic_safe(con) (1)
+#endif
 
 struct console {
 	char	name[16];
Index: linux-2.6.25.4-rt4/drivers/char/sysrq.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/char/sysrq.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/char/sysrq.c	2008-05-29 09:46:51.000000000 -0400
@@ -209,6 +209,22 @@ static struct sysrq_key_op sysrq_showreg
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 
+#if defined(__i386__) || defined(__x86_64__)
+
+static void sysrq_handle_showallregs(int key, struct tty_struct *tty)
+{
+	nmi_show_all_regs();
+}
+
+static struct sysrq_key_op sysrq_showallregs_op = {
+	.handler	= sysrq_handle_showallregs,
+	.help_msg	= "showalLcpupc",
+	.action_msg	= "Show Regs On All CPUs",
+};
+#else
+#define sysrq_showallregs_op (*(struct sysrq_key_op *)0)
+#endif
+
 static void sysrq_handle_showstate(int key, struct tty_struct *tty)
 {
 	show_state();
@@ -341,7 +357,7 @@ static struct sysrq_key_op *sysrq_key_ta
 	&sysrq_kill_op,			/* i */
 	NULL,				/* j */
 	&sysrq_SAK_op,			/* k */
-	NULL,				/* l */
+	&sysrq_showallregs_op,		/* l */
 	&sysrq_showmem_op,		/* m */
 	&sysrq_unrt_op,			/* n */
 	/* o: This will often be registered as 'Off' at init time */
Index: linux-2.6.25.4-rt4/kernel/panic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/panic.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/panic.c	2008-05-29 09:47:00.000000000 -0400
@@ -28,7 +28,38 @@ static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 
-int panic_timeout;
+/*
+ * Debugging helper: freeze all console output after printing the
+ * first oops's head (or tail):
+ */
+static int pause_on_oops_head_flag = 0;
+static int pause_on_oops_tail_flag = 0;
+
+static void pause_on_oops_loop(int flag)
+{
+	switch (flag) {
+	default:
+		break;
+	case 1:
+		for (;;)
+			local_irq_disable();
+	case 2:
+		for (;;)
+			local_irq_enable();
+	}
+}
+
+void pause_on_oops_head(void)
+{
+	pause_on_oops_loop(pause_on_oops_head_flag);
+}
+
+void pause_on_oops_tail(void)
+{
+	pause_on_oops_loop(pause_on_oops_tail_flag);
+}
+
+int panic_timeout __read_mostly;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
@@ -80,6 +111,7 @@ NORET_TYPE void panic(const char * fmt, 
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+	dump_stack();
 	bust_spinlocks(0);
 
 	/*
@@ -95,7 +127,7 @@ NORET_TYPE void panic(const char * fmt, 
 	 * unfortunately means it may not be hardened to work in a panic
 	 * situation.
 	 */
-	smp_send_stop();
+//	smp_send_stop();
 #endif
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
@@ -191,6 +223,22 @@ static int __init pause_on_oops_setup(ch
 }
 __setup("pause_on_oops=", pause_on_oops_setup);
 
+static int __init pause_on_oops_head_setup(char *str)
+{
+	pause_on_oops_head_flag = simple_strtoul(str, NULL, 0);
+	printk(KERN_INFO "pause_on_oops_head: %d\n", pause_on_oops_head_flag);
+	return 1;
+}
+__setup("pause_on_oops_head=", pause_on_oops_head_setup);
+
+static int __init pause_on_oops_tail_setup(char *str)
+{
+	pause_on_oops_tail_flag = simple_strtoul(str, NULL, 0);
+	printk(KERN_INFO "pause_on_oops_tail: %d\n", pause_on_oops_tail_flag);
+	return 1;
+}
+__setup("pause_on_oops_tail=", pause_on_oops_tail_setup);
+
 static void spin_msec(int msecs)
 {
 	int i;
Index: linux-2.6.25.4-rt4/drivers/ide/ide-floppy.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/ide/ide-floppy.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/ide/ide-floppy.c	2008-05-29 09:46:46.000000000 -0400
@@ -1245,9 +1245,9 @@ static int idefloppy_get_format_progress
 		unsigned long flags;
 		u8 stat;
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		stat = ide_read_status(drive);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 
 		progress_indication = ((stat & SEEK_STAT) == 0) ? 0 : 0x10000;
 	}
Index: linux-2.6.25.4-rt4/drivers/ide/ide-io.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/ide/ide-io.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/ide/ide-io.c	2008-05-29 09:46:46.000000000 -0400
@@ -1079,7 +1079,7 @@ static void ide_do_request (ide_hwgroup_
 	ide_get_lock(ide_intr, hwgroup);
 
 	/* caller must own ide_lock */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	while (!hwgroup->busy) {
 		hwgroup->busy = 1;
@@ -1345,7 +1345,7 @@ void ide_timer_expiry (unsigned long dat
 			disable_irq(hwif->irq);
 			/* local CPU only,
 			 * as if we were handling an interrupt */
-			local_irq_disable();
+			local_irq_disable_nort();
 			if (hwgroup->polling) {
 				startstop = handler(drive);
 			} else if (drive_is_ready(drive)) {
Index: linux-2.6.25.4-rt4/drivers/ide/ide-iops.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/ide/ide-iops.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/ide/ide-iops.c	2008-05-29 09:46:46.000000000 -0400
@@ -194,10 +194,10 @@ static void ata_input_data(ide_drive_t *
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -216,10 +216,10 @@ static void ata_output_data(ide_drive_t 
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -479,12 +479,12 @@ static int __ide_wait_stat(ide_drive_t *
 				if (!(stat & BUSY_STAT))
 					break;
 
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 				*rstat = stat;
 				return -EBUSY;
 			}
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	/*
 	 * Allow status to settle, then read it again.
@@ -651,17 +651,17 @@ int ide_driveid_update(ide_drive_t *driv
 		printk("%s: CHECK for good STATUS\n", drive->name);
 		return 0;
 	}
-	local_irq_save(flags);
-	SELECT_MASK(drive, 0);
 	id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC);
+	local_irq_save_nort(flags);
+	SELECT_MASK(drive, 0);
 	if (!id) {
 		local_irq_restore(flags);
 		return 0;
 	}
 	hwif->ata_input_data(drive, id, SECTOR_WORDS);
 	(void)ide_read_status(drive);	/* clear drive IRQ */
-	local_irq_enable();
-	local_irq_restore(flags);
+	local_irq_enable_nort();
+	local_irq_restore_nort(flags);
 	ide_fix_driveid(id);
 	if (id) {
 		drive->id->dma_ultra = id->dma_ultra;
Index: linux-2.6.25.4-rt4/drivers/ide/ide-lib.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/ide/ide-lib.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/ide/ide-lib.c	2008-05-29 09:46:46.000000000 -0400
@@ -436,14 +436,15 @@ int ide_set_xfer_rate(ide_drive_t *drive
 
 static void ide_dump_opcode(ide_drive_t *drive)
 {
+	unsigned long flags;
 	struct request *rq;
 	ide_task_t *task = NULL;
 
-	spin_lock(&ide_lock);
+	spin_lock_irqsave(&ide_lock, flags);
 	rq = NULL;
 	if (HWGROUP(drive))
 		rq = HWGROUP(drive)->rq;
-	spin_unlock(&ide_lock);
+	spin_unlock_irqrestore(&ide_lock, flags);
 	if (!rq)
 		return;
 
@@ -541,10 +542,8 @@ static void ide_dump_atapi_error(ide_dri
 
 u8 ide_dump_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
-	unsigned long flags;
 	u8 err = 0;
 
-	local_irq_save(flags);
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (stat & BUSY_STAT)
 		printk("Busy ");
@@ -567,7 +566,6 @@ u8 ide_dump_status(ide_drive_t *drive, c
 			ide_dump_atapi_error(drive, err);
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
 	return err;
 }
 
Index: linux-2.6.25.4-rt4/drivers/ide/ide-probe.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/ide/ide-probe.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/ide/ide-probe.c	2008-05-29 09:46:46.000000000 -0400
@@ -127,7 +127,7 @@ static inline void do_identify (ide_driv
 	hwif->ata_input_data(drive, id, SECTOR_WORDS);
 
 	drive->id_read = 1;
-	local_irq_enable();
+	local_irq_enable_nort();
 #ifdef DEBUG
 	printk(KERN_INFO "%s: dumping identify data\n", drive->name);
 	ide_dump_identify((u8 *)id);
@@ -315,14 +315,14 @@ static int actual_try_to_identify (ide_d
 		unsigned long flags;
 
 		/* local CPU only; some systems need this */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* drive returned ID */
 		do_identify(drive, cmd);
 		/* drive responded with ID */
 		rc = 0;
 		/* clear drive IRQ */
 		(void)ide_read_status(drive);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		/* drive refused ID */
 		rc = 2;
@@ -791,7 +791,7 @@ static int ide_probe_port(ide_hwif_t *hw
 		hwif->OUTB(8, hwif->io_ports[IDE_CONTROL_OFFSET]);
 		(void)ide_busy_sleep(hwif);
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	/*
 	 * Use cached IRQ number. It might be (and is...) changed by probe
 	 * code above
Index: linux-2.6.25.4-rt4/drivers/ide/ide-taskfile.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/ide/ide-taskfile.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/ide/ide-taskfile.c	2008-05-29 09:46:46.000000000 -0400
@@ -306,7 +306,7 @@ static void ide_pio_sector(ide_drive_t *
 	offset %= PAGE_SIZE;
 
 #ifdef CONFIG_HIGHMEM
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 #endif
 	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
@@ -326,7 +326,7 @@ static void ide_pio_sector(ide_drive_t *
 
 	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
 #ifdef CONFIG_HIGHMEM
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 #endif
 }
 
@@ -517,7 +517,7 @@ static ide_startstop_t pre_task_out_intr
 	}
 
 	if (!drive->unmask)
-		local_irq_disable();
+		local_irq_disable_nort();
 
 	ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL);
 	ide_pio_datablock(drive, rq, 1);
Index: linux-2.6.25.4-rt4/drivers/ide/pci/alim15x3.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/ide/pci/alim15x3.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/ide/pci/alim15x3.c	2008-05-29 09:46:46.000000000 -0400
@@ -319,7 +319,7 @@ static void ali_set_pio_mode(ide_drive_t
 		if (r_clc >= 16)
 			r_clc = 0;
 	}
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	
 	/* 
 	 * PIO mode => ATA FIFO on, ATAPI FIFO off
@@ -341,7 +341,7 @@ static void ali_set_pio_mode(ide_drive_t
 	
 	pci_write_config_byte(dev, port, s_clc);
 	pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	/*
 	 * setup   active  rec
@@ -473,7 +473,7 @@ static unsigned int __devinit init_chips
 	}
 #endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_IDE_PROC_FS) */
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (m5229_revision < 0xC2) {
 		/*
@@ -564,7 +564,7 @@ out:
 	}
 	pci_dev_put(north);
 	pci_dev_put(isa_dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return 0;
 }
 
@@ -626,7 +626,7 @@ static u8 __devinit ata66_ali15x3(ide_hw
 	unsigned long flags;
 	u8 cbl = ATA_CBL_PATA40, tmpbyte;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (m5229_revision >= 0xC2) {
 		/*
@@ -647,7 +647,7 @@ static u8 __devinit ata66_ali15x3(ide_hw
 		}
 	}
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	return cbl;
 }
Index: linux-2.6.25.4-rt4/drivers/ide/pci/hpt366.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/ide/pci/hpt366.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/ide/pci/hpt366.c	2008-05-29 09:46:46.000000000 -0400
@@ -1406,7 +1406,7 @@ static void __devinit init_dma_hpt366(id
 
 	dma_old = inb(dmabase + 2);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	dma_new = dma_old;
 	pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
@@ -1417,7 +1417,7 @@ static void __devinit init_dma_hpt366(id
 	if (dma_new != dma_old)
 		outb(dma_new, dmabase + 2);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	ide_setup_dma(hwif, dmabase);
 }
Index: linux-2.6.25.4-rt4/drivers/input/gameport/gameport.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/input/gameport/gameport.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/input/gameport/gameport.c	2008-05-29 09:46:46.000000000 -0400
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/interrupt.h>
 #include <linux/sched.h>	/* HZ */
 #include <linux/mutex.h>
 #include <linux/freezer.h>
@@ -99,12 +100,12 @@ static int gameport_measure_speed(struct
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		GET_TIME(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		GET_TIME(t2);
 		GET_TIME(t3);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
 	}
@@ -123,11 +124,11 @@ static int gameport_measure_speed(struct
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		rdtscl(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		rdtscl(t2);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 	}
Index: linux-2.6.25.4-rt4/drivers/net/tulip/tulip_core.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/tulip/tulip_core.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/tulip/tulip_core.c	2008-05-29 09:46:47.000000000 -0400
@@ -1802,6 +1802,7 @@ static void __devexit tulip_remove_one (
 	pci_iounmap(pdev, tp->base_addr);
 	free_netdev (dev);
 	pci_release_regions (pdev);
+	pci_disable_device (pdev);
 	pci_set_drvdata (pdev, NULL);
 
 	/* pci_power_off (pdev, -1); */
Index: linux-2.6.25.4-rt4/kernel/printk.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/printk.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/printk.c	2008-05-29 09:47:18.000000000 -0400
@@ -90,7 +90,7 @@ static int console_locked, console_suspe
  * It is also used in interesting ways to provide interlocking in
  * release_console_sem().
  */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #define LOG_BUF_MASK (log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -441,9 +441,13 @@ static void __call_console_drivers(unsig
 
 	for (con = console_drivers; con; con = con->next) {
 		if ((con->flags & CON_ENABLED) && con->write &&
-				(cpu_online(smp_processor_id()) ||
-				(con->flags & CON_ANYTIME)))
+				console_atomic_safe(con) &&
+				(cpu_online(raw_smp_processor_id()) ||
+				 (con->flags & CON_ANYTIME))) {
+			set_printk_might_sleep(1);
 			con->write(con, &LOG_BUF(start), end - start);
+			set_printk_might_sleep(0);
+		}
 	}
 }
 
@@ -557,6 +561,7 @@ static void zap_locks(void)
 	spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
 	init_MUTEX(&console_sem);
+	zap_rt_locks();
 }
 
 #if defined(CONFIG_PRINTK_TIME)
@@ -639,7 +644,8 @@ static inline int can_use_console(unsign
  * interrupts disabled. It should return with 'lockbuf_lock'
  * released but interrupts still disabled.
  */
-static int acquire_console_semaphore_for_printk(unsigned int cpu)
+static int acquire_console_semaphore_for_printk(unsigned int cpu,
+						unsigned long flags)
 {
 	int retval = 0;
 
@@ -660,6 +666,8 @@ static int acquire_console_semaphore_for
 	}
 	printk_cpu = UINT_MAX;
 	spin_unlock(&logbuf_lock);
+	lockdep_on();
+	local_irq_restore(flags);
 	return retval;
 }
 
@@ -682,7 +690,7 @@ asmlinkage int vprintk(const char *fmt, 
 	preempt_disable();
 	/* This stops the holder of console_sem just where we want him */
 	raw_local_irq_save(flags);
-	this_cpu = smp_processor_id();
+	this_cpu = raw_smp_processor_id();
 
 	/*
 	 * Ouch, printk recursed into itself!
@@ -697,7 +705,8 @@ asmlinkage int vprintk(const char *fmt, 
 		 */
 		if (!oops_in_progress) {
 			printk_recursion_bug = 1;
-			goto out_restore_irqs;
+			raw_local_irq_restore(flags);
+			goto out;
 		}
 		zap_locks();
 	}
@@ -705,6 +714,7 @@ asmlinkage int vprintk(const char *fmt, 
 	lockdep_off();
 	spin_lock(&logbuf_lock);
 	printk_cpu = this_cpu;
+	preempt_enable();
 
 	if (printk_recursion_bug) {
 		printk_recursion_bug = 0;
@@ -782,14 +792,10 @@ asmlinkage int vprintk(const char *fmt, 
 	 * will release 'logbuf_lock' regardless of whether it
 	 * actually gets the semaphore or not.
 	 */
-	if (acquire_console_semaphore_for_printk(this_cpu))
+	if (acquire_console_semaphore_for_printk(this_cpu, flags))
 		release_console_sem();
 
-	lockdep_on();
-out_restore_irqs:
-	raw_local_irq_restore(flags);
-
-	preempt_enable();
+out:
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
@@ -1013,13 +1019,33 @@ void release_console_sem(void)
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
+		/*
+		 * on PREEMPT_RT, call console drivers with
+		 * interrupts enabled (if printk was called
+		 * with interrupts disabled):
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+#else
 		spin_unlock(&logbuf_lock);
+#endif
 		call_console_drivers(_con_start, _log_end);
+#ifndef CONFIG_PREEMPT_RT
 		local_irq_restore(flags);
+#endif
 	}
 	console_locked = 0;
-	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
+	up(&console_sem);
+	/*
+	 * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd
+	 * up only if we are in a preemptible section. We normally dont
+	 * printk from non-preemptible sections so this is for the emergency
+	 * case only.
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	if (!in_atomic() && !irqs_disabled())
+#endif
 	if (wake_klogd)
 		wake_up_klogd();
 }
@@ -1287,7 +1313,7 @@ void tty_write_message(struct tty_struct
  */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-	static DEFINE_SPINLOCK(ratelimit_lock);
+	static DEFINE_RAW_SPINLOCK(ratelimit_lock);
 	static unsigned toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
@@ -1328,6 +1354,23 @@ int printk_ratelimit(void)
 }
 EXPORT_SYMBOL(printk_ratelimit);
 
+static DEFINE_RAW_SPINLOCK(warn_lock);
+
+void __WARN_ON(const char *func, const char *file, const int line)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&warn_lock, flags);
+	printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n",
+		current->comm, current->pid, raw_smp_processor_id(),
+		func, file, line);
+	dump_stack();
+	spin_unlock_irqrestore(&warn_lock, flags);
+}
+
+EXPORT_SYMBOL(__WARN_ON);
+
+
 /**
  * printk_timed_ratelimit - caller-controlled printk ratelimiting
  * @caller_jiffies: pointer to caller's state
Index: linux-2.6.25.4-rt4/drivers/oprofile/oprofilefs.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/oprofile/oprofilefs.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/oprofile/oprofilefs.c	2008-05-29 09:46:47.000000000 -0400
@@ -21,7 +21,7 @@
 
 #define OPROFILEFS_MAGIC 0x6f70726f
 
-DEFINE_SPINLOCK(oprofilefs_lock);
+DEFINE_RAW_SPINLOCK(oprofilefs_lock);
 
 static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode)
 {
Index: linux-2.6.25.4-rt4/drivers/pci/access.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/pci/access.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/pci/access.c	2008-05-29 09:46:47.000000000 -0400
@@ -11,7 +11,7 @@
  * configuration space.
  */
 
-static DEFINE_SPINLOCK(pci_lock);
+static DEFINE_RAW_SPINLOCK(pci_lock);
 
 /*
  *  Wrappers for all PCI configuration access functions.  They just check
Index: linux-2.6.25.4-rt4/drivers/video/console/vgacon.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/video/console/vgacon.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/video/console/vgacon.c	2008-05-29 09:46:47.000000000 -0400
@@ -51,7 +51,7 @@
 #include <video/vga.h>
 #include <asm/io.h>
 
-static DEFINE_SPINLOCK(vga_lock);
+static DEFINE_RAW_SPINLOCK(vga_lock);
 static int cursor_size_lastfrom;
 static int cursor_size_lastto;
 static u32 vgacon_xres;
Index: linux-2.6.25.4-rt4/include/linux/kprobes.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/kprobes.h	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/kprobes.h	2008-05-29 09:46:47.000000000 -0400
@@ -194,7 +194,7 @@ static inline int init_test_probes(void)
 }
 #endif /* CONFIG_KPROBES_SANITY_TEST */
 
-extern spinlock_t kretprobe_lock;
+extern raw_spinlock_t kretprobe_lock;
 extern struct mutex kprobe_mutex;
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
Index: linux-2.6.25.4-rt4/include/linux/oprofile.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/oprofile.h	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/oprofile.h	2008-05-29 09:46:47.000000000 -0400
@@ -159,6 +159,6 @@ ssize_t oprofilefs_ulong_to_user(unsigne
 int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count);
 
 /** lock for read/write safety */
-extern spinlock_t oprofilefs_lock;
+extern raw_spinlock_t oprofilefs_lock;
  
 #endif /* OPROFILE_H */
Index: linux-2.6.25.4-rt4/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/percpu_counter.h	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/percpu_counter.h	2008-05-29 09:46:47.000000000 -0400
@@ -16,7 +16,7 @@
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	s64 count;
 #ifdef CONFIG_HOTPLUG_CPU
 	struct list_head list;	/* All percpu_counters are on a list */
Index: linux-2.6.25.4-rt4/kernel/kprobes.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/kprobes.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/kprobes.c	2008-05-29 09:46:47.000000000 -0400
@@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_
 static bool kprobe_enabled;
 
 DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
-DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
+DEFINE_RAW_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT
Index: linux-2.6.25.4-rt4/kernel/softlockup.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/softlockup.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/softlockup.c	2008-05-29 09:46:47.000000000 -0400
@@ -18,7 +18,7 @@
 
 #include <asm/irq_regs.h>
 
-static DEFINE_SPINLOCK(print_lock);
+static DEFINE_RAW_SPINLOCK(print_lock);
 
 static DEFINE_PER_CPU(unsigned long, touch_timestamp);
 static DEFINE_PER_CPU(unsigned long, print_timestamp);
Index: linux-2.6.25.4-rt4/include/linux/time.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/time.h	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/time.h	2008-05-29 09:47:24.000000000 -0400
@@ -92,7 +92,7 @@ static inline struct timespec timespec_s
 
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
 extern unsigned long read_persistent_clock(void);
 extern int update_persistent_clock(struct timespec now);
@@ -170,7 +170,7 @@ extern struct timeval ns_to_timeval(cons
  * @a:		pointer to timespec to be incremented
  * @ns:		unsigned nanoseconds value to be added
  */
-static inline void timespec_add_ns(struct timespec *a, u64 ns)
+static inline void timespec_add_ns(struct timespec *a, volatile u64 ns)
 {
 	ns += a->tv_nsec;
 	while(unlikely(ns >= NSEC_PER_SEC)) {
Index: linux-2.6.25.4-rt4/kernel/time/clockevents.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/time/clockevents.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/time/clockevents.c	2008-05-29 09:46:48.000000000 -0400
@@ -27,7 +27,7 @@ static LIST_HEAD(clockevents_released);
 static RAW_NOTIFIER_HEAD(clockevents_chain);
 
 /* Protection for the above */
-static DEFINE_SPINLOCK(clockevents_lock);
+static DEFINE_RAW_SPINLOCK(clockevents_lock);
 
 /**
  * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
Index: linux-2.6.25.4-rt4/kernel/time/clocksource.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/time/clocksource.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/time/clocksource.c	2008-05-29 09:46:48.000000000 -0400
@@ -51,7 +51,7 @@ static struct clocksource *curr_clocksou
 static struct clocksource *next_clocksource;
 static struct clocksource *clocksource_override;
 static LIST_HEAD(clocksource_list);
-static DEFINE_SPINLOCK(clocksource_lock);
+static DEFINE_RAW_SPINLOCK(clocksource_lock);
 static char override_name[32];
 static int finished_booting;
 
Index: linux-2.6.25.4-rt4/kernel/time/tick-broadcast.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/time/tick-broadcast.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/time/tick-broadcast.c	2008-05-29 09:46:48.000000000 -0400
@@ -29,7 +29,7 @@
 
 struct tick_device tick_broadcast_device;
 static cpumask_t tick_broadcast_mask;
-static DEFINE_SPINLOCK(tick_broadcast_lock);
+static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
Index: linux-2.6.25.4-rt4/kernel/time/tick-internal.h
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/time/tick-internal.h	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/time/tick-internal.h	2008-05-29 09:46:48.000000000 -0400
@@ -2,7 +2,7 @@
  * tick internal variable and functions used by low/high res code
  */
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
-extern spinlock_t tick_device_lock;
+extern raw_spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
 extern ktime_t tick_period;
 extern int tick_do_timer_cpu __read_mostly;
Index: linux-2.6.25.4-rt4/kernel/time/timer_stats.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/time/timer_stats.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/time/timer_stats.c	2008-05-29 09:46:48.000000000 -0400
@@ -81,12 +81,12 @@ struct entry {
 /*
  * Spinlock protecting the tables - not taken during lookup:
  */
-static DEFINE_SPINLOCK(table_lock);
+static DEFINE_RAW_SPINLOCK(table_lock);
 
 /*
  * Per-CPU lookup locks for fast hash lookup:
  */
-static DEFINE_PER_CPU(spinlock_t, lookup_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, lookup_lock);
 
 /*
  * Mutex to serialize state changes with show-stats activities:
@@ -238,7 +238,7 @@ void timer_stats_update_stats(void *time
 	/*
 	 * It doesnt matter which lock we take:
 	 */
-	spinlock_t *lock;
+	raw_spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
 	struct entry *entry, input;
 	unsigned long flags;
 
Index: linux-2.6.25.4-rt4/drivers/net/usb/usbnet.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/net/usb/usbnet.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/net/usb/usbnet.c	2008-05-29 09:46:49.000000000 -0400
@@ -904,6 +904,8 @@ static void tx_complete (struct urb *urb
 
 	urb->dev = NULL;
 	entry->state = tx_done;
+	spin_lock_rt(&dev->txq.lock);
+	spin_unlock_rt(&dev->txq.lock);
 	defer_bh(dev, skb, &dev->txq);
 }
 
Index: linux-2.6.25.4-rt4/drivers/usb/core/devio.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/usb/core/devio.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/usb/core/devio.c	2008-05-29 09:46:49.000000000 -0400
@@ -311,10 +311,11 @@ static void async_completed(struct urb *
 	struct async *as = urb->context;
 	struct dev_state *ps = as->ps;
 	struct siginfo sinfo;
+	unsigned long flags;
 
-	spin_lock(&ps->lock);
+	spin_lock_irqsave(&ps->lock, flags);
 	list_move_tail(&as->asynclist, &ps->async_completed);
-	spin_unlock(&ps->lock);
+	spin_unlock_irqrestore(&ps->lock, flags);
 	as->status = urb->status;
 	if (as->signr) {
 		sinfo.si_signo = as->signr;
Index: linux-2.6.25.4-rt4/drivers/usb/core/message.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/usb/core/message.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/usb/core/message.c	2008-05-29 09:46:49.000000000 -0400
@@ -266,8 +266,9 @@ static void sg_complete(struct urb *urb)
 {
 	struct usb_sg_request *io = urb->context;
 	int status = urb->status;
+	unsigned long flags;
 
-	spin_lock(&io->lock);
+	spin_lock_irqsave (&io->lock, flags);
 
 	/* In 2.5 we require hcds' endpoint queues not to progress after fault
 	 * reports, until the completion callback (this!) returns.  That lets
@@ -301,7 +302,7 @@ static void sg_complete(struct urb *urb)
 		 * unlink pending urbs so they won't rx/tx bad data.
 		 * careful: unlink can sometimes be synchronous...
 		 */
-		spin_unlock(&io->lock);
+		spin_unlock_irqrestore (&io->lock, flags);
 		for (i = 0, found = 0; i < io->entries; i++) {
 			if (!io->urbs [i] || !io->urbs [i]->dev)
 				continue;
@@ -316,7 +317,7 @@ static void sg_complete(struct urb *urb)
 			} else if (urb == io->urbs [i])
 				found = 1;
 		}
-		spin_lock(&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	urb->dev = NULL;
 
@@ -326,7 +327,7 @@ static void sg_complete(struct urb *urb)
 	if (!io->count)
 		complete(&io->complete);
 
-	spin_unlock(&io->lock);
+	spin_unlock_irqrestore (&io->lock, flags);
 }
 
 
@@ -591,7 +592,7 @@ void usb_sg_cancel(struct usb_sg_request
 		int i;
 
 		io->status = -ECONNRESET;
-		spin_unlock(&io->lock);
+		spin_unlock_irqrestore(&io->lock, flags);
 		for (i = 0; i < io->entries; i++) {
 			int retval;
 
@@ -602,7 +603,7 @@ void usb_sg_cancel(struct usb_sg_request
 				dev_warn(&io->dev->dev, "%s, unlink --> %d\n",
 					__FUNCTION__, retval);
 		}
-		spin_lock(&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	spin_unlock_irqrestore(&io->lock, flags);
 }
Index: linux-2.6.25.4-rt4/include/linux/netdevice.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/netdevice.h	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/netdevice.h	2008-05-29 09:47:15.000000000 -0400
@@ -631,7 +631,7 @@ struct net_device
 	/* cpu id of processor entered to hard_start_xmit or -1,
 	   if nobody entered there.
 	 */
-	int			xmit_lock_owner;
+	void			*xmit_lock_owner;
 	void			*priv;	/* pointer to private data	*/
 	int			(*hard_start_xmit) (struct sk_buff *skb,
 						    struct net_device *dev);
@@ -1345,46 +1345,46 @@ static inline void netif_rx_complete(str
  *
  * Get network device transmit lock
  */
-static inline void __netif_tx_lock(struct net_device *dev, int cpu)
+static inline void __netif_tx_lock(struct net_device *dev)
 {
 	spin_lock(&dev->_xmit_lock);
-	dev->xmit_lock_owner = cpu;
+	dev->xmit_lock_owner = (void *)current;
 }
 
 static inline void netif_tx_lock(struct net_device *dev)
 {
-	__netif_tx_lock(dev, smp_processor_id());
+	__netif_tx_lock(dev);
 }
 
 static inline void netif_tx_lock_bh(struct net_device *dev)
 {
 	spin_lock_bh(&dev->_xmit_lock);
-	dev->xmit_lock_owner = smp_processor_id();
+	dev->xmit_lock_owner = (void *)current;
 }
 
 static inline int netif_tx_trylock(struct net_device *dev)
 {
 	int ok = spin_trylock(&dev->_xmit_lock);
 	if (likely(ok))
-		dev->xmit_lock_owner = smp_processor_id();
+		dev->xmit_lock_owner = (void *)current;
 	return ok;
 }
 
 static inline void netif_tx_unlock(struct net_device *dev)
 {
-	dev->xmit_lock_owner = -1;
+	dev->xmit_lock_owner = (void *)-1;
 	spin_unlock(&dev->_xmit_lock);
 }
 
 static inline void netif_tx_unlock_bh(struct net_device *dev)
 {
-	dev->xmit_lock_owner = -1;
+	dev->xmit_lock_owner = (void *)-1;
 	spin_unlock_bh(&dev->_xmit_lock);
 }
 
-#define HARD_TX_LOCK(dev, cpu) {			\
+#define HARD_TX_LOCK(dev) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
-		__netif_tx_lock(dev, cpu);			\
+		__netif_tx_lock(dev);			\
 	}						\
 }
 
Index: linux-2.6.25.4-rt4/include/net/dn_dev.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/net/dn_dev.h	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/include/net/dn_dev.h	2008-05-29 09:46:49.000000000 -0400
@@ -76,9 +76,9 @@ struct dn_dev_parms {
 	int priority;             /* Priority to be a router            */
 	char *name;               /* Name for sysctl                    */
 	int ctl_name;             /* Index for sysctl                   */
-	int  (*up)(struct net_device *);
-	void (*down)(struct net_device *);
-	void (*timer3)(struct net_device *, struct dn_ifaddr *ifa);
+	int  (*dn_up)(struct net_device *);
+	void (*dn_down)(struct net_device *);
+	void (*dn_timer3)(struct net_device *, struct dn_ifaddr *ifa);
 	void *sysctl;
 };
 
Index: linux-2.6.25.4-rt4/net/core/netpoll.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/core/netpoll.c	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/net/core/netpoll.c	2008-05-29 09:46:49.000000000 -0400
@@ -64,20 +64,20 @@ static void queue_process(struct work_st
 			continue;
 		}
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		netif_tx_lock(dev);
 		if ((netif_queue_stopped(dev) ||
 		     netif_subqueue_stopped(dev, skb)) ||
 		     dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
 			skb_queue_head(&npinfo->txq, skb);
 			netif_tx_unlock(dev);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 			schedule_delayed_work(&npinfo->tx_work, HZ/10);
 			return;
 		}
 		netif_tx_unlock(dev);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 }
 
@@ -145,7 +145,7 @@ static void poll_napi(struct net_device 
 	int budget = 16;
 
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner != smp_processor_id() &&
+		if (napi->poll_owner != raw_smp_processor_id() &&
 		    spin_trylock(&napi->poll_lock)) {
 			budget = poll_one_napi(dev->npinfo, napi, budget);
 			spin_unlock(&napi->poll_lock);
@@ -201,30 +201,35 @@ static void refill_skbs(void)
 
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
 
 		local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
 		local_irq_restore(flags);
-
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			if (skb->destructor) {
-				atomic_inc(&skb->users);
-				dev_kfree_skb_any(skb); /* put this one back */
-			} else {
-				__kfree_skb(skb);
-			}
-		}
 	}
 
+
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		if (skb->destructor) {
+			atomic_inc(&skb->users);
+			dev_kfree_skb_any(skb); /* put this one back */
+		} else {
+			__kfree_skb(skb);
+		}
+	}
 }
 
 static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
@@ -232,13 +237,26 @@ static struct sk_buff *find_skb(struct n
 	int count = 0;
 	struct sk_buff *skb;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * On -rt skb_pool.lock is schedulable, so if we are
+	 * in an atomic context we just try to dequeue from the
+	 * pool and fail if we cannot get one.
+	 */
+	if (in_atomic() || irqs_disabled())
+		goto pick_atomic;
+#endif
 	zap_completion_queue();
 	refill_skbs();
 repeat:
 
 	skb = alloc_skb(len, GFP_ATOMIC);
-	if (!skb)
+	if (!skb) {
+#ifdef CONFIG_PREEMPT_RT
+pick_atomic:
+#endif
 		skb = skb_dequeue(&skb_pool);
+	}
 
 	if (!skb) {
 		if (++count < 10) {
@@ -258,7 +276,7 @@ static int netpoll_owner_active(struct n
 	struct napi_struct *napi;
 
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner == smp_processor_id())
+		if (napi->poll_owner == raw_smp_processor_id())
 			return 1;
 	}
 	return 0;
@@ -280,7 +298,7 @@ static void netpoll_send_skb(struct netp
 	if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* try until next clock tick */
 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
 		     tries > 0; --tries) {
@@ -300,7 +318,7 @@ static void netpoll_send_skb(struct netp
 
 			udelay(USEC_PER_POLL);
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	if (status != NETDEV_TX_OK) {
@@ -724,7 +742,7 @@ int netpoll_setup(struct netpoll *np)
 				       np->name);
 				break;
 			}
-			cond_resched();
+			schedule_timeout_uninterruptible(1);
 		}
 
 		/* If carrier appears to come up instantly, we don't
Index: linux-2.6.25.4-rt4/net/decnet/dn_dev.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/decnet/dn_dev.c	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/net/decnet/dn_dev.c	2008-05-29 09:46:49.000000000 -0400
@@ -90,9 +90,9 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ethernet",
 	.ctl_name =	NET_DECNET_CONF_ETHER,
-	.up =		dn_eth_up,
-	.down = 	dn_eth_down,
-	.timer3 =	dn_send_brd_hello,
+	.dn_up =		dn_eth_up,
+	.dn_down = 	dn_eth_down,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 {
 	.type =		ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
@@ -102,7 +102,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ipgre",
 	.ctl_name =	NET_DECNET_CONF_GRE,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #if 0
 {
@@ -113,7 +113,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		120,
 	.name =		"x25",
 	.ctl_name =	NET_DECNET_CONF_X25,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 #endif
 #if 0
@@ -125,7 +125,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ppp",
 	.ctl_name =	NET_DECNET_CONF_PPP,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #endif
 {
@@ -136,7 +136,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		120,
 	.name =		"ddcmp",
 	.ctl_name =	NET_DECNET_CONF_DDCMP,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 {
 	.type =		ARPHRD_LOOPBACK, /* Loopback interface - always last */
@@ -146,7 +146,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"loopback",
 	.ctl_name =	NET_DECNET_CONF_LOOPBACK,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 }
 };
 
@@ -305,11 +305,11 @@ static int dn_forwarding_proc(ctl_table 
 		 */
 		tmp = dn_db->parms.forwarding;
 		dn_db->parms.forwarding = old;
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = tmp;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return err;
@@ -343,11 +343,11 @@ static int dn_forwarding_sysctl(ctl_tabl
 		if (value > 2)
 			return -EINVAL;
 
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = value;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return 0;
@@ -1080,10 +1080,10 @@ static void dn_dev_timer_func(unsigned l
 	struct dn_ifaddr *ifa;
 
 	if (dn_db->t3 <= dn_db->parms.t2) {
-		if (dn_db->parms.timer3) {
+		if (dn_db->parms.dn_timer3) {
 			for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) {
 				if (!(ifa->ifa_flags & IFA_F_SECONDARY))
-					dn_db->parms.timer3(dev, ifa);
+					dn_db->parms.dn_timer3(dev, ifa);
 			}
 		}
 		dn_db->t3 = dn_db->parms.t3;
@@ -1142,8 +1142,8 @@ struct dn_dev *dn_dev_create(struct net_
 		return NULL;
 	}
 
-	if (dn_db->parms.up) {
-		if (dn_db->parms.up(dev) < 0) {
+	if (dn_db->parms.dn_up) {
+		if (dn_db->parms.dn_up(dev) < 0) {
 			neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
 			dev->dn_ptr = NULL;
 			kfree(dn_db);
@@ -1237,8 +1237,8 @@ static void dn_dev_delete(struct net_dev
 	dn_dev_check_default(dev);
 	neigh_ifdown(&dn_neigh_table, dev);
 
-	if (dn_db->parms.down)
-		dn_db->parms.down(dev);
+	if (dn_db->parms.dn_down)
+		dn_db->parms.dn_down(dev);
 
 	dev->dn_ptr = NULL;
 
Index: linux-2.6.25.4-rt4/net/ipv4/icmp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/ipv4/icmp.c	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/net/ipv4/icmp.c	2008-05-29 09:46:49.000000000 -0400
@@ -230,7 +230,10 @@ static const struct icmp_control icmp_po
  *	On SMP we have one ICMP socket per-cpu.
  */
 static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
-#define icmp_socket	__get_cpu_var(__icmp_socket)
+/*
+ * Should be safe on PREEMPT_SOFTIRQS/HARDIRQS to use raw-smp-processor-id:
+ */
+#define icmp_socket	per_cpu(__icmp_socket, raw_smp_processor_id())
 
 static inline int icmp_xmit_lock(void)
 {
Index: linux-2.6.25.4-rt4/net/ipv6/netfilter/ip6_tables.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/ipv6/netfilter/ip6_tables.c	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/net/ipv6/netfilter/ip6_tables.c	2008-05-29 09:46:49.000000000 -0400
@@ -378,7 +378,7 @@ ip6t_do_table(struct sk_buff *skb,
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	table_base = (void *)private->entries[raw_smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
Index: linux-2.6.25.4-rt4/net/sched/sch_generic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/sched/sch_generic.c	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/net/sched/sch_generic.c	2008-05-29 09:47:15.000000000 -0400
@@ -12,6 +12,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -24,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/delay.h>
 #include <net/pkt_sched.h>
 
 /* Main transmission queue. */
@@ -93,7 +95,7 @@ static inline int handle_dev_cpu_collisi
 {
 	int ret;
 
-	if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
+	if (unlikely(dev->xmit_lock_owner == (void *)current)) {
 		/*
 		 * Same CPU holding the lock. It may be a transient
 		 * configuration error, when hard_start_xmit() recurses. We
@@ -150,7 +152,7 @@ static inline int qdisc_restart(struct n
 	/* And release queue */
 	spin_unlock(&dev->queue_lock);
 
-	HARD_TX_LOCK(dev, smp_processor_id());
+	HARD_TX_LOCK(dev);
 	if (!netif_subqueue_stopped(dev, skb))
 		ret = dev_hard_start_xmit(skb, dev);
 	HARD_TX_UNLOCK(dev);
@@ -595,8 +597,12 @@ void dev_deactivate(struct net_device *d
 
 	/* Wait for outstanding qdisc_run calls. */
 	do {
+		/*
+		 * Wait for outstanding qdisc_run calls.
+		 * TODO: shouldnt this be wakeup-based, instead of polling it?
+		 */
 		while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
-			yield();
+			msleep(1);
 
 		/*
 		 * Double-check inside queue lock to ensure that all effects
Index: linux-2.6.25.4-rt4/net/unix/af_unix.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/unix/af_unix.c	2008-05-29 09:45:43.000000000 -0400
+++ linux-2.6.25.4-rt4/net/unix/af_unix.c	2008-05-29 09:46:49.000000000 -0400
@@ -318,6 +318,7 @@ static void unix_write_space(struct sock
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
+	preempt_check_resched_delayed();
 }
 
 /* When dgram socket disconnects (or changes its peer), we clear its receive
Index: linux-2.6.25.4-rt4/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2008-05-29 09:46:50.000000000 -0400
@@ -768,7 +768,7 @@ void ipoib_mcast_restart_task(struct wor
 
 	ipoib_mcast_stop_thread(dev, 0);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	netif_tx_lock(dev);
 	spin_lock(&priv->lock);
 
@@ -847,7 +847,7 @@ void ipoib_mcast_restart_task(struct wor
 
 	spin_unlock(&priv->lock);
 	netif_tx_unlock(dev);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	/* We have to cancel outside of the spinlock */
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
Index: linux-2.6.25.4-rt4/arch/x86/kernel/apic_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/apic_64.c	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/apic_64.c	2008-05-29 09:46:51.000000000 -0400
@@ -838,7 +838,6 @@ void __cpuinit lapic_setup_esr(void)
 void __cpuinit end_local_APIC_setup(void)
 {
 	lapic_setup_esr();
-	nmi_watchdog_default();
 	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
 }
Index: linux-2.6.25.4-rt4/arch/x86/kernel/smpboot_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/smpboot_64.c	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/smpboot_64.c	2008-05-29 09:46:51.000000000 -0400
@@ -869,7 +869,6 @@ static void __init smp_cpu_index_default
  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
-	nmi_watchdog_default();
 	smp_cpu_index_default();
 	current_cpu_data = boot_cpu_data;
 	current_thread_info()->cpu = 0;  /* needed? */
Index: linux-2.6.25.4-rt4/include/asm-x86/nmi_64.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/nmi_64.h	2008-05-29 09:45:42.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/nmi_64.h	2008-05-29 09:46:51.000000000 -0400
@@ -55,8 +55,6 @@ extern void disable_timer_nmi_watchdog(v
 extern void enable_timer_nmi_watchdog(void);
 extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
 
-extern void nmi_watchdog_default(void);
-
 extern atomic_t nmi_active;
 extern unsigned int nmi_watchdog;
 #define NMI_DISABLED    -1
Index: linux-2.6.25.4-rt4/include/asm-arm26/irq_regs.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/asm-arm26/irq_regs.h	2008-05-29 09:46:51.000000000 -0400
@@ -0,0 +1 @@
+#include <asm-generic/irq_regs.h>
Index: linux-2.6.25.4-rt4/include/linux/lock_list.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/linux/lock_list.h	2008-05-29 09:46:53.000000000 -0400
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2006, Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Licenced under the GPLv2.
+ *
+ * Simple fine grain locked double linked list.
+ */
+#ifndef _LINUX_LOCK_LIST_H
+#define _LINUX_LOCK_LIST_H
+
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+
+struct lock_list_head {
+	union {
+		struct list_head head;
+		struct {
+			struct lock_list_head *next, *prev;
+		};
+	};
+	spinlock_t lock;
+};
+
+enum {
+	LOCK_LIST_NESTING_PREV = 1,
+	LOCK_LIST_NESTING_CUR,
+	LOCK_LIST_NESTING_NEXT,
+};
+
+static inline void INIT_LOCK_LIST_HEAD(struct lock_list_head *list)
+{
+	INIT_LIST_HEAD(&list->head);
+	spin_lock_init(&list->lock);
+}
+
+/*
+ * Passed pointers are assumed stable by external means (refcount, rcu)
+ */
+extern void lock_list_add(struct lock_list_head *new,
+			  struct lock_list_head *list);
+extern void lock_list_del_init(struct lock_list_head *entry);
+extern void lock_list_splice_init(struct lock_list_head *list,
+				  struct lock_list_head *head);
+
+struct lock_list_head *lock_list_next_entry(struct lock_list_head *list,
+					    struct lock_list_head *entry);
+struct lock_list_head *lock_list_first_entry(struct lock_list_head *list);
+
+#define lock_list_for_each_entry(pos, list, member)			\
+	for (pos = list_entry(lock_list_first_entry(list), 		\
+			      typeof(*pos), member); 			\
+	     pos;							\
+	     pos = list_entry(lock_list_next_entry(list, &pos->member),	\
+			      typeof(*pos), member))
+
+/*
+ * to be used when iteration is terminated by breaking out of the
+ * lock_list_for_each_entry() loop.
+ *
+ * 	lock_list_for_each_entry(i, list, member) {
+ * 		if (cond) {
+ * 			lock_list_for_each_entry_stop(i, member);
+ * 			goto foo;
+ * 		}
+ * 	}
+ *
+ */
+#define lock_list_for_each_entry_stop(pos, member)			\
+	spin_unlock(&(pos->member.lock))
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_LOCK_LIST_H */
Index: linux-2.6.25.4-rt4/lib/lock_list.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/lib/lock_list.c	2008-05-29 09:47:10.000000000 -0400
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2006, Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Licenced under the GPLv2.
+ *
+ * Simple fine grain locked double linked list.
+ *
+ * Locking order is from prev -> next.
+ * Edges are locked not nodes; that is, cur->lock protects:
+ *  - cur->next,
+ *  - cur->next->prev.
+ *
+ * Passed pointers are assumed to be stable by external means such as
+ * refcounts or RCU. The individual list entries are assumed to be RCU
+ * freed (requirement of __lock_list).
+ */
+
+#include <linux/lock_list.h>
+
+void lock_list_add(struct lock_list_head *new,
+		   struct lock_list_head *list)
+{
+	spin_lock(&new->lock);
+	spin_lock_nested(&list->lock, LOCK_LIST_NESTING_PREV);
+	__list_add(&new->head, &list->head, &list->next->head);
+	spin_unlock(&list->lock);
+	spin_unlock(&new->lock);
+}
+
+static spinlock_t *__lock_list(struct lock_list_head *entry)
+{
+	struct lock_list_head *prev;
+	spinlock_t *lock = NULL;
+
+again:
+	/*
+	 * all modifications are done under spinlocks
+	 * but this read is not, the unlock acks as a wmb
+	 * for modifications.
+	 */
+	smp_rmb();
+
+	prev = entry->prev;
+	if (prev == entry)
+		goto one;
+	spin_lock_nested(&prev->lock, LOCK_LIST_NESTING_PREV);
+	if (unlikely(entry->prev != prev)) {
+		/*
+		 * we lost
+		 */
+		spin_unlock(&prev->lock);
+		goto again;
+	}
+	lock = &prev->lock;
+one:
+	spin_lock_nested(&entry->lock, LOCK_LIST_NESTING_CUR);
+	return lock;
+}
+
+/*
+ * deadlock galore...
+ *
+ * when using __lock_list to lock the list head we get this:
+ *
+ *     lock H      2               1
+ *     lock 1      a       b
+ *     lock 2              A       B
+ *
+ *     list: ..-> [H] <-> [1] <-> [2] <-..
+ *
+ * obvious dead-lock, to solve this we must use a reverse order
+ * when trying to acquire a double lock on the head:
+ *
+ *     lock H r    1               2
+ *     lock 1      a       b
+ *     lock 2              A       B
+ *
+ *     list: ..-> [H] <-> [1] <-> [2] <-..
+ */
+static spinlock_t *__lock_list_reverse(struct lock_list_head *entry)
+{
+	struct lock_list_head *prev;
+	spinlock_t *lock = NULL;
+
+	spin_lock(&entry->lock);
+again:
+	/*
+	 * all modifications are done under spinlocks
+	 * but this read is not, the unlock acks as a wmb
+	 * for modifications.
+	 */
+	smp_rmb();
+	prev = entry->prev;
+	if (prev == entry)
+		goto done;
+
+	spin_lock_nested(&prev->lock, LOCK_LIST_NESTING_PREV);
+	if (unlikely(entry->prev != prev)) {
+		/*
+		 * we lost
+		 */
+		spin_unlock(&prev->lock);
+		goto again;
+	}
+	lock = &prev->lock;
+done:
+	return lock;
+}
+
+void lock_list_del_init(struct lock_list_head *entry)
+{
+	spinlock_t *lock;
+
+	rcu_read_lock();
+	lock = __lock_list(entry);
+	list_del_init(&entry->head);
+	spin_unlock(&entry->lock);
+	if (lock)
+		spin_unlock(lock);
+	rcu_read_unlock();
+}
+
+void lock_list_splice_init(struct lock_list_head *list,
+			struct lock_list_head *head)
+{
+	spinlock_t *lock;
+
+	rcu_read_lock();
+	lock = __lock_list_reverse(list);
+	if (!list_empty(&list->head)) {
+		spin_lock_nested(&head->lock, LOCK_LIST_NESTING_NEXT);
+		__list_splice(&list->head, &head->head, head->head.next);
+		INIT_LIST_HEAD(&list->head);
+		spin_unlock(&head->lock);
+	}
+	spin_unlock(&list->lock);
+	if (lock)
+		spin_unlock(lock);
+	rcu_read_unlock();
+}
+
+struct lock_list_head *lock_list_next_entry(struct lock_list_head *list,
+					    struct lock_list_head *entry)
+{
+	struct lock_list_head *next = entry->next;
+	if (likely(next != list)) {
+		lock_set_subclass(&entry->lock.dep_map,
+				  LOCK_LIST_NESTING_CUR, _THIS_IP_);
+		spin_lock_nested(&next->lock, LOCK_LIST_NESTING_NEXT);
+		BUG_ON(entry->next != next);
+	} else
+		next = NULL;
+	spin_unlock(&entry->lock);
+	return next;
+}
+
+struct lock_list_head *lock_list_first_entry(struct lock_list_head *list)
+{
+	spin_lock(&list->lock);
+	return lock_list_next_entry(list, list);
+}
+
Index: linux-2.6.25.4-rt4/include/linux/percpu_list.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/linux/percpu_list.h	2008-05-29 09:46:53.000000000 -0400
@@ -0,0 +1,119 @@
+#ifndef _LINUX_PERCPU_LIST_H
+#define _LINUX_PERCPU_LIST_H
+
+#include <linux/lock_list.h>
+#include <linux/percpu.h>
+
+#ifdef CONFIG_SMP
+
+struct percpu_list_element {
+	spinlock_t lock;
+	unsigned long nr;
+	struct lock_list_head list;
+};
+
+struct percpu_list {
+	struct lock_list_head list;
+	struct percpu_list_element *percpu_list;
+};
+
+static inline
+void percpu_list_init(struct percpu_list *pcl)
+{
+	int cpu;
+
+	INIT_LOCK_LIST_HEAD(&pcl->list);
+	pcl->percpu_list = alloc_percpu(struct percpu_list_element);
+
+	for_each_possible_cpu(cpu) {
+		struct percpu_list_element *pcle;
+
+		pcle = per_cpu_ptr(pcl->percpu_list, cpu);
+		spin_lock_init(&pcle->lock);
+		pcle->nr = 0;
+		INIT_LOCK_LIST_HEAD(&pcle->list);
+	}
+}
+
+static inline
+void percpu_list_destroy(struct percpu_list *pcl)
+{
+	free_percpu(pcl->percpu_list);
+}
+
+static inline
+void percpu_list_fold_cpu(struct percpu_list *pcl, int cpu)
+{
+	struct percpu_list_element *pcle = per_cpu_ptr(pcl->percpu_list, cpu);
+
+	spin_lock(&pcle->lock);
+	if (pcle->nr) {
+		pcle->nr = 0;
+		lock_list_splice_init(&pcle->list, &pcl->list);
+	}
+	spin_unlock(&pcle->lock);
+}
+
+static inline
+void percpu_list_add(struct percpu_list *pcl, struct lock_list_head *elm)
+{
+	struct percpu_list_element *pcle;
+	int cpu = raw_smp_processor_id();
+	unsigned long nr;
+
+	pcle = per_cpu_ptr(pcl->percpu_list, cpu);
+	spin_lock(&pcle->lock);
+	nr = ++pcle->nr;
+	lock_list_add(elm, &pcle->list);
+	spin_unlock(&pcle->lock);
+
+	if (nr >= 16)
+		percpu_list_fold_cpu(pcl, cpu);
+}
+
+static inline
+void percpu_list_fold(struct percpu_list *pcl)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		percpu_list_fold_cpu(pcl, cpu);
+}
+
+#else /* CONFIG_SMP */
+
+struct percpu_list {
+	struct lock_list_head list;
+};
+
+static inline
+void percpu_list_init(struct percpu_list *pcl)
+{
+	INIT_LOCK_LIST_HEAD(&pcl->list);
+}
+
+static inline
+void percpu_list_destroy(struct percpu_list *pcl)
+{
+}
+
+static inline
+void percpu_list_add(struct percpu_list *pcl, struct lock_list_head *elm)
+{
+	lock_list_add(elm, &pcl->list);
+}
+
+static inline
+void percpu_list_fold(struct percpu_list *pcl)
+{
+}
+
+#endif
+
+static inline
+struct lock_list_head *percpu_list_head(struct percpu_list *pcl)
+{
+	return &pcl->list;
+}
+
+#endif /* _LINUX_PERCPU_LIST_H */
Index: linux-2.6.25.4-rt4/fs/file_table.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/file_table.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/file_table.c	2008-05-29 09:46:53.000000000 -0400
@@ -28,9 +28,6 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-/* public. Not pretty! */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
-
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
 static inline void file_free_rcu(struct rcu_head *head)
@@ -117,7 +114,7 @@ struct file *get_empty_filp(void)
 		goto fail_sec;
 
 	tsk = current;
-	INIT_LIST_HEAD(&f->f_u.fu_list);
+	INIT_LOCK_LIST_HEAD(&f->f_u.fu_llist);
 	atomic_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_uid = tsk->fsuid;
@@ -309,31 +306,35 @@ void put_filp(struct file *file)
 	}
 }
 
-void file_move(struct file *file, struct list_head *list)
+void file_move(struct file *file, struct percpu_list *list)
 {
 	if (!list)
 		return;
-	file_list_lock();
-	list_move(&file->f_u.fu_list, list);
-	file_list_unlock();
+
+	file_kill(file);
+	percpu_list_add(list, &file->f_u.fu_llist);
 }
 
 void file_kill(struct file *file)
 {
-	if (!list_empty(&file->f_u.fu_list)) {
-		file_list_lock();
-		list_del_init(&file->f_u.fu_list);
-		file_list_unlock();
+	if (file && file->f_mapping && file->f_mapping->host) {
+		struct super_block *sb = file->f_mapping->host->i_sb;
+		if (sb)
+			synchronize_qrcu(&sb->s_qrcu);
 	}
+
+	lock_list_del_init(&file->f_u.fu_llist);
 }
 
 int fs_may_remount_ro(struct super_block *sb)
 {
 	struct file *file;
+	int idx;
 
 	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+	idx = qrcu_read_lock(&sb->s_qrcu);
+	percpu_list_fold(&sb->s_files);
+	lock_list_for_each_entry(file, percpu_list_head(&sb->s_files), f_u.fu_llist) {
 		struct inode *inode = file->f_path.dentry->d_inode;
 
 		/* File with pending delete? */
@@ -344,10 +345,11 @@ int fs_may_remount_ro(struct super_block
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
 	}
-	file_list_unlock();
+	qrcu_read_unlock(&sb->s_qrcu, idx);
 	return 1; /* Tis' cool bro. */
 too_bad:
-	file_list_unlock();
+	lock_list_for_each_entry_stop(file, f_u.fu_llist);
+	qrcu_read_unlock(&sb->s_qrcu, idx);
 	return 0;
 }
 
Index: linux-2.6.25.4-rt4/fs/proc/generic.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/proc/generic.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/proc/generic.c	2008-05-29 09:46:53.000000000 -0400
@@ -742,6 +742,8 @@ void remove_proc_entry(const char *name,
 		goto out;
 	len = strlen(fn);
 
+	percpu_list_fold(&proc_mnt->mnt_sb->s_files);
+
 	spin_lock(&proc_subdir_lock);
 	for (p = &parent->subdir; *p; p=&(*p)->next ) {
 		if (!proc_match(len, fn, *p))
Index: linux-2.6.25.4-rt4/fs/super.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/super.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/super.c	2008-05-29 09:46:53.000000000 -0400
@@ -64,7 +64,8 @@ static struct super_block *alloc_super(s
 		INIT_LIST_HEAD(&s->s_dirty);
 		INIT_LIST_HEAD(&s->s_io);
 		INIT_LIST_HEAD(&s->s_more_io);
-		INIT_LIST_HEAD(&s->s_files);
+		percpu_list_init(&s->s_files);
+		init_qrcu_struct(&s->s_qrcu);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -103,6 +104,7 @@ out:
  */
 static inline void destroy_super(struct super_block *s)
 {
+	percpu_list_destroy(&s->s_files);
 	security_sb_free(s);
 	kfree(s->s_subtype);
 	kfree(s->s_options);
@@ -566,13 +568,15 @@ out:
 static void mark_files_ro(struct super_block *sb)
 {
 	struct file *f;
+	int idx;
 
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+	idx = qrcu_read_lock(&sb->s_qrcu);
+	percpu_list_fold(&sb->s_files);
+	lock_list_for_each_entry(f, percpu_list_head(&sb->s_files), f_u.fu_llist) {
 		if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
 			f->f_mode &= ~FMODE_WRITE;
 	}
-	file_list_unlock();
+	qrcu_read_unlock(&sb->s_qrcu, idx);
 }
 
 /**
Index: linux-2.6.25.4-rt4/include/linux/fs.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/fs.h	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/fs.h	2008-05-29 09:47:08.000000000 -0400
@@ -281,12 +281,14 @@ extern int dir_notify_enable;
 #include <linux/cache.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/percpu_list.h>
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/pid.h>
 #include <linux/mutex.h>
 #include <linux/capability.h>
+#include <linux/srcu.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -499,13 +501,13 @@ struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
-	rwlock_t		tree_lock;	/* and rwlock protecting it */
+	spinlock_t		priv_lock;	/* spinlock protecting various stuffs */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	spinlock_t		i_mmap_lock;	/* protect tree, count, list */
 	unsigned int		truncate_count;	/* Cover race condition with truncate */
-	unsigned long		nrpages;	/* number of total pages */
+	atomic_long_t		__nrpages;	/* number of total pages */
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
@@ -520,6 +522,26 @@ struct address_space {
 	 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
 	 */
 
+static inline void mapping_nrpages_init(struct address_space *mapping)
+{
+	mapping->__nrpages = (atomic_long_t)ATOMIC_LONG_INIT(0);
+}
+
+static inline unsigned long mapping_nrpages(struct address_space *mapping)
+{
+	return (unsigned long)atomic_long_read(&mapping->__nrpages);
+}
+
+static inline void mapping_nrpages_inc(struct address_space *mapping)
+{
+	atomic_long_inc(&mapping->__nrpages);
+}
+
+static inline void mapping_nrpages_dec(struct address_space *mapping)
+{
+	atomic_long_dec(&mapping->__nrpages);
+}
+
 struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
 	struct inode *		bd_inode;	/* will die */
@@ -615,7 +637,7 @@ struct inode {
 	umode_t			i_mode;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct mutex		i_mutex;
-	struct rw_semaphore	i_alloc_sem;
+	struct compat_rw_semaphore	i_alloc_sem;
 	const struct inode_operations	*i_op;
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
@@ -777,12 +799,8 @@ static inline int ra_has_index(struct fi
 }
 
 struct file {
-	/*
-	 * fu_list becomes invalid after file_free is called and queued via
-	 * fu_rcuhead for RCU freeing
-	 */
-	union {
-		struct list_head	fu_list;
+	struct {
+		struct lock_list_head	fu_llist;
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
 	struct path		f_path;
@@ -811,9 +829,6 @@ struct file {
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
 };
-extern spinlock_t files_lock;
-#define file_list_lock() spin_lock(&files_lock);
-#define file_list_unlock() spin_unlock(&files_lock);
 
 #define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
@@ -1009,7 +1024,8 @@ struct super_block {
 	struct list_head	s_io;		/* parked for writeback */
 	struct list_head	s_more_io;	/* parked for more writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
-	struct list_head	s_files;
+	struct percpu_list	s_files;
+	struct qrcu_struct	s_qrcu;
 
 	struct block_device	*s_bdev;
 	struct mtd_info		*s_mtd;
@@ -1787,7 +1803,7 @@ static inline void insert_inode_hash(str
 }
 
 extern struct file * get_empty_filp(void);
-extern void file_move(struct file *f, struct list_head *list);
+extern void file_move(struct file *f, struct percpu_list *list);
 extern void file_kill(struct file *f);
 #ifdef CONFIG_BLOCK
 struct bio;
Index: linux-2.6.25.4-rt4/include/linux/tty.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/tty.h	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/tty.h	2008-05-29 09:46:53.000000000 -0400
@@ -204,7 +204,7 @@ struct tty_struct {
 	struct work_struct hangup_work;
 	void *disc_data;
 	void *driver_data;
-	struct list_head tty_files;
+	struct percpu_list tty_files;
 
 #define N_TTY_BUF_SIZE 4096
 	
Index: linux-2.6.25.4-rt4/security/selinux/hooks.c
===================================================================
--- linux-2.6.25.4-rt4.orig/security/selinux/hooks.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/security/selinux/hooks.c	2008-05-29 09:46:53.000000000 -0400
@@ -2027,8 +2027,11 @@ static inline void flush_unauthorized_fi
 	mutex_lock(&tty_mutex);
 	tty = get_current_tty();
 	if (tty) {
-		file_list_lock();
-		file = list_entry(tty->tty_files.next, typeof(*file), f_u.fu_list);
+		lock_list_for_each_entry(file,
+				percpu_list_head(&tty->tty_files),
+				f_u.fu_llist)
+			break;
+
 		if (file) {
 			/* Revalidate access to controlling tty.
 			   Use inode_has_perm on the tty inode directly rather
@@ -2040,8 +2043,8 @@ static inline void flush_unauthorized_fi
 					   FILE__READ | FILE__WRITE, NULL)) {
 				drop_tty = 1;
 			}
+			lock_list_for_each_entry_stop(file, f_u.fu_llist);
 		}
-		file_list_unlock();
 	}
 	mutex_unlock(&tty_mutex);
 	/* Reset controlling tty. */
Index: linux-2.6.25.4-rt4/mm/readahead.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/readahead.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/readahead.c	2008-05-29 09:46:54.000000000 -0400
@@ -376,9 +376,9 @@ ondemand_readahead(struct address_space 
 	if (hit_readahead_marker) {
 		pgoff_t start;
 
-		read_lock_irq(&mapping->tree_lock);
+		rcu_read_lock();
 		start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
-		read_unlock_irq(&mapping->tree_lock);
+		rcu_read_unlock();
 
 		if (!start || start - offset > max)
 			return 0;
Index: linux-2.6.25.4-rt4/include/linux/page-flags.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/page-flags.h	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/page-flags.h	2008-05-29 09:46:56.000000000 -0400
@@ -83,6 +83,8 @@
 #define PG_private		11	/* If pagecache, has fs-private data */
 
 #define PG_writeback		12	/* Page is under writeback */
+#define PG_nonewrefs		13	/* Block concurrent pagecache lookups
+					 * while testing refcount */
 #define PG_compound		14	/* Part of a compound page */
 #define PG_swapcache		15	/* Swap page: swp_entry_t in private */
 
@@ -296,6 +298,11 @@ static inline void __ClearPageTail(struc
 #define SetPageUncached(page)	set_bit(PG_uncached, &(page)->flags)
 #define ClearPageUncached(page)	clear_bit(PG_uncached, &(page)->flags)
 
+#define PageNoNewRefs(page)	test_bit(PG_nonewrefs, &(page)->flags)
+#define SetPageNoNewRefs(page)	set_bit(PG_nonewrefs, &(page)->flags)
+#define ClearPageNoNewRefs(page) clear_bit(PG_nonewrefs, &(page)->flags)
+#define __ClearPageNoNewRefs(page) __clear_bit(PG_nonewrefs, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 extern void cancel_dirty_page(struct page *page, unsigned int account_size);
Index: linux-2.6.25.4-rt4/include/linux/pagemap.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/pagemap.h	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/pagemap.h	2008-05-29 09:46:57.000000000 -0400
@@ -12,6 +12,12 @@
 #include <asm/uaccess.h>
 #include <linux/gfp.h>
 #include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/hardirq.h> /* for in_interrupt() */
+#include <linux/bit_spinlock.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/interrupt.h>
 
 /*
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
@@ -62,6 +68,198 @@ static inline void mapping_set_gfp_mask(
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+/*
+ * In order to wait for pages to become available there must be
+ * waitqueues associated with pages. By using a hash table of
+ * waitqueues where the bucket discipline is to maintain all
+ * waiters on the same queue and wake all when any of the pages
+ * become available, and for the woken contexts to check to be
+ * sure the appropriate page became available, this saves space
+ * at a cost of "thundering herd" phenomena during rare hash
+ * collisions.
+ */
+static inline wait_queue_head_t *page_waitqueue(struct page *page)
+{
+	const struct zone *zone = page_zone(page);
+
+	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+}
+
+extern int __sleep_on_page(void *);
+
+#ifndef CONFIG_PREEMPT_RT
+static inline void lock_page_ref(struct page *page)
+{
+	bit_spin_lock(PG_nonewrefs, &page->flags);
+	smp_wmb();
+}
+
+static inline void unlock_page_ref(struct page *page)
+{
+	bit_spin_unlock(PG_nonewrefs, &page->flags);
+}
+
+static inline void wait_on_page_ref(struct page *page)
+{
+	while (unlikely(test_bit(PG_nonewrefs, &page->flags)))
+		cpu_relax();
+}
+#else // CONFIG_PREEMPT_RT
+static inline void wait_on_page_ref(struct page *page)
+{
+	might_sleep();
+	if (unlikely(PageNoNewRefs(page))) {
+		DEFINE_WAIT_BIT(wait, &page->flags, PG_nonewrefs);
+		__wait_on_bit(page_waitqueue(page), &wait, __sleep_on_page,
+				TASK_UNINTERRUPTIBLE);
+	}
+}
+
+static inline void lock_page_ref(struct page *page)
+{
+	while (test_and_set_bit(PG_nonewrefs, &page->flags))
+		wait_on_page_ref(page);
+	__acquire(bitlock);
+	smp_wmb();
+}
+
+static inline void unlock_page_ref(struct page *page)
+{
+	VM_BUG_ON(!PageNoNewRefs(page));
+	smp_mb__before_clear_bit();
+	ClearPageNoNewRefs(page);
+	smp_mb__after_clear_bit();
+	__wake_up_bit(page_waitqueue(page), &page->flags, PG_nonewrefs);
+	__release(bitlock);
+}
+#endif // CONFIG_PREEMPT_RT
+
+#define lock_page_ref_irq(page)					\
+	do {							\
+		local_irq_disable_nort();			\
+		lock_page_ref(page);				\
+	} while (0)
+
+#define unlock_page_ref_irq(page)				\
+	do {							\
+		unlock_page_ref(page);				\
+		local_irq_enable_nort();			\
+	} while (0)
+
+#define lock_page_ref_irqsave(page, flags)			\
+	do {							\
+		local_irq_save_nort(flags);			\
+		lock_page_ref(page);				\
+	} while (0)
+
+#define unlock_page_ref_irqrestore(page, flags)			\
+	do {							\
+		unlock_page_ref(page);				\
+		local_irq_restore_nort(flags);			\
+	} while (0)
+
+/*
+ * speculatively take a reference to a page.
+ * If the page is free (_count == 0), then _count is untouched, and 0
+ * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ *
+ * This function must be run in the same rcu_read_lock() section as has
+ * been used to lookup the page in the pagecache radix-tree: this allows
+ * allocators to use a synchronize_rcu() to stabilize _count.
+ *
+ * Unless an RCU grace period has passed, the count of all pages coming out
+ * of the allocator must be considered unstable. page_count may return higher
+ * than expected, and put_page must be able to do the right thing when the
+ * page has been finished with (because put_page is what is used to drop an
+ * invalid speculative reference).
+ *
+ * After incrementing the refcount, this function spins until PageNoNewRefs
+ * is clear, then a read memory barrier is issued.
+ *
+ * This forms the core of the lockless pagecache locking protocol, where
+ * the lookup-side (eg. find_get_page) has the following pattern:
+ * 1. find page in radix tree
+ * 2. conditionally increment refcount
+ * 3. wait for PageNoNewRefs
+ * 4. check the page is still in pagecache
+ *
+ * Remove-side (that cares about _count, eg. reclaim) has the following:
+ * A. SetPageNoNewRefs
+ * B. check refcount is correct
+ * C. remove page
+ * D. ClearPageNoNewRefs
+ *
+ * There are 2 critical interleavings that matter:
+ * - 2 runs before B: in this case, B sees elevated refcount and bails out
+ * - B runs before 2: in this case, 3 ensures 4 will not run until *after* C
+ *   (after D, even). In which case, 4 will notice C and lookup side can retry
+ *
+ * It is possible that between 1 and 2, the page is removed then the exact same
+ * page is inserted into the same position in pagecache. That's OK: the
+ * old find_get_page using tree_lock could equally have run before or after
+ * the write-side, depending on timing.
+ *
+ * Pagecache insertion isn't a big problem: either 1 will find the page or
+ * it will not. Likewise, the old find_get_page could run either before the
+ * insertion or afterwards, depending on timing.
+ */
+static inline int page_cache_get_speculative(struct page *page)
+{
+	VM_BUG_ON(in_interrupt());
+
+#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)
+# ifdef CONFIG_PREEMPT
+	VM_BUG_ON(!in_atomic());
+# endif
+	/*
+	 * Preempt must be disabled here - we rely on rcu_read_lock doing
+	 * this for us.
+	 *
+	 * Pagecache won't be truncated from interrupt context, so if we have
+	 * found a page in the radix tree here, we have pinned its refcount by
+	 * disabling preempt, and hence no need for the "speculative get" that
+	 * SMP requires.
+	 */
+	VM_BUG_ON(page_count(page) == 0);
+	atomic_inc(&page->_count);
+
+#else
+	if (unlikely(!get_page_unless_zero(page)))
+		return 0; /* page has been freed */
+
+	/*
+	 * Note that get_page_unless_zero provides a memory barrier.
+	 * This is needed to ensure PageNoNewRefs is evaluated after the
+	 * page refcount has been raised. See below comment.
+	 */
+
+	wait_on_page_ref(page);
+
+	/*
+	 * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs())
+	 * is performed before a future load used to ensure the page is
+	 * the correct on (usually: page->mapping and page->index).
+	 *
+	 * Those places that set PageNoNewRefs have the following pattern:
+	 * 	SetPageNoNewRefs(page)
+	 * 	wmb();
+	 * 	if (page_count(page) == X)
+	 * 		remove page from pagecache
+	 * 	wmb();
+	 * 	ClearPageNoNewRefs(page)
+	 *
+	 * If the load was out of order, page->mapping might be loaded before
+	 * the page is removed from pagecache but PageNoNewRefs evaluated
+	 * after the ClearPageNoNewRefs().
+	 */
+	smp_rmb();
+
+#endif
+	VM_BUG_ON(PageCompound(page) && (struct page *)page_private(page) != page);
+
+	return 1;
+}
+
 #ifdef CONFIG_NUMA
 extern struct page *__page_cache_alloc(gfp_t gfp);
 #else
Index: linux-2.6.25.4-rt4/mm/filemap.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/filemap.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/filemap.c	2008-05-29 09:47:12.000000000 -0400
@@ -112,16 +112,21 @@ generic_file_direct_IO(int rw, struct ki
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock.
  */
 void __remove_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
+	DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree);
 
 	mem_cgroup_uncharge_page(page);
-	radix_tree_delete(&mapping->page_tree, page->index);
+
+	radix_tree_lock(&ctx);
+	radix_tree_delete(ctx.tree, page->index);
+	radix_tree_unlock(&ctx);
+
 	page->mapping = NULL;
-	mapping->nrpages--;
+	mapping_nrpages_dec(mapping);
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	BUG_ON(page_mapped(page));
 
@@ -140,13 +145,11 @@ void __remove_from_page_cache(struct pag
 
 void remove_from_page_cache(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
-
 	BUG_ON(!PageLocked(page));
 
-	write_lock_irq(&mapping->tree_lock);
+	lock_page_ref_irq(page);
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
+	unlock_page_ref_irq(page);
 }
 
 static int sync_page(void *word)
@@ -212,7 +215,7 @@ int __filemap_fdatawrite_range(struct ad
 	int ret;
 	struct writeback_control wbc = {
 		.sync_mode = sync_mode,
-		.nr_to_write = mapping->nrpages * 2,
+		.nr_to_write = mapping_nrpages(mapping) * 2,
 		.range_start = start,
 		.range_end = end,
 	};
@@ -394,7 +397,7 @@ int filemap_write_and_wait(struct addres
 {
 	int err = 0;
 
-	if (mapping->nrpages) {
+	if (mapping_nrpages(mapping)) {
 		err = filemap_fdatawrite(mapping);
 		/*
 		 * Even if the above returned error, the pages may be
@@ -428,7 +431,7 @@ int filemap_write_and_wait_range(struct 
 {
 	int err = 0;
 
-	if (mapping->nrpages) {
+	if (mapping_nrpages(mapping)) {
 		err = __filemap_fdatawrite_range(mapping, lstart, lend,
 						 WB_SYNC_ALL);
 		/* See comment of filemap_write_and_wait() */
@@ -466,19 +469,23 @@ int add_to_page_cache(struct page *page,
 
 	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
-		write_lock_irq(&mapping->tree_lock);
-		error = radix_tree_insert(&mapping->page_tree, offset, page);
+		DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree);
+
+		lock_page_ref_irq(page);
+		radix_tree_lock(&ctx);
+		error = radix_tree_insert(ctx.tree, offset, page);
+		radix_tree_unlock(&ctx);
 		if (!error) {
 			page_cache_get(page);
 			SetPageLocked(page);
 			page->mapping = mapping;
 			page->index = offset;
-			mapping->nrpages++;
+			mapping_nrpages_inc(mapping);
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 		} else
 			mem_cgroup_uncharge_page(page);
 
-		write_unlock_irq(&mapping->tree_lock);
+		unlock_page_ref_irq(page);
 		radix_tree_preload_end();
 	} else
 		mem_cgroup_uncharge_page(page);
@@ -514,21 +521,10 @@ static int __sleep_on_page_lock(void *wo
 	return 0;
 }
 
-/*
- * In order to wait for pages to become available there must be
- * waitqueues associated with pages. By using a hash table of
- * waitqueues where the bucket discipline is to maintain all
- * waiters on the same queue and wake all when any of the pages
- * become available, and for the woken contexts to check to be
- * sure the appropriate page became available, this saves space
- * at a cost of "thundering herd" phenomena during rare hash
- * collisions.
- */
-static wait_queue_head_t *page_waitqueue(struct page *page)
+int __sleep_on_page(void *word)
 {
-	const struct zone *zone = page_zone(page);
-
-	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+	schedule();
+	return 0;
 }
 
 static inline void wake_up_page(struct page *page, int bit)
@@ -635,13 +631,33 @@ void __lock_page_nosync(struct page *pag
  */
 struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
 {
+	void **pagep;
 	struct page *page;
 
-	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page)
-		page_cache_get(page);
-	read_unlock_irq(&mapping->tree_lock);
+	rcu_read_lock();
+repeat:
+	page = NULL;
+	pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
+	if (pagep) {
+		page = radix_tree_deref_slot(pagep);
+		if (unlikely(!page || page == RADIX_TREE_RETRY))
+			goto repeat;
+
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/*
+		 * Has the page moved?
+		 * This is part of the lockless pagecache protocol. See
+		 * include/linux/pagemap.h for details.
+		 */
+		if (unlikely(page != *pagep)) {
+			page_cache_release(page);
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+
 	return page;
 }
 EXPORT_SYMBOL(find_get_page);
@@ -662,26 +678,16 @@ struct page *find_lock_page(struct addre
 	struct page *page;
 
 repeat:
-	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = find_get_page(mapping, offset);
 	if (page) {
-		page_cache_get(page);
-		if (TestSetPageLocked(page)) {
-			read_unlock_irq(&mapping->tree_lock);
-			__lock_page(page);
-
-			/* Has the page been truncated while we slept? */
-			if (unlikely(page->mapping != mapping)) {
-				unlock_page(page);
-				page_cache_release(page);
-				goto repeat;
-			}
-			VM_BUG_ON(page->index != offset);
-			goto out;
+		lock_page(page);
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
 		}
 	}
-	read_unlock_irq(&mapping->tree_lock);
-out:
 	return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -747,13 +753,39 @@ unsigned find_get_pages(struct address_s
 {
 	unsigned int i;
 	unsigned int ret;
+	unsigned int nr_found;
+
+	rcu_read_lock();
+restart:
+	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+				(void ***)pages, start, nr_pages);
+	ret = 0;
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+repeat:
+		page = radix_tree_deref_slot((void **)pages[i]);
+		if (unlikely(!page))
+			continue;
+		/*
+		 * this can only trigger if nr_found == 1, making livelock
+		 * a non issue.
+		 */
+		if (unlikely(page == RADIX_TREE_RETRY))
+			goto restart;
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
-				(void **)pages, start, nr_pages);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
-	read_unlock_irq(&mapping->tree_lock);
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/* Has the page moved? */
+		if (unlikely(page != *((void **)pages[i]))) {
+			page_cache_release(page);
+			goto repeat;
+		}
+
+		pages[ret] = page;
+		ret++;
+	}
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -774,19 +806,44 @@ unsigned find_get_pages_contig(struct ad
 {
 	unsigned int i;
 	unsigned int ret;
+	unsigned int nr_found;
+
+	rcu_read_lock();
+restart:
+	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+				(void ***)pages, index, nr_pages);
+	ret = 0;
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+repeat:
+		page = radix_tree_deref_slot((void **)pages[i]);
+		if (unlikely(!page))
+			continue;
+		/*
+		 * this can only trigger if nr_found == 1, making livelock
+		 * a non issue.
+		 */
+		if (unlikely(page == RADIX_TREE_RETRY))
+			goto restart;
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
-				(void **)pages, index, nr_pages);
-	for (i = 0; i < ret; i++) {
-		if (pages[i]->mapping == NULL || pages[i]->index != index)
+		if (page->mapping == NULL || page->index != index)
 			break;
 
-		page_cache_get(pages[i]);
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/* Has the page moved? */
+		if (unlikely(page != *((void **)pages[i]))) {
+			page_cache_release(page);
+			goto repeat;
+		}
+
+		pages[ret] = page;
+		ret++;
 		index++;
 	}
-	read_unlock_irq(&mapping->tree_lock);
-	return i;
+	rcu_read_unlock();
+	return ret;
 }
 EXPORT_SYMBOL(find_get_pages_contig);
 
@@ -806,15 +863,43 @@ unsigned find_get_pages_tag(struct addre
 {
 	unsigned int i;
 	unsigned int ret;
+	unsigned int nr_found;
+
+	rcu_read_lock();
+restart:
+	nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+				(void ***)pages, *index, nr_pages, tag);
+	ret = 0;
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+repeat:
+		page = radix_tree_deref_slot((void **)pages[i]);
+		if (unlikely(!page))
+			continue;
+		/*
+		 * this can only trigger if nr_found == 1, making livelock
+		 * a non issue.
+		 */
+		if (unlikely(page == RADIX_TREE_RETRY))
+			goto restart;
+
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/* Has the page moved? */
+		if (unlikely(page != *((void **)pages[i]))) {
+			page_cache_release(page);
+			goto repeat;
+		}
+
+		pages[ret] = page;
+		ret++;
+	}
+	rcu_read_unlock();
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-				(void **)pages, *index, nr_pages, tag);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
 	if (ret)
 		*index = pages[ret - 1]->index + 1;
-	read_unlock_irq(&mapping->tree_lock);
+
 	return ret;
 }
 EXPORT_SYMBOL(find_get_pages_tag);
@@ -1711,7 +1796,9 @@ size_t iov_iter_copy_from_user_atomic(st
 	char *kaddr;
 	size_t copied;
 
-	BUG_ON(!in_atomic());
+#ifndef CONFIG_PREEMPT_RT
+	BUG_ON(!current->pagefault_disabled);
+#endif
 	kaddr = kmap_atomic(page, KM_USER0);
 	if (likely(i->nr_segs == 1)) {
 		int left;
@@ -2539,7 +2626,7 @@ generic_file_direct_IO(int rw, struct ki
 	 * about to write.  We do this *before* the write so that we can return
 	 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
 	 */
-	if (rw == WRITE && mapping->nrpages) {
+	if (rw == WRITE && mapping_nrpages(mapping)) {
 		retval = invalidate_inode_pages2_range(mapping,
 					offset >> PAGE_CACHE_SHIFT, end);
 		if (retval)
@@ -2556,7 +2643,7 @@ generic_file_direct_IO(int rw, struct ki
 	 * so we don't support it 100%.  If this invalidation
 	 * fails, tough, the write still worked...
 	 */
-	if (rw == WRITE && mapping->nrpages) {
+	if (rw == WRITE && mapping_nrpages(mapping)) {
 		invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
 	}
 out:
Index: linux-2.6.25.4-rt4/mm/migrate.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/migrate.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/migrate.c	2008-05-29 09:46:56.000000000 -0400
@@ -305,6 +305,7 @@ static int migrate_page_move_mapping(str
 		struct page *newpage, struct page *page)
 {
 	void **pslot;
+	struct radix_tree_context ctx;
 
 	if (!mapping) {
 		/* Anonymous page without mapping */
@@ -313,14 +314,15 @@ static int migrate_page_move_mapping(str
 		return 0;
 	}
 
-	write_lock_irq(&mapping->tree_lock);
-
-	pslot = radix_tree_lookup_slot(&mapping->page_tree,
- 					page_index(page));
+	init_radix_tree_context(&ctx, &mapping->page_tree);
+	lock_page_ref_irq(page);
+	radix_tree_lock(&ctx);
+	pslot = radix_tree_lookup_slot(ctx.tree, page_index(page));
 
 	if (page_count(page) != 2 + !!PagePrivate(page) ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
-		write_unlock_irq(&mapping->tree_lock);
+		radix_tree_unlock(&ctx);
+		unlock_page_ref_irq(page);
 		return -EAGAIN;
 	}
 
@@ -336,12 +338,8 @@ static int migrate_page_move_mapping(str
 #endif
 
 	radix_tree_replace_slot(pslot, newpage);
-
-	/*
-	 * Drop cache reference from old page.
-	 * We know this isn't the last reference.
-	 */
-	__put_page(page);
+	page->mapping = NULL;
+	radix_tree_unlock(&ctx);
 
 	/*
 	 * If moved to a different zone then also account
@@ -356,7 +354,13 @@ static int migrate_page_move_mapping(str
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
 
-	write_unlock_irq(&mapping->tree_lock);
+	unlock_page_ref_irq(page);
+
+	/*
+	 * Drop cache reference from old page.
+	 * We know this isn't the last reference.
+	 */
+	__put_page(page);
 
 	return 0;
 }
Index: linux-2.6.25.4-rt4/mm/swap_state.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/swap_state.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/swap_state.c	2008-05-29 09:46:56.000000000 -0400
@@ -39,7 +39,6 @@ static struct backing_dev_info swap_back
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
@@ -76,18 +75,21 @@ int add_to_swap_cache(struct page *page,
 	BUG_ON(PagePrivate(page));
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
-		write_lock_irq(&swapper_space.tree_lock);
-		error = radix_tree_insert(&swapper_space.page_tree,
-						entry.val, page);
+		DEFINE_RADIX_TREE_CONTEXT(ctx, &swapper_space.page_tree);
+
+		lock_page_ref_irq(page);
+		radix_tree_lock(&ctx);
+		error = radix_tree_insert(ctx.tree, entry.val, page);
+		radix_tree_unlock(&ctx);
 		if (!error) {
 			page_cache_get(page);
 			SetPageSwapCache(page);
 			set_page_private(page, entry.val);
-			total_swapcache_pages++;
+			mapping_nrpages_inc(&swapper_space);
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 			INC_CACHE_INFO(add_total);
 		}
-		write_unlock_irq(&swapper_space.tree_lock);
+		unlock_page_ref_irq(page);
 		radix_tree_preload_end();
 	}
 	return error;
@@ -99,15 +101,19 @@ int add_to_swap_cache(struct page *page,
  */
 void __delete_from_swap_cache(struct page *page)
 {
+	DEFINE_RADIX_TREE_CONTEXT(ctx, &swapper_space.page_tree);
+
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
 
-	radix_tree_delete(&swapper_space.page_tree, page_private(page));
+	radix_tree_lock(&ctx);
+	radix_tree_delete(ctx.tree, page_private(page));
+	radix_tree_unlock(&ctx);
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
-	total_swapcache_pages--;
+	mapping_nrpages_dec(&swapper_space);
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	INC_CACHE_INFO(del_total);
 }
@@ -175,9 +181,9 @@ void delete_from_swap_cache(struct page 
 
 	entry.val = page_private(page);
 
-	write_lock_irq(&swapper_space.tree_lock);
+	lock_page_ref_irq(page);
 	__delete_from_swap_cache(page);
-	write_unlock_irq(&swapper_space.tree_lock);
+	unlock_page_ref_irq(page);
 
 	swap_free(entry);
 	page_cache_release(page);
Index: linux-2.6.25.4-rt4/fs/inode.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/inode.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/inode.c	2008-05-29 09:46:56.000000000 -0400
@@ -209,7 +209,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	rwlock_init(&inode->i_data.tree_lock);
+	spin_lock_init(&inode->i_data.priv_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
@@ -259,7 +259,7 @@ void clear_inode(struct inode *inode)
 	might_sleep();
 	invalidate_inode_buffers(inode);
        
-	BUG_ON(inode->i_data.nrpages);
+	BUG_ON(mapping_nrpages(&inode->i_data));
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
@@ -292,7 +292,7 @@ static void dispose_list(struct list_hea
 		inode = list_first_entry(head, struct inode, i_list);
 		list_del(&inode->i_list);
 
-		if (inode->i_data.nrpages)
+		if (mapping_nrpages(&inode->i_data))
 			truncate_inode_pages(&inode->i_data, 0);
 		clear_inode(inode);
 
@@ -384,7 +384,7 @@ static int can_unuse(struct inode *inode
 		return 0;
 	if (atomic_read(&inode->i_count))
 		return 0;
-	if (inode->i_data.nrpages)
+	if (mapping_nrpages(&inode->i_data))
 		return 0;
 	return 1;
 }
@@ -423,7 +423,7 @@ static void prune_icache(int nr_to_scan)
 			list_move(&inode->i_list, &inode_unused);
 			continue;
 		}
-		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+		if (inode_has_buffers(inode) || mapping_nrpages(&inode->i_data)) {
 			__iget(inode);
 			spin_unlock(&inode_lock);
 			if (remove_inode_buffers(inode))
@@ -1096,7 +1096,7 @@ static void generic_forget_inode(struct 
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
-	if (inode->i_data.nrpages)
+	if (mapping_nrpages(&inode->i_data))
 		truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
 	wake_up_inode(inode);
Index: linux-2.6.25.4-rt4/include/asm-arm/cacheflush.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-arm/cacheflush.h	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-arm/cacheflush.h	2008-05-29 09:46:56.000000000 -0400
@@ -421,9 +421,9 @@ static inline void flush_anon_page(struc
 }
 
 #define flush_dcache_mmap_lock(mapping) \
-	write_lock_irq(&(mapping)->tree_lock)
+	spin_lock_irq(&(mapping)->priv_lock)
 #define flush_dcache_mmap_unlock(mapping) \
-	write_unlock_irq(&(mapping)->tree_lock)
+	spin_unlock_irq(&(mapping)->priv_lock)
 
 #define flush_icache_user_range(vma,page,addr,len) \
 	flush_dcache_page(page)
Index: linux-2.6.25.4-rt4/include/asm-parisc/cacheflush.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-parisc/cacheflush.h	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-parisc/cacheflush.h	2008-05-29 09:46:56.000000000 -0400
@@ -45,9 +45,9 @@ void flush_cache_mm(struct mm_struct *mm
 extern void flush_dcache_page(struct page *page);
 
 #define flush_dcache_mmap_lock(mapping) \
-	write_lock_irq(&(mapping)->tree_lock)
+	spin_lock_irq(&(mapping)->priv_lock)
 #define flush_dcache_mmap_unlock(mapping) \
-	write_unlock_irq(&(mapping)->tree_lock)
+	spin_unlock_irq(&(mapping)->priv_lock)
 
 #define flush_icache_page(vma,page)	do { 		\
 	flush_kernel_dcache_page(page);			\
Index: linux-2.6.25.4-rt4/mm/swapfile.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/swapfile.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/swapfile.c	2008-05-29 09:47:23.000000000 -0400
@@ -367,16 +367,17 @@ int remove_exclusive_swap_page(struct pa
 	/* Is the only swap cache user the cache itself? */
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
+		spin_unlock(&swap_lock);
 		/* Recheck the page count with the swapcache lock held.. */
-		write_lock_irq(&swapper_space.tree_lock);
+		lock_page_ref_irq(page);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		write_unlock_irq(&swapper_space.tree_lock);
-	}
-	spin_unlock(&swap_lock);
+		unlock_page_ref_irq(page);
+	} else
+		spin_unlock(&swap_lock);
 
 	if (retval) {
 		swap_free(entry);
@@ -401,13 +402,14 @@ void free_swap_and_cache(swp_entry_t ent
 	p = swap_info_get(entry);
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
+			spin_unlock(&swap_lock);
 			page = find_get_page(&swapper_space, entry.val);
 			if (page && unlikely(TestSetPageLocked(page))) {
 				page_cache_release(page);
 				page = NULL;
 			}
-		}
-		spin_unlock(&swap_lock);
+		} else
+			spin_unlock(&swap_lock);
 	}
 	if (page) {
 		int one_user;
Index: linux-2.6.25.4-rt4/mm/truncate.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/truncate.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/truncate.c	2008-05-29 09:46:56.000000000 -0400
@@ -166,7 +166,7 @@ void truncate_inode_pages_range(struct a
 	pgoff_t next;
 	int i;
 
-	if (mapping->nrpages == 0)
+	if (mapping_nrpages(mapping) == 0)
 		return;
 
 	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
@@ -349,18 +349,18 @@ invalidate_complete_page2(struct address
 	if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
-	write_lock_irq(&mapping->tree_lock);
+	lock_page_ref_irq(page);
 	if (PageDirty(page))
 		goto failed;
 
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
+	unlock_page_ref_irq(page);
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
 failed:
-	write_unlock_irq(&mapping->tree_lock);
+	unlock_page_ref_irq(page);
 	return 0;
 }
 
Index: linux-2.6.25.4-rt4/init/Kconfig
===================================================================
--- linux-2.6.25.4-rt4.orig/init/Kconfig	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/init/Kconfig	2008-05-29 09:47:21.000000000 -0400
@@ -311,6 +311,12 @@ config CPUSETS
 
 	  Say N if unsure.
 
+#
+# Architectures with an unreliable sched_clock() should select this:
+#
+config HAVE_UNSTABLE_SCHED_CLOCK
+        bool
+
 config GROUP_SCHED
 	bool "Group CPU scheduler"
 	default y
@@ -504,6 +510,16 @@ config CC_OPTIMIZE_FOR_SIZE
 config SYSCTL
 	bool
 
+config RADIX_TREE_CONCURRENT
+	bool "Enable concurrent radix tree operations (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	default y if SMP
+
+config RADIX_TREE_OPTIMISTIC
+	bool "Enabled optimistic locking (EXPERIMENTAL)"
+	depends on RADIX_TREE_CONCURRENT
+	default y
+
 menuconfig EMBEDDED
 	bool "Configure standard kernel features (for small systems)"
 	help
@@ -717,6 +733,7 @@ config SLAB
 
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
+	depends on !PREEMPT_RT
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
Index: linux-2.6.25.4-rt4/fs/gfs2/glock.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/gfs2/glock.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/gfs2/glock.c	2008-05-29 09:46:55.000000000 -0400
@@ -1904,7 +1904,7 @@ static int dump_glock(struct glock_iter 
 		   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
 	if (gl->gl_aspace)
 		print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-			   gl->gl_aspace->i_mapping->nrpages);
+		       mapping_nrpages(gl->gl_aspace->i_mapping));
 	else
 		print_dbg(gi, "  aspace = no\n");
 	print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
Index: linux-2.6.25.4-rt4/fs/gfs2/glops.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/gfs2/glops.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/gfs2/glops.c	2008-05-29 09:46:55.000000000 -0400
@@ -219,7 +219,7 @@ static int inode_go_demote_ok(struct gfs
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	int demote = 0;
 
-	if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
+	if (!gl->gl_object && !mapping_nrpages(gl->gl_aspace->i_mapping))
 		demote = 1;
 	else if (!sdp->sd_args.ar_localcaching &&
 		 time_after_eq(jiffies, gl->gl_stamp +
@@ -269,7 +269,7 @@ static int inode_go_lock(struct gfs2_hol
 
 static int rgrp_go_demote_ok(struct gfs2_glock *gl)
 {
-	return !gl->gl_aspace->i_mapping->nrpages;
+	return !mapping_nrpages(gl->gl_aspace->i_mapping);
 }
 
 /**
Index: linux-2.6.25.4-rt4/fs/gfs2/meta_io.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/gfs2/meta_io.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/gfs2/meta_io.c	2008-05-29 09:46:55.000000000 -0400
@@ -105,7 +105,7 @@ void gfs2_meta_inval(struct gfs2_glock *
 	truncate_inode_pages(mapping, 0);
 	atomic_dec(&aspace->i_writecount);
 
-	gfs2_assert_withdraw(sdp, !mapping->nrpages);
+	gfs2_assert_withdraw(sdp, !mapping_nrpages(mapping));
 }
 
 /**
Index: linux-2.6.25.4-rt4/fs/hugetlbfs/inode.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/hugetlbfs/inode.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/hugetlbfs/inode.c	2008-05-29 09:46:55.000000000 -0400
@@ -368,7 +368,7 @@ static void truncate_hugepages(struct in
 		}
 		huge_pagevec_release(&pvec);
 	}
-	BUG_ON(!lstart && mapping->nrpages);
+	BUG_ON(!lstart && mapping_nrpages(mapping));
 	hugetlb_unreserve_pages(inode, start, freed);
 }
 
Index: linux-2.6.25.4-rt4/fs/jffs2/dir.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/jffs2/dir.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/jffs2/dir.c	2008-05-29 09:46:55.000000000 -0400
@@ -203,7 +203,7 @@ static int jffs2_create(struct inode *di
 	inode->i_op = &jffs2_file_inode_operations;
 	inode->i_fop = &jffs2_file_operations;
 	inode->i_mapping->a_ops = &jffs2_file_address_operations;
-	inode->i_mapping->nrpages = 0;
+	mapping_nrpages_init(inode->i_mapping);
 
 	f = JFFS2_INODE_INFO(inode);
 	dir_f = JFFS2_INODE_INFO(dir_i);
@@ -219,7 +219,7 @@ static int jffs2_create(struct inode *di
 	d_instantiate(dentry, inode);
 
 	D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
-		  inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->nlink, inode->i_mapping->nrpages));
+		  inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->nlink, mapping_nrpages(inode->i_mapping)));
 	return 0;
 
  fail:
Index: linux-2.6.25.4-rt4/fs/jffs2/fs.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/jffs2/fs.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/jffs2/fs.c	2008-05-29 09:46:55.000000000 -0400
@@ -297,7 +297,7 @@ struct inode *jffs2_iget(struct super_bl
 		inode->i_op = &jffs2_file_inode_operations;
 		inode->i_fop = &jffs2_file_operations;
 		inode->i_mapping->a_ops = &jffs2_file_address_operations;
-		inode->i_mapping->nrpages = 0;
+		mapping_nrpages_init(inode->i_mapping);
 		break;
 
 	case S_IFBLK:
Index: linux-2.6.25.4-rt4/fs/libfs.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/libfs.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/libfs.c	2008-05-29 09:46:55.000000000 -0400
@@ -17,7 +17,7 @@ int simple_getattr(struct vfsmount *mnt,
 {
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
-	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+	stat->blocks = mapping_nrpages(inode->i_mapping) << (PAGE_CACHE_SHIFT - 9);
 	return 0;
 }
 
Index: linux-2.6.25.4-rt4/fs/nfs/inode.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/nfs/inode.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/nfs/inode.c	2008-05-29 09:46:55.000000000 -0400
@@ -120,7 +120,7 @@ int nfs_sync_mapping(struct address_spac
 {
 	int ret;
 
-	if (mapping->nrpages == 0)
+	if (mapping_nrpages(mapping) == 0)
 		return 0;
 	unmap_mapping_range(mapping, 0, 0, 0);
 	ret = filemap_write_and_wait(mapping);
@@ -160,7 +160,7 @@ void nfs_zap_caches(struct inode *inode)
 
 void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
 {
-	if (mapping->nrpages != 0) {
+	if (mapping_nrpages(mapping) != 0) {
 		spin_lock(&inode->i_lock);
 		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
 		spin_unlock(&inode->i_lock);
@@ -725,7 +725,7 @@ static int nfs_invalidate_mapping_nolock
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	
-	if (mapping->nrpages != 0) {
+	if (mapping_nrpages(mapping) != 0) {
 		int ret = invalidate_inode_pages2(mapping);
 		if (ret < 0)
 			return ret;
Index: linux-2.6.25.4-rt4/fs/xfs/linux-2.6/xfs_vnode.h
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/xfs/linux-2.6/xfs_vnode.h	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/xfs/linux-2.6/xfs_vnode.h	2008-05-29 09:46:55.000000000 -0400
@@ -263,7 +263,7 @@ static inline void vn_atime_to_time_t(bh
  * Some useful predicates.
  */
 #define VN_MAPPED(vp)	mapping_mapped(vn_to_inode(vp)->i_mapping)
-#define VN_CACHED(vp)	(vn_to_inode(vp)->i_mapping->nrpages)
+#define VN_CACHED(vp)	mapping_nrpages(vn_to_inode(vp)->i_mapping)
 #define VN_DIRTY(vp)	mapping_tagged(vn_to_inode(vp)->i_mapping, \
 					PAGECACHE_TAG_DIRTY)
 
Index: linux-2.6.25.4-rt4/include/linux/swap.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/swap.h	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/swap.h	2008-05-29 09:46:55.000000000 -0400
@@ -221,7 +221,7 @@ extern void end_swap_bio_read(struct bio
 
 /* linux/mm/swap_state.c */
 extern struct address_space swapper_space;
-#define total_swapcache_pages  swapper_space.nrpages
+#define total_swapcache_pages  mapping_nrpages(&swapper_space)
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *, gfp_t);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
Index: linux-2.6.25.4-rt4/ipc/shm.c
===================================================================
--- linux-2.6.25.4-rt4.orig/ipc/shm.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/ipc/shm.c	2008-05-29 09:46:55.000000000 -0400
@@ -613,11 +613,11 @@ static void shm_get_stat(struct ipc_name
 
 		if (is_file_hugepages(shp->shm_file)) {
 			struct address_space *mapping = inode->i_mapping;
-			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages;
+			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping_nrpages(mapping);
 		} else {
 			struct shmem_inode_info *info = SHMEM_I(inode);
 			spin_lock(&info->lock);
-			*rss += inode->i_mapping->nrpages;
+			*rss += mapping_nrpages(inode->i_mapping);
 			*swp += info->swapped;
 			spin_unlock(&info->lock);
 		}
Index: linux-2.6.25.4-rt4/mm/shmem.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/shmem.c	2008-05-29 09:45:41.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/shmem.c	2008-05-29 09:46:55.000000000 -0400
@@ -251,8 +251,8 @@ static void shmem_free_inode(struct supe
  * We have to calculate the free blocks since the mm can drop
  * undirtied hole pages behind our back.
  *
- * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
- * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
+ * But normally   info->alloced == mapping_nrpages(inode->i_mapping) + info->swapped
+ * So mm freed is info->alloced - (mapping_nrpages(inode->i_mapping) + info->swapped)
  *
  * It has to be called with the spinlock held.
  */
@@ -261,7 +261,7 @@ static void shmem_recalc_inode(struct in
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	long freed;
 
-	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
+	freed = info->alloced - info->swapped - mapping_nrpages(inode->i_mapping);
 	if (freed > 0) {
 		info->alloced -= freed;
 		shmem_unacct_blocks(info->flags, freed);
@@ -705,7 +705,7 @@ static void shmem_truncate_range(struct 
 done1:
 	shmem_dir_unmap(dir);
 done2:
-	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
+	if (mapping_nrpages(inode->i_mapping) && (info->flags & SHMEM_PAGEIN)) {
 		/*
 		 * Call truncate_inode_pages again: racing shmem_unuse_inode
 		 * may have swizzled a page in from swap since vmtruncate or
Index: linux-2.6.25.4-rt4/arch/mips/mm/highmem.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/mm/highmem.c	2008-05-29 09:45:40.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/mm/highmem.c	2008-05-29 09:46:57.000000000 -0400
@@ -38,7 +38,7 @@ void *__kmap_atomic(struct page *page, e
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr, enum 
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -78,6 +79,7 @@ void __kunmap_atomic(void *kvaddr, enum 
 #endif
 
 	pagefault_enable();
+	preempt_enable();
 }
 
 /*
@@ -89,6 +91,7 @@ void *kmap_atomic_pfn(unsigned long pfn,
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
+	preempt_disable();
 	pagefault_disable();
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
Index: linux-2.6.25.4-rt4/arch/sparc/mm/highmem.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/sparc/mm/highmem.c	2008-05-29 09:45:40.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/sparc/mm/highmem.c	2008-05-29 09:46:57.000000000 -0400
@@ -34,7 +34,7 @@ void *kmap_atomic(struct page *page, enu
 	unsigned long idx;
 	unsigned long vaddr;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -71,6 +71,7 @@ void kunmap_atomic(void *kvaddr, enum km
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -97,6 +98,7 @@ void kunmap_atomic(void *kvaddr, enum km
 #endif
 
 	pagefault_enable();
+	preempt_enable();
 }
 
 /* We may be fed a pagetable here by ptep_to_xxx and others. */
Index: linux-2.6.25.4-rt4/include/asm-frv/highmem.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-frv/highmem.h	2008-05-29 09:45:40.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-frv/highmem.h	2008-05-29 09:46:57.000000000 -0400
@@ -115,6 +115,7 @@ static inline void *kmap_atomic(struct p
 {
 	unsigned long paddr;
 
+	preempt_disable();
 	pagefault_disable();
 	paddr = page_to_phys(page);
 
@@ -171,6 +172,7 @@ static inline void kunmap_atomic(void *k
 		BUG();
 	}
 	pagefault_enable();
+	preempt_enable();
 }
 
 #endif /* !__ASSEMBLY__ */
Index: linux-2.6.25.4-rt4/include/asm-ppc/highmem.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-ppc/highmem.h	2008-05-29 09:45:40.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-ppc/highmem.h	2008-05-29 09:46:57.000000000 -0400
@@ -78,7 +78,7 @@ static inline void *kmap_atomic(struct p
 	unsigned int idx;
 	unsigned long vaddr;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -102,6 +102,7 @@ static inline void kunmap_atomic(void *k
 
 	if (vaddr < KMAP_FIX_BEGIN) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -115,6 +116,7 @@ static inline void kunmap_atomic(void *k
 	flush_tlb_page(NULL, vaddr);
 #endif
 	pagefault_enable();
+	preempt_enable();
 }
 
 static inline struct page *kmap_atomic_to_page(void *ptr)
Index: linux-2.6.25.4-rt4/arch/mips/mm/fault.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/mm/fault.c	2008-05-29 09:45:40.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/mm/fault.c	2008-05-29 09:46:58.000000000 -0400
@@ -69,7 +69,7 @@ asmlinkage void do_page_fault(struct pt_
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (in_atomic() || !mm || current->pagefault_disabled)
 		goto bad_area_nosemaphore;
 
 	down_read(&mm->mmap_sem);
Index: linux-2.6.25.4-rt4/arch/powerpc/mm/fault.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/mm/fault.c	2008-05-29 09:45:40.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/mm/fault.c	2008-05-29 09:46:58.000000000 -0400
@@ -182,7 +182,7 @@ int __kprobes do_page_fault(struct pt_re
 	}
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
 
-	if (in_atomic() || mm == NULL) {
+	if (in_atomic() || mm == NULL || current->pagefault_disabled) {
 		if (!user_mode(regs))
 			return SIGSEGV;
 		/* in_atomic() in user mode is really bad,
Index: linux-2.6.25.4-rt4/include/linux/uaccess.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/uaccess.h	2008-05-29 09:45:40.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/uaccess.h	2008-05-29 09:46:58.000000000 -0400
@@ -6,37 +6,10 @@
 
 /*
  * These routines enable/disable the pagefault handler in that
- * it will not take any locks and go straight to the fixup table.
- *
- * They have great resemblance to the preempt_disable/enable calls
- * and in fact they are identical; this is because currently there is
- * no other way to make the pagefault handlers do this. So we do
- * disable preemption but we don't necessarily care about that.
+ * it will not take any MM locks and go straight to the fixup table.
  */
-static inline void pagefault_disable(void)
-{
-	inc_preempt_count();
-	/*
-	 * make sure to have issued the store before a pagefault
-	 * can hit.
-	 */
-	barrier();
-}
-
-static inline void pagefault_enable(void)
-{
-	/*
-	 * make sure to issue those last loads/stores before enabling
-	 * the pagefault handler again.
-	 */
-	barrier();
-	dec_preempt_count();
-	/*
-	 * make sure we do..
-	 */
-	barrier();
-	preempt_check_resched();
-}
+extern void pagefault_disable(void);
+extern void pagefault_enable(void);
 
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
Index: linux-2.6.25.4-rt4/fs/select.c
===================================================================
--- linux-2.6.25.4-rt4.orig/fs/select.c	2008-05-29 09:45:39.000000000 -0400
+++ linux-2.6.25.4-rt4/fs/select.c	2008-05-29 09:46:58.000000000 -0400
@@ -407,20 +407,12 @@ asmlinkage long sys_select(int n, fd_set
 		rtv.tv_sec = timeout;
 		if (timeval_compare(&rtv, &tv) >= 0)
 			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
+		if (copy_to_user(tvp, &rtv, sizeof(rtv)))
+			return -EFAULT;
 	}
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
 
 	return ret;
 }
Index: linux-2.6.25.4-rt4/mm/highmem.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/highmem.c	2008-05-29 09:45:39.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/highmem.c	2008-05-29 09:46:59.000000000 -0400
@@ -14,6 +14,11 @@
  * based on Linus' idea.
  *
  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ *
+ * Largely rewritten to get rid of all global locks
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
  */
 
 #include <linux/mm.h>
@@ -27,18 +32,15 @@
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
+#include <linux/hardirq.h>
+
 #include <asm/tlbflush.h>
+#include <asm/pgtable.h>
 
-/*
- * Virtual_count is not a pure "count".
- *  0 means that it is not mapped, and has not been mapped
- *    since a TLB flush - it is usable.
- *  1 means that there are no users, but it has been mapped
- *    since the last TLB flush - so we can't use it.
- *  n means that there are (n-1) current users of it.
- */
 #ifdef CONFIG_HIGHMEM
 
+static int __set_page_address(struct page *page, void *virtual, int pos);
+
 unsigned long totalhigh_pages __read_mostly;
 
 unsigned int nr_free_highpages (void)
@@ -58,181 +60,284 @@ unsigned int nr_free_highpages (void)
 	return pages;
 }
 
-static int pkmap_count[LAST_PKMAP];
-static unsigned int last_pkmap_nr;
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
+/*
+ * count is not a pure "count".
+ *  0 means its owned exclusively by someone
+ *  1 means its free for use - either mapped or not.
+ *  n means that there are (n-1) current users of it.
+ */
+static atomic_t pkmap_count[LAST_PKMAP];
+static atomic_t pkmap_hand;
+static atomic_t pkmap_free;
+static atomic_t pkmap_users;
 
 pte_t * pkmap_page_table;
 
-static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_wait);
 
-static void flush_all_zero_pkmaps(void)
+/*
+ * Try to free a given kmap slot.
+ *
+ * Returns:
+ *  -1 - in use
+ *   0 - free, no TLB flush needed
+ *   1 - free, needs TLB flush
+ */
+static int pkmap_try_free(int pos)
 {
-	int i;
+	if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1)
+		return -1;
+
+	atomic_dec(&pkmap_free);
+	/*
+	 * TODO: add a young bit to make it CLOCK
+	 */
+	if (!pte_none(pkmap_page_table[pos])) {
+		struct page *page = pte_page(pkmap_page_table[pos]);
+		unsigned long addr = PKMAP_ADDR(pos);
+		pte_t *ptep = &pkmap_page_table[pos];
+
+		VM_BUG_ON(addr != (unsigned long)page_address(page));
+
+		if (!__set_page_address(page, NULL, pos))
+			BUG();
+		flush_kernel_dcache_page(page);
+		pte_clear(&init_mm, addr, ptep);
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline void pkmap_put(atomic_t *counter)
+{
+	switch (atomic_dec_return(counter)) {
+	case 0:
+		BUG();
 
-	flush_cache_kmaps();
+	case 1:
+		atomic_inc(&pkmap_free);
+		wake_up(&pkmap_wait);
+	}
+}
 
+#define TLB_BATCH	32
+
+static int pkmap_get_free(void)
+{
+	int i, pos, flush;
+
+restart:
 	for (i = 0; i < LAST_PKMAP; i++) {
-		struct page *page;
+		pos = atomic_inc_return(&pkmap_hand) & LAST_PKMAP_MASK;
+		flush = pkmap_try_free(pos);
+		if (flush >= 0)
+			goto got_one;
+	}
+
+	/*
+	 * wait for somebody else to unmap their entries
+	 */
+	if (likely(!in_interrupt()))
+		wait_event(pkmap_wait, atomic_read(&pkmap_free) != 0);
+
+	goto restart;
+
+got_one:
+	if (flush) {
+#if 0
+		flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
+#else
+		int pos2 = (pos + 1) & LAST_PKMAP_MASK;
+		int nr;
+		int entries[TLB_BATCH];
 
 		/*
-		 * zero means we don't have anything to do,
-		 * >1 means that it is still in use. Only
-		 * a count of 1 means that it is free but
-		 * needs to be unmapped
+		 * For those architectures that cannot help but flush the
+		 * whole TLB, flush some more entries to make it worthwhile.
+		 * Scan ahead of the hand to minimise search distances.
 		 */
-		if (pkmap_count[i] != 1)
-			continue;
-		pkmap_count[i] = 0;
+		for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
+				i++, pos2 = (pos2 + 1) & LAST_PKMAP_MASK) {
 
-		/* sanity check */
-		BUG_ON(pte_none(pkmap_page_table[i]));
+			flush = pkmap_try_free(pos2);
+			if (flush < 0)
+				continue;
+
+			if (!flush) {
+				atomic_t *counter = &pkmap_count[pos2];
+				VM_BUG_ON(atomic_read(counter) != 0);
+				atomic_set(counter, 2);
+				pkmap_put(counter);
+			} else
+				entries[nr++] = pos2;
+		}
+		flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+
+		for (i = 0; i < nr; i++) {
+			atomic_t *counter = &pkmap_count[entries[i]];
+			VM_BUG_ON(atomic_read(counter) != 0);
+			atomic_set(counter, 2);
+			pkmap_put(counter);
+		}
+#endif
+	}
+	return pos;
+}
+
+static unsigned long pkmap_insert(struct page *page)
+{
+	int pos = pkmap_get_free();
+	unsigned long vaddr = PKMAP_ADDR(pos);
+	pte_t *ptep = &pkmap_page_table[pos];
+	pte_t entry = mk_pte(page, kmap_prot);
+	atomic_t *counter = &pkmap_count[pos];
 
+	VM_BUG_ON(atomic_read(counter) != 0);
+
+	set_pte_at(&init_mm, vaddr, ptep, entry);
+	if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) {
 		/*
-		 * Don't need an atomic fetch-and-clear op here;
-		 * no-one has the page mapped, and cannot get at
-		 * its virtual address (and hence PTE) without first
-		 * getting the kmap_lock (which is held here).
-		 * So no dangers, even with speculative execution.
+		 * concurrent pkmap_inserts for this page -
+		 * the other won the race, release this entry.
+		 *
+		 * we can still clear the pte without a tlb flush since
+		 * it couldn't have been used yet.
 		 */
-		page = pte_page(pkmap_page_table[i]);
-		pte_clear(&init_mm, (unsigned long)page_address(page),
-			  &pkmap_page_table[i]);
+		pte_clear(&init_mm, vaddr, ptep);
+		VM_BUG_ON(atomic_read(counter) != 0);
+		atomic_set(counter, 2);
+		pkmap_put(counter);
+		vaddr = 0;
+	} else
+		atomic_set(counter, 2);
 
-		set_page_address(page, NULL);
-	}
-	flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+	return vaddr;
 }
 
-/**
- * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
+/*
+ * Flush all unused kmap mappings in order to remove stray mappings.
  */
 void kmap_flush_unused(void)
 {
-	spin_lock(&kmap_lock);
-	flush_all_zero_pkmaps();
-	spin_unlock(&kmap_lock);
+	WARN_ON_ONCE(1);
 }
 
-static inline unsigned long map_new_virtual(struct page *page)
+/*
+ * Avoid starvation deadlock by limiting the number of tasks that can obtain a
+ * kmap to (LAST_PKMAP - KM_TYPE_NR*NR_CPUS)/2.
+ */
+static void kmap_account(void)
 {
-	unsigned long vaddr;
-	int count;
+	int weight;
 
-start:
-	count = LAST_PKMAP;
-	/* Find an empty entry */
-	for (;;) {
-		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
-		if (!last_pkmap_nr) {
-			flush_all_zero_pkmaps();
-			count = LAST_PKMAP;
-		}
-		if (!pkmap_count[last_pkmap_nr])
-			break;	/* Found a usable entry */
-		if (--count)
-			continue;
+#ifndef CONFIG_PREEMPT_RT
+	if (in_interrupt()) {
+		/* irqs can always get them */
+		weight = -1;
+	} else
+#endif
+	if (current->flags & PF_KMAP) {
+		current->flags &= ~PF_KMAP;
+		/* we already accounted the second */
+		weight = 0;
+	} else {
+		/* mark 1, account 2 */
+		current->flags |= PF_KMAP;
+		weight = 2;
+	}
 
+	if (weight > 0) {
 		/*
-		 * Sleep for somebody else to unmap their entries
+		 * reserve KM_TYPE_NR maps per CPU for interrupt context
 		 */
-		{
-			DECLARE_WAITQUEUE(wait, current);
-
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			add_wait_queue(&pkmap_map_wait, &wait);
-			spin_unlock(&kmap_lock);
-			schedule();
-			remove_wait_queue(&pkmap_map_wait, &wait);
-			spin_lock(&kmap_lock);
-
-			/* Somebody else might have mapped it while we slept */
-			if (page_address(page))
-				return (unsigned long)page_address(page);
+		const int target = LAST_PKMAP
+#ifndef CONFIG_PREEMPT_RT
+				- KM_TYPE_NR*NR_CPUS
+#endif
+			;
 
-			/* Re-start */
-			goto start;
+again:
+		wait_event(pkmap_wait,
+			atomic_read(&pkmap_users) + weight <= target);
+
+		if (atomic_add_return(weight, &pkmap_users) > target) {
+			atomic_sub(weight, &pkmap_users);
+			goto again;
 		}
 	}
-	vaddr = PKMAP_ADDR(last_pkmap_nr);
-	set_pte_at(&init_mm, vaddr,
-		   &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+}
 
-	pkmap_count[last_pkmap_nr] = 1;
-	set_page_address(page, (void *)vaddr);
+static void kunmap_account(void)
+{
+	int weight;
 
-	return vaddr;
+#ifndef CONFIG_PREEMPT_RT
+	if (in_irq()) {
+		weight = -1;
+	} else
+#endif
+	if (current->flags & PF_KMAP) {
+		/* there was only 1 kmap, un-account both */
+		current->flags &= ~PF_KMAP;
+		weight = 2;
+	} else {
+		/* there were two kmaps, un-account per kunmap */
+		weight = 1;
+	}
+
+	if (weight > 0)
+		atomic_sub(weight, &pkmap_users);
+	wake_up(&pkmap_wait);
 }
 
-/**
- * kmap_high - map a highmem page into memory
- * @page: &struct page to map
- *
- * Returns the page's virtual memory address.
- *
- * We cannot call this from interrupts, as it may block.
- */
-void *kmap_high(struct page *page)
+ void *kmap_high(struct page *page)
 {
 	unsigned long vaddr;
 
-	/*
-	 * For highmem pages, we can't trust "virtual" until
-	 * after we have the lock.
-	 */
-	spin_lock(&kmap_lock);
+	kmap_account();
+again:
 	vaddr = (unsigned long)page_address(page);
+	if (vaddr) {
+		atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)];
+		if (atomic_inc_not_zero(counter)) {
+			/*
+			 * atomic_inc_not_zero implies a (memory) barrier on success
+			 * so page address will be reloaded.
+			 */
+			unsigned long vaddr2 = (unsigned long)page_address(page);
+			if (likely(vaddr == vaddr2))
+				return (void *)vaddr;
+
+			/*
+			 * Oops, we got someone else.
+			 *
+			 * This can happen if we get preempted after
+			 * page_address() and before atomic_inc_not_zero()
+			 * and during that preemption this slot is freed and
+			 * reused.
+			 */
+			pkmap_put(counter);
+			goto again;
+		}
+	}
+
+	vaddr = pkmap_insert(page);
 	if (!vaddr)
-		vaddr = map_new_virtual(page);
-	pkmap_count[PKMAP_NR(vaddr)]++;
-	BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
-	spin_unlock(&kmap_lock);
-	return (void*) vaddr;
+		goto again;
+
+	return (void *)vaddr;
 }
 
 EXPORT_SYMBOL(kmap_high);
 
-/**
- * kunmap_high - map a highmem page into memory
- * @page: &struct page to unmap
- */
-void kunmap_high(struct page *page)
+ void kunmap_high(struct page *page)
 {
-	unsigned long vaddr;
-	unsigned long nr;
-	int need_wakeup;
-
-	spin_lock(&kmap_lock);
-	vaddr = (unsigned long)page_address(page);
+	unsigned long vaddr = (unsigned long)page_address(page);
 	BUG_ON(!vaddr);
-	nr = PKMAP_NR(vaddr);
-
-	/*
-	 * A count must never go down to zero
-	 * without a TLB flush!
-	 */
-	need_wakeup = 0;
-	switch (--pkmap_count[nr]) {
-	case 0:
-		BUG();
-	case 1:
-		/*
-		 * Avoid an unnecessary wake_up() function call.
-		 * The common case is pkmap_count[] == 1, but
-		 * no waiters.
-		 * The tasks queued in the wait-queue are guarded
-		 * by both the lock in the wait-queue-head and by
-		 * the kmap_lock.  As the kmap_lock is held here,
-		 * no need for the wait-queue-head's lock.  Simply
-		 * test if the queue is empty.
-		 */
-		need_wakeup = waitqueue_active(&pkmap_map_wait);
-	}
-	spin_unlock(&kmap_lock);
-
-	/* do wake-up, if needed, race-free outside of the spin lock */
-	if (need_wakeup)
-		wake_up(&pkmap_map_wait);
+	pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]);
+	kunmap_account();
 }
 
 EXPORT_SYMBOL(kunmap_high);
@@ -243,19 +348,13 @@ EXPORT_SYMBOL(kunmap_high);
 #define PA_HASH_ORDER	7
 
 /*
- * Describes one page->virtual association
+ * Describes one page->virtual address association.
  */
-struct page_address_map {
+static struct page_address_map {
 	struct page *page;
 	void *virtual;
 	struct list_head list;
-};
-
-/*
- * page_address_map freelist, allocated from page_address_maps.
- */
-static struct list_head page_address_pool;	/* freelist */
-static spinlock_t pool_lock;			/* protects page_address_pool */
+} page_address_maps[LAST_PKMAP];
 
 /*
  * Hash table bucket
@@ -276,29 +375,37 @@ static struct page_address_slot *page_sl
  *
  * Returns the page's virtual address.
  */
-void *page_address(struct page *page)
-{
-	unsigned long flags;
-	void *ret;
-	struct page_address_slot *pas;
 
-	if (!PageHighMem(page))
-		return lowmem_page_address(page);
+static void *__page_address(struct page_address_slot *pas, struct page *page)
+{
+	void *ret = NULL;
 
-	pas = page_slot(page);
-	ret = NULL;
-	spin_lock_irqsave(&pas->lock, flags);
 	if (!list_empty(&pas->lh)) {
 		struct page_address_map *pam;
 
 		list_for_each_entry(pam, &pas->lh, list) {
 			if (pam->page == page) {
 				ret = pam->virtual;
-				goto done;
+				break;
 			}
 		}
 	}
-done:
+
+	return ret;
+}
+
+void *page_address(struct page *page)
+{
+	unsigned long flags;
+	void *ret;
+	struct page_address_slot *pas;
+
+	if (!PageHighMem(page))
+		return lowmem_page_address(page);
+
+	pas = page_slot(page);
+	spin_lock_irqsave(&pas->lock, flags);
+	ret = __page_address(pas, page);
 	spin_unlock_irqrestore(&pas->lock, flags);
 	return ret;
 }
@@ -310,62 +417,90 @@ EXPORT_SYMBOL(page_address);
  * @page: &struct page to set
  * @virtual: virtual address to use
  */
-void set_page_address(struct page *page, void *virtual)
+static int __set_page_address(struct page *page, void *virtual, int pos)
 {
+	int ret = 0;
 	unsigned long flags;
 	struct page_address_slot *pas;
 	struct page_address_map *pam;
 
-	BUG_ON(!PageHighMem(page));
+	VM_BUG_ON(!PageHighMem(page));
+	VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0);
+	VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP);
 
 	pas = page_slot(page);
-	if (virtual) {		/* Add */
-		BUG_ON(list_empty(&page_address_pool));
+	pam = &page_address_maps[pos];
 
-		spin_lock_irqsave(&pool_lock, flags);
-		pam = list_entry(page_address_pool.next,
-				struct page_address_map, list);
-		list_del(&pam->list);
-		spin_unlock_irqrestore(&pool_lock, flags);
-
-		pam->page = page;
-		pam->virtual = virtual;
-
-		spin_lock_irqsave(&pas->lock, flags);
-		list_add_tail(&pam->list, &pas->lh);
-		spin_unlock_irqrestore(&pas->lock, flags);
-	} else {		/* Remove */
-		spin_lock_irqsave(&pas->lock, flags);
-		list_for_each_entry(pam, &pas->lh, list) {
-			if (pam->page == page) {
-				list_del(&pam->list);
-				spin_unlock_irqrestore(&pas->lock, flags);
-				spin_lock_irqsave(&pool_lock, flags);
-				list_add_tail(&pam->list, &page_address_pool);
-				spin_unlock_irqrestore(&pool_lock, flags);
-				goto done;
-			}
+	spin_lock_irqsave(&pas->lock, flags);
+	if (virtual) { /* add */
+		VM_BUG_ON(!list_empty(&pam->list));
+
+		if (!__page_address(pas, page)) {
+			pam->page = page;
+			pam->virtual = virtual;
+			list_add_tail(&pam->list, &pas->lh);
+			ret = 1;
+		}
+	} else { /* remove */
+		if (!list_empty(&pam->list)) {
+			list_del_init(&pam->list);
+			ret = 1;
 		}
-		spin_unlock_irqrestore(&pas->lock, flags);
 	}
-done:
-	return;
+	spin_unlock_irqrestore(&pas->lock, flags);
+
+	return ret;
 }
 
-static struct page_address_map page_address_maps[LAST_PKMAP];
+int set_page_address(struct page *page, void *virtual)
+{
+	/*
+	 * set_page_address is not supposed to be called when using
+	 * hashed virtual addresses.
+	 */
+	BUG();
+	return 0;
+}
 
-void __init page_address_init(void)
+void __init __page_address_init(void)
 {
 	int i;
 
-	INIT_LIST_HEAD(&page_address_pool);
 	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
-		list_add(&page_address_maps[i].list, &page_address_pool);
+		INIT_LIST_HEAD(&page_address_maps[i].list);
+
 	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
 		INIT_LIST_HEAD(&page_address_htable[i].lh);
 		spin_lock_init(&page_address_htable[i].lock);
 	}
-	spin_lock_init(&pool_lock);
+}
+
+#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */
+
+static int __set_page_address(struct page *page, void *virtual, int pos)
+{
+	return set_page_address(page, virtual);
+}
+
+#endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+
+#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL)
+
+void __init page_address_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pkmap_count); i++)
+		atomic_set(&pkmap_count[i], 1);
+	atomic_set(&pkmap_hand, 0);
+	atomic_set(&pkmap_free, LAST_PKMAP);
+	atomic_set(&pkmap_users, 0);
+#endif
+
+#ifdef HASHED_PAGE_VIRTUAL
+	__page_address_init();
+#endif
 }
 
 #endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
Index: linux-2.6.25.4-rt4/include/linux/mm.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/mm.h	2008-05-29 09:45:39.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/mm.h	2008-05-29 09:46:59.000000000 -0400
@@ -565,23 +565,39 @@ static __always_inline void *lowmem_page
 #endif
 
 #if defined(WANT_PAGE_VIRTUAL)
-#define page_address(page) ((page)->virtual)
-#define set_page_address(page, address)			\
-	do {						\
-		(page)->virtual = (address);		\
-	} while(0)
-#define page_address_init()  do { } while(0)
+/*
+ * wrap page->virtual so it is safe to set/read locklessly
+ */
+#define page_address(page) \
+	({ typeof((page)->virtual) v = (page)->virtual; \
+	 smp_read_barrier_depends(); \
+	 v; })
+
+static inline int set_page_address(struct page *page, void *address)
+{
+	if (address)
+		return cmpxchg(&page->virtual, NULL, address) == NULL;
+	else {
+		/*
+		 * cmpxchg is a bit abused because it is not guaranteed
+		 * safe wrt direct assignment on all platforms.
+		 */
+		void *virt = page->virtual;
+		return cmpxchg(&page->vitrual, virt, NULL) == virt;
+	}
+}
+void page_address_init(void);
 #endif
 
 #if defined(HASHED_PAGE_VIRTUAL)
 void *page_address(struct page *page);
-void set_page_address(struct page *page, void *virtual);
+int set_page_address(struct page *page, void *virtual);
 void page_address_init(void);
 #endif
 
 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
 #define page_address(page) lowmem_page_address(page)
-#define set_page_address(page, address)  do { } while(0)
+#define set_page_address(page, address)  (0)
 #define page_address_init()  do { } while(0)
 #endif
 
Index: linux-2.6.25.4-rt4/drivers/char/rmem.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/drivers/char/rmem.c	2008-05-29 09:47:00.000000000 -0400
@@ -0,0 +1,134 @@
+/*
+ * Rmem - REALLY simple memory mapping demonstration.
+ *
+ *  Copyright (C) 2005 by Theodore Ts'o
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/kdev_t.h>
+#include <asm/page.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+
+static int rmem_major = 0;
+module_param(rmem_major, int, 0444);
+
+static struct class *rmem_class;
+
+MODULE_AUTHOR("Theodore Ts'o");
+MODULE_LICENSE("GPL");
+
+struct page *rmem_vma_nopage(struct vm_area_struct *vma,
+                unsigned long address, int *type)
+{
+	struct page *pageptr;
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long physaddr = address - vma->vm_start + offset;
+	unsigned long pageframe = physaddr >> PAGE_SHIFT;
+
+	if (!pfn_valid(pageframe))
+		return NOPAGE_SIGBUS;
+	pageptr = pfn_to_page(pageframe);
+	get_page(pageptr);
+	if (type)
+		*type = VM_FAULT_MINOR;
+	return pageptr;
+}
+
+static struct vm_operations_struct rmem_nopage_vm_ops = {
+	.nopage = rmem_vma_nopage,
+};
+
+static int rmem_nopage_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+
+	if (offset >= __pa(high_memory) || (filp->f_flags & O_SYNC))
+		vma->vm_flags |= VM_IO;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &rmem_nopage_vm_ops;
+#ifdef TAINT_USER
+	add_taint(TAINT_USER);
+#endif
+	return 0;
+}
+
+static struct file_operations rmem_nopage_ops = {
+	.owner   = THIS_MODULE,
+	.mmap    = rmem_nopage_mmap,
+};
+
+static struct cdev rmem_cdev = {
+	.kobj	=	{.name = "rmem", },
+	.owner	=	THIS_MODULE,
+};
+
+static int __init rmem_init(void)
+{
+	int result;
+	dev_t dev = MKDEV(rmem_major, 0);
+
+	/* Figure out our device number. */
+	if (rmem_major)
+		result = register_chrdev_region(dev, 1, "rmem");
+	else {
+		result = alloc_chrdev_region(&dev, 0, 1, "rmem");
+		rmem_major = MAJOR(dev);
+	}
+	if (result < 0) {
+		printk(KERN_WARNING "rmem: unable to get major %d\n", rmem_major);
+		return result;
+	}
+	if (rmem_major == 0)
+		rmem_major = result;
+
+	cdev_init(&rmem_cdev, &rmem_nopage_ops);
+	result = cdev_add(&rmem_cdev, dev, 1);
+	if (result) {
+		printk (KERN_NOTICE "Error %d adding /dev/rmem", result);
+		kobject_put(&rmem_cdev.kobj);
+		unregister_chrdev_region(dev, 1);
+		return 1;
+	}
+
+	rmem_class = class_create(THIS_MODULE, "rmem");
+	class_device_create(rmem_class, NULL, dev, NULL, "rmem");
+
+	return 0;
+}
+
+
+static void __exit rmem_cleanup(void)
+{
+	cdev_del(&rmem_cdev);
+	unregister_chrdev_region(MKDEV(rmem_major, 0), 1);
+	class_destroy(rmem_class);
+}
+
+
+module_init(rmem_init);
+module_exit(rmem_cleanup);
Index: linux-2.6.25.4-rt4/drivers/char/alloc_rtsj_mem.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/drivers/char/alloc_rtsj_mem.c	2008-05-29 09:47:00.000000000 -0400
@@ -0,0 +1,88 @@
+/*
+ *  alloc_rtsj_mem.c -- Hack to allocate some memory
+ *
+ *  Copyright (C) 2005 by Theodore Ts'o
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/sysctl.h>
+#include <linux/bootmem.h>
+
+#include <asm/io.h>
+
+MODULE_AUTHOR("Theodore Tso");
+MODULE_DESCRIPTION("RTSJ alloc memory");
+MODULE_LICENSE("GPL");
+
+static void *mem = 0;
+int size = 0, addr = 0;
+
+module_param(size, int, 0444);
+module_param(addr, int, 0444);
+
+static void __exit shutdown_module(void)
+{
+	kfree(mem);
+}
+
+#ifndef MODULE
+void __init alloc_rtsj_mem_early_setup(void)
+{
+	if (size > PAGE_SIZE*2) {
+		mem = alloc_bootmem(size);
+		if (mem) {
+			printk(KERN_INFO "alloc_rtsj_mem: got %d bytes "
+			       "using alloc_bootmem\n", size);
+		} else {
+			printk(KERN_INFO "alloc_rtsj_mem: failed to "
+			       "get %d bytes from alloc_bootmem\n", size);
+		}
+	}
+}
+#endif
+
+static int __init startup_module(void)
+{
+	static char test_string[] = "The BOFH: Servicing users the way the "
+		"military\n\tservices targets for 15 years.\n";
+
+	if (!size)
+		return 0;
+
+	if (!mem) {
+		mem = kmalloc(size, GFP_KERNEL);
+		if (mem) {
+			printk(KERN_INFO "alloc_rtsj_mem: got %d bytes "
+			       "using kmalloc\n", size);
+		} else {
+			printk(KERN_ERR "alloc_rtsj_mem: failed to get "
+			       "%d bytes using kmalloc\n", size);
+			return -ENOMEM;
+		}
+	}
+	memcpy(mem, test_string, min(sizeof(test_string), (size_t) size));
+	addr = virt_to_phys(mem);
+	return 0;
+}
+
+module_init(startup_module);
+module_exit(shutdown_module);
+
Index: linux-2.6.25.4-rt4/include/linux/netpoll.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/netpoll.h	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/netpoll.h	2008-05-29 09:47:02.000000000 -0400
@@ -77,7 +77,7 @@ static inline void *netpoll_poll_lock(st
 	rcu_read_lock(); /* deal with race on ->npinfo */
 	if (dev && dev->npinfo) {
 		spin_lock(&napi->poll_lock);
-		napi->poll_owner = smp_processor_id();
+		napi->poll_owner = raw_smp_processor_id();
 		return napi;
 	}
 	return NULL;
Index: linux-2.6.25.4-rt4/net/irda/irttp.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/irda/irttp.c	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/net/irda/irttp.c	2008-05-29 09:47:02.000000000 -0400
@@ -1453,6 +1453,7 @@ struct tsap_cb *irttp_dup(struct tsap_cb
 	}
 	/* Dup */
 	memcpy(new, orig, sizeof(struct tsap_cb));
+	spin_lock_init(&new->lock);
 
 	/* We don't need the old instance any more */
 	spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags);
Index: linux-2.6.25.4-rt4/net/netfilter/nf_conntrack_core.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/netfilter/nf_conntrack_core.c	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/net/netfilter/nf_conntrack_core.c	2008-05-29 09:47:02.000000000 -0400
@@ -1159,6 +1159,24 @@ int __init nf_conntrack_init(void)
 	/*  - and look it like as a confirmed connection */
 	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
 
+	/*
+	 * There's something really weird (read: crash) going on in
+	 * this module when lockdep and rt is enabled - the locks are
+	 * not initialized in the per-CPU area properly - or they might
+	 * be initialized by getting a copy of the first CPU's per-cpu
+	 * area? Only seems to happen when things are modular. Maybe
+	 * per-cpu-alloc does not zero buffers properly? Needs
+	 * investigating. Reported and fixed by Mike.
+	 */
+#if defined(CONFIG_NF_CONNTRACK_EVENTS) && defined(CONFIG_SMP)
+	{
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			spin_lock_init(&per_cpu_lock(nf_conntrack_ecache, cpu));
+	}
+#endif
+
 	return ret;
 
 out_fini_expect:
Index: linux-2.6.25.4-rt4/init/Makefile
===================================================================
--- linux-2.6.25.4-rt4.orig/init/Makefile	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/init/Makefile	2008-05-29 09:47:04.000000000 -0400
@@ -33,4 +33,5 @@ silent_chk_compile.h = :
 include/linux/compile.h: FORCE
 	@$($(quiet)chk_compile.h)
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
-	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
+	"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT)" \
+	"$(CC) $(KBUILD_CFLAGS)"
Index: linux-2.6.25.4-rt4/scripts/mkcompile_h
===================================================================
--- linux-2.6.25.4-rt4.orig/scripts/mkcompile_h	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/scripts/mkcompile_h	2008-05-29 09:47:04.000000000 -0400
@@ -2,7 +2,8 @@ TARGET=$1
 ARCH=$2
 SMP=$3
 PREEMPT=$4
-CC=$5
+PREEMPT_RT=$5
+CC=$6
 
 # If compile.h exists already and we don't own autoconf.h
 # (i.e. we're not the same user who did make *config), don't
@@ -43,6 +44,7 @@ UTS_VERSION="#$VERSION"
 CONFIG_FLAGS=""
 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
+if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
 
 # Truncate to maximum length
Index: linux-2.6.25.4-rt4/arch/x86/kernel/paravirt.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/paravirt.c	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/paravirt.c	2008-05-29 09:47:04.000000000 -0400
@@ -366,6 +366,16 @@ struct pv_apic_ops pv_apic_ops = {
 #endif
 };
 
+#ifdef CONFIG_HIGHPTE
+/*
+ * kmap_atomic() might be an inline or a macro:
+ */
+static void *kmap_atomic_func(struct page *page, enum km_type idx)
+{
+	return kmap_atomic(page, idx);
+}
+#endif
+
 struct pv_mmu_ops pv_mmu_ops = {
 #ifndef CONFIG_X86_64
 	.pagetable_setup_start = native_pagetable_setup_start,
@@ -395,7 +405,7 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.pte_update_defer = paravirt_nop,
 
 #ifdef CONFIG_HIGHPTE
-	.kmap_atomic_pte = kmap_atomic,
+	.kmap_atomic_pte = kmap_atomic_func,
 #endif
 
 #if PAGETABLE_LEVELS >= 3
Index: linux-2.6.25.4-rt4/include/linux/quicklist.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/quicklist.h	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/quicklist.h	2008-05-29 09:47:04.000000000 -0400
@@ -18,7 +18,7 @@ struct quicklist {
 	int nr_pages;
 };
 
-DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+DECLARE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK];
 
 /*
  * The two key functions quicklist_alloc and quicklist_free are inline so
@@ -30,19 +30,27 @@ DECLARE_PER_CPU(struct quicklist, quickl
  * The fast patch in quicklist_alloc touched only a per cpu cacheline and
  * the first cacheline of the page itself. There is minmal overhead involved.
  */
-static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *))
+static inline void *__quicklist_alloc(struct quicklist *q)
 {
-	struct quicklist *q;
-	void **p = NULL;
+	void **p = q->page;
 
-	q =&get_cpu_var(quicklist)[nr];
-	p = q->page;
 	if (likely(p)) {
 		q->page = p[0];
 		p[0] = NULL;
 		q->nr_pages--;
 	}
-	put_cpu_var(quicklist);
+	return p;
+}
+
+static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *))
+{
+	struct quicklist *q;
+	void **p;
+	int cpu;
+
+	q = &get_cpu_var_locked(quicklist, &cpu)[nr];
+	p = __quicklist_alloc(q);
+	put_cpu_var_locked(quicklist, cpu);
 	if (likely(p))
 		return p;
 
@@ -56,12 +64,13 @@ static inline void __quicklist_free(int 
 	struct page *page)
 {
 	struct quicklist *q;
+	int cpu;
 
-	q = &get_cpu_var(quicklist)[nr];
+	q = &get_cpu_var_locked(quicklist, &cpu)[nr];
 	*(void **)p = q->page;
 	q->page = p;
 	q->nr_pages++;
-	put_cpu_var(quicklist);
+	put_cpu_var_locked(quicklist, cpu);
 }
 
 static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp)
Index: linux-2.6.25.4-rt4/mm/quicklist.c
===================================================================
--- linux-2.6.25.4-rt4.orig/mm/quicklist.c	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/mm/quicklist.c	2008-05-29 09:47:04.000000000 -0400
@@ -19,7 +19,7 @@
 #include <linux/module.h>
 #include <linux/quicklist.h>
 
-DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+DEFINE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK];
 
 #define FRACTION_OF_NODE_MEM	16
 
@@ -59,17 +59,14 @@ void quicklist_trim(int nr, void (*dtor)
 {
 	long pages_to_free;
 	struct quicklist *q;
+	int cpu;
 
-	q = &get_cpu_var(quicklist)[nr];
+	q = &get_cpu_var_locked(quicklist, &cpu)[nr];
 	if (q->nr_pages > min_pages) {
 		pages_to_free = min_pages_to_free(q, min_pages, max_free);
 
 		while (pages_to_free > 0) {
-			/*
-			 * We pass a gfp_t of 0 to quicklist_alloc here
-			 * because we will never call into the page allocator.
-			 */
-			void *p = quicklist_alloc(nr, 0, NULL);
+			void *p = __quicklist_alloc(q);
 
 			if (dtor)
 				dtor(p);
@@ -77,7 +74,7 @@ void quicklist_trim(int nr, void (*dtor)
 			pages_to_free--;
 		}
 	}
-	put_cpu_var(quicklist);
+	put_cpu_var_locked(quicklist, cpu);
 }
 
 unsigned long quicklist_total_size(void)
@@ -87,7 +84,7 @@ unsigned long quicklist_total_size(void)
 	struct quicklist *ql, *q;
 
 	for_each_online_cpu(cpu) {
-		ql = per_cpu(quicklist, cpu);
+		ql = per_cpu_var_locked(quicklist, cpu);
 		for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
 			count += q->nr_pages;
 	}
Index: linux-2.6.25.4-rt4/include/asm-x86/smp_32.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/smp_32.h	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/smp_32.h	2008-05-29 09:47:05.000000000 -0400
@@ -161,5 +161,7 @@ static inline int hard_smp_processor_id(
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+#define HAVE_RESCHEDULE_ALLBUTSELF_CPUMASK 1
+
 #endif /* !ASSEMBLY */
 #endif
Index: linux-2.6.25.4-rt4/include/asm-x86/smp_64.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/smp_64.h	2008-05-29 09:45:38.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/smp_64.h	2008-05-29 09:47:05.000000000 -0400
@@ -97,5 +97,7 @@ static inline int hard_smp_processor_id(
 	return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
 }
 
+#define HAVE_RESCHEDULE_ALLBUTSELF_CPUMASK 1
+
 #endif
 
Index: linux-2.6.25.4-rt4/kernel/relay.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/relay.c	2008-05-29 09:45:37.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/relay.c	2008-05-29 09:47:06.000000000 -0400
@@ -314,6 +314,10 @@ static void wakeup_readers(unsigned long
 {
 	struct rchan_buf *buf = (struct rchan_buf *)data;
 	wake_up_interruptible(&buf->read_wait);
+	/*
+	 * Stupid polling for now:
+	 */
+	mod_timer(&buf->timer, jiffies + 1);
 }
 
 /**
@@ -331,6 +335,7 @@ static void __relay_reset(struct rchan_b
 		init_waitqueue_head(&buf->read_wait);
 		kref_init(&buf->kref);
 		setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+		mod_timer(&buf->timer, jiffies + 1);
 	} else
 		del_timer_sync(&buf->timer);
 
@@ -600,15 +605,6 @@ size_t relay_switch_subbuf(struct rchan_
 		buf->subbufs_produced++;
 		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
 			buf->padding[old_subbuf];
-		smp_mb();
-		if (waitqueue_active(&buf->read_wait))
-			/*
-			 * Calling wake_up_interruptible() from here
-			 * will deadlock if we happen to be logging
-			 * from the scheduler (trying to re-grab
-			 * rq->lock), so defer it.
-			 */
-			__mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
Index: linux-2.6.25.4-rt4/arch/x86/kernel/setup64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/kernel/setup64.c	2008-05-29 09:45:37.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/kernel/setup64.c	2008-05-29 09:47:09.000000000 -0400
@@ -297,7 +297,9 @@ void __cpuinit cpu_init (void)
 	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
 		static const unsigned int order[N_EXCEPTION_STACKS] = {
 			[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+#if DEBUG_STACK > 0
 			[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+#endif
 		};
 		if (cpu) {
 			estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
Index: linux-2.6.25.4-rt4/include/asm-x86/page_64.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-x86/page_64.h	2008-05-29 09:45:37.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-x86/page_64.h	2008-05-29 09:47:09.000000000 -0400
@@ -16,12 +16,21 @@
 #define IRQSTACK_ORDER 2
 #define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
 
+#ifdef CONFIG_PREEMPT_RT
+#define STACKFAULT_STACK 0
+#define DOUBLEFAULT_STACK 1
+#define NMI_STACK 2
+#define DEBUG_STACK 0
+#define MCE_STACK 3
+#define N_EXCEPTION_STACKS 3  /* hw limit: 7 */
+#else
 #define STACKFAULT_STACK 1
 #define DOUBLEFAULT_STACK 2
 #define NMI_STACK 3
 #define DEBUG_STACK 4
 #define MCE_STACK 5
 #define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
+#endif
 
 #define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
Index: linux-2.6.25.4-rt4/include/linux/list.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/list.h	2008-05-29 09:45:36.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/list.h	2008-05-29 09:47:10.000000000 -0400
@@ -320,17 +320,17 @@ static inline int list_empty_careful(con
 }
 
 static inline void __list_splice(struct list_head *list,
-				 struct list_head *head)
+				 struct list_head *prev,
+				 struct list_head *next)
 {
 	struct list_head *first = list->next;
 	struct list_head *last = list->prev;
-	struct list_head *at = head->next;
 
-	first->prev = head;
-	head->next = first;
+	first->prev = prev;
+	prev->next = first;
 
-	last->next = at;
-	at->prev = last;
+	last->next = next;
+	next->prev = last;
 }
 
 /**
@@ -341,7 +341,13 @@ static inline void __list_splice(struct 
 static inline void list_splice(struct list_head *list, struct list_head *head)
 {
 	if (!list_empty(list))
-		__list_splice(list, head);
+		__list_splice(list, head, head->next);
+}
+
+static inline void list_splice_tail(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head->prev, head);
 }
 
 /**
@@ -355,11 +361,55 @@ static inline void list_splice_init(stru
 				    struct list_head *head)
 {
 	if (!list_empty(list)) {
-		__list_splice(list, head);
+		__list_splice(list, head, head->next);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+static inline void list_splice_tail_init(struct list_head *list,
+					 struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head->prev, head);
 		INIT_LIST_HEAD(list);
 	}
 }
 
+static inline void __list_splice2(struct list_head *first,
+	       			struct list_head *last,
+				struct list_head *prev,
+				struct list_head *next)
+{
+	first->prev->next = last->next;
+	last->next->prev = first->prev;
+
+	first->prev = prev;
+	prev->next = first;
+
+	last->next = next;
+	next->prev = last;
+}
+
+/**
+ * list_splice2 - join [first, last] to head
+ * @first: list item
+ * @last: list item further on the same list
+ * @head: the place to add it on another list
+ */
+static inline void list_splice2(struct list_head *first,
+				struct list_head *last,
+				struct list_head *head)
+{
+	__list_splice2(first, last, head, head->next);
+}
+
+static inline void list_splice2_tail(struct list_head *first,
+				     struct list_head *last,
+				     struct list_head *head)
+{
+	__list_splice2(first, last, head->prev, head);
+}
+
 /**
  * list_splice_init_rcu - splice an RCU-protected list into an existing list.
  * @list:	the RCU-protected list to splice
Index: linux-2.6.25.4-rt4/drivers/usb/host/ehci-q.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/usb/host/ehci-q.c	2008-05-29 09:45:36.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/usb/host/ehci-q.c	2008-05-29 09:47:10.000000000 -0400
@@ -890,7 +890,7 @@ static struct ehci_qh *qh_append_tds (
 
 			list_del (&qtd->qtd_list);
 			list_add (&dummy->qtd_list, qtd_list);
-			__list_splice (qtd_list, qh->qtd_list.prev);
+			list_splice_tail (qtd_list, &qh->qtd_list);
 
 			ehci_qtd_init(ehci, qtd, qtd->qtd_dma);
 			qh->dummy = qtd;
Index: linux-2.6.25.4-rt4/drivers/dma/ioat_dma.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/dma/ioat_dma.c	2008-05-29 09:45:36.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/dma/ioat_dma.c	2008-05-29 09:47:10.000000000 -0400
@@ -283,7 +283,7 @@ static dma_cookie_t ioat1_tx_submit(stru
 	/* write address into NextDescriptor field of last desc in chain */
 	to_ioat_desc(ioat_chan->used_desc.prev)->hw->next =
 							first->async_tx.phys;
-	__list_splice(&new_chain, ioat_chan->used_desc.prev);
+	list_splice_tail(&new_chain, &ioat_chan->used_desc);
 
 	ioat_chan->dmacount += desc_count;
 	ioat_chan->pending += desc_count;
Index: linux-2.6.25.4-rt4/kernel/power/poweroff.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/power/poweroff.c	2008-05-29 09:45:36.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/power/poweroff.c	2008-05-29 09:47:10.000000000 -0400
@@ -8,6 +8,7 @@
 #include <linux/sysrq.h>
 #include <linux/init.h>
 #include <linux/pm.h>
+#include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
 
Index: linux-2.6.25.4-rt4/include/linux/sched_prio.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25.4-rt4/include/linux/sched_prio.h	2008-05-29 09:47:11.000000000 -0400
@@ -0,0 +1,23 @@
+#ifndef __SCHED_PRIO_H
+#define __SCHED_PRIO_H
+
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+
+#endif
Index: linux-2.6.25.4-rt4/drivers/media/video/zoran_driver.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/media/video/zoran_driver.c	2008-05-29 09:45:36.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/media/video/zoran_driver.c	2008-05-29 09:47:12.000000000 -0400
@@ -1174,7 +1174,7 @@ zoran_close_end_session (struct file *fi
 
 	/* v4l capture */
 	if (fh->v4l_buffers.active != ZORAN_FREE) {
-		long flags;
+		unsigned long flags;
 
 		spin_lock_irqsave(&zr->spinlock, flags);
 		zr36057_set_memgrab(zr, 0);
@@ -3447,7 +3447,7 @@ zoran_do_ioctl (struct inode *inode,
 
 			/* unload capture */
 			if (zr->v4l_memgrab_active) {
-				long flags;
+				unsigned long flags;
 
 				spin_lock_irqsave(&zr->spinlock, flags);
 				zr36057_set_memgrab(zr, 0);
@@ -4387,7 +4387,7 @@ zoran_vm_close (struct vm_area_struct *v
 				mutex_lock(&zr->resource_lock);
 
 				if (fh->v4l_buffers.active != ZORAN_FREE) {
-					long flags;
+					unsigned long flags;
 
 					spin_lock_irqsave(&zr->spinlock, flags);
 					zr36057_set_memgrab(zr, 0);
Index: linux-2.6.25.4-rt4/kernel/cgroup.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/cgroup.c	2008-05-29 09:45:36.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/cgroup.c	2008-05-29 09:47:13.000000000 -0400
@@ -168,7 +168,7 @@ list_for_each_entry(_root, &roots, root_
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
Index: linux-2.6.25.4-rt4/include/linux/proportions.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/proportions.h	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/proportions.h	2008-05-29 09:47:14.000000000 -0400
@@ -58,7 +58,7 @@ struct prop_local_percpu {
 	 */
 	int shift;
 	unsigned long period;
-	spinlock_t lock;		/* protect the snapshot state */
+	raw_spinlock_t lock;		/* protect the snapshot state */
 };
 
 int prop_local_init_percpu(struct prop_local_percpu *pl);
@@ -93,11 +93,11 @@ struct prop_local_single {
 	 */
 	int shift;
 	unsigned long period;
-	spinlock_t lock;		/* protect the snapshot state */
+	raw_spinlock_t lock;		/* protect the snapshot state */
 };
 
 #define INIT_PROP_LOCAL_SINGLE(name)			\
-{	.lock = __SPIN_LOCK_UNLOCKED(name.lock),	\
+{	.lock = RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
 }
 
 int prop_local_init_single(struct prop_local_single *pl);
Index: linux-2.6.25.4-rt4/arch/arm/mach-at91/gpio.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/mach-at91/gpio.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/mach-at91/gpio.c	2008-05-29 09:47:15.000000000 -0400
@@ -368,12 +368,18 @@ static int gpio_irq_type(unsigned pin, u
 	}
 }
 
+static void gpio_irq_ack_noop(unsigned int irq)
+{
+	/* Dummy function.  */
+}
+
 static struct irq_chip gpio_irqchip = {
 	.name		= "GPIO",
 	.mask		= gpio_irq_mask,
 	.unmask		= gpio_irq_unmask,
 	.set_type	= gpio_irq_type,
 	.set_wake	= gpio_irq_set_wake,
+	.ack            = gpio_irq_ack_noop,
 };
 
 static void gpio_irq_handler(unsigned irq, struct irq_desc *desc)
@@ -522,7 +528,7 @@ void __init at91_gpio_irq_setup(void)
 			 * shorter, and the AIC handles interrupts sanely.
 			 */
 			set_irq_chip(pin, &gpio_irqchip);
-			set_irq_handler(pin, handle_simple_irq);
+			set_irq_handler(pin, handle_edge_irq);
 			set_irq_flags(pin, IRQF_VALID);
 		}
 
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/process.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/process.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/process.c	2008-05-29 09:47:16.000000000 -0400
@@ -269,6 +269,10 @@ struct task_struct *__switch_to(struct t
 	struct thread_struct *new_thread, *old_thread;
 	unsigned long flags;
 	struct task_struct *last;
+#if defined(CONFIG_PPC64) && defined (CONFIG_PREEMPT_RT)
+	struct ppc64_tlb_batch *batch;
+	int hadbatch;
+#endif
 
 #ifdef CONFIG_SMP
 	/* avoid complexity of lazy save/restore of fpu
@@ -345,6 +349,17 @@ struct task_struct *__switch_to(struct t
 		old_thread->accum_tb += (current_tb - start_tb);
 		new_thread->start_tb = current_tb;
 	}
+
+#ifdef CONFIG_PREEMPT_RT
+	batch = &__get_cpu_var(ppc64_tlb_batch);
+	if (batch->active) {
+		hadbatch = 1;
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+#endif /* #ifdef CONFIG_PREEMPT_RT */
 #endif
 
 	local_irq_save(flags);
@@ -363,6 +378,13 @@ struct task_struct *__switch_to(struct t
 
 	local_irq_restore(flags);
 
+#if defined(CONFIG_PPC64) && defined(CONFIG_PREEMPT_RT)
+	if (hadbatch) {
+		batch = &__get_cpu_var(ppc64_tlb_batch);
+		batch->active = 1;
+	}
+#endif
+
 	return last;
 }
 
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/prom.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/prom.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/prom.c	2008-05-29 09:47:15.000000000 -0400
@@ -79,7 +79,7 @@ struct boot_param_header *initial_boot_p
 
 extern struct device_node *allnodes;	/* temporary while merging */
 
-extern rwlock_t devtree_lock;	/* temporary while merging */
+extern raw_rwlock_t devtree_lock;	/* temporary while merging */
 
 /* export that to outside world */
 struct device_node *of_chosen;
Index: linux-2.6.25.4-rt4/arch/powerpc/platforms/pseries/eeh.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/platforms/pseries/eeh.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/platforms/pseries/eeh.c	2008-05-29 09:47:15.000000000 -0400
@@ -99,7 +99,7 @@ int eeh_subsystem_enabled;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
 /* Lock to avoid races due to multiple reports of an error */
-static DEFINE_SPINLOCK(confirm_error_lock);
+static DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
 /* Buffer for reporting slot-error-detail rtas calls. Its here
  * in BSS, and not dynamically alloced, so that it ends up in
Index: linux-2.6.25.4-rt4/drivers/of/base.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/of/base.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/of/base.c	2008-05-29 09:47:15.000000000 -0400
@@ -25,7 +25,7 @@ struct device_node *allnodes;
 /* use when traversing tree through the allnext, child, sibling,
  * or parent members of struct device_node.
  */
-DEFINE_RWLOCK(devtree_lock);
+DEFINE_RAW_RWLOCK(devtree_lock);
 
 int of_n_addr_cells(struct device_node *np)
 {
Index: linux-2.6.25.4-rt4/include/asm-powerpc/tlbflush.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/tlbflush.h	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/tlbflush.h	2008-05-29 09:47:19.000000000 -0400
@@ -109,18 +109,39 @@ extern void hpte_need_flush(struct mm_st
 
 static inline void arch_enter_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch;
+#ifdef CONFIG_PREEMPT_RT
+	preempt_disable();
+#endif
+	batch = &get_cpu_var(ppc64_tlb_batch);
+
+#ifdef CONFIG_PREEMPT_RT
+	preempt_enable();
+#endif
 
 	batch->active = 1;
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 static inline void arch_leave_lazy_mmu_mode(void)
 {
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	struct ppc64_tlb_batch *batch;
 
-	if (batch->index)
-		__flush_tlb_pending(batch);
-	batch->active = 0;
+#ifdef CONFIG_PREEMPT_RT
+	preempt_disable();
+#endif
+	batch = &get_cpu_var(ppc64_tlb_batch);
+
+	if (batch->active) {
+		if (batch->index) {
+			__flush_tlb_pending(batch);
+		}
+		batch->active = 0;
+	}
+#ifdef CONFIG_PREEMPT_RT
+	preempt_enable();
+#endif
+	put_cpu_var(ppc64_tlb_batch);
 }
 
 #define arch_flush_lazy_mmu_mode()      do {} while (0)
Index: linux-2.6.25.4-rt4/include/asm-powerpc/tlb.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/tlb.h	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/tlb.h	2008-05-29 09:47:15.000000000 -0400
@@ -46,8 +46,11 @@ static inline void tlb_flush(struct mmu_
 	 * pages are going to be freed and we really don't want to have a CPU
 	 * access a freed page because it has a stale TLB
 	 */
-	if (tlbbatch->index)
+	if (tlbbatch->index) {
+		preempt_disable();
 		__flush_tlb_pending(tlbbatch);
+		preempt_enable();
+	}
 
 	pte_free_finish();
 }
Index: linux-2.6.25.4-rt4/drivers/serial/mpc52xx_uart.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/serial/mpc52xx_uart.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/serial/mpc52xx_uart.c	2008-05-29 09:47:16.000000000 -0400
@@ -783,7 +783,9 @@ mpc52xx_uart_int_rx_chars(struct uart_po
 		}
 	}
 
+	spin_unlock(&port->lock);
 	tty_flip_buffer_push(tty);
+	spin_lock(&port->lock);
 
 	return psc_ops->raw_rx_rdy(port);
 }
Index: linux-2.6.25.4-rt4/include/asm-mips/mach-tx49xx/cpu-feature-overrides.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/mach-tx49xx/cpu-feature-overrides.h	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/mach-tx49xx/cpu-feature-overrides.h	2008-05-29 09:47:17.000000000 -0400
@@ -1,6 +1,13 @@
 #ifndef __ASM_MACH_TX49XX_CPU_FEATURE_OVERRIDES_H
 #define __ASM_MACH_TX49XX_CPU_FEATURE_OVERRIDES_H
 
+/* finish_arch_switch_empty is defined if we know finish_arch_switch() will
+ * be empty, based on the lack of features defined in this file.  This is
+ * needed because config preempt will barf in kernel/sched.c ifdef
+ * finish_arch_switch
+ */
+#define finish_arch_switch_empty
+
 #define cpu_has_llsc	1
 #define cpu_has_64bits	1
 #define cpu_has_inclusive_pcaches	0
Index: linux-2.6.25.4-rt4/include/asm-mips/system.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-mips/system.h	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-mips/system.h	2008-05-29 09:47:17.000000000 -0400
@@ -70,6 +70,8 @@ do {									\
 	(last) = resume(prev, next, task_thread_info(next));		\
 } while (0)
 
+/* preempt kernel barfs in kernel/sched.c ifdef finish_arch_switch */
+#ifndef finish_arch_switch_empty
 #define finish_arch_switch(prev)					\
 do {									\
 	if (cpu_has_dsp)						\
@@ -77,6 +79,7 @@ do {									\
 	if (cpu_has_userlocal)						\
 		write_c0_userlocal(current_thread_info()->tp_value);	\
 } while (0)
+#endif
 
 static inline unsigned long __xchg_u32(volatile int * m, unsigned int val)
 {
Index: linux-2.6.25.4-rt4/arch/mips/kernel/gdb-stub.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/mips/kernel/gdb-stub.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/mips/kernel/gdb-stub.c	2008-05-29 09:47:17.000000000 -0400
@@ -176,7 +176,7 @@ int kgdb_enabled;
  * spin locks for smp case
  */
 static DEFINE_SPINLOCK(kgdb_lock);
-static raw_spinlock_t kgdb_cpulock[NR_CPUS] = {
+static __raw_spinlock_t kgdb_cpulock[NR_CPUS] = {
 	[0 ... NR_CPUS-1] = __RAW_SPIN_LOCK_UNLOCKED,
 };
 
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/setup_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/setup_32.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/setup_32.c	2008-05-29 09:47:18.000000000 -0400
@@ -88,7 +88,7 @@ int ucache_bsize;
  * from the address that it was linked at, so we must use RELOC/PTRRELOC
  * to access static data (including strings).  -- paulus
  */
-unsigned long __init early_init(unsigned long dt_ptr)
+notrace unsigned long __init early_init(unsigned long dt_ptr)
 {
 	unsigned long offset = reloc_offset();
 	struct cpu_spec *spec;
@@ -296,3 +296,22 @@ void __init setup_arch(char **cmdline_p)
 
 	paging_init();
 }
+
+#ifdef CONFIG_STACKTRACE
+#include <linux/stacktrace.h>
+void notrace save_stack_trace(struct stack_trace *trace)
+{
+}
+#endif /* CONFIG_STACKTRACE */
+
+#ifdef CONFIG_EARLY_PRINTK
+void notrace early_printk(const char *fmt, ...)
+{
+	BUG();
+}
+#endif /* CONFIG_EARLY_PRINTK */
+
+#ifdef CONFIG_MCOUNT
+extern void _mcount(void);
+EXPORT_SYMBOL(_mcount);
+#endif /* CONFIG_MCOUNT */
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/cputable.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/cputable.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/cputable.c	2008-05-29 09:47:18.000000000 -0400
@@ -1482,7 +1482,7 @@ static struct cpu_spec __initdata cpu_sp
 
 static struct cpu_spec the_cpu_spec;
 
-struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
+notrace struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
 {
 	struct cpu_spec *s = cpu_specs;
 	struct cpu_spec *t = &the_cpu_spec;
@@ -1529,7 +1529,7 @@ struct cpu_spec * __init identify_cpu(un
 	return NULL;
 }
 
-void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
+notrace void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 {
 	struct fixup_entry {
 		unsigned long	mask;
Index: linux-2.6.25.4-rt4/arch/powerpc/kernel/io.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/powerpc/kernel/io.c	2008-05-29 09:45:35.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/powerpc/kernel/io.c	2008-05-29 09:47:18.000000000 -0400
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(_outsl_ns);
 
 #define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
 
-void _memset_io(volatile void __iomem *addr, int c, unsigned long n)
+notrace void _memset_io(volatile void __iomem *addr, int c, unsigned long n)
 {
 	void *p = (void __force *)addr;
 	u32 lc = c;
Index: linux-2.6.25.4-rt4/drivers/char/vt.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/char/vt.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/char/vt.c	2008-05-29 09:47:18.000000000 -0400
@@ -2498,7 +2498,7 @@ static struct console vt_console_driver 
 	.write		= vt_console_print,
 	.device		= vt_console_device,
 	.unblank	= unblank_screen,
-	.flags		= CON_PRINTBUFFER,
+	.flags		= CON_PRINTBUFFER | CON_ATOMIC,
 	.index		= -1,
 };
 #endif
Index: linux-2.6.25.4-rt4/kernel/kthread.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/kthread.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/kthread.c	2008-05-29 09:47:19.000000000 -0400
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k,
 	wait_task_inactive(k);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
+	k->rt.nr_cpus_allowed = 1;
 }
 EXPORT_SYMBOL(kthread_bind);
 
Index: linux-2.6.25.4-rt4/include/asm-powerpc/pgtable-ppc64.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/asm-powerpc/pgtable-ppc64.h	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/include/asm-powerpc/pgtable-ppc64.h	2008-05-29 09:47:19.000000000 -0400
@@ -277,8 +277,15 @@ static inline unsigned long pte_update(s
 	: "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY)
 	: "cc" );
 
-	if (old & _PAGE_HASHPTE)
+	if (old & _PAGE_HASHPTE) {
+#ifdef CONFIG_PREEMPT_RT
+		preempt_disable();
+#endif
 		hpte_need_flush(mm, addr, ptep, old, huge);
+#ifdef CONFIG_PREEMPT_RT
+		preempt_enable();
+#endif
+	}
 	return old;
 }
 
Index: linux-2.6.25.4-rt4/net/9p/util.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/9p/util.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/net/9p/util.c	2008-05-29 09:47:20.000000000 -0400
@@ -71,7 +71,7 @@ int p9_idpool_get(struct p9_idpool *p)
 {
 	int i = 0;
 	int error;
-	unsigned int flags;
+	unsigned long flags;
 
 retry:
 	if (idr_pre_get(&p->pool, GFP_KERNEL) == 0)
@@ -102,7 +102,7 @@ EXPORT_SYMBOL(p9_idpool_get);
 
 void p9_idpool_put(int id, struct p9_idpool *p)
 {
-	unsigned int flags;
+	unsigned long flags;
 	spin_lock_irqsave(&p->lock, flags);
 	idr_remove(&p->pool, id);
 	spin_unlock_irqrestore(&p->lock, flags);
Index: linux-2.6.25.4-rt4/net/9p/trans_virtio.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/9p/trans_virtio.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/net/9p/trans_virtio.c	2008-05-29 09:47:20.000000000 -0400
@@ -134,7 +134,7 @@ static void p9_virtio_close(struct p9_tr
 {
 	struct virtio_chan *chan = trans->priv;
 	int count;
-	unsigned int flags;
+	unsigned long flags;
 
 	spin_lock_irqsave(&chan->lock, flags);
 	p9_idpool_destroy(chan->tagpool);
Index: linux-2.6.25.4-rt4/net/mac80211/rc80211_pid_debugfs.c
===================================================================
--- linux-2.6.25.4-rt4.orig/net/mac80211/rc80211_pid_debugfs.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/net/mac80211/rc80211_pid_debugfs.c	2008-05-29 09:47:20.000000000 -0400
@@ -85,7 +85,7 @@ static int rate_control_pid_events_open(
 	struct rc_pid_sta_info *sinfo = inode->i_private;
 	struct rc_pid_event_buffer *events = &sinfo->events;
 	struct rc_pid_events_file_info *file_info;
-	unsigned int status;
+	unsigned long status;
 
 	/* Allocate a state struct */
 	file_info = kmalloc(sizeof(*file_info), GFP_KERNEL);
@@ -135,7 +135,7 @@ static ssize_t rate_control_pid_events_r
 	char pb[RC_PID_PRINT_BUF_SIZE];
 	int ret;
 	int p;
-	unsigned int status;
+	unsigned long status;
 
 	/* Check if there is something to read. */
 	if (events->next_entry == file_info->next_entry) {
Index: linux-2.6.25.4-rt4/drivers/media/video/saa7134/saa7134-video.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/media/video/saa7134/saa7134-video.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/media/video/saa7134/saa7134-video.c	2008-05-29 09:47:20.000000000 -0400
@@ -1639,7 +1639,7 @@ static int saa7134_s_fmt_overlay(struct 
 	struct saa7134_fh *fh = priv;
 	struct saa7134_dev *dev = fh->dev;
 	int err;
-	unsigned int flags;
+	unsigned long flags;
 
 	if (saa7134_no_overlay > 0) {
 		printk(KERN_ERR "V4L2_BUF_TYPE_VIDEO_OVERLAY: no_overlay\n");
Index: linux-2.6.25.4-rt4/drivers/rtc/rtc-ds1511.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/rtc/rtc-ds1511.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/rtc/rtc-ds1511.c	2008-05-29 09:47:20.000000000 -0400
@@ -185,7 +185,7 @@ ds1511_wdog_disable(void)
 ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm)
 {
 	u8 mon, day, dow, hrs, min, sec, yrs, cen;
-	unsigned int flags;
+	unsigned long flags;
 
 	/*
 	 * won't have to change this for a while
@@ -249,7 +249,7 @@ ds1511_rtc_set_time(struct device *dev, 
 ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm)
 {
 	unsigned int century;
-	unsigned int flags;
+	unsigned long flags;
 
 	spin_lock_irqsave(&ds1511_lock, flags);
 	rtc_disable_update();
Index: linux-2.6.25.4-rt4/drivers/usb/serial/iuu_phoenix.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/usb/serial/iuu_phoenix.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/usb/serial/iuu_phoenix.c	2008-05-29 09:47:20.000000000 -0400
@@ -635,7 +635,7 @@ static void read_buf_callback(struct urb
 static int iuu_bulk_write(struct usb_serial_port *port)
 {
 	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned int flags;
+	unsigned long flags;
 	int result;
 	int i;
 	char *buf_ptr = port->write_urb->transfer_buffer;
@@ -686,7 +686,7 @@ static void iuu_uart_read_callback(struc
 {
 	struct usb_serial_port *port = (struct usb_serial_port *)urb->context;
 	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned int flags;
+	unsigned long flags;
 	int status;
 	int error = 0;
 	int len = 0;
@@ -751,7 +751,7 @@ static int iuu_uart_write(struct usb_ser
 			  int count)
 {
 	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned int flags;
+	unsigned long flags;
 	dbg("%s - enter", __FUNCTION__);
 
 	if (count > 256)
Index: linux-2.6.25.4-rt4/include/linux/pcounter.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/pcounter.h	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/pcounter.h	2008-05-29 09:47:21.000000000 -0400
@@ -28,7 +28,9 @@ struct pcounter {
 static DEFINE_PER_CPU(int, NAME##_pcounter_values);			\
 static void NAME##_pcounter_add(struct pcounter *self, int val)		\
 {									\
-       __get_cpu_var(NAME##_pcounter_values) += val;			\
+	preempt_disable();						\
+	__get_cpu_var(NAME##_pcounter_values) += val;			\
+	preempt_enable();						\
 }									\
 static int NAME##_pcounter_getval(const struct pcounter *self, int cpu)	\
 {									\
Index: linux-2.6.25.4-rt4/drivers/pci/pci.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/pci/pci.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/pci/pci.c	2008-05-29 09:47:22.000000000 -0400
@@ -170,6 +170,42 @@ int pci_find_capability(struct pci_dev *
 }
 
 /**
+ * pci_find_capability_cached - query for devices' capabilities, cached version
+ * @dev: PCI device to query
+ * @cap: capability code
+ *
+ * Tell if a device supports a given PCI capability.
+ * Returns the address of the requested capability structure within the
+ * device's PCI configuration space or 0 in case the device does not
+ * support it.  Possible values for @cap:
+ *
+ *  %PCI_CAP_ID_PM           Power Management
+ *  %PCI_CAP_ID_AGP          Accelerated Graphics Port
+ *  %PCI_CAP_ID_VPD          Vital Product Data
+ *  %PCI_CAP_ID_SLOTID       Slot Identification
+ *  %PCI_CAP_ID_MSI          Message Signalled Interrupts
+ *  %PCI_CAP_ID_CHSWP        CompactPCI HotSwap
+ *  %PCI_CAP_ID_PCIX         PCI-X
+ *  %PCI_CAP_ID_EXP          PCI Express
+ */
+int pci_find_capability_cached(struct pci_dev *dev, int cap)
+{
+       int pos = 0;
+
+       WARN_ON_ONCE(cap <= 0 || cap > PCI_CAP_LIST_NR_ENTRIES);
+
+       if (cap <= PCI_CAP_LIST_NR_ENTRIES) {
+               const int i = cap - 1;
+               if (dev->cached_capabilities[i] == -1)
+                       dev->cached_capabilities[i] = pci_find_capability(dev, cap);
+
+               pos = dev->cached_capabilities[i];
+       }
+
+       return pos;
+}
+
+/**
  * pci_bus_find_capability - query for devices' capabilities 
  * @bus:   the PCI bus to query
  * @devfn: PCI device to query
Index: linux-2.6.25.4-rt4/drivers/pci/probe.c
===================================================================
--- linux-2.6.25.4-rt4.orig/drivers/pci/probe.c	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/drivers/pci/probe.c	2008-05-29 09:47:22.000000000 -0400
@@ -843,6 +843,7 @@ static void pci_release_bus_bridge_dev(s
 
 struct pci_dev *alloc_pci_dev(void)
 {
+	int i;
 	struct pci_dev *dev;
 
 	dev = kzalloc(sizeof(struct pci_dev), GFP_KERNEL);
@@ -852,6 +853,9 @@ struct pci_dev *alloc_pci_dev(void)
 	INIT_LIST_HEAD(&dev->global_list);
 	INIT_LIST_HEAD(&dev->bus_list);
 
+	for (i = 0; i < ARRAY_SIZE(dev->cached_capabilities); ++i)
+		dev->cached_capabilities[i] = -1;
+
 	pci_msi_init_pci_dev(dev);
 
 	return dev;
Index: linux-2.6.25.4-rt4/include/linux/pci.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/pci.h	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/pci.h	2008-05-29 09:47:22.000000000 -0400
@@ -190,6 +190,7 @@ struct pci_dev {
 	unsigned int	msix_enabled:1;
 	unsigned int	is_managed:1;
 	unsigned int	is_pcie:1;
+	int             cached_capabilities[PCI_CAP_LIST_NR_ENTRIES]; /* See pci_find_capability_cached */
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
@@ -509,6 +510,7 @@ struct pci_dev __deprecated *pci_find_sl
 #endif /* CONFIG_PCI_LEGACY */
 
 int pci_find_capability(struct pci_dev *dev, int cap);
+int pci_find_capability_cached(struct pci_dev *dev, int cap);
 int pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap);
 int pci_find_ext_capability(struct pci_dev *dev, int cap);
 int pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
@@ -872,6 +874,10 @@ static inline int pci_find_capability(st
 	return 0;
 }
 
+static inline int pci_find_capability_cached(struct pci_dev *dev, int cap) {
+	return 0;
+}
+
 static inline int pci_find_next_capability(struct pci_dev *dev, u8 post,
 					   int cap)
 {
Index: linux-2.6.25.4-rt4/include/linux/pci_regs.h
===================================================================
--- linux-2.6.25.4-rt4.orig/include/linux/pci_regs.h	2008-05-29 09:45:34.000000000 -0400
+++ linux-2.6.25.4-rt4/include/linux/pci_regs.h	2008-05-29 09:47:22.000000000 -0400
@@ -210,6 +210,7 @@
 #define  PCI_CAP_ID_AGP3	0x0E	/* AGP Target PCI-PCI bridge */
 #define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
 #define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
+#define PCI_CAP_LIST_NR_ENTRIES PCI_CAP_ID_MSIX
 #define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
 #define PCI_CAP_FLAGS		2	/* Capability defined flags (16 bits) */
 #define PCI_CAP_SIZEOF		4
Index: linux-2.6.25.4-rt4/kernel/rtmutex_common.h
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/rtmutex_common.h	2008-05-29 09:45:33.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/rtmutex_common.h	2008-05-29 09:47:27.000000000 -0400
@@ -13,6 +13,7 @@
 #define __KERNEL_RTMUTEX_COMMON_H
 
 #include <linux/rtmutex.h>
+#include <linux/rt_lock.h>
 
 /*
  * The rtmutex in kernel tester is independent of rtmutex debugging. We
@@ -43,12 +44,14 @@ extern void schedule_rt_mutex_test(struc
  * @list_entry:		pi node to enqueue into the mutex waiters list
  * @pi_list_entry:	pi node to enqueue into the mutex owner waiters list
  * @task:		task reference to the blocked task
+ * @write_lock:		true if blocked as writer
  */
 struct rt_mutex_waiter {
 	struct plist_node	list_entry;
 	struct plist_node	pi_list_entry;
 	struct task_struct	*task;
 	struct rt_mutex		*lock;
+	int			write_lock;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	unsigned long		ip;
 	struct pid		*deadlock_task_pid;
@@ -112,6 +115,65 @@ static inline unsigned long rt_mutex_own
 	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * rw_mutex->owner state tracking
+ */
+#define RT_RWLOCK_CHECK		1UL
+#define RT_RWLOCK_WRITER	2UL
+#define RT_RWLOCK_MASKALL	3UL
+
+/* used as reader owner of the mutex */
+#define RT_RW_READER		(struct task_struct *)0x100
+
+/* used when a writer releases the lock with waiters */
+/*   pending owner is a reader */
+#define RT_RW_PENDING_READ	(struct task_struct *)0x200
+/*   pending owner is a writer */
+#define RT_RW_PENDING_WRITE	(struct task_struct *)0x400
+/* Either of the above is true */
+#define RT_RW_PENDING_MASK	(0x600 | RT_RWLOCK_MASKALL)
+
+/* Return true if lock is not owned but has pending owners */
+static inline int rt_rwlock_pending(struct rw_mutex *rwm)
+{
+	unsigned long owner = (unsigned long)rwm->owner;
+	return (owner & RT_RW_PENDING_MASK) == owner;
+}
+
+static inline int rt_rwlock_pending_writer(struct rw_mutex *rwm)
+{
+	unsigned long owner = (unsigned long)rwm->owner;
+	return rt_rwlock_pending(rwm) &&
+		(owner & (unsigned long)RT_RW_PENDING_WRITE);
+}
+
+static inline struct task_struct *rt_rwlock_owner(struct rw_mutex *rwm)
+{
+	return (struct task_struct *)
+		((unsigned long)rwm->owner & ~RT_RWLOCK_MASKALL);
+}
+
+static inline unsigned long rt_rwlock_writer(struct rw_mutex *rwm)
+{
+	return (unsigned long)rwm->owner & RT_RWLOCK_WRITER;
+}
+
+extern void rt_mutex_up_write(struct rw_mutex *rwm);
+extern void rt_mutex_up_read(struct rw_mutex *rwm);
+extern int rt_mutex_down_write_trylock(struct rw_mutex *rwm);
+extern void rt_mutex_down_write(struct rw_mutex *rwm);
+extern int rt_mutex_down_read_trylock(struct rw_mutex *rwm);
+extern void rt_mutex_down_read(struct rw_mutex *rwm);
+extern void rt_mutex_rwsem_init(struct rw_mutex *rwm, const char *name);
+extern void rt_rwlock_write_lock(struct rw_mutex *rwm);
+extern void rt_rwlock_read_lock(struct rw_mutex *rwm);
+extern void rt_rwlock_write_unlock(struct rw_mutex *rwm);
+extern void rt_rwlock_read_unlock(struct rw_mutex *rwm);
+extern void rt_mutex_downgrade_write(struct rw_mutex *rwm);
+
+#endif /* CONFIG_PREEMPT_RT */
+
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
@@ -121,6 +183,25 @@ extern void rt_mutex_init_proxy_locked(s
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 				  struct task_struct *proxy_owner);
 
+
+#define STEAL_LATERAL 1
+#define STEAL_NORMAL  0
+
+/*
+ * Note that RT tasks are excluded from lateral-steals to prevent the
+ * introduction of an unbounded latency
+ */
+static inline int lock_is_stealable(struct task_struct *pendowner, int mode)
+{
+    if (mode == STEAL_NORMAL || rt_task(current)) {
+	    if (current->prio >= pendowner->prio)
+		    return 0;
+    } else if (current->prio > pendowner->prio)
+	    return 0;
+
+    return 1;
+}
+
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
 #else
Index: linux-2.6.25.4-rt4/kernel/lockdep_proc.c
===================================================================
--- linux-2.6.25.4-rt4.orig/kernel/lockdep_proc.c	2008-05-29 09:45:32.000000000 -0400
+++ linux-2.6.25.4-rt4/kernel/lockdep_proc.c	2008-05-29 09:47:29.000000000 -0400
@@ -516,6 +516,10 @@ static void seq_stats(struct seq_file *m
 static void seq_header(struct seq_file *m)
 {
 	seq_printf(m, "lock_stat version 0.2\n");
+
+	if (unlikely(!debug_locks))
+		seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
+
 	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
 	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
 			"%14s %14s\n",
Index: linux-2.6.25.4-rt4/.gitignore
===================================================================
--- linux-2.6.25.4-rt4.orig/.gitignore	2008-05-29 09:45:32.000000000 -0400
+++ linux-2.6.25.4-rt4/.gitignore	2008-05-29 09:47:29.000000000 -0400
@@ -28,6 +28,7 @@ vmlinux*
 !vmlinux.lds.S
 System.map
 Module.symvers
+Module.markers
 !.gitignore
 
 #
Index: linux-2.6.25.4-rt4/scripts/.gitignore
===================================================================
--- linux-2.6.25.4-rt4.orig/scripts/.gitignore	2008-05-29 09:45:32.000000000 -0400
+++ linux-2.6.25.4-rt4/scripts/.gitignore	2008-05-29 09:47:30.000000000 -0400
@@ -7,3 +7,4 @@ pnmtologo
 bin2c
 unifdef
 binoffset
+testlpp
Index: linux-2.6.25.4-rt4/arch/x86/lib/delay_32.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/lib/delay_32.c	2008-05-29 09:45:31.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/lib/delay_32.c	2008-05-29 09:47:31.000000000 -0400
@@ -44,13 +44,36 @@ static void delay_loop(unsigned long loo
 static void delay_tsc(unsigned long loops)
 {
 	unsigned long bclock, now;
+	int cpu;
 
-	preempt_disable();		/* TSC's are per-cpu */
+	preempt_disable();
+	cpu = smp_processor_id();
 	rdtscl(bclock);
-	do {
-		rep_nop();
+	for (;;) {
 		rdtscl(now);
-	} while ((now-bclock) < loops);
+		if ((now - bclock) >= loops)
+			break;
+
+		/* Allow RT tasks to run */
+		preempt_enable();
+		rep_nop();
+		preempt_disable();
+
+		/*
+		 * It is possible that we moved to another CPU, and
+		 * since TSC's are per-cpu we need to calculate
+		 * that. The delay must guarantee that we wait "at
+		 * least" the amount of time. Being moved to another
+		 * CPU could make the wait longer but we just need to
+		 * make sure we waited long enough. Rebalance the
+		 * counter for this CPU.
+		 */
+		if (unlikely(cpu != smp_processor_id())) {
+			loops -= (now - bclock);
+			cpu = smp_processor_id();
+			rdtscl(bclock);
+		}
+	}
 	preempt_enable();
 }
 
Index: linux-2.6.25.4-rt4/arch/x86/lib/delay_64.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/x86/lib/delay_64.c	2008-05-29 09:45:31.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/x86/lib/delay_64.c	2008-05-29 09:47:31.000000000 -0400
@@ -31,14 +31,36 @@ int __devinit read_current_timer(unsigne
 void __delay(unsigned long loops)
 {
 	unsigned bclock, now;
+	int cpu;
 
-	preempt_disable();		/* TSC's are pre-cpu */
+	preempt_disable();
+	cpu = smp_processor_id();
 	rdtscl(bclock);
-	do {
-		rep_nop(); 
+	for (;;) {
 		rdtscl(now);
+		if ((now - bclock) >= loops)
+			break;
+
+		/* Allow RT tasks to run */
+		preempt_enable();
+		rep_nop();
+		preempt_disable();
+
+		/*
+		 * It is possible that we moved to another CPU, and
+		 * since TSC's are per-cpu we need to calculate
+		 * that. The delay must guarantee that we wait "at
+		 * least" the amount of time. Being moved to another
+		 * CPU could make the wait longer but we just need to
+		 * make sure we waited long enough. Rebalance the
+		 * counter for this CPU.
+		 */
+		if (unlikely(cpu != smp_processor_id())) {
+			loops -= (now - bclock);
+			cpu = smp_processor_id();
+			rdtscl(bclock);
+		}
 	}
-	while ((now-bclock) < loops);
 	preempt_enable();
 }
 EXPORT_SYMBOL(__delay);
Index: linux-2.6.25.4-rt4/arch/arm/plat-omap/clock.c
===================================================================
--- linux-2.6.25.4-rt4.orig/arch/arm/plat-omap/clock.c	2008-05-29 09:45:31.000000000 -0400
+++ linux-2.6.25.4-rt4/arch/arm/plat-omap/clock.c	2008-05-29 09:47:33.000000000 -0400
@@ -160,15 +160,12 @@ EXPORT_SYMBOL(clk_get_usecount);
 
 unsigned long clk_get_rate(struct clk *clk)
 {
-	unsigned long flags;
 	unsigned long ret = 0;
 
 	if (clk == NULL || IS_ERR(clk))
 		return 0;
 
-	spin_lock_irqsave(&clockfw_lock, flags);
 	ret = clk->rate;
-	spin_unlock_irqrestore(&clockfw_lock, flags);
 
 	return ret;
 }