From ad26d40d82b537dd00626e036f02bec1f0a25d47 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:08 -0500
Subject: [PATCH] net: preempt-rt support

commit 0f2c3c2b4cbac06fa3080bc350b0defb9d0f525e in tip.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 include/linux/netdevice.h       |    8 ++--
 net/core/dev.c                  |   41 ++++++++++++++++++++++----
 net/core/netpoll.c              |   60 +++++++++++++++++++++++++-------------
 net/ipv4/icmp.c                 |    5 ++-
 net/ipv4/route.c                |    4 +-
 net/ipv6/netfilter/ip6_tables.c |    2 +-
 net/sched/sch_generic.c         |   13 ++++++--
 7 files changed, 94 insertions(+), 39 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa8b476..6be4dde 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1790,14 +1790,14 @@ static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
 {
 	spin_lock_bh(&txq->_xmit_lock);
-	txq->xmit_lock_owner = smp_processor_id();
+	txq->xmit_lock_owner = raw_smp_processor_id();
 }
 
 static inline int __netif_tx_trylock(struct netdev_queue *txq)
 {
 	int ok = spin_trylock(&txq->_xmit_lock);
 	if (likely(ok))
-		txq->xmit_lock_owner = smp_processor_id();
+		txq->xmit_lock_owner = raw_smp_processor_id();
 	return ok;
 }
 
@@ -1831,7 +1831,7 @@ static inline void netif_tx_lock(struct net_device *dev)
 	int cpu;
 
 	spin_lock(&dev->tx_global_lock);
-	cpu = smp_processor_id();
+	cpu = raw_smp_processor_id();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
@@ -1894,7 +1894,7 @@ static inline void netif_tx_disable(struct net_device *dev)
 	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
+	cpu = raw_smp_processor_id();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 6b1cc19..07b9c01 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2130,9 +2130,14 @@ gso:
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
-
+		/*
+		 * No need to check for recursion with threaded interrupts:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		if (1) {
+#else
 		if (txq->xmit_lock_owner != cpu) {
+#endif
 
 			HARD_TX_LOCK(dev, txq, cpu);
 
@@ -2251,7 +2256,8 @@ EXPORT_SYMBOL(netif_rx_ni);
 
 static void net_tx_action(struct softirq_action *h)
 {
-	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = &per_cpu(softnet_data,
+					   raw_smp_processor_id());
 
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
@@ -2267,6 +2273,11 @@ static void net_tx_action(struct softirq_action *h)
 
 			WARN_ON(atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+			cond_resched_softirq_context();
 		}
 	}
 
@@ -2285,6 +2296,22 @@ static void net_tx_action(struct softirq_action *h)
 			head = head->next_sched;
 
 			root_lock = qdisc_lock(q);
+			/*
+			 * We are executing in softirq context here, and
+			 * if softirqs are preemptible, we must avoid
+			 * infinite reactivation of the softirq by
+			 * either the tx handler, or by netif_schedule().
+			 * (it would result in an infinitely looping
+			 *  softirq context)
+			 * So we take the spinlock unconditionally.
+			 */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			spin_lock(root_lock);
+			smp_mb__before_clear_bit();
+			clear_bit(__QDISC_STATE_SCHED, &q->state);
+			qdisc_run(q);
+			spin_unlock(root_lock);
+#else
 			if (spin_trylock(root_lock)) {
 				smp_mb__before_clear_bit();
 				clear_bit(__QDISC_STATE_SCHED,
@@ -2301,6 +2328,7 @@ static void net_tx_action(struct softirq_action *h)
 						  &q->state);
 				}
 			}
+#endif
 		}
 	}
 }
@@ -2516,7 +2544,7 @@ int netif_receive_skb(struct sk_buff *skb)
 			skb->dev = master;
 	}
 
-	__get_cpu_var(netdev_rx_stat).total++;
+	per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++;
 
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
@@ -2913,9 +2941,10 @@ EXPORT_SYMBOL(napi_gro_frags);
 static int process_backlog(struct napi_struct *napi, int quota)
 {
 	int work = 0;
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
 
+	queue = &per_cpu(softnet_data, raw_smp_processor_id());
 	napi->weight = weight_p;
 	do {
 		struct sk_buff *skb;
@@ -2947,7 +2976,7 @@ void __napi_schedule(struct napi_struct *n)
 
 	local_irq_save(flags);
 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__napi_schedule);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a58f59b..2fbd53b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -74,20 +74,20 @@ static void queue_process(struct work_struct *work)
 
 		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		__netif_tx_lock(txq, smp_processor_id());
 		if (netif_tx_queue_stopped(txq) ||
 		    netif_tx_queue_frozen(txq) ||
 		    ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
 			skb_queue_head(&npinfo->txq, skb);
 			__netif_tx_unlock(txq);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 
 			schedule_delayed_work(&npinfo->tx_work, HZ/10);
 			return;
 		}
 		__netif_tx_unlock(txq);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 }
 
@@ -158,7 +158,7 @@ static void poll_napi(struct net_device *dev)
 	int budget = 16;
 
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner != smp_processor_id() &&
+		if (napi->poll_owner != raw_smp_processor_id() &&
 		    spin_trylock(&napi->poll_lock)) {
 			budget = poll_one_napi(dev->npinfo, napi, budget);
 			spin_unlock(&napi->poll_lock);
@@ -219,30 +219,35 @@ static void refill_skbs(void)
 
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
 
 		local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
 		local_irq_restore(flags);
-
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			if (skb->destructor) {
-				atomic_inc(&skb->users);
-				dev_kfree_skb_any(skb); /* put this one back */
-			} else {
-				__kfree_skb(skb);
-			}
-		}
 	}
 
+
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		if (skb->destructor) {
+			atomic_inc(&skb->users);
+			dev_kfree_skb_any(skb); /* put this one back */
+		} else {
+			__kfree_skb(skb);
+		}
+	}
 }
 
 static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
@@ -250,13 +255,26 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
 	int count = 0;
 	struct sk_buff *skb;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * On -rt skb_pool.lock is schedulable, so if we are
+	 * in an atomic context we just try to dequeue from the
+	 * pool and fail if we cannot get one.
+	 */
+	if (in_atomic() || irqs_disabled())
+		goto pick_atomic;
+#endif
 	zap_completion_queue();
 	refill_skbs();
 repeat:
 
 	skb = alloc_skb(len, GFP_ATOMIC);
-	if (!skb)
+	if (!skb) {
+#ifdef CONFIG_PREEMPT_RT
+pick_atomic:
+#endif
 		skb = skb_dequeue(&skb_pool);
+	}
 
 	if (!skb) {
 		if (++count < 10) {
@@ -276,7 +294,7 @@ static int netpoll_owner_active(struct net_device *dev)
 	struct napi_struct *napi;
 
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner == smp_processor_id())
+		if (napi->poll_owner == raw_smp_processor_id())
 			return 1;
 	}
 	return 0;
@@ -302,7 +320,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 
 		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* try until next clock tick */
 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
 		     tries > 0; --tries) {
@@ -329,7 +347,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 			"netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n",
 			dev->name, ops->ndo_start_xmit);
 
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	if (status != NETDEV_TX_OK) {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index ac4dec1..e4d0cc4 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -202,7 +202,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
  */
 static struct sock *icmp_sk(struct net *net)
 {
-	return net->ipv4.icmp_sk[smp_processor_id()];
+	/*
+	 * Should be safe on PREEMPT_SOFTIRQS/HARDIRQS to use raw-smp-processor-id:
+	 */
+	return net->ipv4.icmp_sk[raw_smp_processor_id()];
 }
 
 static inline struct sock *icmp_xmit_lock(struct net *net)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 71d4e5b..f322a91 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -204,13 +204,13 @@ struct rt_hash_bucket {
 };
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
+	defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT)
 /*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
  */
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
 # define RT_HASH_LOCK_SZ	256
 #else
 # if NR_CPUS >= 32
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 9210e31..053133a 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -384,7 +384,7 @@ ip6t_do_table(struct sk_buff *skb,
 
 	xt_info_rdlock_bh();
 	private = table->private;
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries[raw_smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ff4dd53..6d6a2ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -25,6 +26,7 @@
 #include <linux/rcupdate.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <linux/delay.h>
 #include <net/pkt_sched.h>
 
 /* Main transmission queue. */
@@ -77,7 +79,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
 {
 	int ret;
 
-	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
+	if (unlikely(dev_queue->xmit_lock_owner == raw_smp_processor_id())) {
 		/*
 		 * Same CPU holding the lock. It may be a transient
 		 * configuration error, when hard_start_xmit() recurses. We
@@ -119,7 +121,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	/* And release qdisc */
 	spin_unlock(root_lock);
 
-	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	HARD_TX_LOCK(dev, txq, raw_smp_processor_id());
 	if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
 		ret = dev_hard_start_xmit(skb, dev, txq);
 
@@ -787,9 +789,12 @@ void dev_deactivate(struct net_device *dev)
 	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
 	synchronize_rcu();
 
-	/* Wait for outstanding qdisc_run calls. */
+	/*
+	 * Wait for outstanding qdisc_run calls.
+	 * TODO: shouldnt this be wakeup-based, instead of polling it?
+	 */
 	while (some_qdisc_is_busy(dev))
-		yield();
+		msleep(1);
 }
 
 static void dev_init_scheduler_queue(struct net_device *dev,
-- 
1.7.0.4