From fecfaf03f489b00f3d6aeb9aff44774746e549ac Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:37 -0500
Subject: [PATCH] mm: page_alloc: rt-friendly per-cpu pages

commit ff3fd6afd788760c846a2f4449487debb6c4b0ac in tip.

rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
method into a preemptible, explicit-per-cpu-locks method.

Contains fixes from:
	 Peter Zijlstra <a.p.zijlstra@chello.nl>
	 Thomas Gleixner <tglx@linutronix.de>

[PG: upstream 99dcc3e5a94e muddies the waters of applying the original,
 for example free_zone_pagesets() is gone.]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 mm/page_alloc.c |  137 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 files changed, 117 insertions(+), 20 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d03c946..d870c91 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -188,6 +188,54 @@ static unsigned long __meminitdata dma_reserve;
   EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
+#ifdef CONFIG_PREEMPT_RT
+static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
+#endif
+
+static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	spin_lock(&__get_cpu_lock(pcp_locks, cpu));
+	flags = 0;
+#else
+	local_irq_save(*flags);
+#endif
+}
+
+static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	(void)get_cpu_var_locked(pcp_locks, this_cpu);
+	flags = 0;
+#else
+	local_irq_save(*flags);
+	*this_cpu = smp_processor_id();
+#endif
+}
+
+static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+	put_cpu_var_locked(pcp_locks, this_cpu);
+#else
+	local_irq_restore(flags);
+#endif
+}
+
+// PG: FIXME - zone_pcp is dead (99dcc3e5a9) so kill these get/put variants?
+static struct per_cpu_pageset *
+get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
+{
+	lock_cpu_pcp(flags, this_cpu);
+	return per_cpu_ptr(zone->pageset, *this_cpu);
+}
+
+static void
+put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
+{
+	unlock_cpu_pcp(flags, this_cpu);
+}
+
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
@@ -602,8 +650,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
-	int i;
-	int bad = 0;
+	int i, this_cpu, bad = 0;
 	int wasMlocked = __TestClearPageMlocked(page);
 
 	trace_mm_page_free_direct(page, order);
@@ -622,13 +669,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 
-	local_irq_save(flags);
+	lock_cpu_pcp(&flags, &this_cpu);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
-	__count_vm_events(PGFREE, 1 << order);
+	count_vm_events(PGFREE, 1 << order);
+	unlock_cpu_pcp(flags, this_cpu);
 	free_one_page(page_zone(page), page, order,
 					get_pageblock_migratetype(page));
-	local_irq_restore(flags);
 }
 
 /*
@@ -1005,15 +1052,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
+	int this_cpu;
 
-	local_irq_save(flags);
+	lock_cpu_pcp(&flags, &this_cpu);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
 	free_pcppages_bulk(zone, to_drain, pcp);
 	pcp->count -= to_drain;
-	local_irq_restore(flags);
+	unlock_cpu_pcp(flags, this_cpu);
 }
 #endif
 
@@ -1033,13 +1081,18 @@ static void drain_pages(unsigned int cpu)
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
-		local_irq_save(flags);
+		__lock_cpu_pcp(&flags, cpu);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
+		if (!pset) {
+			unlock_cpu_pcp(flags, cpu);
+			WARN_ON(1);
+			continue;
+		}
 		pcp = &pset->pcp;
 		free_pcppages_bulk(zone, pcp->count, pcp);
 		pcp->count = 0;
-		local_irq_restore(flags);
+		unlock_cpu_pcp(flags, cpu);
 	}
 }
 
@@ -1051,12 +1104,52 @@ void drain_local_pages(void *arg)
 	drain_pages(smp_processor_id());
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static void drain_local_pages_work(struct work_struct *wrk)
+{
+	drain_pages(smp_processor_id());
+}
+#endif
+
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator
  */
 void drain_all_pages(void)
 {
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * HACK!!!!!
+	 *  For RT we can't use IPIs to run drain_local_pages, since
+	 *  that code will call spin_locks that will now sleep.
+	 *  But, schedule_on_each_cpu will call kzalloc, which will
+	 *  call page_alloc which was what calls this.
+	 *
+	 *  Luckily, there's a condition to get here, and that is if
+	 *  the order passed in to alloc_pages is greater than 0
+	 *  (alloced more than a page size).  The slabs only allocate
+	 *  what is needed, and the allocation made by schedule_on_each_cpu
+	 *  does an alloc of "sizeof(void *)*nr_cpu_ids".
+	 *
+	 *  So we can safely call schedule_on_each_cpu if that number
+	 *  is less than a page. Otherwise don't bother. At least warn of
+	 *  this issue.
+	 *
+	 * And yes, this is one big hack.  Please fix ;-)
+	 */
+	if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
+		schedule_on_each_cpu(drain_local_pages_work);
+	else {
+		static int once;
+		if (!once) {
+			printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
+			once = 1;
+		}
+		drain_local_pages(NULL);
+	}
+
+#else
 	on_each_cpu(drain_local_pages, NULL, 1);
+#endif
 }
 
 #ifdef CONFIG_HIBERNATION
@@ -1102,10 +1195,11 @@ void mark_free_pages(struct zone *zone)
 void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
+	struct per_cpu_pageset *pset;
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
-	int wasMlocked = __TestClearPageMlocked(page);
+	int this_cpu, wasMlocked = __TestClearPageMlocked(page);
 
 	trace_mm_page_free_direct(page, 0);
 	kmemcheck_free_shadow(page, 0);
@@ -1122,12 +1216,13 @@ void free_hot_cold_page(struct page *page, int cold)
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
+	pset = get_zone_pcp(zone, &flags, &this_cpu);
+	pcp = &pset->pcp;
 	migratetype = get_pageblock_migratetype(page);
 	set_page_private(page, migratetype);
-	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
-	__count_vm_event(PGFREE);
+	count_vm_event(PGFREE);
 
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
@@ -1144,7 +1239,6 @@ void free_hot_cold_page(struct page *page, int cold)
 		migratetype = MIGRATE_MOVABLE;
 	}
 
-	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (cold)
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
@@ -1156,7 +1250,7 @@ void free_hot_cold_page(struct page *page, int cold)
 	}
 
 out:
-	local_irq_restore(flags);
+	put_zone_pcp(zone, flags, this_cpu);
 }
 
 /*
@@ -1200,15 +1294,18 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
+	struct per_cpu_pageset *pset;
+	int this_cpu;
 
 again:
+	pset = get_zone_pcp(zone, &flags, &this_cpu);
+
 	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
 		struct list_head *list;
+		struct per_cpu_pages *pcp = &pset->pcp;
 
-		local_irq_save(flags);
-		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
+
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
@@ -1238,7 +1335,7 @@ again:
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
-		spin_lock_irqsave(&zone->lock, flags);
+		spin_lock(&zone->lock);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
@@ -1248,7 +1345,7 @@ again:
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone);
-	local_irq_restore(flags);
+	put_zone_pcp(zone, flags, this_cpu);
 
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1256,7 +1353,7 @@ again:
 	return page;
 
 failed:
-	local_irq_restore(flags);
+	put_zone_pcp(zone, flags, this_cpu);
 	return NULL;
 }
 
-- 
1.7.0.4