From: Christoph Lameter <christoph@graphe.net>

Reduce size of the huge per_cpu_pageset structure in __initdata introduced
into mm1 with the pageset localization patchset.  Use one specially
configured pageset per cpu for all zones and nodes during bootup.

- Avoid duplication of pageset initialization code.
- do the adding to the pageset list before potential free_pages_bulk
  in free_hot_cold_page (otherwise we would have to hold a page
  in a pageset during the period that the boot pagesets are in use).
- remove mistaken __cpuinitdata attribute and revert back to __initdata
  for the boot pageset. A boot pageset is not necessary for cpu hotplug.

Tested for UP SMP NUMA on x86_64 (2.6.12-rc6-mm1): UP SMP NUMA Tested on
IA64 (2.6.12-rc5-mm2): NUMA (2.6.12-rc6-mm1 broken for IA64 because of
sparsemem patches)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 mm/page_alloc.c |  108 +++++++++++++++++++++-----------------------------------
 1 files changed, 42 insertions(+), 66 deletions(-)

diff -puN mm/page_alloc.c~reduce-size-of-huge-boot-per_cpu_pageset mm/page_alloc.c
--- 25/mm/page_alloc.c~reduce-size-of-huge-boot-per_cpu_pageset	2005-06-07 20:53:39.000000000 -0700
+++ 25-akpm/mm/page_alloc.c	2005-06-07 20:54:09.000000000 -0700
@@ -71,11 +71,6 @@ EXPORT_SYMBOL(nr_swap_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT];
 EXPORT_SYMBOL(zone_table);
 
-#ifdef CONFIG_NUMA
-static struct per_cpu_pageset
-	pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
-#endif
-
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
@@ -652,10 +647,10 @@ static void fastcall free_hot_cold_page(
 	free_pages_check(__FUNCTION__, page);
 	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
-	if (pcp->count >= pcp->high)
-		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
+	if (pcp->count >= pcp->high)
+		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 	local_irq_restore(flags);
 	put_cpu();
 }
@@ -1734,57 +1729,55 @@ static int __devinit zone_batchsize(stru
 	return batch;
 }
 
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0];		/* hot */
+	pcp->count = 0;
+	pcp->low = 2 * batch;
+	pcp->high = 6 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &p->pcp[1];		/* cold*/
+	pcp->count = 0;
+	pcp->low = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+}
+
 #ifdef CONFIG_NUMA
 /*
- * Dynamicaly allocate memory for the
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ * These will be discarded when bootup is complete.
+ */
+static struct per_cpu_pageset
+	boot_pageset[NR_CPUS] __initdata;
+
+/*
+ * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
 static int __devinit process_zones(int cpu)
 {
 	struct zone *zone, *dzone;
-	int i;
 
 	for_each_zone(zone) {
-		struct per_cpu_pageset *npageset = NULL;
 
-		npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
+		zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, cpu_to_node(cpu));
-		if (!npageset) {
-			zone->pageset[cpu] = NULL;
+		if (!zone->pageset[cpu])
 			goto bad;
-		}
-
-		if (zone->pageset[cpu]) {
-			memcpy(npageset, zone->pageset[cpu],
-					sizeof(struct per_cpu_pageset));
-
-			/* Relocate lists */
-			for (i = 0; i < 2; i++) {
-				INIT_LIST_HEAD(&npageset->pcp[i].list);
-				list_splice(&zone->pageset[cpu]->pcp[i].list,
-					&npageset->pcp[i].list);
-			}
- 		} else {
-			struct per_cpu_pages *pcp;
-			unsigned long batch;
 
-			batch = zone_batchsize(zone);
-
-			pcp = &npageset->pcp[0];		/* hot */
-			pcp->count = 0;
-			pcp->low = 2 * batch;
-			pcp->high = 6 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-
-			pcp = &npageset->pcp[1];		/* cold*/
-			pcp->count = 0;
-			pcp->low = 0;
-			pcp->high = 2 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-		}
-		zone->pageset[cpu] = npageset;
+		setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
 	}
 
 	return 0;
@@ -1897,30 +1890,13 @@ static void __init free_area_init_core(s
 		batch = zone_batchsize(zone);
 
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			struct per_cpu_pages *pcp;
 #ifdef CONFIG_NUMA
-			struct per_cpu_pageset *pgset;
-			pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
-					(j * NR_CPUS) + cpu];
-
-			zone->pageset[cpu] = pgset;
+			/* Early boot. Slab allocator not functional yet */
+			zone->pageset[cpu] = &boot_pageset[cpu];
+			setup_pageset(&boot_pageset[cpu],0);
 #else
-			struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
+			setup_pageset(zone_pcp(zone,cpu), batch);
 #endif
-
-			pcp = &pgset->pcp[0];			/* hot */
-			pcp->count = 0;
-			pcp->low = 2 * batch;
-			pcp->high = 6 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-
-			pcp = &pgset->pcp[1];			/* cold */
-			pcp->count = 0;
-			pcp->low = 0;
-			pcp->high = 2 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
 		}
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
_