Signed-off-by: Andrea Arcangeli <andrea@novell.com>

diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/Kconfig.debug x/arch/i386/Kconfig.debug
--- x-ref/arch/i386/Kconfig.debug	2004-10-27 03:14:17.000000000 +0200
+++ x/arch/i386/Kconfig.debug	2004-10-30 15:56:48.746593856 +0200
@@ -46,6 +46,13 @@ config DEBUG_PAGEALLOC
 	  This results in a large slowdown, but helps to find certain types
 	  of memory corruptions.
 
+config DEBUG_PAGE_ZERO
+	bool "Page zero debugging"
+	depends on DEBUG_KERNEL
+	help
+	  Verify that the PG_zero pages returned by __GFP_ZERO allocations
+	  are truly zero pages.
+
 config 4KSTACKS
 	bool "Use 4Kb for kernel stacks instead of 8Kb"
 	help
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/kernel/process.c x/arch/i386/kernel/process.c
--- x-ref/arch/i386/kernel/process.c	2004-10-27 03:14:17.000000000 +0200
+++ x/arch/i386/kernel/process.c	2004-10-30 15:56:48.752592944 +0200
@@ -143,6 +143,9 @@ void cpu_idle (void)
 	while (1) {
 		while (!need_resched()) {
 			void (*idle)(void);
+
+			idle_page_zero();
+
 			/*
 			 * Mark this as an RCU critical section so that
 			 * synchronize_kernel() in the unload path waits
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/mm/pgtable.c x/arch/i386/mm/pgtable.c
--- x-ref/arch/i386/mm/pgtable.c	2004-08-25 02:47:49.000000000 +0200
+++ x/arch/i386/mm/pgtable.c	2004-10-30 15:57:43.539264104 +0200
@@ -132,10 +132,7 @@ void __set_fixmap (enum fixed_addresses 
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-	if (pte)
-		clear_page(pte);
-	return pte;
+	return (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
 }
 
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -143,12 +140,14 @@ struct page *pte_alloc_one(struct mm_str
 	struct page *pte;
 
 #ifdef CONFIG_HIGHPTE
-	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0);
+	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
 #else
-	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
 #endif
-	if (pte)
+	if (pte	&& !PageZero(pte)) {
 		clear_highpage(pte);
+		SetPageZero(pte);
+	}
 	return pte;
 }
 
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/gfp.h x/include/linux/gfp.h
--- x-ref/include/linux/gfp.h	2004-10-27 03:14:22.000000000 +0200
+++ x/include/linux/gfp.h	2004-10-30 15:56:48.766590816 +0200
@@ -37,6 +37,8 @@ struct vm_area_struct;
 #define __GFP_NORETRY	0x1000	/* Do not retry.  Might fail */
 #define __GFP_NO_GROW	0x2000	/* Slab internal usage */
 #define __GFP_COMP	0x4000	/* Add compound page metadata */
+#define __GFP_ZERO	0x8000	/* Alloc a zero page */
+#define __GFP_ONLY_ZERO	0x10000	/* Only try to find an already zero page */
 
 #define __GFP_BITS_SHIFT 16	/* Room for 16 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
@@ -44,7 +46,7 @@ struct vm_area_struct;
 /* if you forget to add the bitmask here kernel will crash, period */
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
-			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP)
+			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP|__GFP_ZERO)
 
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
@@ -52,6 +54,8 @@ struct vm_area_struct;
 #define GFP_KERNEL	(__GFP_WAIT | __GFP_IO | __GFP_FS)
 #define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS)
 #define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_HIGHZERO	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_ZERO)
+#define GFP_ONLYZERO	(__GFP_HIGHMEM | __GFP_ZERO | __GFP_ONLY_ZERO) /* atomic */
 
 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
    platforms, used as appropriate on others */
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/mmzone.h x/include/linux/mmzone.h
--- x-ref/include/linux/mmzone.h	2004-10-30 15:55:57.365404984 +0200
+++ x/include/linux/mmzone.h	2004-10-30 15:56:48.767590664 +0200
@@ -42,16 +42,29 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+enum per_cpu_pages_type {
+	PER_CPU_PAGES_HOT_COLD, /* hot at head.next, cold at head.prev */
+	PER_CPU_PAGES_ZERO, /* zero pages, no need of clear_page for these */
+	NR_PER_CPU_PAGES,
+};
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
-	int low;		/* low watermark, refill needed */
-	int high;		/* high watermark, emptying needed */
-	int batch;		/* chunk size for buddy add/remove */
+	int max_size;
+	int batch;
 	struct list_head list;	/* the list of pages */
 };
 
+struct sysctl_per_cpu_pages {
+	int size_ratio;
+	int max_size;
+	int batch_ratio;
+	int max_batch;
+	int disable_idle_page_zero;
+};
+
 struct per_cpu_pageset {
-	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+	struct per_cpu_pages pcp[NR_PER_CPU_PAGES];
 #ifdef CONFIG_NUMA
 	unsigned long numa_hit;		/* allocated in intended node */
 	unsigned long numa_miss;	/* allocated in non intended node */
@@ -364,6 +377,10 @@ int min_free_kbytes_sysctl_handler(struc
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
+extern struct sysctl_per_cpu_pages sysctl_per_cpu_pages;
+extern int per_cpu_pages_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+extern void idle_page_zero(void);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/page-flags.h x/include/linux/page-flags.h
--- x-ref/include/linux/page-flags.h	2004-10-27 03:14:22.000000000 +0200
+++ x/include/linux/page-flags.h	2004-10-30 15:56:48.767590664 +0200
@@ -74,6 +74,7 @@
 #define PG_swapcache		16	/* Swap page: swp_entry_t in private */
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
 #define PG_reclaim		18	/* To be reclaimed asap */
+#define PG_zero			19	/* Zero contents, don't cover compound */
 
 
 /*
@@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u
 #define PageSwapCache(page)	0
 #endif
 
+#define PageZero(page)		test_bit(PG_zero, &(page)->flags)
+#define SetPageZero(page)	set_bit(PG_zero, &(page)->flags)
+#define ClearPageZero(page)	clear_bit(PG_zero, &(page)->flags)
+#define TestClearPageZero(page)	test_and_clear_bit(PG_zero, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/sysctl.h x/include/linux/sysctl.h
--- x-ref/include/linux/sysctl.h	2004-10-30 15:55:57.368404528 +0200
+++ x/include/linux/sysctl.h	2004-10-30 15:56:48.768590512 +0200
@@ -167,6 +167,7 @@ enum
 	VM_HUGETLB_GROUP=25,	/* permitted hugetlb group */
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
+	VM_PER_CPU_PAGES=28,	/* per cpu pages tuning */
 };
 
 
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/kernel/sysctl.c x/kernel/sysctl.c
--- x-ref/kernel/sysctl.c	2004-10-30 15:55:57.378403008 +0200
+++ x/kernel/sysctl.c	2004-10-30 15:56:48.776589296 +0200
@@ -798,6 +798,15 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
+	{
+		.ctl_name	= VM_PER_CPU_PAGES,
+		.procname	= "per_cpu_pages",
+		.data		= &sysctl_per_cpu_pages,
+		.maxlen		= sizeof(sysctl_per_cpu_pages),
+		.mode		= 0644,
+		.proc_handler	= &per_cpu_pages_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/memory.c x/mm/memory.c
--- x-ref/mm/memory.c	2004-10-27 03:14:22.000000000 +0200
+++ x/mm/memory.c	2004-10-30 15:56:48.779588840 +0200
@@ -1059,7 +1059,7 @@ static int do_wp_page(struct mm_struct *
 	}
 	old_page = pfn_to_page(pfn);
 
-	if (!TestSetPageLocked(old_page)) {
+	if (!PageReserved(old_page) && !TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
 		unlock_page(old_page);
 		if (reuse) {
@@ -1072,7 +1072,32 @@ static int do_wp_page(struct mm_struct *
 			spin_unlock(&mm->page_table_lock);
 			return VM_FAULT_MINOR;
 		}
+	} else if (old_page == ZERO_PAGE(address)) { /* zero page is PageReserved */
+		/* no need to clear_page or copy or unlock */
+		new_page = alloc_page_vma(GFP_ONLYZERO, vma, address);
+		if (new_page) {
+			if (unlikely(!vma->anon_vma)) {
+				/* oh well, let's do it then */
+				pte_unmap(page_table);
+				spin_unlock(&mm->page_table_lock);
+
+				if (unlikely(anon_vma_prepare(vma))) {
+					old_page = new_page; /* free it */
+					goto no_new_page;
+				}
+
+				spin_lock(&mm->page_table_lock);
+				page_table = pte_offset_map(pmd, address);
+				if (unlikely(!pte_same(*page_table, pte))) {
+					old_page = new_page; /* free it */
+					goto no_new_page;
+				}
+			}
+			ClearPageZero(new_page);
+			goto map_rw;
+		}
 	}
+
 	pte_unmap(page_table);
 
 	/*
@@ -1095,9 +1120,10 @@ static int do_wp_page(struct mm_struct *
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, pte))) {
-		if (PageReserved(old_page))
+		if (PageReserved(old_page)) {
+		map_rw:
 			++mm->rss;
-		else
+		} else
 			page_remove_rmap(old_page);
 		break_cow(vma, new_page, address, page_table);
 		lru_cache_add_active(new_page);
@@ -1416,10 +1442,7 @@ do_anonymous_page(struct mm_struct *mm, 
 		unsigned long addr)
 {
 	pte_t entry;
-	struct page * page = ZERO_PAGE(addr);
-
-	/* Read-only mapping of ZERO_PAGE. */
-	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+	struct page * page;
 
 	/* ..except if it's a write access */
 	if (write_access) {
@@ -1429,10 +1452,11 @@ do_anonymous_page(struct mm_struct *mm, 
 
 		if (unlikely(anon_vma_prepare(vma)))
 			goto no_mem;
-		page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+		page = alloc_page_vma(GFP_HIGHZERO, vma, addr);
 		if (!page)
 			goto no_mem;
-		clear_user_highpage(page, addr);
+		if (!TestClearPageZero(page))
+			clear_user_highpage(page, addr);
 
 		spin_lock(&mm->page_table_lock);
 		page_table = pte_offset_map(pmd, addr);
@@ -1450,6 +1474,9 @@ do_anonymous_page(struct mm_struct *mm, 
 		lru_cache_add_active(page);
 		mark_page_accessed(page);
 		page_add_anon_rmap(page, vma, addr);
+	} else {
+		/* Read-only mapping of ZERO_PAGE. */
+		entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 	}
 
 	set_pte(page_table, entry);
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/page_alloc.c x/mm/page_alloc.c
--- x-ref/mm/page_alloc.c	2004-10-30 15:55:57.391401032 +0200
+++ x/mm/page_alloc.c	2004-10-30 15:56:48.786587776 +0200
@@ -12,6 +12,7 @@
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ *  Per cpu zero lists, Andrea Arcangeli, SUSE, Oct 2004
  */
 
 #include <linux/config.h>
@@ -50,6 +51,14 @@ int numnodes = 1;
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
 
+struct sysctl_per_cpu_pages sysctl_per_cpu_pages = {
+	256, /* per-cpu pagelist size is "zone->present_pages/size_ratio" */
+	4*1024*1024 >> PAGE_SHIFT, /* maximum size */
+	8, /* batch is calculated as "max_size/batch_ratio" */
+	32, /* max number of pages to free/alloc at time - latency control */
+	0, /* if 1 it disables the idle page zero background generation */
+};
+
 EXPORT_SYMBOL(totalram_pages);
 EXPORT_SYMBOL(nr_swap_pages);
 
@@ -189,7 +198,7 @@ static inline void __free_pages_bulk (st
 {
 	unsigned long page_idx, index, mask;
 
-	if (order)
+	if (unlikely(order))
 		destroy_compound_page(page, order);
 	mask = (~0UL) << order;
 	page_idx = page - base;
@@ -362,7 +371,7 @@ static void prep_new_page(struct page *p
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_checked | 1 << PG_mappedtodisk);
+			1 << PG_checked | 1 << PG_mappedtodisk | 1 << PG_zero);
 	page->private = 0;
 	set_page_refs(page, order);
 }
@@ -399,24 +408,24 @@ static struct page *__rmqueue(struct zon
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
+ * This assumes irqs are disabled by the caller.
  */
-static int rmqueue_bulk(struct zone *zone, unsigned int order, 
-			unsigned long count, struct list_head *list)
+static int __rmqueue_bulk(struct zone *zone, unsigned long count,
+			struct list_head *list)
 {
-	unsigned long flags;
 	int i;
 	int allocated = 0;
 	struct page *page;
 	
-	spin_lock_irqsave(&zone->lock, flags);
+	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
-		page = __rmqueue(zone, order);
+		page = __rmqueue(zone, 0);
 		if (page == NULL)
 			break;
 		allocated++;
 		list_add_tail(&page->lru, list);
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
+	spin_unlock(&zone->lock);
 	return allocated;
 }
 
@@ -503,6 +512,26 @@ static void zone_statistics(struct zonel
 #endif
 }
 
+#ifdef CONFIG_DEBUG_PAGE_ZERO
+static void fastcall debug_page_zero(struct page *page)
+{
+	char * p;
+	unsigned long flags;
+
+	if (system_state == SYSTEM_RUNNING) {
+		local_irq_save(flags);
+
+		p = kmap_atomic(page, KM_IRQ0);
+		BUG_ON(memcmp(p, page_address(ZERO_PAGE(p /* ?? */)), PAGE_SIZE));
+		kunmap_atomic(p, KM_IRQ0);
+
+		local_irq_restore(flags);
+	}
+}
+#else
+#define debug_page_zero(page) do { } while (0)
+#endif
+
 /*
  * Free a 0-order page
  */
@@ -512,6 +541,12 @@ static void fastcall free_hot_cold_page(
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
+	struct list_head * place;
+	int excess_pages;
+	int zero = PageZero(page);
+
+	if (zero)
+		debug_page_zero(page);
 
 	arch_free_page(page, 0);
 
@@ -520,12 +555,29 @@ static void fastcall free_hot_cold_page(
 	if (PageAnon(page))
 		page->mapping = NULL;
 	free_pages_check(__FUNCTION__, page);
-	pcp = &zone->pageset[get_cpu()].pcp[cold];
+	pcp = &zone->pageset[get_cpu()].pcp[zero];
 	local_irq_save(flags);
-	if (pcp->count >= pcp->high)
-		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
-	list_add(&page->lru, &pcp->list);
-	pcp->count++;
+	if (zero) {
+		excess_pages = pcp->count - pcp->max_size;
+		if (excess_pages >= 0) {
+			if (excess_pages > 0)
+				pcp->count -= free_pages_bulk(zone,
+							      max(excess_pages, pcp->batch),
+							      &pcp->list, 0);
+			pcp = &zone->pageset[get_cpu()].pcp[0];
+		}
+	}
+	place = pcp->list.prev;
+	if (!cold)
+		place = &pcp->list;
+	list_add(&page->lru, place);
+	++pcp->count;
+	/* pcp->count == 0 must stop with pcp->max_size == 0 */
+	excess_pages = pcp->count - pcp->max_size;
+	if (excess_pages > 0)
+		pcp->count -= free_pages_bulk(zone,
+					      max(excess_pages, pcp->batch),
+					      &pcp->list, 0);
 	local_irq_restore(flags);
 	put_cpu();
 }
@@ -540,43 +592,65 @@ void fastcall free_cold_page(struct page
 	free_hot_cold_page(page, 1);
 }
 
+static struct page *
+alloc_per_cpu_page(struct zone *zone, int gfp_flags, int local)
+{
+	unsigned long flags;
+	struct per_cpu_pages *pcp;
+	struct page * page = NULL;
+	/*
+	 * __GFP_ZERO and __GFP_COLD are orthogonal and they could even
+	 * be used at the same time
+	 */
+	int zero = !!(gfp_flags & __GFP_ZERO);
+	int cold = gfp_flags & __GFP_COLD;
+
+	pcp = &zone->pageset[get_cpu()].pcp[zero];
+	if (!pcp->count && local)
+		goto end_put_cpu;
+	local_irq_save(flags);
+	if (!pcp->count) {
+		if (local)
+			goto end_irq_restore;
+		pcp->count += __rmqueue_bulk(zone, pcp->batch, &pcp->list);
+	}
+	if (pcp->count) {
+		struct list_head * pick = pcp->list.prev;
+		if (!cold)
+			pick = pcp->list.next;
+		page = list_entry(pick, struct page, lru);
+		list_del(&page->lru);
+		pcp->count--;
+	}
+ end_irq_restore:
+	local_irq_restore(flags);
+ end_put_cpu:
+	put_cpu();
+
+	return page;
+}
+
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-
+#define buffered_rmqueue(zone, order, gfp_flags) __buffered_rmqueue(zone, order, gfp_flags, 0)
 static struct page *
-buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
+__buffered_rmqueue(struct zone *zone, int order, int gfp_flags, int local)
 {
 	unsigned long flags;
 	struct page *page = NULL;
-	int cold = !!(gfp_flags & __GFP_COLD);
-
-	if (order == 0) {
-		struct per_cpu_pages *pcp;
-
-		pcp = &zone->pageset[get_cpu()].pcp[cold];
-		local_irq_save(flags);
-		if (pcp->count <= pcp->low)
-			pcp->count += rmqueue_bulk(zone, 0,
-						pcp->batch, &pcp->list);
-		if (pcp->count) {
-			page = list_entry(pcp->list.next, struct page, lru);
-			list_del(&page->lru);
-			pcp->count--;
-		}
-		local_irq_restore(flags);
-		put_cpu();
-	}
 
-	if (page == NULL) {
+	if (likely(!order))
+		page = alloc_per_cpu_page(zone, gfp_flags, local);
+	else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 
-	if (page != NULL) {
+	if (likely(page)) {
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
 		prep_new_page(page, order);
@@ -591,7 +665,7 @@ buffered_rmqueue(struct zone *zone, int 
  */
 struct page * fastcall
 __alloc_pages(unsigned int gfp_mask, unsigned int order,
-		struct zonelist *zonelist)
+	      struct zonelist *zonelist)
 {
 	const int wait = gfp_mask & __GFP_WAIT;
 	unsigned long min;
@@ -599,7 +673,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
-	int i;
+	int i, pass, zero = gfp_mask & __GFP_ZERO;
 	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
@@ -628,16 +702,52 @@ __alloc_pages(unsigned int gfp_mask, uns
 	 */
 	classzone_idx = zone_idx(zones[0]);
 
-	/* Go through the zonelist once, looking for a zone with enough free */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		min = z->pages_low + (1<<order) + z->lowmem_reserve[classzone_idx];
+	/* zero pages can only be provided of order 0 */
+	BUG_ON(zero && order);
+	BUG_ON((gfp_mask & __GFP_ONLY_ZERO) && !(gfp_mask & __GFP_ZERO));
 
-		if (z->free_pages < min)
-			continue;
+	/*
+	 * We can't allocate from the buddy allocator before even trying
+	 * the per-cpu-pages of the lower zones if lowmem_reserve permits.
+	 * This is important only here in front of the allocator, this
+	 * is the fast path. As soon as we reach pass 2 of this loop
+	 * we're in the slow path and it doesn't matter anymore to use
+	 * efficintly the per-cpu-page resources for this allocation
+	 * (as worse we'll leave something available for the next one ;).
+	 *
+	 * pass 0 is for the per-cpu-zero list (this is the only "zero" allocator).
+	 * pass 1 is for the per-cpu-hot-cold list.
+	 * pass 2 is the buddy allocator.
+	 */
+	pass = 0;
+	if (!zero)
+		pass = 1;
+	if (unlikely(order))
+		pass = 2;
+	for (; pass < 3; pass++) {
+		/* Go through the zonelist once, looking for a zone with enough free */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			min = z->pages_low + (1<<order) + z->lowmem_reserve[classzone_idx];
 
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
+			if (z->free_pages < min)
+				continue;
+
+			switch (pass) {
+			case 0 ... 1:
+				page = __buffered_rmqueue(z, order, gfp_mask, 1);
+				if (page)
+					goto got_pg;
+				break;
+			case 2:
+				page = buffered_rmqueue(z, order, gfp_mask);
+				if (page)
+					goto got_pg;
+			}
+		}
+		if (gfp_mask & __GFP_ONLY_ZERO)
+			return NULL;
+		/* downgrade to hot-cold per-cpu-page list */
+		gfp_mask &= ~__GFP_ZERO;
 	}
 
 	for (i = 0; (z = zones[i]) != NULL; i++)
@@ -734,6 +844,10 @@ nopage:
 	}
 	return NULL;
 got_pg:
+	if (pass == 0) {
+		debug_page_zero(page);
+		SetPageZero(page);
+	}
 	zone_statistics(zonelist, z);
 	kernel_map_pages(page, 1 << order, 1);
 	return page;
@@ -765,10 +879,11 @@ fastcall unsigned long get_zeroed_page(u
 	 */
 	BUG_ON(gfp_mask & __GFP_HIGHMEM);
 
-	page = alloc_pages(gfp_mask, 0);
+	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
 	if (page) {
 		void *address = page_address(page);
-		clear_page(address);
+		if (!TestClearPageZero(page))
+			clear_page(address);
 		return (unsigned long) address;
 	}
 	return 0;
@@ -1060,20 +1175,20 @@ void show_free_areas(void)
 
 			pageset = zone->pageset + cpu;
 
-			for (temperature = 0; temperature < 2; temperature++)
-				printk("cpu %d %s: low %d, high %d, batch %d\n",
+			for (temperature = 0; temperature < ARRAY_SIZE(pageset->pcp); temperature++)
+				printk("cpu %d %s: count %d, max_size %d, batch %d\n",
 					cpu,
-					temperature ? "cold" : "hot",
-					pageset->pcp[temperature].low,
-					pageset->pcp[temperature].high,
-					pageset->pcp[temperature].batch);
+					temperature ? "zero" : "hot_cold",
+					pageset->pcp[temperature].count,
+					pageset->pcp[temperature].max_size,
+				        pageset->pcp[temperature].batch);
 		}
 	}
 
 	get_page_state(&ps);
 	get_zone_counts(&active, &inactive, &free);
 
-	printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+	printk("Free pages: %11ukB (%ukB HighMem)\n",
 		K(nr_free_pages()),
 		K(nr_free_highpages()));
 
@@ -1143,6 +1258,124 @@ void show_free_areas(void)
 	show_swap_cache_info();
 }
 
+#define setup_per_cpu_pages_zone_boot(zone) __setup_per_cpu_pages_zone(zone, 1)
+#define setup_per_cpu_pages_zone_sysctl(zone) __setup_per_cpu_pages_zone(zone, 0)
+static unsigned long __setup_per_cpu_pages_zone(struct zone * zone, int boot)
+{
+	int i, cpu;
+	struct per_cpu_pages *pcp;
+	unsigned long per_cpu_pages = 0;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		for (i = 0; i < ARRAY_SIZE(zone->pageset[cpu].pcp); i++) {
+			pcp = &zone->pageset[cpu].pcp[i];
+
+			if (boot) {
+				pcp->count = 0;
+				INIT_LIST_HEAD(&pcp->list);
+			}
+
+			/* set pcp->max_size with sysctl; pcp->max_size == 0 is allowed for tiny boxes */
+			pcp->max_size = zone->present_pages / sysctl_per_cpu_pages.size_ratio;
+			if (pcp->max_size > sysctl_per_cpu_pages.max_size)
+				pcp->max_size = sysctl_per_cpu_pages.max_size;
+			per_cpu_pages += pcp->max_size;
+
+			/* set pcp->batch with sysctl */
+			pcp->batch = pcp->max_size / sysctl_per_cpu_pages.batch_ratio;
+			if (pcp->batch > sysctl_per_cpu_pages.max_batch)
+				pcp->batch = sysctl_per_cpu_pages.max_batch;
+			if (pcp->batch < 1)
+				pcp->batch = 1;
+		}
+	}
+
+	return per_cpu_pages;
+}
+
+static void setup_per_cpu_pages(void)
+{
+	struct pglist_data *pgdat;
+	int j;
+
+	for_each_pgdat(pgdat) {
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone * zone = pgdat->node_zones + j;
+			setup_per_cpu_pages_zone_sysctl(zone);
+		}
+	}
+}
+
+static int idle_page_zero_zone(struct zone * zone)
+{
+	struct per_cpu_pageset *pageset;
+	struct per_cpu_pages *pcp;
+	struct page * page;
+	unsigned long flags;
+	unsigned long * p;
+	int i;
+
+	cond_resched();
+
+	/* the idle task cannot change cpu */
+	pageset = zone->pageset + smp_processor_id();
+	if (pageset->pcp[1].count >= pageset->pcp[1].max_size ||
+	    !pageset->pcp[0].count)
+		return 0;
+
+	cond_resched();
+
+	page = alloc_per_cpu_page(zone, 0, 1);
+
+	cond_resched();
+
+	if (!page)
+		return 0;
+
+	if (!PageZero(page)) {
+		p = kmap_atomic(page, KM_USER0);
+		for (i = 0; i < PAGE_SIZE / sizeof(long); ++i) {
+			p[i] = 0;
+			if (need_resched()) {
+				kunmap_atomic(p, KM_USER0);
+				__cond_resched();
+				p = kmap_atomic(page, KM_USER0);
+			}
+		}
+		kunmap_atomic(p, KM_USER0);
+
+		SetPageZero(page);
+	}
+
+	cond_resched();
+
+	local_irq_save(flags);
+	pcp = &pageset->pcp[1];
+	list_add(&page->lru, &pcp->list);
+	++pcp->count;
+	local_irq_restore(flags);
+
+	cond_resched();
+
+	return 1;
+}
+
+void idle_page_zero(void)
+{
+	struct pglist_data *pgdat;
+	int j;
+
+	if (sysctl_per_cpu_pages.disable_idle_page_zero)
+		return;
+
+	pgdat = NODE_DATA(numa_node_id());
+	for (j = MAX_NR_ZONES-1; j >= 0; j--) {
+		struct zone * zone = pgdat->node_zones + j;
+		if (idle_page_zero_zone(zone))
+			return;
+	}
+}
+
 /*
  * Builds allocation fallback zone lists.
  */
@@ -1477,7 +1710,7 @@ static void __init free_area_init_core(s
 {
 	unsigned long i, j;
 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
-	int cpu, nid = pgdat->node_id;
+	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
 	pgdat->nr_zones = 0;
@@ -1486,7 +1719,7 @@ static void __init free_area_init_core(s
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
-		unsigned long batch;
+		unsigned long per_cpu_pages;
 
 		zone_table[NODEZONE(nid, j)] = zone;
 		realsize = size = zones_size[j];
@@ -1507,39 +1740,9 @@ static void __init free_area_init_core(s
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 
-		/*
-		 * The per-cpu-pages pools are set to around 1000th of the
-		 * size of the zone.  But no more than 1/4 of a meg - there's
-		 * no point in going beyond the size of L2 cache.
-		 *
-		 * OK, so we don't know how big the cache is.  So guess.
-		 */
-		batch = zone->present_pages / 1024;
-		if (batch * PAGE_SIZE > 256 * 1024)
-			batch = (256 * 1024) / PAGE_SIZE;
-		batch /= 4;		/* We effectively *= 4 below */
-		if (batch < 1)
-			batch = 1;
-
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			struct per_cpu_pages *pcp;
-
-			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
-			pcp->count = 0;
-			pcp->low = 2 * batch;
-			pcp->high = 6 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-
-			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
-			pcp->count = 0;
-			pcp->low = 0;
-			pcp->high = 2 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-		}
-		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-				zone_names[j], realsize, batch);
+		per_cpu_pages = setup_per_cpu_pages_zone_boot(zone);
+		printk(KERN_DEBUG "  %s zone: %lu pages, per_cpu_pages:%lu\n",
+				zone_names[j], realsize, per_cpu_pages);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		zone->nr_scan_active = 0;
@@ -1952,6 +2155,23 @@ int lowmem_reserve_ratio_sysctl_handler(
 }
 
 /*
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ *	whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
+ */
+int per_cpu_pages_sysctl_handler(ctl_table *table, int write,
+		 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	setup_per_cpu_pages();
+	return 0;
+}
+
+/*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/shmem.c x/mm/shmem.c
--- x-ref/mm/shmem.c	2004-10-27 03:14:22.000000000 +0200
+++ x/mm/shmem.c	2004-10-30 15:56:48.789587320 +0200
@@ -85,7 +85,7 @@ static inline struct page *shmem_dir_all
 	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
 	 * might be reconsidered if it ever diverges from PAGE_SIZE.
 	 */
-	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+	return alloc_pages(gfp_mask | __GFP_HIGH, PAGE_CACHE_SHIFT-PAGE_SHIFT);
 }
 
 static inline void shmem_dir_free(struct page *page)
@@ -368,7 +368,8 @@ static swp_entry_t *shmem_swp_alloc(stru
 		spin_unlock(&info->lock);
 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
 		if (page) {
-			clear_highpage(page);
+			if (!TestClearPageZero(page))
+				clear_highpage(page);
 			page->nr_swapped = 0;
 		}
 		spin_lock(&info->lock);
@@ -848,7 +849,7 @@ shmem_alloc_page(unsigned long gfp, stru
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 	pvma.vm_pgoff = idx;
 	pvma.vm_end = PAGE_SIZE;
-	page = alloc_page_vma(gfp, &pvma, 0);
+	page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
 	mpol_free(pvma.vm_policy);
 	return page;
 }
@@ -864,7 +865,7 @@ static inline struct page *
 shmem_alloc_page(unsigned long gfp,struct shmem_inode_info *info,
 				 unsigned long idx)
 {
-	return alloc_page(gfp);
+	return alloc_page(gfp | __GFP_ZERO);
 }
 #endif
 
@@ -1073,7 +1074,8 @@ repeat:
 
 		info->alloced++;
 		spin_unlock(&info->lock);
-		clear_highpage(filepage);
+		if (!TestClearPageZero(filepage))
+			clear_highpage(filepage);
 		flush_dcache_page(filepage);
 		SetPageUptodate(filepage);
 	}
diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/swapfile.c x/mm/swapfile.c
--- x-ref/mm/swapfile.c	2004-10-27 03:14:22.000000000 +0200
+++ x/mm/swapfile.c	2004-10-30 15:56:48.791587016 +0200
@@ -312,8 +312,8 @@ int can_share_swap_page(struct page *pag
 {
 	int retval = 0;
 
-	if (!PageLocked(page))
-		BUG();
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageReserved(page));
 	switch (page_count(page)) {
 	case 3:
 		if (!PagePrivate(page))
@@ -325,8 +325,6 @@ int can_share_swap_page(struct page *pag
 		retval = exclusive_swap_page(page);
 		break;
 	case 1:
-		if (PageReserved(page))
-			break;
 		retval = 1;
 	}
 	return retval;