Index: linux-2.6.10/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/mmx.h	2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-i386/mmx.h	2005-02-16 16:05:26.000000000 -0800
@@ -8,7 +8,7 @@
 #include <linux/types.h>
  
 extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
 extern void mmx_copy_page(void *to, void *from);
 
 #endif
Index: linux-2.6.10/drivers/base/node.c
===================================================================
--- linux-2.6.10.orig/drivers/base/node.c	2005-02-14 10:38:13.000000000 -0800
+++ linux-2.6.10/drivers/base/node.c	2005-02-16 16:05:30.000000000 -0800
@@ -42,13 +42,15 @@ static ssize_t node_read_meminfo(struct 
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long free;
+	unsigned long zero;
 
 	si_meminfo_node(&i, nid);
-	__get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
+	__get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(nid));
 
 	n = sprintf(buf, "\n"
 		       "Node %d MemTotal:     %8lu kB\n"
 		       "Node %d MemFree:      %8lu kB\n"
+		       "Node %d MemZero:      %8lu kB\n"
 		       "Node %d MemUsed:      %8lu kB\n"
 		       "Node %d Active:       %8lu kB\n"
 		       "Node %d Inactive:     %8lu kB\n"
@@ -58,6 +60,7 @@ static ssize_t node_read_meminfo(struct 
 		       "Node %d LowFree:      %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
+		       nid, K(zero),
 		       nid, K(i.totalram - i.freeram),
 		       nid, K(active),
 		       nid, K(inactive),
Index: linux-2.6.10/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.10.orig/arch/i386/lib/mmx.c	2004-12-24 13:34:48.000000000 -0800
+++ linux-2.6.10/arch/i386/lib/mmx.c	2005-02-16 16:05:26.000000000 -0800
@@ -128,7 +128,7 @@ void *_mmx_memcpy(void *to, const void *
  *	other MMX using processors do not.
  */
 
-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
 {
 	int i;
 
@@ -138,7 +138,7 @@ static void fast_clear_page(void *page)
 		"  pxor %%mm0, %%mm0\n" : :
 	);
 
-	for(i=0;i<4096/64;i++)
+	for(i=0;i<((4096/64) << order);i++)
 	{
 		__asm__ __volatile__ (
 		"  movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@ static void fast_copy_page(void *to, voi
  *	Generic MMX implementation without K7 specific streaming
  */
  
-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
 {
 	int i;
 	
@@ -267,7 +267,7 @@ static void fast_clear_page(void *page)
 		"  pxor %%mm0, %%mm0\n" : :
 	);
 
-	for(i=0;i<4096/128;i++)
+	for(i=0;i<((4096/128) << order);i++)
 	{
 		__asm__ __volatile__ (
 		"  movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@ static void fast_copy_page(void *to, voi
  *	Favour MMX for page clear and copy. 
  */
 
-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
 {
 	int d0, d1;
 	__asm__ __volatile__( \
 		"cld\n\t" \
 		"rep ; stosl" \
 		: "=&c" (d0), "=&D" (d1)
-		:"a" (0),"1" (page),"0" (1024)
+		:"a" (0),"1" (page),"0" (1024 << order)
 		:"memory");
 }
- 
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
 {
 	if(unlikely(in_interrupt()))
-		slow_zero_page(page);
+		slow_clear_page(page, order);
 	else
-		fast_clear_page(page);
+		fast_clear_page(page, order);
 }
 
 static void slow_copy_page(void *to, void *from)
Index: linux-2.6.10/include/linux/sched.h
===================================================================
--- linux-2.6.10.orig/include/linux/sched.h	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/include/linux/sched.h	2005-02-16 16:05:30.000000000 -0800
@@ -735,6 +735,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
+#define PF_KSCRUBD	0x00800000	/* I am kscrubd */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
Index: linux-2.6.10/include/linux/gfp.h
===================================================================
--- linux-2.6.10.orig/include/linux/gfp.h	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/include/linux/gfp.h	2005-02-16 16:05:26.000000000 -0800
@@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru
 
 void page_alloc_init(void);
 
+void prep_zero_page(struct page *, unsigned int order, unsigned int gfp_flags);
 #endif /* __LINUX_GFP_H */
Index: linux-2.6.10/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.10.orig/fs/proc/proc_misc.c	2005-02-14 10:38:15.000000000 -0800
+++ linux-2.6.10/fs/proc/proc_misc.c	2005-02-16 16:05:30.000000000 -0800
@@ -123,12 +123,13 @@ static int meminfo_read_proc(char *page,
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long free;
+	unsigned long zero;
 	unsigned long committed;
 	unsigned long allowed;
 	struct vmalloc_info vmi;
 
 	get_page_state(&ps);
-	get_zone_counts(&active, &inactive, &free);
+	get_zone_counts(&active, &inactive, &free, &zero);
 
 /*
  * display in kilobytes.
@@ -148,6 +149,7 @@ static int meminfo_read_proc(char *page,
 	len = sprintf(page,
 		"MemTotal:     %8lu kB\n"
 		"MemFree:      %8lu kB\n"
+		"MemZero:      %8lu kB\n"
 		"Buffers:      %8lu kB\n"
 		"Cached:       %8lu kB\n"
 		"SwapCached:   %8lu kB\n"
@@ -171,6 +173,7 @@ static int meminfo_read_proc(char *page,
 		"VmallocChunk: %8lu kB\n",
 		K(i.totalram),
 		K(i.freeram),
+		K(zero),
 		K(i.bufferram),
 		K(get_page_cache_size()-total_swapcache_pages-i.bufferram),
 		K(total_swapcache_pages),
Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h	2005-02-16 16:05:26.000000000 -0800
@@ -56,8 +56,10 @@
 # ifdef __KERNEL__
 #  define STRICT_MM_TYPECHECKS
 
-extern void clear_page (void *page);
+extern void clear_pages (void *page, int order);
 extern void copy_page (void *to, void *from);
+#define clear_page(__page) clear_pages(__page, 0)
+#define __HAVE_ARCH_CLEAR_PAGES
 
 /*
  * clear_user_page() and copy_user_page() can't be inline functions because
Index: linux-2.6.10/mm/hugetlb.c
===================================================================
--- linux-2.6.10.orig/mm/hugetlb.c	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/mm/hugetlb.c	2005-02-16 16:05:26.000000000 -0800
@@ -78,7 +78,6 @@ void free_huge_page(struct page *page)
 struct page *alloc_huge_page(void)
 {
 	struct page *page;
-	int i;
 
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page();
@@ -89,8 +88,7 @@ struct page *alloc_huge_page(void)
 	spin_unlock(&hugetlb_lock);
 	set_page_count(page, 1);
 	page[1].mapping = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
+	prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER);
 	return page;
 }
 
Index: linux-2.6.10/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/ia64/lib/clear_page.S	2004-12-24 13:33:50.000000000 -0800
+++ linux-2.6.10/arch/ia64/lib/clear_page.S	2005-02-16 16:05:26.000000000 -0800
@@ -7,6 +7,7 @@
  * 1/06/01 davidm	Tuned for Itanium.
  * 2/12/02 kchen	Tuned for both Itanium and McKinley
  * 3/08/02 davidm	Some more tweaking
+ * 12/10/04 clameter	Make it work on pages of order size
  */
 #include <linux/config.h>
 
@@ -29,27 +30,33 @@
 #define dst4		r11
 
 #define dst_last	r31
+#define totsize		r14
 
-GLOBAL_ENTRY(clear_page)
+GLOBAL_ENTRY(clear_pages)
 	.prologue
-	.regstk 1,0,0,0
-	mov r16 = PAGE_SIZE/L3_LINE_SIZE-1	// main loop count, -1=repeat/until
+	.regstk 2,0,0,0
+	mov r16 = PAGE_SIZE/L3_LINE_SIZE	// main loop count
+	mov totsize = PAGE_SIZE
 	.save ar.lc, saved_lc
 	mov saved_lc = ar.lc
-
+	;;
 	.body
+	adds dst1 = 16, in0
 	mov ar.lc = (PREFETCH_LINES - 1)
 	mov dst_fetch = in0
-	adds dst1 = 16, in0
 	adds dst2 = 32, in0
+	shl r16 = r16, in1
+	shl totsize = totsize, in1
 	;;
 .fetch:	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
 	adds dst3 = 48, in0		// executing this multiple times is harmless
 	br.cloop.sptk.few .fetch
+	add r16 = -1,r16
+	add dst_last = totsize, dst_fetch
+	adds dst4 = 64, in0
 	;;
-	addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
 	mov ar.lc = r16			// one L3 line per iteration
-	adds dst4 = 64, in0
+	adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
 	;;
 #ifdef CONFIG_ITANIUM
 	// Optimized for Itanium
@@ -74,4 +81,4 @@ GLOBAL_ENTRY(clear_page)
 	;;
 	mov ar.lc = saved_lc		// restore lc
 	br.ret.sptk.many rp
-END(clear_page)
+END(clear_pages)
Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c	2005-02-16 16:05:30.000000000 -0800
@@ -12,6 +12,8 @@
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ *  Page zeroing by Christoph Lameter, SGI, Dec 2004 using
+ *	initial code for __GFP_ZERO support by Andrea Arcangeli, Oct 2004.
  */
 
 #include <linux/config.h>
@@ -33,6 +35,7 @@
 #include <linux/cpu.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/scrub.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -175,16 +178,16 @@ static void destroy_compound_page(struct
  * zone->lock is already acquired when we use these.
  * So, we don't need atomic page->flags operations here.
  */
-static inline unsigned long page_order(struct page *page) {
+static inline unsigned long page_zorder(struct page *page) {
 	return page->private;
 }
 
-static inline void set_page_order(struct page *page, int order) {
-	page->private = order;
+static inline void set_page_zorder(struct page *page, int order, int zero) {
+	page->private = order + (zero << 10);
 	__SetPagePrivate(page);
 }
 
-static inline void rmv_page_order(struct page *page)
+static inline void rmv_page_zorder(struct page *page)
 {
 	__ClearPagePrivate(page);
 	page->private = 0;
@@ -195,14 +198,15 @@ static inline void rmv_page_order(struct
  * we can do coalesce a page and its buddy if
  * (a) the buddy is free &&
  * (b) the buddy is on the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order and the same
+ *     zeroing status.
  * for recording page's order, we use page->private and PG_private.
  *
  */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, int order, int zero)
 {
        if (PagePrivate(page)           &&
-           (page_order(page) == order) &&
+           (page_zorder(page) == order + (zero << 10)) &&
            !PageReserved(page)         &&
             page_count(page) == 0)
                return 1;
@@ -233,22 +237,20 @@ static inline int page_is_buddy(struct p
  * -- wli
  */
 
-static inline void __free_pages_bulk (struct page *page, struct page *base,
-		struct zone *zone, unsigned int order)
+static inline int __free_pages_bulk (struct page *page, struct page *base,
+		struct zone *zone, unsigned int order, int zero)
 {
 	unsigned long page_idx;
 	struct page *coalesced;
-	int order_size = 1 << order;
 
 	if (unlikely(order))
 		destroy_compound_page(page, order);
 
 	page_idx = page - base;
 
-	BUG_ON(page_idx & (order_size - 1));
+	BUG_ON(page_idx & (( 1 << order) - 1));
 	BUG_ON(bad_range(zone, page));
 
-	zone->free_pages += order_size;
 	while (order < MAX_ORDER-1) {
 		struct free_area *area;
 		struct page *buddy;
@@ -258,20 +260,21 @@ static inline void __free_pages_bulk (st
 		buddy = base + buddy_idx;
 		if (bad_range(zone, buddy))
 			break;
-		if (!page_is_buddy(buddy, order))
+		if (!page_is_buddy(buddy, order, zero))
 			break;
 		/* Move the buddy up one level. */
 		list_del(&buddy->lru);
-		area = zone->free_area + order;
+		area = zone->free_area[zero] + order;
 		area->nr_free--;
-		rmv_page_order(buddy);
+		rmv_page_zorder(buddy);
 		page_idx &= buddy_idx;
 		order++;
 	}
 	coalesced = base + page_idx;
-	set_page_order(coalesced, order);
-	list_add(&coalesced->lru, &zone->free_area[order].free_list);
-	zone->free_area[order].nr_free++;
+	set_page_zorder(coalesced, order, zero);
+	list_add(&coalesced->lru, &zone->free_area[zero][order].free_list);
+	zone->free_area[zero][order].nr_free++;
+	return order;
 }
 
 static inline void free_pages_check(const char *function, struct page *page)
@@ -320,8 +323,11 @@ free_pages_bulk(struct zone *zone, int c
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_pages_bulk list manipulates */
 		list_del(&page->lru);
-		__free_pages_bulk(page, base, zone, order);
+		if (__free_pages_bulk(page, base, zone, order, NOT_ZEROED)
+			>= sysctl_scrub_start)
+				wakeup_kscrubd(zone);
 		ret++;
+		zone->free_pages += 1UL << order;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return ret;
@@ -349,6 +355,20 @@ void __free_pages_ok(struct page *page, 
 	free_pages_bulk(page_zone(page), 1, &list, order);
 }
 
+#ifdef CONFIG_SCRUBD
+void end_zero_page(struct page *page, unsigned int order)
+{
+	unsigned long flags;
+	struct zone * zone = page_zone(page);
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	__free_pages_bulk(page, zone->zone_mem_map, zone, order, ZEROED);
+	zone->zero_pages += 1UL << order;
+
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif
 
 /*
  * The order of subdivision here is critical for the IO subsystem.
@@ -366,7 +386,7 @@ void __free_pages_ok(struct page *page, 
  */
 static inline struct page *
 expand(struct zone *zone, struct page *page,
- 	int low, int high, struct free_area *area)
+ 	int low, int high, struct free_area *area, int zero)
 {
 	unsigned long size = 1 << high;
 
@@ -377,7 +397,7 @@ expand(struct zone *zone, struct page *p
 		BUG_ON(bad_range(zone, &page[size]));
 		list_add(&page[size].lru, &area->free_list);
 		area->nr_free++;
-		set_page_order(&page[size], high);
+		set_page_zorder(&page[size], high, zero);
 	}
 	return page;
 }
@@ -428,23 +448,46 @@ static void prep_new_page(struct page *p
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static void inline rmpage(struct page *page, struct free_area *area)
+{
+	list_del(&page->lru);
+	rmv_page_zorder(page);
+	area->nr_free--;
+}
+
+#ifdef CONFIG_SCRUBD
+struct page *scrubd_rmpage(struct zone *zone, struct free_area *area)
+{
+	unsigned long flags;
+	struct page *page = NULL;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	if (!list_empty(&area->free_list)) {
+		page = list_entry(area->free_list.next, struct page, lru);
+		rmpage(page, area);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return page;
+}
+#endif
+
+static struct page *__rmqueue(struct zone *zone, unsigned int order, int zero)
 {
-	struct free_area * area;
+	struct free_area *area;
 	unsigned int current_order;
 	struct page *page;
 
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
-		area = zone->free_area + current_order;
+		area = zone->free_area[zero] + current_order;
 		if (list_empty(&area->free_list))
 			continue;
 
 		page = list_entry(area->free_list.next, struct page, lru);
-		list_del(&page->lru);
-		rmv_page_order(page);
-		area->nr_free--;
+		rmpage(page, zone->free_area[zero] + current_order);
 		zone->free_pages -= 1UL << order;
-		return expand(zone, page, order, current_order, area);
+		if (zero)
+			zone->zero_pages -= 1UL << order;
+		return expand(zone, page, order, current_order, area, zero);
 	}
 
 	return NULL;
@@ -456,7 +499,7 @@ static struct page *__rmqueue(struct zon
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
-			unsigned long count, struct list_head *list)
+			unsigned long count, struct list_head *list, int zero)
 {
 	unsigned long flags;
 	int i;
@@ -465,7 +508,7 @@ static int rmqueue_bulk(struct zone *zon
 	
 	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
-		page = __rmqueue(zone, order);
+		page = __rmqueue(zone, order, zero);
 		if (page == NULL)
 			break;
 		allocated++;
@@ -512,7 +555,7 @@ void mark_free_pages(struct zone *zone)
 		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
 
 	for (order = MAX_ORDER - 1; order >= 0; --order)
-		list_for_each(curr, &zone->free_area[order].free_list) {
+		list_for_each(curr, &zone->free_area[NOT_ZEROED][order].free_list) {
 			unsigned long start_pfn, i;
 
 			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
@@ -599,11 +642,19 @@ void fastcall free_cold_page(struct page
 	free_hot_cold_page(page, 1);
 }
 
-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags)
 {
 	int i;
 
 	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+
+#ifdef __HAVE_ARCH_CLEAR_PAGES
+	if (!PageHighMem(page)) {
+		clear_pages(page_address(page), order);
+		return;
+	}
+#endif
+
 	for(i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
@@ -618,7 +669,9 @@ buffered_rmqueue(struct zone *zone, int 
 {
 	unsigned long flags;
 	struct page *page = NULL;
-	int cold = !!(gfp_flags & __GFP_COLD);
+	int nr_pages = 1 << order;
+	int zero = !!((gfp_flags & __GFP_ZERO) && zone->zero_pages >= nr_pages);
+	int cold = !!(gfp_flags & __GFP_COLD) + 2*zero;
 
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
@@ -627,7 +680,7 @@ buffered_rmqueue(struct zone *zone, int 
 		local_irq_save(flags);
 		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
-						pcp->batch, &pcp->list);
+						pcp->batch, &pcp->list, zero);
 		if (pcp->count) {
 			page = list_entry(pcp->list.next, struct page, lru);
 			list_del(&page->lru);
@@ -639,16 +692,25 @@ buffered_rmqueue(struct zone *zone, int 
 
 	if (page == NULL) {
 		spin_lock_irqsave(&zone->lock, flags);
-		page = __rmqueue(zone, order);
+		page = __rmqueue(zone, order, zero);
+		/*
+		 * If we failed to obtain a zero and/or unzeroed page
+		 * then we may still be able to obtain the other
+		 * type of page.
+		 */
+		if (!page) {
+			page = __rmqueue(zone, order, !zero);
+			zero = 0;
+		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 
 	if (page != NULL) {
 		BUG_ON(bad_range(zone, page));
-		mod_page_state_zone(zone, pgalloc, 1 << order);
+		mod_page_state_zone(zone, pgalloc, nr_pages);
 		prep_new_page(page, order);
 
-		if (gfp_flags & __GFP_ZERO)
+		if ((gfp_flags & __GFP_ZERO) && !zero)
 			prep_zero_page(page, order, gfp_flags);
 
 		if (order && (gfp_flags & __GFP_COMP))
@@ -677,7 +739,7 @@ int zone_watermark_ok(struct zone *z, in
 		return 0;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
-		free_pages -= z->free_area[o].nr_free << o;
+		free_pages -= (z->free_area[NOT_ZEROED][o].nr_free + z->free_area[ZEROED][o].nr_free)  << o;
 
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
@@ -1085,7 +1147,7 @@ void __mod_page_state(unsigned offset, u
 EXPORT_SYMBOL(__mod_page_state);
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
-			unsigned long *free, struct pglist_data *pgdat)
+			unsigned long *free, unsigned long *zero, struct pglist_data *pgdat)
 {
 	struct zone *zones = pgdat->node_zones;
 	int i;
@@ -1093,27 +1155,31 @@ void __get_zone_counts(unsigned long *ac
 	*active = 0;
 	*inactive = 0;
 	*free = 0;
+	*zero = 0;
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		*active += zones[i].nr_active;
 		*inactive += zones[i].nr_inactive;
 		*free += zones[i].free_pages;
+		*zero += zones[i].zero_pages;
 	}
 }
 
 void get_zone_counts(unsigned long *active,
-		unsigned long *inactive, unsigned long *free)
+		unsigned long *inactive, unsigned long *free, unsigned long *zero)
 {
 	struct pglist_data *pgdat;
 
 	*active = 0;
 	*inactive = 0;
 	*free = 0;
+	*zero = 0;
 	for_each_pgdat(pgdat) {
-		unsigned long l, m, n;
-		__get_zone_counts(&l, &m, &n, pgdat);
+		unsigned long l, m, n,o;
+		__get_zone_counts(&l, &m, &n, &o, pgdat);
 		*active += l;
 		*inactive += m;
 		*free += n;
+		*zero += o;
 	}
 }
 
@@ -1150,6 +1216,7 @@ void si_meminfo_node(struct sysinfo *val
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
+const char *temp[3] = { "hot", "cold", "zero" };
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
@@ -1162,6 +1229,7 @@ void show_free_areas(void)
 	unsigned long active;
 	unsigned long inactive;
 	unsigned long free;
+	unsigned long zero;
 	struct zone *zone;
 
 	for_each_zone(zone) {
@@ -1182,10 +1250,10 @@ void show_free_areas(void)
 
 			pageset = zone->pageset + cpu;
 
-			for (temperature = 0; temperature < 2; temperature++)
+			for (temperature = 0; temperature < 3; temperature++)
 				printk("cpu %d %s: low %d, high %d, batch %d\n",
 					cpu,
-					temperature ? "cold" : "hot",
+					temp[temperature],
 					pageset->pcp[temperature].low,
 					pageset->pcp[temperature].high,
 					pageset->pcp[temperature].batch);
@@ -1193,20 +1261,21 @@ void show_free_areas(void)
 	}
 
 	get_page_state(&ps);
-	get_zone_counts(&active, &inactive, &free);
+	get_zone_counts(&active, &inactive, &free, &zero);
 
 	printk("\nFree pages: %11ukB (%ukB HighMem)\n",
 		K(nr_free_pages()),
 		K(nr_free_highpages()));
 
 	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
-		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+		"unstable:%lu free:%u zero:%lu slab:%lu mapped:%lu pagetables:%lu\n",
 		active,
 		inactive,
 		ps.nr_dirty,
 		ps.nr_writeback,
 		ps.nr_unstable,
 		nr_free_pages(),
+		zero,
 		ps.nr_slab,
 		ps.nr_mapped,
 		ps.nr_page_table_pages);
@@ -1255,7 +1324,7 @@ void show_free_areas(void)
 
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
-			nr = zone->free_area[order].nr_free;
+			nr = zone->free_area[NOT_ZEROED][order].nr_free + zone->free_area[ZEROED][order].nr_free;
 			total += nr << order;
 			printk("%lu*%lukB ", nr, K(1UL) << order);
 		}
@@ -1555,8 +1624,10 @@ void zone_init_free_lists(struct pglist_
 {
 	int order;
 	for (order = 0; order < MAX_ORDER ; order++) {
-		INIT_LIST_HEAD(&zone->free_area[order].free_list);
-		zone->free_area[order].nr_free = 0;
+		INIT_LIST_HEAD(&zone->free_area[NOT_ZEROED][order].free_list);
+		INIT_LIST_HEAD(&zone->free_area[ZEROED][order].free_list);
+		zone->free_area[NOT_ZEROED][order].nr_free = 0;
+		zone->free_area[ZEROED][order].nr_free = 0;
 	}
 }
 
@@ -1581,6 +1652,7 @@ static void __init free_area_init_core(s
 
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
+	init_waitqueue_head(&pgdat->kscrubd_wait);
 	pgdat->kswapd_max_order = 0;
 	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -1604,6 +1676,7 @@ static void __init free_area_init_core(s
 		spin_lock_init(&zone->lru_lock);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+		zone->zero_pages = 0;
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 
@@ -1637,6 +1710,13 @@ static void __init free_area_init_core(s
 			pcp->high = 2 * batch;
 			pcp->batch = 1 * batch;
 			INIT_LIST_HEAD(&pcp->list);
+
+			pcp = &zone->pageset[cpu].pcp[2];	/* zero pages */
+			pcp->count = 0;
+			pcp->low = 0;
+			pcp->high = 2 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
 		}
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
@@ -1762,7 +1842,10 @@ static int frag_show(struct seq_file *m,
 		spin_lock_irqsave(&zone->lock, flags);
 		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 		for (order = 0; order < MAX_ORDER; ++order)
-			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+			seq_printf(m, "%6lu ", zone->free_area[NOT_ZEROED][order].nr_free);
+		seq_printf(m, "\n        Zeroed Pages  ");
+		for (order = 0; order < MAX_ORDER; ++order)
+			seq_printf(m, "%6lu ", zone->free_area[ZEROED][order].nr_free);
 		spin_unlock_irqrestore(&zone->lock, flags);
 		seq_putc(m, '\n');
 	}
Index: linux-2.6.10/init/Kconfig
===================================================================
--- linux-2.6.10.orig/init/Kconfig	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/init/Kconfig	2005-02-16 16:05:30.000000000 -0800
@@ -265,6 +265,19 @@ config KALLSYMS_ALL
 
 	   Say N.
 
+config SCRUBD
+	bool "Scrub Daemon (prezeroing of pages)"
+	depends on EXPERIMENTAL
+	help
+	   The scrub daemon manages a pool of zeroed pages. Pages of higher
+	   order are zeroed when the system is idle (configurable via
+	   /proc/sys/vm/scrubd_load).
+	   If the kernel later needs a zeroed page then a page may be
+	   obtained from these pools instead of hot-zeroing a paged.
+	   Prezeroing will in particular speed up applications allocating
+	   large amounts of memory and will be effective for sparse
+	   matrices (this includes multi-level page tables).
+
 config KALLSYMS_EXTRA_PASS
 	bool "Do an extra kallsyms pass"
 	depends on KALLSYMS
Index: linux-2.6.10/mm/Makefile
===================================================================
--- linux-2.6.10.orig/mm/Makefile	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/mm/Makefile	2005-02-16 16:05:30.000000000 -0800
@@ -17,4 +17,5 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SCRUBD) += scrubd.o
 
Index: linux-2.6.10/mm/scrubd.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/mm/scrubd.c	2005-02-16 16:05:30.000000000 -0800
@@ -0,0 +1,136 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/suspend.h>
+#include <linux/sysctl.h>
+#include <linux/scrub.h>
+
+unsigned int sysctl_scrub_start = 8;	/* if a page of this order is coalesed then run kscrubd */
+unsigned int sysctl_scrub_stop = 5;	/* Mininum order of page to zero */
+unsigned int sysctl_scrub_load = 1;	/* Do not run scrubd if load > */
+
+/*
+ * sysctl handler for /proc/sys/vm/scrub_start
+ */
+int scrub_start_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (sysctl_scrub_start < MAX_ORDER) {
+		struct zone *zone;
+
+		for_each_zone(zone)
+			wakeup_kscrubd(zone);
+	}
+	return 0;
+}
+
+LIST_HEAD(zero_drivers);
+
+/*
+ * zero_highest_order_page takes a page off the freelist
+ * and then hands it off to block zeroing agents.
+ * The cleared pages are added to the back of
+ * the freelist where the page allocator may pick them up.
+ */
+int zero_highest_order_page(struct zone *z)
+{
+	int order;
+
+	for(order = MAX_ORDER-1; order >= sysctl_scrub_stop; order--) {
+		struct free_area *area = z->free_area[NOT_ZEROED] + order; 
+		if (!list_empty(&area->free_list)) {
+			struct page *page = scrubd_rmpage(z, area);
+			struct list_head *l;
+			int size = PAGE_SIZE << order;
+			
+			if (!page)
+				continue;
+
+			list_for_each(l, &zero_drivers) {
+				struct zero_driver *driver = list_entry(l, struct zero_driver, list);
+
+				if (driver->start(page_address(page), size) == 0)
+					goto done;
+			}
+				
+			/* Unable to find a zeroing device that would
+			 * deal with this page so just do it on our own.
+			 * This will likely thrash the cpu caches.
+			 */
+			cond_resched();
+			prep_zero_page(page, order, 0);
+done:
+			end_zero_page(page, order);
+			cond_resched();
+			return 1 << order;
+		}
+	}
+	return 0;
+}
+
+/*
+ * scrub_pgdat() will work across all this node's zones.
+ */
+static void scrub_pgdat(pg_data_t *pgdat)
+{
+	int i;
+	unsigned long pages_zeroed;
+
+	if (system_state != SYSTEM_RUNNING)
+		return;
+
+        while (avenrun[0] >= ((unsigned long)sysctl_scrub_load << FSHIFT))
+		schedule_timeout(30*HZ);
+	do {
+		pages_zeroed = 0;
+		for (i = 0; i < pgdat->nr_zones; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+
+			pages_zeroed += zero_highest_order_page(zone);
+		}
+	} while (pages_zeroed);
+}
+
+/*
+ * The background scrub daemon, started as a kernel thread
+ * from the init process. 
+ */
+static int kscrubd(void *p)
+{
+	pg_data_t *pgdat = (pg_data_t*)p;
+	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
+	cpumask_t cpumask;
+
+	daemonize("kscrubd%d", pgdat->node_id);
+	cpumask = node_to_cpumask(pgdat->node_id);
+	if (!cpus_empty(cpumask))
+		set_cpus_allowed(tsk, cpumask);
+
+	tsk->flags |= PF_MEMALLOC | PF_KSCRUBD;
+
+	for ( ; ; ) {
+		if (current->flags & PF_FREEZE)
+			refrigerator(PF_FREEZE);
+		prepare_to_wait(&pgdat->kscrubd_wait, &wait, TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&pgdat->kscrubd_wait, &wait);
+
+		scrub_pgdat(pgdat);
+	}
+	return 0;
+}
+
+static int __init kscrubd_init(void)
+{
+	pg_data_t *pgdat;
+	for_each_pgdat(pgdat)
+		pgdat->kscrubd
+		= find_task_by_pid(kernel_thread(kscrubd, pgdat, CLONE_KERNEL));
+	return 0;
+}
+
+module_init(kscrubd_init)
Index: linux-2.6.10/Documentation/vm/scrubd.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/Documentation/vm/scrubd.txt	2005-02-16 16:05:30.000000000 -0800
@@ -0,0 +1,76 @@
+The SCRUB Daemon
+----------------
+
+The scrub daemon zeroes memory so that later requests for zeroed memory can
+be satisifed without having to zero memory in a hot code path. The operations
+of scrubd may be controlled through 3 variables in /proc/sys/vm:
+
+/proc/sys/vm/scrubd_load	default value	1
+
+	Scrubd is woken up if the system load is lower than this setting and
+	a large block of memory is colesced by the buddy allocator. The
+	default setting of 1 insures that there will be no performance
+	degradation in single user mode. In an SMP system it is like that a
+	cpu is idle despite the load being high. A setting of 9 or 99 may
+	be advisable for SMP systems.
+
+/proc/sys/vm/scrub_start	default value	8
+
+	This is the page order (a number of continous pages) that need to be
+	coalesced by the buddy allocator in order to wake scrubd up. A high
+	value makes scrubd more effective because it is less frequently
+	invoked for larger sizes of pages that may be zeroed. Setting this
+	values causes a scrubd run. This may be used to manually trigger
+	the zeroing of pages.
+
+/proc/sys/vm/scrub_stop		default value	5
+
+	Once scrubd starts it will zero all pages down to this order.
+	Typically it takes too much time to zero all pages in the system.
+	If all pages in the system should be zeroed then this needs to be
+	set to 0.
+
+The amount of memory that is zeroed may be seen in /proc/meminfo or in
+/proc/buddyinfo.
+
+Zeroing Drivers:
+----------------
+
+If hardware is available that can zero memory without the use of the cpu then a
+driver may be written that can then register itself with
+register_zero_driver(). See include/linux/scrubd.h for details and
+arch/ia64/sn/kernel/bte.c for an example of a zeroing driver)
+
+Performance considerations:
+---------------------------
+
+If there is no zeroing hardware available then zeroing may invalidate the
+cpu cache and is therefore something that may cause a minimal performance loss
+especially since scrubd may zero more pages than necessary. In these situations
+(single processor machines?) it is advisable to run scrubd only when the
+system is truly idle. The default configuration has scrub_load set to 1. This
+means that scrubd only runs when the system load is minimal. Scrubd typically
+completes in a fraction of a second since the zeroing speed of processors is 
+quite high.
+
+The frequency of scrubd activation may be controlled by increasing the order
+specified through scrub_start. However, as the system uses memory the
+available order of free pages is typically reduced. If a system has been in
+use for a long time then no pages greater than scrub_start may be available.
+At that point scrubd may no longer provide zeroed pages if scrub_start is set
+too high.
+
+Scrubd may causes additional memory fragmentation since zeroed pages are not
+combinable with unzeroed pages. Also only higher order zero pages are zeroed.
+An allocation of a single zeroed page may require the breakup of larger pages.
+This type of fragmentation may be avoided by setting /proc/sys/vm/scrub_stop
+to zero.
+
+Scrubd is most effective for memory that is only sparsely accessed. Getting a
+prezeroed page for an application that then immediately overwrites all bytes
+in the page does not lead to any performance improvement. However, if the
+application only uses certain cachelines of the page immediately after a page
+fault then scrubd can be of tremendous benefit.
+
+Christoph Lameter, SGI, February 2005.
+
Index: linux-2.6.10/arch/ia64/sn/kernel/setup.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/sn/kernel/setup.c	2005-02-14 10:38:12.000000000 -0800
+++ linux-2.6.10/arch/ia64/sn/kernel/setup.c	2005-02-16 16:05:34.000000000 -0800
@@ -251,6 +251,7 @@ void __init sn_setup(char **cmdline_p)
 	int pxm;
 	int major = sn_sal_rev_major(), minor = sn_sal_rev_minor();
 	extern void sn_cpu_init(void);
+	extern void sn_bte_bzero_init(void);
 
 	/*
 	 * If the generic code has enabled vga console support - lets
@@ -341,6 +342,7 @@ void __init sn_setup(char **cmdline_p)
 	screen_info = sn_screen_info;
 
 	sn_timer_init();
+	sn_bte_bzero_init();
 }
 
 /**
Index: linux-2.6.10/include/linux/scrub.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/include/linux/scrub.h	2005-02-16 16:05:30.000000000 -0800
@@ -0,0 +1,58 @@
+#ifndef _LINUX_SCRUB_H
+#define _LINUX_SCRUB_H
+
+/*
+ * Definitions for scrubbing of memory include an interface 
+ * for drivers that may that allow the zeroing of memory
+ * without invalidating the caches.
+ *
+ * Christoph Lameter, December 2004.
+ */
+#ifdef CONFIG_SCRUBD
+struct zero_driver {
+        int (*start)(void *, unsigned long);		/* Start bzero transfer */
+        struct list_head list;
+};
+
+extern struct list_head zero_drivers;
+
+extern unsigned int sysctl_scrub_start;
+extern unsigned int sysctl_scrub_stop;
+extern unsigned int sysctl_scrub_load;
+
+/* Registering and unregistering zero drivers */
+static inline void register_zero_driver(struct zero_driver *z)
+{
+	list_add(&z->list, &zero_drivers);
+}
+
+static inline void unregister_zero_driver(struct zero_driver *z)
+{
+	list_del(&z->list);
+}
+
+extern struct page *scrubd_rmpage(struct zone *zone, struct free_area *area);
+
+static void inline wakeup_kscrubd(struct zone *zone)
+{
+	if (!waitqueue_active(&zone->zone_pgdat->kscrubd_wait))
+                return;
+        wake_up_interruptible(&zone->zone_pgdat->kscrubd_wait);
+}
+
+int scrub_start_handler(struct ctl_table *, int, struct file *,
+				      void __user *, size_t *, loff_t *);
+
+extern void end_zero_page(struct page *page, unsigned int order);
+
+#else
+
+static void inline wakeup_kscrubd(struct zone *zone)
+{
+}
+
+#define sysctl_scrub_start 0
+
+#endif
+
+#endif
Index: linux-2.6.10/kernel/sysctl.c
===================================================================
--- linux-2.6.10.orig/kernel/sysctl.c	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/kernel/sysctl.c	2005-02-16 16:05:30.000000000 -0800
@@ -40,6 +40,7 @@
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
+#include <linux/scrub.h>
 #include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
@@ -825,6 +826,35 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
+#ifdef CONFIG_SCRUBD
+	{
+		.ctl_name	= VM_SCRUB_START,
+		.procname	= "scrub_start",
+		.data		= &sysctl_scrub_start,
+		.maxlen		= sizeof(sysctl_scrub_start),
+		.mode		= 0644,
+		.proc_handler	= &scrub_start_handler,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= VM_SCRUB_STOP,
+		.procname	= "scrub_stop",
+		.data		= &sysctl_scrub_stop,
+		.maxlen		= sizeof(sysctl_scrub_stop),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= VM_SCRUB_LOAD,
+		.procname	= "scrub_load",
+		.data		= &sysctl_scrub_load,
+		.maxlen		= sizeof(sysctl_scrub_load),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6.10/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/sparc64/lib/clear_page.S	2004-12-24 13:35:23.000000000 -0800
+++ linux-2.6.10/arch/sparc64/lib/clear_page.S	2005-02-16 16:05:26.000000000 -0800
@@ -28,9 +28,12 @@
 	.text
 
 	.globl		_clear_page
-_clear_page:		/* %o0=dest */
+_clear_page:		/* %o0=dest, %o1=order */
+	sethi		%hi(PAGE_SIZE/64), %o2
+	clr		%o4
+	or		%o2, %lo(PAGE_SIZE/64), %o2
 	ba,pt		%xcc, clear_page_common
-	 clr		%o4
+	 sllx		%o2, %o1, %o1
 
 	/* This thing is pretty important, it shows up
 	 * on the profiles via do_anonymous_page().
@@ -69,16 +72,16 @@ clear_user_page:	/* %o0=dest, %o1=vaddr 
 	flush		%g6
 	wrpr		%o4, 0x0, %pstate
 
+	sethi		%hi(PAGE_SIZE/64), %o1
 	mov		1, %o4
+	or		%o1, %lo(PAGE_SIZE/64), %o1
 
 clear_page_common:
 	VISEntryHalf
 	membar		#StoreLoad | #StoreStore | #LoadStore
 	fzero		%f0
-	sethi		%hi(PAGE_SIZE/64), %o1
 	mov		%o0, %g1		! remember vaddr for tlbflush
 	fzero		%f2
-	or		%o1, %lo(PAGE_SIZE/64), %o1
 	faddd		%f0, %f2, %f4
 	fmuld		%f0, %f2, %f6
 	faddd		%f0, %f2, %f8
Index: linux-2.6.10/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc64/page.h	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/include/asm-sparc64/page.h	2005-02-16 16:05:26.000000000 -0800
@@ -14,8 +14,10 @@
 
 #ifndef __ASSEMBLY__
 
-extern void _clear_page(void *page);
-#define clear_page(X)	_clear_page((void *)(X))
+extern void _clear_page(void *page, int order);
+#define clear_page(X)	_clear_page((void *)(X), 0)
+#define clear_pages _clear_page
+
 struct page;
 extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
 #define copy_page(X,Y)	memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h	2005-02-16 16:05:26.000000000 -0800
@@ -18,7 +18,7 @@
 
 #include <asm/mmx.h>
 
-#define clear_page(page)	mmx_clear_page((void *)(page))
+#define clear_pages(page, order)	mmx_clear_page((void *)(page),order)
 #define copy_page(to,from)	mmx_copy_page(to,from)
 
 #else
@@ -28,11 +28,13 @@
  *	Maybe the K6-III ?
  */
  
-#define clear_page(page)	memset((void *)(page), 0, PAGE_SIZE)
+#define clear_pages(page, order)	memset((void *)(page), 0, PAGE_SIZE << (order))
 #define copy_page(to,from)	memcpy((void *)(to), (void *)(from), PAGE_SIZE)
 
 #endif
 
+#define __HAVE_ARCH_CLEAR_PAGES
+#define clear_page(page) clear_pages(page, 0)
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
Index: linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/kernel/ia64_ksyms.c	2005-02-14 10:38:11.000000000 -0800
+++ linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c	2005-02-16 16:05:26.000000000 -0800
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(__down_trylock);
 EXPORT_SYMBOL(__up);
 
 #include <asm/page.h>
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_pages);
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 #include <linux/bootmem.h>
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h	2005-02-16 16:05:26.000000000 -0800
@@ -32,8 +32,10 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
-void clear_page(void *);
+void clear_pages(void *, int);
 void copy_page(void *, void *);
+#define __HAVE_ARCH_CLEAR_PAGES
+#define clear_page(__page) clear_pages(__page, 0)
 
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
Index: linux-2.6.10/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/x86_64/lib/clear_page.S	2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/arch/x86_64/lib/clear_page.S	2005-02-16 16:05:26.000000000 -0800
@@ -1,12 +1,16 @@
 /*
  * Zero a page. 	
  * rdi	page
+ * rsi	order
  */			
-	.globl clear_page
+	.globl clear_pages
 	.p2align 4
-clear_page:
+clear_pages:
+	movl   $4096/64,%eax
+	movl	%esi, %ecx
+	shll	%cl, %eax
+	movl	%eax, %ecx
 	xorl   %eax,%eax
-	movl   $4096/64,%ecx
 	.p2align 4
 .Lloop:
 	decl	%ecx
@@ -23,7 +27,7 @@ clear_page:
 	jnz	.Lloop
 	nop
 	ret
-clear_page_end:	
+clear_pages_end:	
 	
 	/* C stepping K8 run faster using the string instructions.
 	   It is also a lot simpler. Use this when possible */
@@ -32,19 +36,22 @@ clear_page_end:	
 	    	
 	.section .altinstructions,"a"
 	.align 8
-	.quad  clear_page
-	.quad  clear_page_c
+	.quad  clear_pages
+	.quad  clear_pages_c
 	.byte  X86_FEATURE_K8_C
-	.byte  clear_page_end-clear_page	
-	.byte  clear_page_c_end-clear_page_c
+	.byte  clear_pages_end-clear_pages	
+	.byte  clear_pages_c_end-clear_pages_c
 	.previous
 
 	.section .altinstr_replacement,"ax"
-clear_page_c:
-	movl $4096/8,%ecx
+clear_pages_c:
+	movl $4096/8,%eax
+	movl %esi, %ecx
+	shll %cl, %eax
+	movl %eax, %ecx
 	xorl %eax,%eax
 	rep 
 	stosq
 	ret
-clear_page_c_end:
+clear_pages_c_end:
 	.previous
Index: linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/x86_64/kernel/x8664_ksyms.c	2005-02-14 10:38:12.000000000 -0800
+++ linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c	2005-02-16 16:05:26.000000000 -0800
@@ -108,7 +108,7 @@ EXPORT_SYMBOL(pci_mem_start);
 #endif
 
 EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_pages);
 
 EXPORT_SYMBOL(cpu_pda);
 #ifdef CONFIG_SMP
Index: linux-2.6.10/include/linux/sysctl.h
===================================================================
--- linux-2.6.10.orig/include/linux/sysctl.h	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/include/linux/sysctl.h	2005-02-16 16:05:30.000000000 -0800
@@ -169,6 +169,9 @@ enum
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+	VM_SCRUB_START=30,	/* percentage * 10 at which to start scrubd */
+	VM_SCRUB_STOP=31,	/* percentage * 10 at which to stop scrubd */
+	VM_SCRUB_LOAD=32,	/* Load factor at which not to scrub anymore */
 };
 
 
Index: linux-2.6.10/arch/ia64/sn/kernel/bte.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/sn/kernel/bte.c	2005-02-14 10:38:12.000000000 -0800
+++ linux-2.6.10/arch/ia64/sn/kernel/bte.c	2005-02-16 16:05:34.000000000 -0800
@@ -3,7 +3,9 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * Support for zeroing pages, Christoph Lameter, SGI, December 2004.
  */
 
 #include <linux/config.h>
@@ -20,6 +22,8 @@
 #include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/scrub.h>
 
 #include <asm/sn/bte.h>
 
@@ -30,7 +34,7 @@
 /* two interfaces on two btes */
 #define MAX_INTERFACES_TO_TRY		4
 
-static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
+static inline struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
 {
 	nodepda_t *tmp_nodepda;
 
@@ -199,6 +203,7 @@ retry_bteop:
 	}
 
 	while ((transfer_stat = *bte->most_rcnt_na) == BTE_WORD_BUSY) {
+		cpu_relax();
 		if (ia64_get_itc() > itc_end) {
 			BTE_PRINTK(("BTE timeout nasid 0x%x bte%d IBLS = 0x%lx na 0x%lx\n",
 				NASID_GET(bte->bte_base_addr), bte->bte_num,
@@ -449,5 +454,25 @@ void bte_init_node(nodepda_t * mynodepda
 		mynodepda->bte_if[i].cleanup_active = 0;
 		mynodepda->bte_if[i].bh_error = 0;
 	}
+}
+
+static int bte_start_bzero(void *p, unsigned long len)
+{
+
+	/* Check limitations.
+		1. System must be running (weird things happen during bootup)
+		2. Size >64KB. Smaller requests cause too much bte traffic
+	 */
+	if (len >= BTE_MAX_XFER || len < 60000 || system_state != SYSTEM_RUNNING)
+		return EINVAL;
+
+	return bte_zero(ia64_tpa(p), len, 0, NULL);
+}
+
+static struct zero_driver bte_bzero = {
+	.start = bte_start_bzero,
+};
 
+void sn_bte_bzero_init(void) {
+	register_zero_driver(&bte_bzero);
 }
Index: linux-2.6.10/include/linux/mmzone.h
===================================================================
--- linux-2.6.10.orig/include/linux/mmzone.h	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/include/linux/mmzone.h	2005-02-16 16:05:30.000000000 -0800
@@ -51,7 +51,7 @@ struct per_cpu_pages {
 };
 
 struct per_cpu_pageset {
-	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+	struct per_cpu_pages pcp[3];	/* 0: hot.  1: cold  2: cold zeroed pages */
 #ifdef CONFIG_NUMA
 	unsigned long numa_hit;		/* allocated in intended node */
 	unsigned long numa_miss;	/* allocated in non intended node */
@@ -107,10 +107,14 @@ struct per_cpu_pageset {
  * ZONE_HIGHMEM	 > 896 MB	only page cache and user processes
  */
 
+#define NOT_ZEROED 0
+#define ZEROED 1
+
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	unsigned long		zero_pages;
 	/*
 	 * We don't know if the memory that we're going to allocate will be freeable
 	 * or/and it will be released eventually, so to avoid totally wasting several
@@ -127,7 +131,7 @@ struct zone {
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
-	struct free_area	free_area[MAX_ORDER];
+	struct free_area	free_area[2][MAX_ORDER];
 
 
 	ZONE_PADDING(_pad1_)
@@ -262,6 +266,9 @@ typedef struct pglist_data {
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
+
+	wait_queue_head_t       kscrubd_wait;
+	struct task_struct *kscrubd;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
@@ -270,9 +277,9 @@ typedef struct pglist_data {
 extern struct pglist_data *pgdat_list;
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
-			unsigned long *free, struct pglist_data *pgdat);
+			unsigned long *free, unsigned long *zero, struct pglist_data *pgdat);
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
-			unsigned long *free);
+			unsigned long *free, unsigned long *zero);
 void build_all_zonelists(void);
 void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
Index: linux-2.6.10/mm/readahead.c
===================================================================
--- linux-2.6.10.orig/mm/readahead.c	2005-02-14 10:38:16.000000000 -0800
+++ linux-2.6.10/mm/readahead.c	2005-02-16 16:05:30.000000000 -0800
@@ -573,7 +573,8 @@ unsigned long max_sane_readahead(unsigne
 	unsigned long active;
 	unsigned long inactive;
 	unsigned long free;
+	unsigned long zero;
 
-	__get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
+	__get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(numa_node_id()));
 	return min(nr, (inactive + free) / 2);
 }