Binary files 2.4.0-test7-pre5aa1/ID and 2.4.0-test7-pre5aa1-cz/ID differ
diff -urN 2.4.0-test7-pre5aa1/arch/i386/mm/init.c 2.4.0-test7-pre5aa1-cz/arch/i386/mm/init.c
--- 2.4.0-test7-pre5aa1/arch/i386/mm/init.c	Thu Aug 17 19:57:23 2000
+++ 2.4.0-test7-pre5aa1-cz/arch/i386/mm/init.c	Tue Aug 22 13:09:05 2000
@@ -606,7 +606,7 @@
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		max_mapnr << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
diff -urN 2.4.0-test7-pre5aa1/fs/buffer.c 2.4.0-test7-pre5aa1-cz/fs/buffer.c
--- 2.4.0-test7-pre5aa1/fs/buffer.c	Tue Aug 22 01:23:50 2000
+++ 2.4.0-test7-pre5aa1-cz/fs/buffer.c	Tue Aug 22 13:14:18 2000
@@ -119,12 +119,12 @@
 				  when trying to refill buffers. */
 		int interval; /* jiffies delay between kupdate flushes */
 		int age_buffer;  /* Time for normal buffer to age before we flush it */
-		int dummy1;    /* unused, was age_super */
+		int age_super;  /* Time for superblock to age before we flush it */
 		int dummy2;    /* unused */
 		int dummy3;    /* unused */
 	} b_un;
 	unsigned int data[N_PARAM];
-} bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
+} bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 30*HZ, 1884, 2}};
 
 /* These are the min and max parameter values that we will allow to be assigned */
 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
@@ -894,7 +894,7 @@
 
 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
 {
-	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
+	bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 	refile_buffer(bh);
 }
 
@@ -1078,6 +1078,21 @@
 }
 
 /*
+ * After reaping some pages from the page-cache, vmscan may call
+ * this function to flush buffer-heads out of their slab cache.
+ */
+int shrink_buffer_headers(zone_t * zone)
+{
+	int nr_pages = 0;
+	/*
+	 * Must not be called before the buffer-head cache is set-up.
+	 */
+	kmem_cache_shrink(bh_cachep, zone, &nr_pages);
+
+	return nr_pages;
+}
+
+/*
  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
  * no-buffer-head deadlock.  Return NULL on failure; waiting for
  * buffer heads is now handled in create_buffers().
@@ -1322,6 +1337,7 @@
 	 */
 	if (!offset) {
 		if (!try_to_free_buffers(page, 0)) {
+			BUG();
 			atomic_inc(&buffermem_pages);
 			return 0;
 		}
@@ -1330,7 +1346,14 @@
 	return 1;
 }
 
-static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
+#define create_empty_buffers(page, inode, blocksize)		\
+do {								\
+	if (!(page)->buffers)					\
+		__create_empty_buffers(page, inode, blocksize);	\
+	SetPageBufferAge(page);					\
+} while(0)
+
+static void __create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
 {
 	struct buffer_head *bh, *head, *tail;
 
@@ -1351,27 +1374,13 @@
 	page_cache_get(page);
 }
 
-/*
- * We are taking a block for data and we don't want any output from any
- * buffer-cache aliases starting from return from that function and
- * until the moment when something will explicitly mark the buffer
- * dirty (hopefully that will not happen until we will free that block ;-)
- * We don't even need to mark it not-uptodate - nobody can expect
- * anything from a newly allocated buffer anyway. We used to used
- * unmap_buffer() for such invalidation, but that was wrong. We definitely
- * don't want to mark the alias unmapped, for example - it would confuse
- * anyone who might pick it with bread() afterwards...
- */
-
 static void unmap_underlying_metadata(struct buffer_head * bh)
 {
 	struct buffer_head *old_bh;
 
 	old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
 	if (old_bh) {
-		mark_buffer_clean(old_bh);
-		wait_on_buffer(old_bh);
-		clear_bit(BH_Req, &old_bh->b_state);
+		unmap_buffer(old_bh);
 		/* Here we could run brelse or bforget. We use
 		   bforget because it will try to put the buffer
 		   in the freelist. */
@@ -1392,8 +1401,7 @@
 	if (!PageLocked(page))
 		BUG();
 
-	if (!page->buffers)
-		create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
+	create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
 	head = page->buffers;
 
 	block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1448,8 +1456,7 @@
 	char *kaddr = (char *)kmap(page);
 
 	blocksize = inode->i_sb->s_blocksize;
-	if (!page->buffers)
-		create_empty_buffers(page, inode, blocksize);
+	create_empty_buffers(page, inode, blocksize);
 	head = page->buffers;
 
 	bbits = inode->i_sb->s_blocksize_bits;
@@ -1558,8 +1565,7 @@
 	if (!PageLocked(page))
 		PAGE_BUG(page);
 	blocksize = inode->i_sb->s_blocksize;
-	if (!page->buffers)
-		create_empty_buffers(page, inode, blocksize);
+	create_empty_buffers(page, inode, blocksize);
 	head = page->buffers;
 
 	blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
@@ -2136,25 +2142,53 @@
  *
  * This all is required so that we can free up memory
  * later.
- *
- * Wait:
- *	0 - no wait (this does not get called - see try_to_free_buffers below)
- *	1 - start IO for dirty buffers
- *	2 - wait for completion of locked buffers
  */
-static void sync_page_buffers(struct buffer_head *bh, int wait)
+static int sync_page_buffers(struct buffer_head *bh)
 {
 	struct buffer_head * tmp = bh;
+	int ret, i;
+#if BITS_PER_LONG < (MAX_BUF_PER_PAGE+1)
+#error wait_IO is too short, convert to it to array for your architecture in this define
+#else
+	unsigned long wait_IO = 0, clean = 0;
+#endif
 
+	i = 0;
 	do {
 		struct buffer_head *p = tmp;
 		tmp = tmp->b_this_page;
-		if (buffer_locked(p)) {
-			if (wait > 1)
-				__wait_on_buffer(p);
-		} else if (buffer_dirty(p))
+
+		if (buffer_dirty(p))
 			ll_rw_block(WRITE, 1, &p);
+		
+		if (buffer_locked(p)) {
+			if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
+				if (buffer_locked(p))
+					wait_IO |= 1UL << i;
+				else {
+					clear_bit(BH_Wait_IO, &p->b_state);
+					clean |= 1UL << i;
+				}
+			}
+		} else
+			clean |= 1UL << i;
+
+		i++;
 	} while (tmp != bh);
+
+	ret = (clean | wait_IO) == ((1UL << i) - 1);
+
+	while (wait_IO) {
+		struct buffer_head *p = tmp;
+		tmp = tmp->b_this_page;
+		if (wait_IO & 1)
+			wait_on_buffer(p);
+		if (tmp == bh)
+			break;
+		wait_IO >>= 1;
+	}
+
+	return ret;
 }
 
 /*
@@ -2174,11 +2208,13 @@
  *       obtain a reference to a buffer head within a page.  So we must
  *	 lock out all of these paths to cleanly toss the page.
  */
-int try_to_free_buffers(struct page * page, int wait)
+int try_to_free_buffers(struct page * page, int gfp_mask)
 {
 	struct buffer_head * tmp, * bh = page->buffers;
 	int index = BUFSIZE_INDEX(bh->b_size);
+	int pass = 0;
 
+ again:
 	spin_lock(&lru_list_lock);
 	write_lock(&hash_table_lock);
 	spin_lock(&free_list[index].lock);
@@ -2223,9 +2259,11 @@
 	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
 	spin_unlock(&free_list[index].lock);
 	write_unlock(&hash_table_lock);
-	spin_unlock(&lru_list_lock);
-	if (wait)
-		sync_page_buffers(bh, wait);
+	spin_unlock(&lru_list_lock);	
+	if ((gfp_mask & __GFP_IO) && !pass && sync_page_buffers(bh)) {
+		pass = 1;
+		goto again;
+	}
 	return 0;
 }
 
diff -urN 2.4.0-test7-pre5aa1/fs/dcache.c 2.4.0-test7-pre5aa1-cz/fs/dcache.c
--- 2.4.0-test7-pre5aa1/fs/dcache.c	Tue Aug 22 01:23:50 2000
+++ 2.4.0-test7-pre5aa1-cz/fs/dcache.c	Tue Aug 22 13:16:28 2000
@@ -339,9 +339,17 @@
 
 		if (tmp == &dentry_unused)
 			break;
-		dentry_stat.nr_unused--;
-		list_del_init(tmp);
 		dentry = list_entry(tmp, struct dentry, d_lru);
+		list_del(tmp);
+
+		if (dentry->d_flags & DCACHE_REFERENCED) {
+			dentry->d_flags &= ~DCACHE_REFERENCED;
+			list_add(tmp, &dentry_unused);
+			continue;
+		}
+
+		dentry_stat.nr_unused--;
+		INIT_LIST_HEAD(tmp);
 
 		/* Unused dentry with a count? */
 		if (atomic_read(&dentry->d_count))
@@ -495,6 +503,7 @@
 		if (!atomic_read(&dentry->d_count)) {
 			list_del(&dentry->d_lru);
 			list_add(&dentry->d_lru, dentry_unused.prev);
+			dentry->d_flags &= ~DCACHE_REFERENCED;
 			found++;
 		}
 		/*
@@ -551,20 +560,17 @@
  *  ...
  *   6 - base-level: try to shrink a bit.
  */
-int shrink_dcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dcache_memory(int priority, zone_t * zone)
 {
-	int count = 0;
+	int count = 0, nr_pages = 0;
+
 	if (priority)
 		count = dentry_stat.nr_unused / priority;
 	prune_dcache(count);
-	/* FIXME: kmem_cache_shrink here should tell us
-	   the number of pages freed, and it should
-	   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
-	   to free only the interesting pages in
-	   function of the needs of the current allocation. */
-	kmem_cache_shrink(dentry_cache);
 
-	return 0;
+	kmem_cache_shrink(dentry_cache, zone, &nr_pages);
+
+	return nr_pages;
 }
 
 #define NAME_ALLOC_LEN(len)	((len+16) & ~15)
@@ -723,6 +729,7 @@
 				continue;
 		}
 		__dget_locked(dentry);
+		dentry->d_flags |= DCACHE_REFERENCED;
 		spin_unlock(&dcache_lock);
 		return dentry;
 	}
diff -urN 2.4.0-test7-pre5aa1/fs/inode.c 2.4.0-test7-pre5aa1-cz/fs/inode.c
--- 2.4.0-test7-pre5aa1/fs/inode.c	Tue Aug 22 01:23:51 2000
+++ 2.4.0-test7-pre5aa1-cz/fs/inode.c	Tue Aug 22 13:09:05 2000
@@ -417,8 +417,9 @@
 
 void prune_icache(int goal)
 {
-	LIST_HEAD(list);
-	struct list_head *entry, *freeable = &list;
+	LIST_HEAD(freeable);
+	LIST_HEAD(unfreeable);
+	struct list_head *entry;
 	int count = 0;
 	struct inode * inode;
 
@@ -426,49 +427,53 @@
 	/* go simple and safe syncing everything before starting */
 	sync_all_inodes();
 
-	entry = inode_unused.prev;
-	while (entry != &inode_unused)
+	while ((entry = inode_unused.prev) != &inode_unused)
 	{
-		struct list_head *tmp = entry;
+		list_del(entry);
+		inode = INODE(entry);
 
-		entry = entry->prev;
-		inode = INODE(tmp);
 		if (inode->i_state & (I_FREEING|I_CLEAR))
 			BUG();
-		if (!CAN_UNUSE(inode))
-			continue;
 		if (atomic_read(&inode->i_count))
 			BUG();
-		list_del(tmp);
+
+		if (inode->i_state & I_REFERENCED) {
+			inode->i_state &= ~I_REFERENCED;
+			list_add(entry, &inode_unused);
+			continue;
+		}
+
+		if (!CAN_UNUSE(inode)) {
+			list_add(entry, &unfreeable);
+			continue;
+		}
+
 		list_del(&inode->i_hash);
 		INIT_LIST_HEAD(&inode->i_hash);
-		list_add(tmp, freeable);
+		list_add(entry, &freeable);
 		inode->i_state |= I_FREEING;
 		count++;
 		if (!--goal)
 			break;
 	}
 	inodes_stat.nr_unused -= count;
+	list_splice(&unfreeable, &inode_unused);
 	spin_unlock(&inode_lock);
 
-	dispose_list(freeable);
+	dispose_list(&freeable);
 }
 
-int shrink_icache_memory(int priority, int gfp_mask)
+int shrink_icache_memory(int priority, zone_t * zone)
 {
-	int count = 0;
+	int count = 0, nr_pages = 0;
 		
 	if (priority)
 		count = inodes_stat.nr_unused / priority;
 	prune_icache(count);
-	/* FIXME: kmem_cache_shrink here should tell us
-	   the number of pages freed, and it should
-	   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
-	   to free only the interesting pages in
-	   function of the needs of the current allocation. */
-	kmem_cache_shrink(inode_cachep);
 
-	return 0;
+	kmem_cache_shrink(inode_cachep, zone, &nr_pages);
+
+	return nr_pages;
 }
 
 /*
@@ -495,6 +500,7 @@
 			continue;
 		if (find_actor && !find_actor(inode, ino, opaque))
 			continue;
+		inode->i_state |= I_REFERENCED;
 		break;
 	}
 	return inode;
diff -urN 2.4.0-test7-pre5aa1/include/linux/cache.h 2.4.0-test7-pre5aa1-cz/include/linux/cache.h
--- 2.4.0-test7-pre5aa1/include/linux/cache.h	Tue Aug 22 01:30:01 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/cache.h	Tue Aug 22 14:56:35 2000
@@ -1,6 +1,7 @@
 #ifndef __LINUX_CACHE_H
 #define __LINUX_CACHE_H
 
+#include <linux/config.h>
 #include <asm/cache.h>
 
 #ifndef L1_CACHE_ALIGN
@@ -13,6 +14,14 @@
 
 #ifndef ____cacheline_aligned
 #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
+#endif
+
+#ifndef ____cacheline_aligned_in_smp
+#ifdef CONFIG_SMP
+#define ____cacheline_aligned_in_smp ____cacheline_aligned
+#else
+#define ____cacheline_aligned_in_smp
+#endif /* CONFIG_SMP */
 #endif
 
 #ifndef __cacheline_aligned
diff -urN 2.4.0-test7-pre5aa1/include/linux/dcache.h 2.4.0-test7-pre5aa1-cz/include/linux/dcache.h
--- 2.4.0-test7-pre5aa1/include/linux/dcache.h	Tue Aug 22 01:30:01 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/dcache.h	Tue Aug 22 14:56:35 2000
@@ -115,6 +115,7 @@
 					 * If this dentry points to a directory, then
 					 * s_nfsd_free_path semaphore will be down
 					 */
+#define	DCACHE_REFERENCED 0x0008
 
 extern spinlock_t dcache_lock;
 
@@ -163,11 +164,11 @@
 #define shrink_dcache() prune_dcache(0)
 struct zone_struct;
 /* dcache memory management */
-extern int shrink_dcache_memory(int, unsigned int);
+extern int shrink_dcache_memory(int, struct zone_struct *);
 extern void prune_dcache(int);
 
 /* icache memory management (defined in linux/fs/inode.c) */
-extern int shrink_icache_memory(int, int);
+extern int shrink_icache_memory(int, struct zone_struct *);
 extern void prune_icache(int);
 
 /* only used at mount-time */
diff -urN 2.4.0-test7-pre5aa1/include/linux/fs.h 2.4.0-test7-pre5aa1-cz/include/linux/fs.h
--- 2.4.0-test7-pre5aa1/include/linux/fs.h	Tue Aug 22 01:30:01 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/fs.h	Tue Aug 22 14:56:35 2000
@@ -203,6 +203,7 @@
 #define BH_Mapped	4	/* 1 if the buffer has a disk mapping */
 #define BH_New		5	/* 1 if the buffer is new and not yet written out */
 #define BH_Protected	6	/* 1 if the buffer is protected */
+#define BH_Wait_IO	7	/* 1 if the buffer is under I/O for too long */
 
 /*
  * Try to keep the most commonly used fields in single cache lines (16
@@ -245,6 +246,7 @@
 
 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
 void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
+unsigned int shrink_buffer_heads(unsigned int);
 
 #define __buffer_state(bh, state)	(((bh)->b_state & (1UL << BH_##state)) != 0)
 
@@ -450,6 +452,7 @@
 #define I_LOCK		2
 #define I_FREEING	4
 #define I_CLEAR		8
+#define I_REFERENCED	16
 
 extern void __mark_inode_dirty(struct inode *);
 static inline void mark_inode_dirty(struct inode *inode)
@@ -938,7 +941,9 @@
 
 extern int fs_may_remount_ro(struct super_block *);
 
-extern int try_to_free_buffers(struct page *, int);
+struct zone_struct;
+extern int shrink_buffer_headers(struct zone_struct *);
+extern int FASTCALL(try_to_free_buffers(struct page *, int));
 extern void refile_buffer(struct buffer_head * buf);
 
 #define BUF_CLEAN	0
diff -urN 2.4.0-test7-pre5aa1/include/linux/highmem.h 2.4.0-test7-pre5aa1-cz/include/linux/highmem.h
--- 2.4.0-test7-pre5aa1/include/linux/highmem.h	Tue Aug 22 01:30:04 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/highmem.h	Tue Aug 22 14:56:39 2000
@@ -11,7 +11,7 @@
 #include <asm/highmem.h>
 
 /* declarations for linux/mm/highmem.c */
-FASTCALL(unsigned int nr_free_highpages(void));
+extern unsigned int nr_free_highpages(void);
 
 extern struct page * prepare_highmem_swapout(struct page *);
 extern struct page * replace_with_highmem(struct page *);
@@ -19,7 +19,7 @@
 
 #else /* CONFIG_HIGHMEM */
 
-extern inline unsigned int nr_free_highpages(void) { return 0; }
+#define nr_free_highpages() 0UL
 #define prepare_highmem_swapout(page) page
 #define replace_with_highmem(page) page
 
diff -urN 2.4.0-test7-pre5aa1/include/linux/locks.h 2.4.0-test7-pre5aa1-cz/include/linux/locks.h
--- 2.4.0-test7-pre5aa1/include/linux/locks.h	Tue Aug 22 01:30:04 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/locks.h	Tue Aug 22 14:56:39 2000
@@ -29,6 +29,7 @@
 extern inline void unlock_buffer(struct buffer_head *bh)
 {
 	clear_bit(BH_Lock, &bh->b_state);
+	clear_bit(BH_Wait_IO, &bh->b_state);
 	wake_up(&bh->b_wait);
 }
 
diff -urN 2.4.0-test7-pre5aa1/include/linux/mm.h 2.4.0-test7-pre5aa1-cz/include/linux/mm.h
--- 2.4.0-test7-pre5aa1/include/linux/mm.h	Tue Aug 22 01:30:01 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/mm.h	Tue Aug 22 14:56:35 2000
@@ -168,7 +168,7 @@
 #define PG_uptodate		 3
 #define PG_dirty		 4
 #define PG_decr_after		 5
-#define PG_unused_01		 6
+#define PG_buffer_age		 6
 #define PG__unused_02		 7
 #define PG_slab			 8
 #define PG_swap_cache		 9
@@ -225,6 +225,10 @@
 #define SetPageReserved(page)		set_bit(PG_reserved, &(page)->flags)
 #define ClearPageReserved(page)		clear_bit(PG_reserved, &(page)->flags)
 
+#define ClearPageBufferAge(page)	clear_bit(PG_buffer_age, &(page)->flags)
+#define SetPageBufferAge(page)		set_bit(PG_buffer_age, &(page)->flags)
+#define TestandClearPageBufferAge(page)	test_and_clear_bit(PG_buffer_age, &(page)->flags)
+
 /*
  * Error return values for the *_nopage functions
  */
@@ -310,7 +314,7 @@
  * can allocate highmem pages, the *get*page*() variants return
  * virtual kernel addresses to the allocated page(s).
  */
-extern struct page * FASTCALL(__alloc_pages(zonelist_t *zonelist, unsigned long order));
+extern struct page * FASTCALL(__alloc_pages(gfpmask_zone_t *, unsigned long order));
 extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order);
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -321,7 +325,7 @@
 	 */
 	if (order >= MAX_ORDER)
 		return NULL;
-	return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);
+	return __alloc_pages(contig_page_data.node_gfpmask_zone+gfp_mask, order);
 }
 #else /* !CONFIG_DISCONTIGMEM */
 extern struct page * alloc_pages(int gfp_mask, unsigned long order);
@@ -415,7 +419,7 @@
 /* filemap.c */
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int, zone_t *));
 extern void truncate_inode_pages(struct address_space *, loff_t);
 
 /* generic vm_area_ops exported for stackable file systems */
diff -urN 2.4.0-test7-pre5aa1/include/linux/mmzone.h 2.4.0-test7-pre5aa1-cz/include/linux/mmzone.h
--- 2.4.0-test7-pre5aa1/include/linux/mmzone.h	Tue Aug 22 01:30:01 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/mmzone.h	Tue Aug 22 14:56:35 2000
@@ -21,16 +21,26 @@
 
 struct pglist_data;
 
+/*
+ * Memory balancing internally to the node can work correctly only on
+ * classzone basis while handling overlapped classzones.
+ */
 typedef struct zone_struct {
 	/*
 	 * Commonly accessed fields:
 	 */
-	spinlock_t		lock;
 	unsigned long		offset;
 	unsigned long		free_pages;
-	char			low_on_memory;
-	char			zone_wake_kswapd;
+
+	/*
+	 * Memory balancing is all classzone based, all the below
+	 * fields refer to the classzone. The classzone includes
+	 * the current zone plus all the lower zones in the MM.
+	 */
+	unsigned long		classzone_free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	char			zone_wake_kswapd;
+	int			nr_zone;
 
 	/*
 	 * free areas of different sizes
@@ -57,27 +67,34 @@
 #define MAX_NR_ZONES		3
 
 /*
- * One allocation request operates on a zonelist. A zonelist
- * is a list of zones, the first one is the 'goal' of the
- * allocation, the other zones are fallback zones, in decreasing
- * priority.
- *
- * Right now a zonelist takes up less than a cacheline. We never
- * modify it apart from boot-up, and only a few indices are used,
- * so despite the zonelist table being relatively big, the cache
- * footprint of this construct is very small.
+ * The pgdat->node_gfpmask_zone[] array tell us which classzone
+ * we should allocate from given a certain gfpmask. It translates
+ * the gfpmask to a classzone.
  */
-typedef struct zonelist_struct {
-	zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited
+typedef struct gfpmask_zone_s {
+	zone_t * classzone;
 	int gfp_mask;
-} zonelist_t;
+} gfpmask_zone_t;
 
 #define NR_GFPINDEX		0x100
 
+#define NR_VM_LRU	1
+#define LRU_CACHE	0
+
+typedef struct vm_lru_s {
+	/* keep lock in a separate cacheline to avoid ping pong in SMP */
+	spinlock_t lock ____cacheline_aligned_in_smp;
+	struct list_head head;
+	unsigned long nr_pages;
+} vm_lru_t;
+
 struct bootmem_data;
 typedef struct pglist_data {
+	spinlock_t freelist_lock ____cacheline_aligned_in_smp;
 	zone_t node_zones[MAX_NR_ZONES];
-	zonelist_t node_zonelists[NR_GFPINDEX];
+	gfpmask_zone_t node_gfpmask_zone[NR_GFPINDEX];
+	int nr_zones;
+	vm_lru_t vm_lru[NR_VM_LRU];
 	struct page *node_mem_map;
 	unsigned long *valid_addr_bitmap;
 	struct bootmem_data *bdata;
@@ -92,8 +109,7 @@
 extern pg_data_t *pgdat_list;
 
 #define memclass(pgzone, tzone)	(((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \
-			&& (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \
-			((tzone) - (pgzone)->zone_pgdat->node_zones)))
+			&& ((pgzone) <= (tzone)))
 
 /*
  * The following two are not meant for general usage. They are here as
diff -urN 2.4.0-test7-pre5aa1/include/linux/sched.h 2.4.0-test7-pre5aa1-cz/include/linux/sched.h
--- 2.4.0-test7-pre5aa1/include/linux/sched.h	Tue Aug 22 01:30:01 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/sched.h	Tue Aug 22 14:56:35 2000
@@ -373,6 +373,8 @@
    	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty */
 	spinlock_t alloc_lock;
+/* Local freelist */
+	struct list_head local_pages; int allocation_order, nr_local_pages;
 };
 
 /*
@@ -388,6 +390,7 @@
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_VFORK	0x00001000	/* Wake up parent in mm_release */
+#define PF_FREE_PAGES	0x00002000	/* In the middle of memory balancing */
 
 #define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */
 
@@ -452,7 +455,11 @@
     blocked:		{{0}},						\
     sigqueue:		NULL,						\
     sigqueue_tail:	&tsk.sigqueue,					\
-    alloc_lock:		SPIN_LOCK_UNLOCKED				\
+    alloc_lock:		SPIN_LOCK_UNLOCKED,				\
+    local_pages:	{						\
+	next:			&init_task.local_pages,			\
+	prev:			&init_task.local_pages,			\
+    },									\
 }
 
 
diff -urN 2.4.0-test7-pre5aa1/include/linux/slab.h 2.4.0-test7-pre5aa1-cz/include/linux/slab.h
--- 2.4.0-test7-pre5aa1/include/linux/slab.h	Tue Aug 22 01:30:01 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/slab.h	Tue Aug 22 14:56:35 2000
@@ -52,14 +52,14 @@
 				       void (*)(void *, kmem_cache_t *, unsigned long),
 				       void (*)(void *, kmem_cache_t *, unsigned long));
 extern int kmem_cache_destroy(kmem_cache_t *);
-extern int kmem_cache_shrink(kmem_cache_t *);
+extern int kmem_cache_shrink(kmem_cache_t *, zone_t *, int *);
 extern void *kmem_cache_alloc(kmem_cache_t *, int);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 
 extern void *kmalloc(size_t, int);
 extern void kfree(const void *);
 
-extern void kmem_cache_reap(int);
+extern int kmem_cache_reap(int, zone_t *);
 extern int slabinfo_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data);
 extern int slabinfo_write_proc(struct file *file, const char *buffer,
diff -urN 2.4.0-test7-pre5aa1/include/linux/swap.h 2.4.0-test7-pre5aa1-cz/include/linux/swap.h
--- 2.4.0-test7-pre5aa1/include/linux/swap.h	Tue Aug 22 01:30:04 2000
+++ 2.4.0-test7-pre5aa1-cz/include/linux/swap.h	Tue Aug 22 14:56:39 2000
@@ -64,10 +64,8 @@
 };
 
 extern int nr_swap_pages;
-FASTCALL(unsigned int nr_free_pages(void));
-FASTCALL(unsigned int nr_free_buffer_pages(void));
-FASTCALL(unsigned int nr_free_highpages(void));
-extern int nr_lru_pages;
+extern unsigned long nr_free_pages(void);
+extern unsigned long nr_free_buffer_pages(void);
 extern atomic_t nr_async_pages;
 extern struct address_space swapper_space;
 extern atomic_t page_cache_size;
@@ -80,13 +78,13 @@
 
 struct zone_t;
 /* linux/ipc/shm.c */
-extern int shm_swap(int, int);
+extern int shm_swap(int, zone_t *);
 
 /* linux/mm/swap.c */
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(unsigned int gfp_mask);
+extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *, int);
@@ -163,27 +161,38 @@
 /*
  * Helper macros for lru_pages handling.
  */
-#define	lru_cache_add(page)			\
-do {						\
-	spin_lock(&pagemap_lru_lock);		\
-	list_add(&(page)->lru, &lru_cache);	\
-	nr_lru_pages++;				\
-	spin_unlock(&pagemap_lru_lock);		\
-} while (0)
+#define __vm_lru_add(page, vm_lru)			\
+do {							\
+	list_add(&(page)->lru, &(vm_lru)->head);	\
+	(vm_lru)->nr_pages++;				\
+} while(0)
 
-#define	__lru_cache_del(page)			\
+#define __vm_lru_del(page, vm_lru)		\
 do {						\
 	list_del(&(page)->lru);			\
-	nr_lru_pages--;				\
+	(vm_lru)->nr_pages--;			\
+} while(0)
+
+#define	lru_cache_add(page)					\
+do {								\
+	vm_lru_t * vm_lru = (page)->zone->zone_pgdat->vm_lru;	\
+	vm_lru_t * lru_cache = &vm_lru[LRU_CACHE];		\
+								\
+	spin_lock(&lru_cache->lock);				\
+	__vm_lru_add(page, lru_cache);				\
+	spin_unlock(&lru_cache->lock);				\
 } while (0)
 
-#define	lru_cache_del(page)			\
-do {						\
-	if (!PageLocked(page))			\
-		BUG();				\
-	spin_lock(&pagemap_lru_lock);		\
-	__lru_cache_del(page);			\
-	spin_unlock(&pagemap_lru_lock);		\
+#define	lru_cache_del(page)					\
+do {								\
+	vm_lru_t * vm_lru = (page)->zone->zone_pgdat->vm_lru;	\
+	vm_lru_t * lru_cache = &vm_lru[LRU_CACHE];		\
+								\
+	if (!PageLocked(page))					\
+		BUG();						\
+	spin_lock(&lru_cache->lock);				\
+	__vm_lru_del(page, lru_cache);				\
+	spin_unlock(&lru_cache->lock);				\
 } while (0)
 
 extern spinlock_t swaplock;
diff -urN 2.4.0-test7-pre5aa1/ipc/shm.c 2.4.0-test7-pre5aa1-cz/ipc/shm.c
--- 2.4.0-test7-pre5aa1/ipc/shm.c	Tue Aug 22 01:23:54 2000
+++ 2.4.0-test7-pre5aa1-cz/ipc/shm.c	Tue Aug 22 13:09:05 2000
@@ -129,7 +129,7 @@
 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 #endif
 
-static void zshm_swap (int prio, int gfp_mask);
+static void zshm_swap (int prio, zone_t *zone);
 static void zmap_unuse(swp_entry_t entry, struct page *page);
 static void shmzero_open(struct vm_area_struct *shmd);
 static void shmzero_close(struct vm_area_struct *shmd);
@@ -1465,7 +1465,7 @@
 #define RETRY	1
 #define FAILED	2
 
-static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, int *counter, struct page **outpage)
+static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage)
 {
 	pte_t page;
 	struct page *page_map;
@@ -1474,7 +1474,7 @@
 	if (!pte_present(page))
 		return RETRY;
 	page_map = pte_page(page);
-	if (page_map->zone->free_pages > page_map->zone->pages_high)
+	if (!memclass(page_map->zone, zone))
 		return RETRY;
 	if (shp->id != zero_id) swap_attempts++;
 
@@ -1527,22 +1527,23 @@
 static unsigned long swap_id; /* currently being swapped */
 static unsigned long swap_idx; /* next to swap */
 
-int shm_swap (int prio, int gfp_mask)
+int shm_swap (int prio, zone_t *zone)
 {
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
 	unsigned long id, idx;
-	int loop = 0;
+	int loop;
 	int counter;
 	struct page * page_map;
 
-	zshm_swap(prio, gfp_mask);
-	counter = shm_rss / (prio + 1);
+	zshm_swap(prio, zone);
+	counter = shm_rss / prio;
 	if (!counter)
 		return 0;
 	if (shm_swap_preop(&swap_entry))
 		return 0;
 
+	loop = 0;
 	shm_lockall();
 check_id:
 	shp = shm_get(swap_id);
@@ -1568,7 +1569,7 @@
 	if (idx >= shp->shm_npages)
 		goto next_id;
 
-	switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
 		case RETRY: goto check_table;
 		case FAILED: goto failed;
 	}
@@ -1854,7 +1855,7 @@
 	spin_unlock(&zmap_list_lock);
 }
 
-static void zshm_swap (int prio, int gfp_mask)
+static void zshm_swap (int prio, zone_t *zone)
 {
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
@@ -1863,7 +1864,7 @@
 	int counter;
 	struct page * page_map;
 
-	counter = zshm_rss / (prio + 1);
+	counter = zshm_rss / prio;
 	if (!counter)
 		return;
 next:
@@ -1899,7 +1900,7 @@
 		goto next_id;
 	}
 
-	switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
 		case RETRY: goto check_table;
 		case FAILED: goto failed;
 	}
diff -urN 2.4.0-test7-pre5aa1/ipc/util.c 2.4.0-test7-pre5aa1-cz/ipc/util.c
--- 2.4.0-test7-pre5aa1/ipc/util.c	Sat Jun 24 16:03:03 2000
+++ 2.4.0-test7-pre5aa1-cz/ipc/util.c	Tue Aug 22 13:09:05 2000
@@ -345,7 +345,7 @@
     return;
 }
 
-int shm_swap (int prio, int gfp_mask)
+int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
     return 0;
 }
diff -urN 2.4.0-test7-pre5aa1/kernel/fork.c 2.4.0-test7-pre5aa1-cz/kernel/fork.c
--- 2.4.0-test7-pre5aa1/kernel/fork.c	Tue Aug 22 01:23:54 2000
+++ 2.4.0-test7-pre5aa1-cz/kernel/fork.c	Tue Aug 22 13:09:05 2000
@@ -620,6 +620,8 @@
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
+	INIT_LIST_HEAD(&p->local_pages);
+
 	retval = -ENOMEM;
 	/* copy all the process information */
 	if (copy_files(clone_flags, p))
diff -urN 2.4.0-test7-pre5aa1/mm/filemap.c 2.4.0-test7-pre5aa1-cz/mm/filemap.c
--- 2.4.0-test7-pre5aa1/mm/filemap.c	Tue Aug 22 13:08:39 2000
+++ 2.4.0-test7-pre5aa1-cz/mm/filemap.c	Tue Aug 22 13:21:09 2000
@@ -44,7 +44,6 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
-struct list_head lru_cache;
 
 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
@@ -125,7 +124,6 @@
 	head = &inode->i_mapping->pages;
 
 	spin_lock(&pagecache_lock);
-	spin_lock(&pagemap_lru_lock);
 	curr = head->next;
 
 	while (curr != head) {
@@ -136,13 +134,12 @@
 		if (TryLockPage(page))
 			continue;
 
-		__lru_cache_del(page);
+		lru_cache_del(page);
 		__remove_inode_page(page);
 		UnlockPage(page);
 		page_cache_release(page);
 	}
 
-	spin_unlock(&pagemap_lru_lock);
 	spin_unlock(&pagecache_lock);
 }
 
@@ -245,30 +242,110 @@
 	spin_unlock(&pagecache_lock);
 }
 
+#define VM_PAGE_OLD	0
+#define VM_PAGE_BH_OLD	1
+#define VM_PAGE_YOUNG	2
+
+static inline int vm_page_age(struct page * page)
+{
+	int ret = VM_PAGE_OLD, referenced, buffer_age;
+
+	/*
+	 * We hold he lru list lock here but the page isn't locked yet.
+	 *
+	 * The overlapped buffer headers can go away from under us
+	 * (and that's not a problem since there's no
+	 * stability issue), but the mapping can't go away because
+	 * we always run lru_cache_del() _before_ __remove_inode_pages(),
+	 * and the real buffer cache can't go away because
+	 * if somebody would be attempting to free the buffer cache
+	 * from under us then the page wouldn't be in the LRU in first place
+	 * (only shrink_mmap frees the buffer cache and unlink the page
+	 * from the lru before starting for real).
+	 */
+	if (!page->buffers && !page->mapping)
+		PAGE_BUG(page);
+
+	/*
+	 * If the page isn't referenced it means it's
+	 * not interesting data and we so must throw away also
+	 * the buffer headers without caring about their age.
+	 *
+	 * If the page is referenced very often we could
+	 * still want to drop the overlapped buffers,
+	 * think at read(2)/write(2) case. BufferAge tell us
+	 * if we should drop the buffers even if the
+	 * page is referenced very often.
+	 */
+
+	referenced = PageTestandClearReferenced(page);
+	if (!!page->buffers ^ !!page->mapping) {
+		if (referenced)
+			ret = VM_PAGE_YOUNG;
+	} else {
+		/*
+		 * This is page cache with overlapped buffers.
+		 * Always clear the buffer-age bit.
+		 *
+		 * Note: the buffer could grow from under us, or
+		 * the buffers could go away from under us,
+		 * that's not a stability problem.
+		 */
+		buffer_age = TestandClearPageBufferAge(page);
+
+		/* If the page wasn't referenced then it's old and that's all. */
+		if (referenced) {
+			if (buffer_age)
+				/* The page and the overlapped bhs are both young */
+				ret = VM_PAGE_YOUNG;
+			else
+				/* The page is young but the bh are old */
+				ret = VM_PAGE_BH_OLD;
+		}
+	}
+
+	return ret;
+}
+
 /*
  * nr_dirty represents the number of dirty pages that we will write async
  * before doing sync writes.  We can only do sync writes if we can
  * wait for IO (__GFP_IO set).
  */
-int shrink_mmap(int priority, int gfp_mask)
+int shrink_mmap(int priority, int gfp_mask, zone_t * zone)
 {
-	int ret = 0, count, nr_dirty;
-	struct list_head * page_lru;
-	struct page * page = NULL;
-	
-	count = nr_lru_pages / (priority + 1);
-	nr_dirty = priority;
-
-	/* we need pagemap_lru_lock for list_del() ... subtle code below */
-	spin_lock(&pagemap_lru_lock);
-	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+	int ret = 0, count, age, zone_state = 0;
+	struct list_head * page_lru, * lru_head, * dispose;
+	vm_lru_t * vm_lru_cache;
+	spinlock_t * lru_lock;
+	struct page * page;
+
+	vm_lru_cache = &zone->zone_pgdat->vm_lru[LRU_CACHE];
+	lru_lock = &vm_lru_cache->lock;
+	lru_head = &vm_lru_cache->head;
+
+	/* we need lru_lock for list_del() ... subtle code below */
+	spin_lock(lru_lock);
+
+	count = vm_lru_cache->nr_pages / priority;
+
+	while (count > 0 && (page_lru = lru_head->prev) != lru_head && !ret) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
 
-		if (PageTestandClearReferenced(page))
+		age = vm_page_age(page);
+
+		if (age == VM_PAGE_YOUNG)
+			goto dispose_continue;
+
+		if (!memclass(page->zone, zone)) {
+			if (!(++zone_state % priority))
+				count--;
 			goto dispose_continue;
+		}
 
 		count--;
+
 		/*
 		 * Avoid unscalable SMP locking for pages we can
 		 * immediate tell are untouchable..
@@ -284,7 +361,7 @@
 		   the page so nobody else may SMP race with us running
 		   a lru_cache_del() (lru_cache_del() always run with the
 		   page locked down ;). */
-		spin_unlock(&pagemap_lru_lock);
+		spin_unlock(lru_lock);
 
 		/* avoid freeing the page while it's locked */
 		page_cache_get(page);
@@ -294,15 +371,9 @@
 		 * of zone - it's old.
 		 */
 		if (page->buffers) {
-			int wait;
-			/*
-			 * 0 - free it if can do so without IO
-			 * 1 - start write-out of dirty buffers
-			 * 2 - wait for locked buffers
-			 */
-			wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0;
-			if (!try_to_free_buffers(page, wait))
+			if (!try_to_free_buffers(page, gfp_mask))
 				goto unlock_continue;
+			ret += shrink_buffer_headers(zone);
 			/* page was locked, inode can't go away under us */
 			if (!page->mapping) {
 				atomic_dec(&buffermem_pages);
@@ -310,6 +381,9 @@
 			}
 		}
 
+		if (age == VM_PAGE_BH_OLD)
+			goto unlock_continue;
+
 		/* Take the pagecache_lock spinlock held to avoid
 		   other tasks to notice the page while we are looking at its
 		   page count. If it's a pagecache-page we'll free it
@@ -334,13 +408,6 @@
 			goto made_inode_progress;
 		}	
 
-		/*
-		 * Page is from a zone we don't care about.
-		 * Don't drop page cache entries in vain.
-		 */
-		if (page->zone->free_pages > page->zone->pages_high)
-			goto cache_unlock_continue;
-
 		/* is it a page-cache page? */
 		if (page->mapping) {
 			if (!PageDirty(page) && !pgcache_under_min()) {
@@ -356,11 +423,18 @@
 cache_unlock_continue:
 		spin_unlock(&pagecache_lock);
 unlock_continue:
-		spin_lock(&pagemap_lru_lock);
+		spin_lock(lru_lock);
 		UnlockPage(page);
 		page_cache_release(page);
 dispose_continue:
-		list_add(page_lru, &lru_cache);
+		list_add(page_lru, lru_head);
+
+		if (current->need_resched) {
+			spin_unlock(lru_lock);
+			current->state = TASK_RUNNING;
+			schedule();
+			spin_lock(lru_lock);
+		}
 	}
 	goto out;
 
@@ -369,13 +443,13 @@
 made_buffer_progress:
 	UnlockPage(page);
 	page_cache_release(page);
-	ret = 1;
-	spin_lock(&pagemap_lru_lock);
-	/* nr_lru_pages needs the spinlock */
-	nr_lru_pages--;
+	ret += 1;
+	spin_lock(lru_lock);
+	/* nr_pages needs the spinlock */
+	vm_lru_cache->nr_pages--;
 
 out:
-	spin_unlock(&pagemap_lru_lock);
+	spin_unlock(lru_lock);
 
 	return ret;
 }
@@ -414,7 +488,6 @@
 		if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 			continue;
 
-		bh->b_flushtime = jiffies;
 		ll_rw_block(WRITE, 1, &bh);	
 	} while ((bh = bh->b_this_page) != head);
 	return 0;
@@ -519,7 +592,7 @@
 	if (PageLocked(page))
 		BUG();
 
-	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
+	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_buffer_age));
 	page->flags = flags | (1 << PG_locked);
 	page_cache_get(page);
 	page->index = offset;
diff -urN 2.4.0-test7-pre5aa1/mm/numa.c 2.4.0-test7-pre5aa1-cz/mm/numa.c
--- 2.4.0-test7-pre5aa1/mm/numa.c	Thu Aug 17 19:57:44 2000
+++ 2.4.0-test7-pre5aa1-cz/mm/numa.c	Tue Aug 22 13:09:05 2000
@@ -33,7 +33,7 @@
 
 struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
 {
-	return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order);
+	return __alloc_pages(NODE_DATA(nid)->node_gfpmask_zone + gfp_mask, order);
 }
 
 #ifdef CONFIG_DISCONTIGMEM
diff -urN 2.4.0-test7-pre5aa1/mm/page_alloc.c 2.4.0-test7-pre5aa1-cz/mm/page_alloc.c
--- 2.4.0-test7-pre5aa1/mm/page_alloc.c	Thu Aug 17 19:57:44 2000
+++ 2.4.0-test7-pre5aa1-cz/mm/page_alloc.c	Tue Aug 22 15:04:58 2000
@@ -58,19 +58,64 @@
  */
 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
 
+#define __free_pages_ok_critical_section(zone, pgdat, mask, area, index, base, page_idx)	\
+	(zone)->free_pages -= (mask);								\
+												\
+	/* update the classzone */								\
+	{											\
+		int nr_zone = (zone)->nr_zone;							\
+		register zone_t * z = (zone);							\
+		do {										\
+			z->classzone_free_pages -= (mask);					\
+			if (z->zone_wake_kswapd &&						\
+			    z->classzone_free_pages > z->pages_high)				\
+				z->zone_wake_kswapd = 0;					\
+			z++;									\
+		} while (++nr_zone < (pgdat)->nr_zones);					\
+	}											\
+												\
+	while ((mask) + (1 << (MAX_ORDER-1))) {							\
+		struct page *buddy1, *buddy2;							\
+												\
+		if ((area) >= (zone)->free_area + MAX_ORDER)					\
+			BUG();									\
+		if (!test_and_change_bit(index, (area)->map))					\
+			/*									\
+			 * the buddy page is still allocated.					\
+			 */									\
+			break;									\
+		/*										\
+		 * Move the buddy up one level.							\
+		 */										\
+		buddy1 = (base) + ((page_idx) ^ -(mask));					\
+		buddy2 = (base) + (page_idx);							\
+		if (BAD_RANGE(zone,buddy1))							\
+			BUG();									\
+		if (BAD_RANGE(zone,buddy2))							\
+			BUG();									\
+												\
+		memlist_del(&buddy1->list);							\
+		(mask) <<= 1;									\
+		(area)++;									\
+		(index) >>= 1;									\
+		(page_idx) &= (mask);								\
+	}											\
+	memlist_add_head(&((base) + (page_idx))->list, &(area)->free_list);
+
 /*
  * Buddy system. Hairy. You really aren't expected to understand this
  *
  * Hint: -mask = 1+~mask
  */
 
-static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
-static void __free_pages_ok (struct page *page, unsigned long order)
+void __free_pages_ok (struct page *page, unsigned long order)
 {
 	unsigned long index, page_idx, mask, flags;
 	free_area_t *area;
 	struct page *base;
 	zone_t *zone;
+	spinlock_t * freelist_lock;
+	pg_data_t * pgdat;
 
 	/*
 	 * Subtle. We do not want to test this in the inlined part of
@@ -97,6 +142,10 @@
 	if (PageDirty(page))
 		BUG();
 
+	if (current->flags & PF_FREE_PAGES)
+		goto local_freelist;
+ back_local_freelist:
+
 	zone = page->zone;
 
 	mask = (~0UL) << order;
@@ -108,44 +157,48 @@
 
 	area = zone->free_area + order;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	pgdat = zone->zone_pgdat;
+	freelist_lock = &pgdat->freelist_lock;
+	spin_lock_irqsave(freelist_lock, flags);
+	__free_pages_ok_critical_section(zone, pgdat, mask, area, index, base, page_idx);
+	spin_unlock_irqrestore(freelist_lock, flags);
+	return;
 
-	zone->free_pages -= mask;
+ local_freelist:
+	/*
+	 * This is a little subtle: if the allocation order
+	 * wanted is major than zero we'd better take all the pages
+	 * local since we must deal with fragmentation too and we
+	 * can't rely on the nr_local_pages information.
+	 */
+	if (current->nr_local_pages && !current->allocation_order)
+		goto back_local_freelist;
 
-	while (mask + (1 << (MAX_ORDER-1))) {
-		struct page *buddy1, *buddy2;
+	list_add(&page->list, &current->local_pages);
+	page->index = order;
+	current->nr_local_pages++;
+}
 
-		if (area >= zone->free_area + MAX_ORDER)
-			BUG();
-		if (!test_and_change_bit(index, area->map))
-			/*
-			 * the buddy page is still allocated.
-			 */
-			break;
-		/*
-		 * Move the buddy up one level.
-		 */
-		buddy1 = base + (page_idx ^ -mask);
-		buddy2 = base + page_idx;
-		if (BAD_RANGE(zone,buddy1))
-			BUG();
-		if (BAD_RANGE(zone,buddy2))
-			BUG();
+static void free_local_pages(struct page * page) {
+	unsigned long index, page_idx, mask, order = page->index;
+	free_area_t *area;
+	struct page *base;
+	zone_t *zone;
+	pg_data_t * pgdat;
 
-		memlist_del(&buddy1->list);
-		mask <<= 1;
-		area++;
-		index >>= 1;
-		page_idx &= mask;
-	}
-	memlist_add_head(&(base + page_idx)->list, &area->free_list);
+	zone = page->zone;
+	mask = (~0UL) << order;
+	base = mem_map + zone->offset;
+	page_idx = page - base;
+	if (page_idx & ~mask)
+		BUG();
+	index = page_idx >> (1 + order);
 
-	spin_unlock_irqrestore(&zone->lock, flags);
+	area = zone->free_area + order;
 
-	if (zone->free_pages > zone->pages_high) {
-		zone->zone_wake_kswapd = 0;
-		zone->low_on_memory = 0;
-	}
+	pgdat = zone->zone_pgdat;
+
+	__free_pages_ok_critical_section(zone, pgdat, mask, area, index, base, page_idx);
 }
 
 #define MARK_USED(index, order, area) \
@@ -172,16 +225,14 @@
 	return page;
 }
 
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static inline struct page * rmqueue(zone_t *zone, unsigned long order, unsigned long flags)
 {
 	free_area_t * area = zone->free_area + order;
 	unsigned long curr_order = order;
 	struct list_head *head, *curr;
-	unsigned long flags;
 	struct page *page;
+	pg_data_t * pgdat;
 
-	spin_lock_irqsave(&zone->lock, flags);
 	do {
 		head = &area->free_list;
 		curr = memlist_next(head);
@@ -197,8 +248,20 @@
 			MARK_USED(index, curr_order, area);
 			zone->free_pages -= 1 << order;
 
+			pgdat = zone->zone_pgdat;
+			/* update the classzone */
+			{
+				int nr_zone = zone->nr_zone;
+				register zone_t * z = zone;
+				unsigned int chunk_size = 1<<order;
+				do {
+					z->classzone_free_pages -= chunk_size;
+					z++;
+				} while (++nr_zone < pgdat->nr_zones);
+			}
+
 			page = expand(zone, page, index, order, curr_order, area);
-			spin_unlock_irqrestore(&zone->lock, flags);
+			spin_unlock_irqrestore(&pgdat->freelist_lock, flags);
 
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
@@ -208,111 +271,110 @@
 		curr_order++;
 		area++;
 	} while (curr_order < MAX_ORDER);
-	spin_unlock_irqrestore(&zone->lock, flags);
 
 	return NULL;
 }
 
+static void refile_local_pages(void)
+{
+	if (current->nr_local_pages) {
+		struct page * page;
+		struct list_head * entry;
+		int nr_pages = current->nr_local_pages;
+
+		while ((entry = current->local_pages.next) != &current->local_pages) {
+			list_del(entry);
+			page = list_entry(entry, struct page, list);
+			free_local_pages(page);
+			if (!nr_pages--)
+				panic("__get_free_pages local_pages list corrupted I");
+		}
+		if (nr_pages)
+			panic("__get_free_pages local_pages list corrupted II");
+		current->nr_local_pages = 0;
+	}
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
-struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
+struct page * __alloc_pages(gfpmask_zone_t * gfpmask_zone, unsigned long order)
 {
-	zone_t **zone;
-	extern wait_queue_head_t kswapd_wait;
+	zone_t * classzone = gfpmask_zone->classzone;
+	pg_data_t * pgdat = classzone->zone_pgdat;
+	spinlock_t * freelist_lock = &pgdat->freelist_lock;
+	long flags;
+	unsigned long size = 1UL << order;
+
+	spin_lock_irqsave(freelist_lock, flags);
 
 	/*
-	 * (If anyone calls gfp from interrupts nonatomically then it
-	 * will sooner or later tripped up by a schedule().)
-	 *
-	 * We are falling back to lower-level zones if allocation
-	 * in a higher zone fails.
+	 * If this is a recursive call, we'd better
+	 * do our best to just allocate things without
+	 * further thought.
 	 */
+	if (current->flags & PF_MEMALLOC)
+		goto allocate_ok;
 
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (!z->size)
-			BUG();
-
-		/* Are we supposed to free memory? Don't make it worse.. */
-		if (!z->zone_wake_kswapd) {
-			struct page *page = rmqueue(z, order);
-			if (z->free_pages < z->pages_low) {
-				z->zone_wake_kswapd = 1;
-				if (waitqueue_active(&kswapd_wait))
-					wake_up_interruptible(&kswapd_wait);
+	/* classzone based memory balancing */
+	if (classzone->classzone_free_pages > classzone->pages_low) {
+		int nr_zone;
+		zone_t * z;
+
+	allocate_ok:
+		z = classzone;
+		for (nr_zone = classzone->nr_zone;
+		     nr_zone >= 0;
+		     nr_zone--, z--) {
+			if (z->free_pages >= size) {
+				struct page *page = rmqueue(z, order, flags);
+				if (page)
+					return page;
 			}
-			if (page)
-				return page;
 		}
-	}
+	} else {
+		extern wait_queue_head_t kswapd_wait;
 
-	/* Three possibilities to get here
-	 * - Previous alloc_pages resulted in last zone set to have
-	 *   zone_wake_kswapd and start it. kswapd has not been able
-	 *   to release enough pages so that one zone does not have
-	 *   zone_wake_kswapd set.
-	 * - Different sets of zones (zonelist)
-	 *   previous did not have all zones with zone_wake_kswapd but
-	 *   this one has... should kswapd be woken up? it will run once.
-	 * - SMP race, kswapd went to sleep slightly after it as running
-	 *   in 'if (waitqueue_active(...))' above.
-	 * + anyway the test is very cheap to do...
-	 */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
-
-	/*
-	 * Ok, we don't have any zones that don't need some
-	 * balancing.. See if we have any that aren't critical..
-	 */
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (!z->low_on_memory) {
-			struct page *page = rmqueue(z, order);
-			if (z->free_pages < z->pages_min)
-				z->low_on_memory = 1;
-			if (page)
-				return page;
+		if (!classzone->zone_wake_kswapd) {
+			classzone->zone_wake_kswapd = 1;
+			if (waitqueue_active(&kswapd_wait))
+				wake_up_interruptible(&kswapd_wait);
 		}
-	}
 
-	/*
-	 * Uhhuh. All the zones have been critical, which means that
-	 * we'd better do some synchronous swap-out. kswapd has not
-	 * been able to cope..
-	 */
-	if (!(current->flags & PF_MEMALLOC)) {
-		int gfp_mask = zonelist->gfp_mask;
-		if (!try_to_free_pages(gfp_mask)) {
-			if (!(gfp_mask & __GFP_HIGH))
-				goto fail;
-		}
-	}
+		/* Are we reaching the critical stage? */
+		if (classzone->classzone_free_pages > classzone->pages_min)
+			/* Not yet critical, so let kswapd handle it.. */
+			goto allocate_ok;
+
+		if (gfpmask_zone->gfp_mask & __GFP_WAIT) {
+			int freed;
+
+			spin_unlock_irqrestore(freelist_lock, flags);
+
+			current->flags |= PF_MEMALLOC|PF_FREE_PAGES;
+			current->allocation_order = order;
+			freed = try_to_free_pages(gfpmask_zone->gfp_mask, classzone);
+			current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES);
 
-	/*
-	 * Final phase: allocate anything we can!
-	 */
-	zone = zonelist->zones;
-	for (;;) {
-		struct page *page;
-
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		page = rmqueue(z, order);
-		if (page)
-			return page;
-	}
+			spin_lock_irq(freelist_lock);
+			refile_local_pages();
+
+			if (freed || gfpmask_zone->gfp_mask & __GFP_HIGH)
+				goto allocate_ok;
 
-fail:
-	/* No luck.. */
+			/*
+			 * Re-check we're low on memory keeping the spinlock held
+			 * before failing. Somebody may have released
+			 * lots of memory from under us while we was trying
+			 * to free the pages. We check against pages_high
+			 * to be sure to succeed only if lots of memory is been
+			 * released.
+			 */
+			if (classzone->classzone_free_pages > classzone->pages_high)
+				goto allocate_ok;
+		}
+	}
+	spin_unlock_irqrestore(freelist_lock, flags);
 	return NULL;
 }
 
@@ -363,40 +425,44 @@
 /*
  * Total amount of free (allocatable) RAM:
  */
-unsigned int nr_free_pages (void)
+unsigned long nr_free_pages (void)
 {
-	unsigned int sum;
-	zone_t *zone;
+	unsigned long sum;
 	int i;
 
 	sum = 0;
-	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
-			sum += zone->free_pages;
+	for (i = 0; i < NUMNODES; i++) {
+		pg_data_t * pgdat = NODE_DATA(i);
+		zone_t * node_zones = pgdat->node_zones;
+		sum += node_zones[pgdat->nr_zones-1].classzone_free_pages;
+	}
 	return sum;
 }
 
 /*
  * Amount of free RAM allocatable as buffer memory:
  */
-unsigned int nr_free_buffer_pages (void)
+unsigned long nr_free_buffer_pages (void)
 {
-	unsigned int sum;
-	zone_t *zone;
+	unsigned long sum = 0;
 	int i;
 
-	sum = nr_lru_pages / 3;
-	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
-			sum += zone->free_pages;
+	for (i = 0; i < NUMNODES; i++) {
+		pg_data_t * pgdat = NODE_DATA(i);
+		zone_t * node_zones = pgdat->node_zones;
+		int higher_zone = pgdat->nr_zones-1;
+		vm_lru_t * vm_lru_cache = &pgdat->vm_lru[LRU_CACHE];
+		sum += vm_lru_cache->nr_pages / 3;
+		sum += node_zones[higher_zone <= ZONE_NORMAL ? higher_zone : ZONE_NORMAL].classzone_free_pages;
+	}
 	return sum;
 }
 
 #if CONFIG_HIGHMEM
-unsigned int nr_free_highpages (void)
+unsigned long nr_free_highpages (void)
 {
 	int i;
-	unsigned int pages = 0;
+	unsigned long pages = 0;
 
 	for (i = 0; i < NUMNODES; i++)
 		pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages;
@@ -411,39 +477,48 @@
  */
 void show_free_areas_core(int nid)
 {
- 	unsigned long order;
+	unsigned long order, flags;
 	unsigned type;
+	pg_data_t * pgdat = NODE_DATA(nid);
+	spinlock_t * freelist_lock = &pgdat->freelist_lock;
 
-	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
+	printk("Free pages:      %6lukB (%6lukB HighMem)\n",
 		nr_free_pages() << (PAGE_SHIFT-10),
 		nr_free_highpages() << (PAGE_SHIFT-10));
 
-	printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
+	printk("( Free: %lu, cache: %lu (%d %d %d) )\n",
 		nr_free_pages(),
-		nr_lru_pages,
+		NODE_DATA(nid)->vm_lru[LRU_CACHE].nr_pages,
 		freepages.min,
 		freepages.low,
 		freepages.high);
 
+	spin_lock_irqsave(freelist_lock, flags);
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
-		zone_t *zone = NODE_DATA(nid)->node_zones + type;
- 		unsigned long nr, total, flags;
+		zone_t *zone = pgdat->node_zones + type;
+ 		unsigned long nr, total;
 
-		printk("  %c%d%d %s: ",
+		printk("  %c%c%d %s: ",
 		       (zone->free_pages > zone->pages_low
 			? (zone->free_pages > zone->pages_high
 			   ? ' '
-			   : 'H')
+			   : 'h')
 			: (zone->free_pages > zone->pages_min
+			   ? 'm'
+			   : 'l')),
+		       (zone->classzone_free_pages > zone->pages_low
+			? (zone->classzone_free_pages > zone->pages_high
+			   ? ' '
+			   : 'H')
+			: (zone->classzone_free_pages > zone->pages_min
 			   ? 'M'
 			   : 'L')),
-		       zone->zone_wake_kswapd, zone->low_on_memory,
+		       zone->zone_wake_kswapd,
 		       zone->name);
 
 		total = 0;
 		if (zone->size) {
-			spin_lock_irqsave(&zone->lock, flags);
 		 	for (order = 0; order < MAX_ORDER; order++) {
 				head = &(zone->free_area + order)->free_list;
 				curr = head;
@@ -458,10 +533,15 @@
 				printk("%lu*%lukB ", nr,
 						(PAGE_SIZE>>10) << order);
 			}
-			spin_unlock_irqrestore(&zone->lock, flags);
+			if (total != zone->free_pages)
+				printk("error %lu ",
+				       zone->free_pages * (PAGE_SIZE>>10));
 		}
-		printk("= %lukB)\n", total * (PAGE_SIZE>>10));
+		printk("= %lukB", total * (PAGE_SIZE>>10));
+		printk(" class %ldkB\n",
+		       zone->classzone_free_pages * (PAGE_SIZE>>10));
 	}
+	spin_unlock_irqrestore(freelist_lock, flags);
 
 #ifdef SWAP_CACHE_INFO
 	show_swap_cache_info();
@@ -476,18 +556,17 @@
 /*
  * Builds allocation fallback zone lists.
  */
-static inline void build_zonelists(pg_data_t *pgdat)
+static void __init build_gfpmask_zone(pg_data_t *pgdat)
 {
 	int i, j, k;
 
 	for (i = 0; i < NR_GFPINDEX; i++) {
-		zonelist_t *zonelist;
+		gfpmask_zone_t * gfpmask_zone;
 		zone_t *zone;
 
-		zonelist = pgdat->node_zonelists + i;
-		memset(zonelist, 0, sizeof(*zonelist));
+		gfpmask_zone = pgdat->node_gfpmask_zone + i;
 
-		zonelist->gfp_mask = i;
+		gfpmask_zone->gfp_mask = i;
 		j = 0;
 		k = ZONE_NORMAL;
 		if (i & __GFP_HIGHMEM)
@@ -507,21 +586,37 @@
 #ifndef CONFIG_HIGHMEM
 					BUG();
 #endif
-					zonelist->zones[j++] = zone;
+					gfpmask_zone->classzone = zone;
+					break;
 				}
 			case ZONE_NORMAL:
 				zone = pgdat->node_zones + ZONE_NORMAL;
-				if (zone->size)
-					zonelist->zones[j++] = zone;
+				if (zone->size) {
+					gfpmask_zone->classzone = zone;
+					break;
+				}
 			case ZONE_DMA:
 				zone = pgdat->node_zones + ZONE_DMA;
-				if (zone->size)
-					zonelist->zones[j++] = zone;
+				if (zone->size) {
+					gfpmask_zone->classzone = zone;
+					break;
+				}
 		}
-		zonelist->zones[j++] = NULL;
 	} 
 }
 
+static void __init vm_lru_init(pg_data_t * pgdat)
+{
+	int i;
+	vm_lru_t * vm_lru = pgdat->vm_lru;
+
+	for (i = 0; i < NR_VM_LRU; i++) {
+		spin_lock_init(&vm_lru[i].lock);
+		INIT_LIST_HEAD(&vm_lru[i].head);
+		vm_lru[i].nr_pages = 0;
+	}
+}
+
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
 /*
@@ -538,7 +633,7 @@
 	unsigned long i, j;
 	unsigned long map_size;
 	unsigned long totalpages, offset, realtotalpages;
-	unsigned int cumulative = 0;
+	unsigned long classzonepages;
 
 	pgdat->node_next = pgdat_list;
 	pgdat_list = pgdat;
@@ -570,7 +665,6 @@
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
-	memlist_init(&lru_cache);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -587,6 +681,8 @@
 	pgdat->node_size = totalpages;
 	pgdat->node_start_paddr = zone_start_paddr;
 	pgdat->node_start_mapnr = (lmem_map - mem_map);
+	pgdat->nr_zones = 0;
+	spin_lock_init(&pgdat->freelist_lock);
 
 	/*
 	 * Initially all pages are reserved - free ones are freed
@@ -601,6 +697,7 @@
 	}
 
 	offset = lmem_map - mem_map;	
+	classzonepages = 0;
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		zone_t *zone = pgdat->node_zones + j;
 		unsigned long mask;
@@ -609,19 +706,22 @@
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
+		classzonepages += realsize;
 
 		printk("zone(%lu): %lu pages.\n", j, size);
 		zone->size = size;
 		zone->name = zone_names[j];
-		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
+		zone->nr_zone = j;
 		zone->free_pages = 0;
+		zone->zone_wake_kswapd = 0;
+		zone->classzone_free_pages = 0;
 		if (!size)
 			continue;
+		pgdat->nr_zones = j+1;
 
 		zone->offset = offset;
-		cumulative += size;
-		mask = (realsize / zone_balance_ratio[j]);
+		mask = (classzonepages / zone_balance_ratio[j]);
 		if (mask < zone_balance_min[j])
 			mask = zone_balance_min[j];
 		else if (mask > zone_balance_max[j])
@@ -629,8 +729,6 @@
 		zone->pages_min = mask;
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
-		zone->low_on_memory = 0;
-		zone->zone_wake_kswapd = 0;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
@@ -659,7 +757,8 @@
 			  (unsigned int *) alloc_bootmem_node(nid, bitmap_size);
 		}
 	}
-	build_zonelists(pgdat);
+	build_gfpmask_zone(pgdat);
+	vm_lru_init(pgdat);
 }
 
 void __init free_area_init(unsigned long *zones_size)
diff -urN 2.4.0-test7-pre5aa1/mm/slab.c 2.4.0-test7-pre5aa1-cz/mm/slab.c
--- 2.4.0-test7-pre5aa1/mm/slab.c	Tue Aug 22 01:23:54 2000
+++ 2.4.0-test7-pre5aa1-cz/mm/slab.c	Tue Aug 22 15:26:19 2000
@@ -140,8 +140,7 @@
  *
  * Manages the objs in a slab. Placed either at the beginning of mem allocated
  * for a slab, or allocated from an general cache.
- * Slabs are chained into one ordered list: fully used, partial, then fully
- * free slabs.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
  */
 typedef struct slab_s {
 	struct list_head	list;
@@ -167,7 +166,7 @@
 } cpucache_t;
 
 #define cc_entry(cpucache) \
-	((void **)(((cpucache_t*)cpucache)+1))
+	((void **)(((cpucache_t*)(cpucache))+1))
 #define cc_data(cachep) \
 	((cachep)->cpudata[smp_processor_id()])
 /*
@@ -181,8 +180,9 @@
 struct kmem_cache_s {
 /* 1) each alloc & free */
 	/* full, partial first, then free */
-	struct list_head	slabs;
-	struct list_head	*firstnotfull;
+	struct list_head	slabs_full;
+	struct list_head	slabs_partial;
+	struct list_head	slabs_free;
 	unsigned int		objsize;
 	unsigned int	 	flags;	/* constant flags */
 	unsigned int		num;	/* # of objs per slab */
@@ -345,8 +345,9 @@
 
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
-	slabs:		LIST_HEAD_INIT(cache_cache.slabs),
-	firstnotfull:	&cache_cache.slabs,
+	slabs_full:	LIST_HEAD_INIT(cache_cache.slabs_full),
+	slabs_partial:	LIST_HEAD_INIT(cache_cache.slabs_partial),
+	slabs_free:	LIST_HEAD_INIT(cache_cache.slabs_free),
 	objsize:	sizeof(kmem_cache_t),
 	flags:		SLAB_NO_REAP,
 	spinlock:	SPIN_LOCK_UNLOCKED,
@@ -778,8 +779,9 @@
 		cachep->gfpflags |= GFP_DMA;
 	spin_lock_init(&cachep->spinlock);
 	cachep->objsize = size;
-	INIT_LIST_HEAD(&cachep->slabs);
-	cachep->firstnotfull = &cachep->slabs;
+	INIT_LIST_HEAD(&cachep->slabs_full);
+	INIT_LIST_HEAD(&cachep->slabs_partial);
+	INIT_LIST_HEAD(&cachep->slabs_free);
 
 	if (flags & CFLGS_OFF_SLAB)
 		cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
@@ -884,10 +886,10 @@
 #define drain_cpu_caches(cachep)	do { } while (0)
 #endif
 
-static int __kmem_cache_shrink(kmem_cache_t *cachep)
+static int __kmem_cache_shrink(kmem_cache_t *cachep, zone_t * zone, int * nr_pages)
 {
 	slab_t *slabp;
-	int ret;
+	int ret, progress;
 
 	drain_cpu_caches(cachep);
 
@@ -897,23 +899,33 @@
 	while (!cachep->growing) {
 		struct list_head *p;
 
-		p = cachep->slabs.prev;
-		if (p == &cachep->slabs)
+		p = cachep->slabs_free.prev;
+		if (p == &cachep->slabs_free)
 			break;
 
-		slabp = list_entry(cachep->slabs.prev, slab_t, list);
+		slabp = list_entry(cachep->slabs_free.prev, slab_t, list);
 		if (slabp->inuse)
-			break;
+			BUG();
+
+		progress = 1;
+		if (zone) {
+			void * addr = slabp->s_mem - slabp->colouroff;
+			struct page * page = virt_to_page(addr);
+
+			if (!memclass(page->zone, zone))
+				progress = 0;
+		}
 
 		list_del(&slabp->list);
-		if (cachep->firstnotfull == &slabp->list)
-			cachep->firstnotfull = &cachep->slabs;
 
 		spin_unlock_irq(&cachep->spinlock);
 		kmem_slab_destroy(cachep, slabp);
+		if (nr_pages && progress)
+			/* don't need the cache's spinlock to read the order */
+			*nr_pages += 1UL << cachep->gfporder;
 		spin_lock_irq(&cachep->spinlock);
 	}
-	ret = !list_empty(&cachep->slabs);
+	ret = !list_empty(&cachep->slabs_full) || !list_empty(&cachep->slabs_partial);
 	spin_unlock_irq(&cachep->spinlock);
 	return ret;
 }
@@ -925,12 +937,12 @@
  * Releases as many slabs as possible for a cache.
  * To help debugging, a zero exit status indicates all slabs were released.
  */
-int kmem_cache_shrink(kmem_cache_t *cachep)
+int kmem_cache_shrink(kmem_cache_t *cachep, zone_t * zone, int * nr_pages)
 {
 	if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep))
 		BUG();
 
-	return __kmem_cache_shrink(cachep);
+	return __kmem_cache_shrink(cachep, zone, nr_pages);
 }
 
 /**
@@ -962,7 +974,7 @@
 	list_del(&cachep->next);
 	up(&cache_chain_sem);
 
-	if (__kmem_cache_shrink(cachep)) {
+	if (__kmem_cache_shrink(cachep, NULL, NULL)) {
 		printk(KERN_ERR "kmem_cache_destroy: Can't free all objects %p\n",
 		       cachep);
 		down(&cache_chain_sem);
@@ -1139,9 +1151,7 @@
 	cachep->growing--;
 
 	/* Make slab active. */
-	list_add_tail(&slabp->list,&cachep->slabs);
-	if (cachep->firstnotfull == &cachep->slabs)
-		cachep->firstnotfull = &slabp->list;
+	list_add(&slabp->list, &cachep->slabs_free);
 	STATS_INC_GROWN(cachep);
 	cachep->failures = 0;
 
@@ -1198,7 +1208,7 @@
 }
 
 static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
-							 slab_t *slabp)
+						slab_t *slabp, int partial)
 {
 	void *objp;
 
@@ -1211,9 +1221,15 @@
 	objp = slabp->s_mem + slabp->free*cachep->objsize;
 	slabp->free=slab_bufctl(slabp)[slabp->free];
 
-	if (slabp->free == BUFCTL_END)
-		/* slab now full: move to next slab for next alloc */
-		cachep->firstnotfull = slabp->list.next;
+	if (slabp->free == BUFCTL_END) {
+		list_del(&slabp->list);
+		list_add(&slabp->list, &cachep->slabs_full);
+	} else {
+		if (!partial) {
+			list_del(&slabp->list);
+			list_add(&slabp->list, &cachep->slabs_partial);
+		}
+	}
 #if DEBUG
 	if (cachep->flags & SLAB_POISON)
 		if (kmem_check_poison_obj(cachep, objp))
@@ -1239,16 +1255,20 @@
  */
 #define kmem_cache_alloc_one(cachep)				\
 ({								\
-	slab_t	*slabp;					\
+	slab_t	*slabp;						\
+	struct list_head * slab_freelist;			\
+	int partial = 1;					\
 								\
-	/* Get slab alloc is to come from. */			\
-	{							\
-		struct list_head* p = cachep->firstnotfull;	\
-		if (p == &cachep->slabs)			\
+	slab_freelist = &(cachep)->slabs_partial;		\
+	if (list_empty(slab_freelist)) {			\
+		partial = 0;					\
+		slab_freelist = &(cachep)->slabs_free;		\
+		if (list_empty(slab_freelist))			\
 			goto alloc_new_slab;			\
-		slabp = list_entry(p,slab_t, list);	\
 	}							\
-	kmem_cache_alloc_one_tail(cachep, slabp);		\
+								\
+	slabp = list_entry(slab_freelist->next, slab_t, list);	\
+	kmem_cache_alloc_one_tail(cachep, slabp, partial);	\
 })
 
 #ifdef CONFIG_SMP
@@ -1256,18 +1276,25 @@
 {
 	int batchcount = cachep->batchcount;
 	cpucache_t* cc = cc_data(cachep);
+	struct list_head * slab_freelist;
+	int partial;
+	slab_t *slabp;
 
 	spin_lock(&cachep->spinlock);
 	while (batchcount--) {
 		/* Get slab alloc is to come from. */
-		struct list_head *p = cachep->firstnotfull;
-		slab_t *slabp;
+		slab_freelist = &(cachep)->slabs_partial;
+		partial = 1;
+		if (list_empty(slab_freelist)) {
+			partial = 0;
+			slab_freelist = &(cachep)->slabs_free;
+			if (list_empty(slab_freelist))
+				break;
+		}
 
-		if (p == &cachep->slabs)
-			break;
-		slabp = list_entry(p,slab_t, list);
+		slabp = list_entry(slab_freelist->next, slab_t, list);
 		cc_entry(cc)[cc->avail++] =
-				kmem_cache_alloc_one_tail(cachep, slabp);
+				kmem_cache_alloc_one_tail(cachep, slabp, partial);
 	}
 	spin_unlock(&cachep->spinlock);
 
@@ -1397,43 +1424,24 @@
 	}
 	STATS_DEC_ACTIVE(cachep);
 	
-	/* fixup slab chain */
-	if (slabp->inuse-- == cachep->num)
-		goto moveslab_partial;
-	if (!slabp->inuse)
+	/* fixup slab chains */
+	if (!--slabp->inuse)
 		goto moveslab_free;
+	if (slabp->inuse + 1 == cachep->num)
+		goto moveslab_partial;
 	return;
 
 moveslab_partial:
-    	/* was full.
-	 * Even if the page is now empty, we can set c_firstnotfull to
-	 * slabp: there are no partial slabs in this case
-	 */
-	{
-		struct list_head *t = cachep->firstnotfull;
+    	/* Was full. */
+	list_del(&slabp->list);
+	list_add(&slabp->list, &cachep->slabs_partial);
+	return;
 
-		cachep->firstnotfull = &slabp->list;
-		if (slabp->list.next == t)
-			return;
-		list_del(&slabp->list);
-		list_add_tail(&slabp->list, t);
-		return;
-	}
 moveslab_free:
-	/*
-	 * was partial, now empty.
-	 * c_firstnotfull might point to slabp
-	 * FIXME: optimize
-	 */
-	{
-		struct list_head *t = cachep->firstnotfull->prev;
-
-		list_del(&slabp->list);
-		list_add_tail(&slabp->list, &cachep->slabs);
-		if (cachep->firstnotfull == &slabp->list)
-			cachep->firstnotfull = t->next;
-		return;
-	}
+	/* Was partial, now empty. */
+	list_del(&slabp->list);
+	list_add(&slabp->list, &cachep->slabs_free);
+	return;
 }
 
 #ifdef CONFIG_SMP
@@ -1744,7 +1752,7 @@
  *
  * Called from try_to_free_page().
  */
-void kmem_cache_reap (int gfp_mask)
+int kmem_cache_reap (int gfp_mask, zone_t * zone)
 {
 	slab_t *slabp;
 	kmem_cache_t *searchp;
@@ -1752,12 +1760,13 @@
 	unsigned int best_pages;
 	unsigned int best_len;
 	unsigned int scan;
+	int nr_pages = 0;
 
 	if (gfp_mask & __GFP_WAIT)
 		down(&cache_chain_sem);
 	else
 		if (down_trylock(&cache_chain_sem))
-			return;
+			return 0;
 
 	scan = REAP_SCANLEN;
 	best_len = 0;
@@ -1798,13 +1807,13 @@
 #endif
 
 		full_free = 0;
-		p = searchp->slabs.prev;
-		while (p != &searchp->slabs) {
+		p = searchp->slabs_free.next;
+		while (p != &searchp->slabs_free) {
 			slabp = list_entry(p, slab_t, list);
 			if (slabp->inuse)
-				break;
+				BUG();
 			full_free++;
-			p = p->prev;
+			p = p->next;
 		}
 
 		/*
@@ -1821,7 +1830,7 @@
 			best_cachep = searchp;
 			best_len = full_free;
 			best_pages = pages;
-			if (full_free >= REAP_PERFECT) {
+			if (pages >= REAP_PERFECT) {
 				clock_searchp = list_entry(searchp->next.next,
 							kmem_cache_t,next);
 				goto perfect;
@@ -1841,22 +1850,29 @@
 
 	spin_lock_irq(&best_cachep->spinlock);
 perfect:
-	/* free only 80% of the free slabs */
-	best_len = (best_len*4 + 1)/5;
+	/* free only 50% of the free slabs */
+	best_len = (best_len + 1)/2;
 	for (scan = 0; scan < best_len; scan++) {
 		struct list_head *p;
+		int progress;
 
 		if (best_cachep->growing)
 			break;
-		p = best_cachep->slabs.prev;
-		if (p == &best_cachep->slabs)
+		p = best_cachep->slabs_free.prev;
+		if (p == &best_cachep->slabs_free)
 			break;
 		slabp = list_entry(p,slab_t,list);
 		if (slabp->inuse)
-			break;
+			BUG();
+		progress = 1;
+		if (zone) {
+			void * addr = slabp->s_mem - slabp->colouroff;
+			struct page * page = virt_to_page(addr);
+
+			if (!memclass(page->zone, zone))
+				progress = 0;
+		}
 		list_del(&slabp->list);
-		if (best_cachep->firstnotfull == &slabp->list)
-			best_cachep->firstnotfull = &best_cachep->slabs;
 		STATS_INC_REAPED(best_cachep);
 
 		/* Safe to drop the lock. The slab is no longer linked to the
@@ -1864,12 +1880,15 @@
 		 */
 		spin_unlock_irq(&best_cachep->spinlock);
 		kmem_slab_destroy(best_cachep, slabp);
+		if (progress)
+			/* don't need the cache's spinlock to read the order */
+			nr_pages += 1UL << best_cachep->gfporder;
 		spin_lock_irq(&best_cachep->spinlock);
 	}
 	spin_unlock_irq(&best_cachep->spinlock);
 out:
 	up(&cache_chain_sem);
-	return;
+	return nr_pages;
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1922,14 +1941,25 @@
 		spin_lock_irq(&cachep->spinlock);
 		active_objs = 0;
 		num_slabs = 0;
-		list_for_each(q,&cachep->slabs) {
+		list_for_each(q,&cachep->slabs_full) {
 			slabp = list_entry(q, slab_t, list);
+			if (slabp->inuse != cachep->num)
+				BUG();
+			active_objs += cachep->num;
+			active_slabs++;
+		}
+		list_for_each(q,&cachep->slabs_partial) {
+			slabp = list_entry(q, slab_t, list);
+			if (slabp->inuse == cachep->num || !slabp->inuse)
+				BUG();
 			active_objs += slabp->inuse;
-			num_objs += cachep->num;
+			active_slabs++;
+		}
+		list_for_each(q,&cachep->slabs_free) {
+			slabp = list_entry(q, slab_t, list);
 			if (slabp->inuse)
-				active_slabs++;
-			else
-				num_slabs++;
+				BUG();
+			num_slabs++;
 		}
 		num_slabs+=active_slabs;
 		num_objs = num_slabs*cachep->num;
diff -urN 2.4.0-test7-pre5aa1/mm/swap_state.c 2.4.0-test7-pre5aa1-cz/mm/swap_state.c
--- 2.4.0-test7-pre5aa1/mm/swap_state.c	Thu Aug 17 19:57:44 2000
+++ 2.4.0-test7-pre5aa1-cz/mm/swap_state.c	Tue Aug 22 13:09:06 2000
@@ -58,7 +58,7 @@
 		BUG();
 	if (page->mapping)
 		BUG();
-	flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
+	flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_buffer_age));
 	page->flags = flags | (1 << PG_uptodate);
 	add_to_page_cache_locked(page, &swapper_space, entry.val);
 }
diff -urN 2.4.0-test7-pre5aa1/mm/vmscan.c 2.4.0-test7-pre5aa1-cz/mm/vmscan.c
--- 2.4.0-test7-pre5aa1/mm/vmscan.c	Thu Aug 17 19:57:44 2000
+++ 2.4.0-test7-pre5aa1-cz/mm/vmscan.c	Tue Aug 22 15:35:01 2000
@@ -22,6 +22,7 @@
 #include <linux/file.h>
 
 #include <asm/pgalloc.h>
+#include <asm/hardirq.h>
 
 /*
  * The swap-out functions return 1 if they successfully
@@ -34,7 +35,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask, zone_t * zone)
 {
 	pte_t pte;
 	swp_entry_t entry;
@@ -45,7 +46,7 @@
 	if (!pte_present(pte))
 		goto out_failed;
 	page = pte_page(pte);
-	if ((!VALID_PAGE(page)) || PageReserved(page))
+	if ((!VALID_PAGE(page)) || PageReserved(page) || !memclass(page->zone, zone))
 		goto out_failed;
 
 	if (mm->swap_cnt)
@@ -113,13 +114,6 @@
 		goto out_unlock;
 
 	/*
-	 * Don't do any of the expensive stuff if
-	 * we're not really interested in this zone.
-	 */
-	if (page->zone->free_pages > page->zone->pages_high)
-		goto out_unlock;
-
-	/*
 	 * Ok, it's really dirty. That means that
 	 * we should either create a new swap cache
 	 * entry for it, or we should write it back
@@ -209,7 +203,7 @@
  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
  */
 
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, zone_t * zone)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -231,9 +225,13 @@
 	do {
 		int result;
 		vma->vm_mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
+		result = try_to_swap_out(mm, vma, address, pte, gfp_mask, zone);
 		if (result)
 			return result;
+		if (current->need_resched) {
+			vmlist_access_unlock(vma->vm_mm);
+			return 2;
+		}
 		if (!mm->swap_cnt)
 			return 0;
 		address += PAGE_SIZE;
@@ -242,7 +240,7 @@
 	return 0;
 }
 
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, zone_t * zone)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -262,7 +260,7 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask, zone);
 		if (result)
 			return result;
 		if (!mm->swap_cnt)
@@ -273,7 +271,7 @@
 	return 0;
 }
 
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask, zone_t * zone)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -288,7 +286,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask, zone);
 		if (result)
 			return result;
 		if (!mm->swap_cnt)
@@ -299,7 +297,7 @@
 	return 0;
 }
 
-static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
+static int swap_out_mm(struct mm_struct * mm, int gfp_mask, zone_t * zone)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -320,7 +318,7 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(mm, vma, address, gfp_mask);
+			int result = swap_out_vma(mm, vma, address, gfp_mask, zone);
 			if (result)
 				return result;
 			vma = vma->vm_next;
@@ -342,11 +340,12 @@
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int gfp_mask)
+static int swap_out(unsigned int priority, int gfp_mask, zone_t * zone)
 {
 	struct task_struct * p;
 	int counter;
 	int __ret = 0;
+	int assign = 0;
 
 	lock_kernel();
 	/* 
@@ -363,16 +362,16 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = (nr_threads << 2) >> (priority >> 2);
-	if (counter < 1)
-		counter = 1;
+	counter = nr_threads / priority ? : 1;
 
 	for (; counter >= 0; counter--) {
-		unsigned long max_cnt = 0;
-		struct mm_struct *best = NULL;
+		unsigned long max_cnt;
+		struct mm_struct *best;
 		int pid = 0;
-		int assign = 0;
 	select:
+		max_cnt = 0;
+		best = NULL;
+		pid = 0;
 		read_lock(&tasklist_lock);
 		p = init_task.next_task;
 		for (; p != &init_task; p = p->next_task) {
@@ -391,6 +390,8 @@
 			}
 		}
 		read_unlock(&tasklist_lock);
+		if (assign == 1)
+			assign = 2;
 		if (!best) {
 			if (!assign) {
 				assign = 1;
@@ -401,9 +402,16 @@
 			int ret;
 
 			atomic_inc(&best->mm_count);
-			ret = swap_out_mm(best, gfp_mask);
+			ret = swap_out_mm(best, gfp_mask, zone);
 			mmdrop(best);
 
+			if (ret == 2) {
+				/* needs a reschedule */
+				current->state = TASK_RUNNING;
+				schedule();
+				goto select;
+			}
+
 			if (!ret)
 				continue;
 
@@ -418,50 +426,31 @@
 	return __ret;
 }
 
-/*
- * Check if there is any memory pressure (free_pages < pages_low)
- */
-static inline int memory_pressure(void)
+#define FALLBACK_PAGELRU_TO_DCACHE_RATIO ((400*1024*1024)>>PAGE_SHIFT)
+
+static inline int shrink_cache(int priority, int gfp_mask, zone_t * zone)
 {
-	pg_data_t *pgdat = pgdat_list;
+	static spinlock_t fallback_lock = SPIN_LOCK_UNLOCKED;
+	static int fallback_pagelru_to_dcache;
+	int ret, fallback;
 
-	do {
-		int i;
-		for(i = 0; i < MAX_NR_ZONES; i++) {
-			zone_t *zone = pgdat->node_zones+ i;
-			if (zone->size &&
-			    zone->free_pages < zone->pages_low)
-				return 1;
-		}
-		pgdat = pgdat->node_next;
-	} while (pgdat);
+	if (in_interrupt())
+		BUG();
 
-	return 0;
-}
+	spin_lock(&fallback_lock);
+	fallback = !(++fallback_pagelru_to_dcache % FALLBACK_PAGELRU_TO_DCACHE_RATIO);
+	spin_unlock(&fallback_lock);
 
-/*
- * Check if all zones have recently had memory_pressure (zone_wake_kswapd)
- */
-static inline int keep_kswapd_awake(void)
-{
-	int all_recent = 1;
-	pg_data_t *pgdat = pgdat_list;
+	ret = shrink_mmap(priority, gfp_mask, zone);
 
-	do {
-		int i;
-		for(i = 0; i < MAX_NR_ZONES; i++) {
-			zone_t *zone = pgdat->node_zones+ i;
-			if (zone->size) {
-				if (zone->free_pages < zone->pages_min)
-					return 1;
-				if (!zone->zone_wake_kswapd)
-					all_recent = 0;
-			}
-		}
-		pgdat = pgdat->node_next;
-	} while (pgdat);
+	if ((fallback || !ret) && (gfp_mask & __GFP_IO)) {
+		fallback_pagelru_to_dcache = 0;
 
-	return all_recent;
+		ret += shrink_dcache_memory(priority, zone);
+		ret += shrink_icache_memory(priority, zone);
+	}
+
+	return ret;
 }
 
 /*
@@ -471,97 +460,112 @@
  *
  * We want to try to free "count" pages, and we want to 
  * cluster them so that we get good swap-out behaviour.
- *
- * Don't try _too_ hard, though. We don't want to have bad
- * latency.
- *
- * Note: only called by kswapd and try_to_free_pages
- *       both can WAIT at top level.
  */
 #define FREE_COUNT	8
-#define SWAP_COUNT	16
-static int do_try_to_free_pages(unsigned int gfp_mask)
+#define SWAP_COUNT	SWAP_CLUSTER_MAX
+int try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 {
-	int priority;
-	int count = FREE_COUNT;
-	int swap_count;
+	int priority, count = FREE_COUNT, nr_pages, swap_count;
 
 	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
+	count -= kmem_cache_reap(gfp_mask, zone);
+	if (count <= 0)
+		return 1;
 
-	priority = 64;
+	priority = 5;
 	do {
-		if (current->need_resched) {
-			schedule();
-			/* time has passed - pressure too? */
-			if (!memory_pressure())
-				goto done;
-		}
-
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
+		while ((nr_pages = shrink_cache(priority, gfp_mask, zone))) {
+			count -= nr_pages;
+			if (count <= 0)
 				goto done;
 		}
 
-		/* check if mission completed */
-		if (!keep_kswapd_awake())
-			goto done;
-
 		/* Try to get rid of some shared memory pages.. */
 		if (gfp_mask & __GFP_IO) {
-			/*
-			 * don't be too light against the d/i cache since
-		   	 * shrink_mmap() almost never fail when there's
-		   	 * really plenty of memory free. 
-			 */
-			count -= shrink_dcache_memory(priority, gfp_mask);
-			count -= shrink_icache_memory(priority, gfp_mask);
-			/*
-			 * Not currently working, see fixme in shrink_?cache_memory
-			 * In the inner funtions there is a comment:
-			 * "To help debugging, a zero exit status indicates
-			 *  all slabs were released." (-arca?)
-			 * lets handle it in a primitive but working way...
-			 *	if (count <= 0)
-			 *		goto done;
-			 */
-			if (!keep_kswapd_awake())
-				goto done;
-
-			while (shm_swap(priority, gfp_mask)) {
+			while (shm_swap(priority, zone)) {
 				if (!--count)
 					goto done;
 			}
 		}
 
-		/*
-		 * Then, try to page stuff out..
-		 *
-		 * This will not actually free any pages (they get
-		 * put in the swap cache), so we must not count this
-		 * as a "count" success.
-		 */
+		/* Then, try to page stuff out.. */
 		swap_count = SWAP_COUNT;
-		while (swap_out(priority, gfp_mask))
-			if (--swap_count < 0)
-				break;
+		while (swap_out(priority, gfp_mask, zone)) {
+			if (!--swap_count)
+				goto done;
+		}
+		count -= SWAP_COUNT - swap_count;
+		if (count <= 0)
+			goto done;
+	} while (--priority > 0);
+done:
 
-	} while (--priority >= 0);
+	return priority > 0;
+}
 
-	/* Always end on a shrink_mmap.., may sleep... */
-	while (shrink_mmap(0, gfp_mask)) {
-		if (!--count)
-			goto done;
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+
+static int kswapd_work_pgdat(pg_data_t * pgdat)
+{
+	int worked = 0, i;
+	zone_t * zone;
+
+	for (i = pgdat->nr_zones-1; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (!zone->zone_wake_kswapd)
+			continue;
+		if (!try_to_free_pages(GFP_KSWAPD, zone)) {
+			zone->zone_wake_kswapd = 0;
+			continue;
+		}
+		worked = 1;
 	}
-	/* Return 1 if any page is freed, or
-	 * there are no more memory pressure   */
-	return (count < FREE_COUNT || !keep_kswapd_awake());
- 
-done:
+
+	return worked;
+}
+
+static void kswapd_work(void)
+{
+	int worked;
+	pg_data_t * pgdat;
+
+	do {
+		worked = 0;
+		pgdat = pgdat_list;
+		do
+			worked |= kswapd_work_pgdat(pgdat);
+		while ((pgdat = pgdat->node_next));
+	} while (worked);
+}
+
+static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
+{
+	zone_t * zone;
+	int i;
+
+	for (i = pgdat->nr_zones-1; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (!zone->zone_wake_kswapd)
+			continue;
+		return 0;
+	}
+
 	return 1;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+static int kswapd_can_sleep(void)
+{
+	pg_data_t * pgdat;
+
+	pgdat = pgdat_list;
+	do {
+		if (kswapd_can_sleep_pgdat(pgdat))
+			continue;
+		return 0;
+	} while ((pgdat = pgdat->node_next));
+
+	return 1;
+}
 
 /*
  * The background pageout daemon, started as a kernel thread
@@ -579,11 +583,13 @@
 int kswapd(void *unused)
 {
 	struct task_struct *tsk = current;
+	wait_queue_t wait;
 
 	tsk->session = 1;
 	tsk->pgrp = 1;
 	strcpy(tsk->comm, "kswapd");
 	sigfillset(&tsk->blocked);
+	init_waitqueue_entry(&wait, tsk);
 	
 	/*
 	 * Tell the memory management that we're a "memory allocator",
@@ -599,52 +605,29 @@
 	 */
 	tsk->flags |= PF_MEMALLOC;
 
-	for (;;) {
-		if (!keep_kswapd_awake()) {
-			interruptible_sleep_on(&kswapd_wait);
-		}
+	while (1) {
+		/*
+		 * If we actually get into a low-memory situation,
+		 * the processes needing more memory will wake us
+		 * up on a more timely basis.
+		 */
+		kswapd_work();
+		run_task_queue(&tq_disk);
 
-		do_try_to_free_pages(GFP_KSWAPD);
-	}
-}
+		__set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kswapd_wait, &wait);
 
-/*
- * Called by non-kswapd processes when they want more
- * memory.
- *
- * In a perfect world, this should just wake up kswapd
- * and return. We don't actually want to swap stuff out
- * from user processes, because the locking issues are
- * nasty to the extreme (file write locks, and MM locking)
- *
- * One option might be to let kswapd do all the page-out
- * and VM page table scanning that needs locking, and this
- * process thread could do just the mmap shrink stage that
- * can be done by just dropping cached pages without having
- * any deadlock issues.
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
-	int retval = 1;
+		if (kswapd_can_sleep())
+			schedule();
 
-	if (gfp_mask & __GFP_WAIT) {
-		current->state = TASK_RUNNING;
-		current->flags |= PF_MEMALLOC;
-		retval = do_try_to_free_pages(gfp_mask);
-		current->flags &= ~PF_MEMALLOC;
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kswapd_wait, &wait);
 	}
-
-	/* someone needed memory that kswapd had not provided
-	 * make sure kswapd runs, should not happen often */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
-
-	return retval;
 }
 
 static int __init kswapd_init(void)
 {
-	printk("Starting kswapd v1.7\n");
+	printk("Starting kswapd v1.8\n");
 	swap_setup();
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	return 0;