diff -urN vm-ref/arch/alpha/mm/fault.c vm/arch/alpha/mm/fault.c
--- vm-ref/arch/alpha/mm/fault.c	Mon Sep 17 01:26:12 2001
+++ vm/arch/alpha/mm/fault.c	Mon Sep 17 01:26:25 2001
@@ -140,6 +140,7 @@
 			goto bad_area;
 	}
 
+ survive:
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
@@ -194,6 +195,12 @@
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
+	if (current->pid == 1) {
+		current->policy |= SCHED_YIELD;
+		schedule();
+		down_read(&mm->mmap_sem);
+		goto survive;
+	}
 	printk(KERN_ALERT "VM: killing process %s(%d)\n",
 	       current->comm, current->pid);
 	if (!user_mode(regs))
diff -urN vm-ref/arch/i386/mm/fault.c vm/arch/i386/mm/fault.c
--- vm-ref/arch/i386/mm/fault.c	Mon Sep 17 01:26:12 2001
+++ vm/arch/i386/mm/fault.c	Mon Sep 17 01:26:25 2001
@@ -51,8 +51,14 @@
 	start &= PAGE_MASK;
 
 	for (;;) {
-		if (handle_mm_fault(current->mm, vma, start, 1) <= 0)
-			goto bad_area;
+	survive:
+		{
+			int fault = handle_mm_fault(current->mm, vma, start, 1);
+			if (!fault)
+				goto bad_area;
+			if (fault < 0)
+				goto out_of_memory;
+		}
 		if (!size)
 			break;
 		size--;
@@ -76,6 +82,14 @@
 
 bad_area:
 	return 0;
+
+out_of_memory:
+	if (current->pid == 1) {
+		current->policy |= SCHED_YIELD;
+		schedule();
+		goto survive;
+	}
+	goto bad_area;
 }
 
 extern spinlock_t console_lock, timerlist_lock;
@@ -198,6 +212,7 @@
 				goto bad_area;
 	}
 
+ survive:
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
@@ -300,6 +315,12 @@
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
+	if (tsk->pid == 1) {
+		tsk->policy |= SCHED_YIELD;
+		schedule();
+		down_read(&mm->mmap_sem);
+		goto survive;
+	}
 	printk("VM: killing process %s\n", tsk->comm);
 	if (error_code & 4)
 		do_exit(SIGKILL);
diff -urN vm-ref/fs/buffer.c vm/fs/buffer.c
--- vm-ref/fs/buffer.c	Mon Sep 17 01:26:14 2001
+++ vm/fs/buffer.c	Mon Sep 17 01:26:25 2001
@@ -135,6 +135,7 @@
 
 inline void unlock_buffer(struct buffer_head *bh)
 {
+	clear_bit(BH_Wait_IO, &bh->b_state);
 	clear_bit(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	if (waitqueue_active(&bh->b_wait))
@@ -838,9 +839,7 @@
 static void free_more_memory(void)
 {
 	balance_dirty();
-	page_launder(GFP_NOFS, 0);		
 	wakeup_bdflush();
-	wakeup_kswapd();
 	current->policy |= SCHED_YIELD;
 	__set_current_state(TASK_RUNNING);
 	schedule();
@@ -1185,6 +1184,7 @@
 	out:
 		write_unlock(&hash_table_lock);
 		spin_unlock(&lru_list_lock);
+		touch_buffer(bh);
 		return bh;
 	}
 
@@ -1349,7 +1349,6 @@
 	struct buffer_head * bh;
 
 	bh = getblk(dev, block, size);
-	touch_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(READ, 1, &bh);
@@ -2516,34 +2515,31 @@
 	return 0;
 }
 
-/*
- * Sync all the buffers on one page..
- *
- * If we have old buffers that are locked, we'll
- * wait on them, but we won't wait on the new ones
- * we're writing out now.
- *
- * This all is required so that we can free up memory
- * later.
- *
- * Wait:
- *	0 - no wait (this does not get called - see try_to_free_buffers below)
- *	1 - start IO for dirty buffers
- *	2 - wait for completion of locked buffers
- */
-static void sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
+static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
 {
-	struct buffer_head * tmp = bh;
+	struct buffer_head * p = bh;
+	int tryagain = 1;
 
 	do {
-		struct buffer_head *p = tmp;
-		tmp = tmp->b_this_page;
-		if (buffer_locked(p)) {
-			if (gfp_mask & __GFP_WAIT)
-				__wait_on_buffer(p);
-		} else if (buffer_dirty(p))
-			ll_rw_block(WRITE, 1, &p);
-	} while (tmp != bh);
+		if (buffer_dirty(p) || buffer_locked(p)) {
+			if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
+				if (buffer_dirty(p)) {
+					ll_rw_block(WRITE, 1, &p);
+					tryagain = 0;
+				} else if (buffer_locked(p)) {
+					if (gfp_mask & __GFP_WAIT) {
+						wait_on_buffer(p);
+						tryagain = 1;
+					} else
+						tryagain = 0;
+				}
+			} else
+				tryagain = 0;
+		}
+		p = p->b_this_page;
+	} while (p != bh);
+
+	return tryagain;
 }
 
 /*
@@ -2614,16 +2610,16 @@
 	write_unlock(&hash_table_lock);
 	spin_unlock(&lru_list_lock);
 	if (gfp_mask & __GFP_IO && !(current->flags & PF_ATOMICALLOC)) {
-		if (!(gfp_mask & __GFP_HIGHIO) && PageHighMem(page))
-			return 0;
-		sync_page_buffers(bh, gfp_mask);
-		/* We waited synchronously, so we can free the buffers. */
-		if (gfp_mask & __GFP_WAIT) {
-			gfp_mask = 0;	/* no IO or waiting this time around */
-			goto cleaned_buffers_try_again;
+		if (gfp_mask & __GFP_HIGHIO || !PageHighMem(page)) {
+			if (sync_page_buffers(bh, gfp_mask)) {
+				/* no IO or waiting next time */
+				gfp_mask = 0;
+				goto cleaned_buffers_try_again;
+			}
 		}
-		wakeup_bdflush();
 	}
+	if (balance_dirty_state() >= 0)
+		wakeup_bdflush();
 	return 0;
 }
 
diff -urN vm-ref/fs/dcache.c vm/fs/dcache.c
--- vm-ref/fs/dcache.c	Mon Sep 17 01:26:14 2001
+++ vm/fs/dcache.c	Mon Sep 17 01:26:25 2001
@@ -569,7 +569,7 @@
 	if (!(gfp_mask & __GFP_FS))
 		return 0;
 
-	count = dentry_stat.nr_unused >> priority;
+	count = dentry_stat.nr_unused / priority;
 
 	prune_dcache(count);
 	kmem_cache_shrink(dentry_cache);
diff -urN vm-ref/fs/inode.c vm/fs/inode.c
--- vm-ref/fs/inode.c	Mon Sep 17 01:26:13 2001
+++ vm/fs/inode.c	Mon Sep 17 01:26:25 2001
@@ -278,27 +278,18 @@
 	}
 }
 
-static inline int try_to_sync_unused_list(struct list_head *head)
+static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes)
 {
 	struct list_head *tmp = head;
 	struct inode *inode;
 
-	while ((tmp = tmp->prev) != head) {
+	while (nr_inodes && (tmp = tmp->prev) != head) {
 		inode = list_entry(tmp, struct inode, i_list);
 
 		if (!atomic_read(&inode->i_count)) {
-			/* 
-			 * We're under PF_MEMALLOC here, and syncing the 
-			 * inode may have to allocate memory. To avoid
-			 * running into a OOM deadlock, we write one 
-			 * inode synchronously and stop syncing in case 
-			 * we're under freepages.low
-			 */
+			__sync_one(inode, 0);
+			nr_inodes--;
 
-			int sync = nr_free_pages() < freepages.low;
-			__sync_one(inode, sync);
-			if (sync) 
-				return 0;
 			/* 
 			 * __sync_one moved the inode to another list,
 			 * so we have to start looking from the list head.
@@ -306,7 +297,8 @@
 			tmp = head;
 		}
 	}
-	return 1;
+
+	return nr_inodes;
 }
 
 void sync_inodes_sb(struct super_block *sb)
@@ -402,24 +394,25 @@
 	}
 }
 
-/*
- * Called with the spinlock already held..
- */
-static void try_to_sync_unused_inodes(void)
+static void try_to_sync_unused_inodes(void * arg)
 {
 	struct super_block * sb;
+	int nr_inodes = inodes_stat.nr_unused;
 
+	spin_lock(&inode_lock);
 	spin_lock(&sb_lock);
 	sb = sb_entry(super_blocks.next);
-	for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
+	for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
 		spin_unlock(&sb_lock);
-		if (!try_to_sync_unused_list(&sb->s_dirty))
-			return;
+		nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes);
 		spin_lock(&sb_lock);
 	}
 	spin_unlock(&sb_lock);
+	spin_unlock(&inode_lock);
 }
 
+static struct tq_struct unused_inodes_flush_task;
+
 /**
  *	write_inode_now	-	write an inode to disk
  *	@inode: inode to write to disk
@@ -672,12 +665,11 @@
 {
 	LIST_HEAD(list);
 	struct list_head *entry, *freeable = &list;
-	int count, synced = 0;
+	int count;
 	struct inode * inode;
 
 	spin_lock(&inode_lock);
 
-free_unused:
 	count = 0;
 	entry = inode_unused.prev;
 	while (entry != &inode_unused)
@@ -707,18 +699,13 @@
 	dispose_list(freeable);
 
 	/* 
-	 * If we freed enough clean inodes, avoid writing 
-	 * dirty ones. Also giveup if we already tried to
-	 * sync dirty inodes.
+	 * If we didn't freed enough clean inodes schedule
+	 * a sync of the dirty inodes, we cannot do it
+	 * from here or we're either synchronously dogslow
+	 * or we deadlock with oom.
 	 */
-	if (!goal || synced)
-		return;
-	
-	synced = 1;
-
-	spin_lock(&inode_lock);
-	try_to_sync_unused_inodes();
-	goto free_unused;
+	if (goal)
+		schedule_task(&unused_inodes_flush_task);
 }
 
 int shrink_icache_memory(int priority, int gfp_mask)
@@ -735,7 +722,7 @@
 	if (!(gfp_mask & __GFP_FS))
 		return 0;
 
-	count = inodes_stat.nr_unused >> priority;
+	count = inodes_stat.nr_unused / priority;
 
 	prune_icache(count);
 	kmem_cache_shrink(inode_cachep);
@@ -1182,6 +1169,8 @@
 					 NULL);
 	if (!inode_cachep)
 		panic("cannot create inode slab cache");
+
+	unused_inodes_flush_task.routine = try_to_sync_unused_inodes;
 }
 
 /**
diff -urN vm-ref/fs/proc/proc_misc.c vm/fs/proc/proc_misc.c
--- vm-ref/fs/proc/proc_misc.c	Mon Sep 17 01:26:14 2001
+++ vm/fs/proc/proc_misc.c	Mon Sep 17 01:26:25 2001
@@ -168,9 +168,7 @@
 		"Cached:       %8lu kB\n"
 		"SwapCached:   %8lu kB\n"
 		"Active:       %8u kB\n"
-		"Inact_dirty:  %8u kB\n"
-		"Inact_clean:  %8u kB\n"
-		"Inact_target: %8lu kB\n"
+		"Inactive:     %8u kB\n"
 		"HighTotal:    %8lu kB\n"
 		"HighFree:     %8lu kB\n"
 		"LowTotal:     %8lu kB\n"
@@ -184,9 +182,7 @@
 		K(atomic_read(&page_cache_size) - swapper_space.nrpages),
 		K(swapper_space.nrpages),
 		K(nr_active_pages),
-		K(nr_inactive_dirty_pages),
-		K(nr_inactive_clean_pages()),
-		K(inactive_target),
+		K(nr_inactive_pages),
 		K(i.totalhigh),
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
diff -urN vm-ref/include/linux/fs.h vm/include/linux/fs.h
--- vm-ref/include/linux/fs.h	Mon Sep 17 01:26:14 2001
+++ vm/include/linux/fs.h	Mon Sep 17 01:26:25 2001
@@ -216,6 +216,7 @@
 	BH_Mapped,	/* 1 if the buffer has a disk mapping */
 	BH_New,		/* 1 if the buffer is new and not yet written out */
 	BH_Async,	/* 1 if the buffer is under end_buffer_io_async I/O */
+	BH_Wait_IO,	/* 1 if we should throttle on this buffer */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
diff -urN vm-ref/include/linux/highmem.h vm/include/linux/highmem.h
--- vm-ref/include/linux/highmem.h	Mon Sep 17 00:14:59 2001
+++ vm/include/linux/highmem.h	Mon Sep 17 01:26:25 2001
@@ -11,7 +11,7 @@
 #include <asm/highmem.h>
 
 /* declarations for linux/mm/highmem.c */
-FASTCALL(unsigned int nr_free_highpages(void));
+unsigned int nr_free_highpages(void);
 
 extern struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig);
 
diff -urN vm-ref/include/linux/list.h vm/include/linux/list.h
--- vm-ref/include/linux/list.h	Mon Sep 17 01:26:13 2001
+++ vm/include/linux/list.h	Mon Sep 17 01:26:25 2001
@@ -92,6 +92,7 @@
 static __inline__ void list_del(struct list_head *entry)
 {
 	__list_del(entry->prev, entry->next);
+	entry->next = entry->prev = 0;
 }
 
 /**
diff -urN vm-ref/include/linux/mm.h vm/include/linux/mm.h
--- vm-ref/include/linux/mm.h	Mon Sep 17 01:26:15 2001
+++ vm/include/linux/mm.h	Mon Sep 17 01:26:25 2001
@@ -19,7 +19,7 @@
 extern int page_cluster;
 /* The inactive_clean lists are per zone. */
 extern struct list_head active_list;
-extern struct list_head inactive_dirty_list;
+extern struct list_head inactive_list;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -154,7 +154,6 @@
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
-	unsigned long age;		/* Page aging counter. */
 	wait_queue_head_t wait;		/* Page locked?  Stand in line... */
 	struct page **pprev_hash;	/* Complement to *next_hash. */
 	struct buffer_head * buffers;	/* Buffer maps us to a disk block. */
@@ -275,16 +274,14 @@
 #define PG_dirty		 4
 #define PG_decr_after		 5
 #define PG_active		 6
-#define PG_inactive_dirty	 7
+#define PG_inactive		 7
 #define PG_slab			 8
 #define PG_swap_cache		 9
 #define PG_skip			10
-#define PG_inactive_clean	11
-#define PG_highmem		12
-#define PG_checked		13	/* kill me in 2.5.<early>. */
-				/* bits 21-29 unused */
-#define PG_arch_1		30
-#define PG_reserved		31
+#define PG_highmem		11
+#define PG_checked		12	/* kill me in 2.5.<early>. */
+#define PG_arch_1		13
+#define PG_reserved		14
 
 /* Make it prettier to test the above... */
 #define Page_Uptodate(page)	test_bit(PG_uptodate, &(page)->flags)
@@ -347,14 +344,14 @@
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page)	test_and_set_bit(PG_active, &(page)->flags)
+#define TestandClearPageActive(page)	test_and_clear_bit(PG_active, &(page)->flags)
 
-#define PageInactiveDirty(page)	test_bit(PG_inactive_dirty, &(page)->flags)
-#define SetPageInactiveDirty(page)	set_bit(PG_inactive_dirty, &(page)->flags)
-#define ClearPageInactiveDirty(page)	clear_bit(PG_inactive_dirty, &(page)->flags)
-
-#define PageInactiveClean(page)	test_bit(PG_inactive_clean, &(page)->flags)
-#define SetPageInactiveClean(page)	set_bit(PG_inactive_clean, &(page)->flags)
-#define ClearPageInactiveClean(page)	clear_bit(PG_inactive_clean, &(page)->flags)
+#define PageInactive(page)	test_bit(PG_inactive, &(page)->flags)
+#define SetPageInactive(page)	set_bit(PG_inactive, &(page)->flags)
+#define ClearPageInactive(page)	clear_bit(PG_inactive, &(page)->flags)
+#define TestandSetPageInactive(page)	test_and_set_bit(PG_inactive, &(page)->flags)
+#define TestandClearPageInactive(page)	test_and_clear_bit(PG_inactive, &(page)->flags)
 
 #ifdef CONFIG_HIGHMEM
 #define PageHighMem(page)		test_bit(PG_highmem, &(page)->flags)
@@ -380,11 +377,11 @@
  * can allocate highmem pages, the *get*page*() variants return
  * virtual kernel addresses to the allocated page(s).
  */
-extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned long order));
-extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist));
-extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order);
+extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
+extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist));
+extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
 
-static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
+static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
 {
 	/*
 	 * Gets optimized away by the compiler.
@@ -396,8 +393,8 @@
 
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 
-extern unsigned long FASTCALL(__get_free_pages(int gfp_mask, unsigned long order));
-extern unsigned long FASTCALL(get_zeroed_page(int gfp_mask));
+extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order));
+extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
 
 #define __get_free_page(gfp_mask) \
 		__get_free_pages((gfp_mask),0)
@@ -413,8 +410,8 @@
 /*
  * There is only one 'core' page-freeing function.
  */
-extern void FASTCALL(__free_pages(struct page *page, unsigned long order));
-extern void FASTCALL(free_pages(unsigned long addr, unsigned long order));
+extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
+extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
 extern void * FASTCALL(alloc_exact(unsigned int size));
 extern void FASTCALL(free_exact(void * addr, unsigned int size));
 
@@ -469,6 +466,11 @@
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void swapin_readahead(swp_entry_t);
+
+static inline int is_page_cache_freeable(struct page * page)
+{
+	return page_count(page) - !!page->buffers == 1;
+}
 
 /*
  * Work out if there are any other processes sharing this
diff -urN vm-ref/include/linux/mmzone.h vm/include/linux/mmzone.h
--- vm-ref/include/linux/mmzone.h	Mon Sep 17 00:14:59 2001
+++ vm/include/linux/mmzone.h	Mon Sep 17 01:26:25 2001
@@ -39,14 +39,12 @@
 	 */
 	spinlock_t		lock;
 	unsigned long		free_pages;
-	unsigned long		inactive_clean_pages;
-	unsigned long		inactive_dirty_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	int			need_balance;
 
 	/*
 	 * free areas of different sizes
 	 */
-	struct list_head	inactive_clean_list;
 	free_area_t		free_area[MAX_ORDER];
 
 	/*
@@ -101,6 +99,7 @@
 typedef struct pglist_data {
 	zone_t node_zones[MAX_NR_ZONES];
 	zonelist_t node_zonelists[GFP_ZONEMASK+1];
+	int nr_zones;
 	struct page *node_mem_map;
 	unsigned long *valid_addr_bitmap;
 	struct bootmem_data *bdata;
@@ -114,8 +113,8 @@
 extern int numnodes;
 extern pg_data_t *pgdat_list;
 
-#define memclass(pgzone, tzone)	(((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \
-			&& ((pgzone) <= (tzone)))
+#define memclass(pgzone, classzone)	(((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \
+			&& ((pgzone) <= (classzone)))
 
 /*
  * The following two are not meant for general usage. They are here as
diff -urN vm-ref/include/linux/pagemap.h vm/include/linux/pagemap.h
--- vm-ref/include/linux/pagemap.h	Mon Sep 17 00:14:59 2001
+++ vm/include/linux/pagemap.h	Mon Sep 17 01:26:25 2001
@@ -29,7 +29,6 @@
 #define PAGE_CACHE_ALIGN(addr)	(((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)
 
 #define page_cache_get(x)	get_page(x)
-#define page_cache_free(x)	__free_page(x)
 #define page_cache_release(x)	__free_page(x)
 
 static inline struct page *page_cache_alloc(struct address_space *x)
diff -urN vm-ref/include/linux/sched.h vm/include/linux/sched.h
--- vm-ref/include/linux/sched.h	Mon Sep 17 01:26:15 2001
+++ vm/include/linux/sched.h	Mon Sep 17 01:26:25 2001
@@ -318,6 +318,8 @@
 	int get_child_timeslice;
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
+	struct list_head local_pages;
+	unsigned int allocation_order, nr_local_pages;
 
 /* task state */
 	struct linux_binfmt *binfmt;
@@ -417,6 +419,7 @@
 #define PF_MEMALLOC	(1UL<<5)	/* Allocating memory */
 #define PF_USEDFPU	(1UL<<6)	/* task used FPU this quantum (SMP) */
 #define PF_ATOMICALLOC	(1UL<<7)	/* do not block during memalloc */
+#define PF_FREE_PAGES	(1UL<<8)	/* per process page freeing */
 
 /*
  * Ptrace flags
diff -urN vm-ref/include/linux/slab.h vm/include/linux/slab.h
--- vm-ref/include/linux/slab.h	Mon Sep 17 00:14:59 2001
+++ vm/include/linux/slab.h	Mon Sep 17 01:26:25 2001
@@ -60,7 +60,7 @@
 extern void *kmalloc(size_t, int);
 extern void kfree(const void *);
 
-extern void kmem_cache_reap(int);
+extern int FASTCALL(kmem_cache_reap(int));
 extern int slabinfo_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data);
 extern int slabinfo_write_proc(struct file *file, const char *buffer,
diff -urN vm-ref/include/linux/swap.h vm/include/linux/swap.h
--- vm-ref/include/linux/swap.h	Mon Sep 17 01:26:13 2001
+++ vm/include/linux/swap.h	Mon Sep 17 01:26:25 2001
@@ -80,10 +80,9 @@
 
 extern int nr_swap_pages;
 extern unsigned int nr_free_pages(void);
-extern unsigned int nr_inactive_clean_pages(void);
 extern unsigned int nr_free_buffer_pages(void);
 extern int nr_active_pages;
-extern int nr_inactive_dirty_pages;
+extern int nr_inactive_pages;
 extern atomic_t nr_async_pages;
 extern struct address_space swapper_space;
 extern atomic_t page_cache_size;
@@ -99,26 +98,20 @@
 struct zone_t;
 
 /* linux/mm/swap.c */
-extern int memory_pressure;
-extern void deactivate_page(struct page *);
-extern void deactivate_page_nolock(struct page *);
-extern void activate_page(struct page *);
-extern void activate_page_nolock(struct page *);
-extern void lru_cache_add(struct page *);
-extern void __lru_cache_del(struct page *);
-extern void lru_cache_del(struct page *);
-extern void recalculate_vm_stats(void);
+extern void FASTCALL(lru_cache_add(struct page *));
+extern void FASTCALL(__lru_cache_del(struct page *));
+extern void FASTCALL(lru_cache_del(struct page *));
+
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
+extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern struct page * reclaim_page(zone_t *);
 extern wait_queue_head_t kswapd_wait;
-extern wait_queue_head_t kreclaimd_wait;
-extern int page_launder(int, int);
-extern int free_shortage(void);
-extern int inactive_shortage(void);
-extern void wakeup_kswapd(void);
-extern int try_to_free_pages(unsigned int gfp_mask);
+extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *);
@@ -134,7 +127,6 @@
 extern struct page * read_swap_cache_async(swp_entry_t);
 
 /* linux/mm/oom_kill.c */
-extern int out_of_memory(void);
 extern void oom_kill(void);
 
 /*
@@ -146,7 +138,6 @@
 extern void free_page_and_swap_cache(struct page *page);
 
 /* linux/mm/swapfile.c */
-extern int vm_swap_full(void);
 extern unsigned int nr_swapfiles;
 extern struct swap_info_struct swap_info[];
 extern int is_swap_partition(kdev_t);
@@ -179,90 +170,51 @@
 
 extern spinlock_t pagemap_lru_lock;
 
-extern void FASTCALL(mark_page_accessed(struct page *));
-
-/*
- * Page aging defines.
- * Since we do exponential decay of the page age, we
- * can chose a fairly large maximum.
- */
-#define PAGE_AGE_START 2
-#define PAGE_AGE_ADV 3
-#define PAGE_AGE_MAX 64
-
 /*
  * List add/del helper macros. These must be called
  * with the pagemap_lru_lock held!
  */
-#define DEBUG_ADD_PAGE \
-	if (PageActive(page) || PageInactiveDirty(page) || \
-					PageInactiveClean(page)) BUG();
-
-#define ZERO_PAGE_BUG \
-	if (page_count(page) == 0) BUG();
-
-#define add_page_to_active_list(page) { \
-	DEBUG_ADD_PAGE \
-	ZERO_PAGE_BUG \
-	page->age = 0; \
-	ClearPageReferenced(page); \
-	SetPageActive(page); \
-	list_add(&(page)->lru, &active_list); \
-	nr_active_pages++; \
-}
-
-#define add_page_to_inactive_dirty_list(page) { \
-	DEBUG_ADD_PAGE \
-	ZERO_PAGE_BUG \
-	SetPageInactiveDirty(page); \
-	list_add(&(page)->lru, &inactive_dirty_list); \
-	nr_inactive_dirty_pages++; \
-	page->zone->inactive_dirty_pages++; \
-}
-
-#define add_page_to_inactive_clean_list(page) { \
-	DEBUG_ADD_PAGE \
-	ZERO_PAGE_BUG \
-	SetPageInactiveClean(page); \
-	list_add(&(page)->lru, &page->zone->inactive_clean_list); \
-	page->zone->inactive_clean_pages++; \
-}
-
-#define del_page_from_active_list(page) { \
-	list_del(&(page)->lru); \
-	ClearPageActive(page); \
-	nr_active_pages--; \
-	DEBUG_ADD_PAGE \
-	ZERO_PAGE_BUG \
-}
-
-#define del_page_from_inactive_dirty_list(page) { \
-	list_del(&(page)->lru); \
-	ClearPageInactiveDirty(page); \
-	nr_inactive_dirty_pages--; \
-	page->zone->inactive_dirty_pages--; \
-	DEBUG_ADD_PAGE \
-	ZERO_PAGE_BUG \
-}
-
-#define del_page_from_inactive_clean_list(page) { \
-	list_del(&(page)->lru); \
-	ClearPageInactiveClean(page); \
-	page->zone->inactive_clean_pages--; \
-	DEBUG_ADD_PAGE \
-	ZERO_PAGE_BUG \
-}
-
-/*
- * In mm/swap.c::recalculate_vm_stats(), we substract
- * inactive_target from memory_pressure every second.
- * This means that memory_pressure is smoothed over
- * 64 (1 << INACTIVE_SHIFT) seconds.
- */
-#define INACTIVE_SHIFT 6
-#define inactive_target min_t(unsigned long, \
-			    (memory_pressure >> INACTIVE_SHIFT), \
-			    (num_physpages / 4))
+#define DEBUG_LRU_PAGE(page)			\
+do {						\
+	if (PageActive(page))			\
+		BUG();				\
+	if (PageInactive(page))			\
+		BUG();				\
+	if (page_count(page) == 0)		\
+		BUG();				\
+} while (0)
+
+#define add_page_to_active_list(page)		\
+do {						\
+	DEBUG_LRU_PAGE(page);			\
+	SetPageActive(page);			\
+	list_add(&(page)->lru, &active_list);	\
+	nr_active_pages++;			\
+} while (0)
+
+#define add_page_to_inactive_list(page)		\
+do {						\
+	DEBUG_LRU_PAGE(page);			\
+	SetPageInactive(page);		\
+	list_add(&(page)->lru, &inactive_list);	\
+	nr_inactive_pages++;			\
+} while (0)
+
+#define del_page_from_active_list(page)		\
+do {						\
+	list_del(&(page)->lru);			\
+	ClearPageActive(page);			\
+	nr_active_pages--;			\
+	DEBUG_LRU_PAGE(page);			\
+} while (0)
+
+#define del_page_from_inactive_list(page)	\
+do {						\
+	list_del(&(page)->lru);			\
+	ClearPageInactive(page);		\
+	nr_inactive_pages--;			\
+	DEBUG_LRU_PAGE(page);			\
+} while (0)
 
 /*
  * Ugly ugly ugly HACK to make sure the inactive lists
diff -urN vm-ref/include/linux/swapctl.h vm/include/linux/swapctl.h
--- vm-ref/include/linux/swapctl.h	Sun Sep 16 14:15:30 2001
+++ vm/include/linux/swapctl.h	Mon Sep 17 01:26:25 2001
@@ -1,28 +1,6 @@
 #ifndef _LINUX_SWAPCTL_H
 #define _LINUX_SWAPCTL_H
 
-#include <asm/page.h>
-#include <linux/fs.h>
-
-typedef struct buffer_mem_v1
-{
-	unsigned int	min_percent;
-	unsigned int	borrow_percent;
-	unsigned int	max_percent;
-} buffer_mem_v1;
-typedef buffer_mem_v1 buffer_mem_t;
-extern buffer_mem_t buffer_mem;
-extern buffer_mem_t page_cache;
-
-typedef struct freepages_v1
-{
-	unsigned int	min;
-	unsigned int	low;
-	unsigned int	high;
-} freepages_v1;
-typedef freepages_v1 freepages_t;
-extern freepages_t freepages;
-
 typedef struct pager_daemon_v1
 {
 	unsigned int	tries_base;
diff -urN vm-ref/kernel/fork.c vm/kernel/fork.c
--- vm-ref/kernel/fork.c	Mon Sep 17 01:26:15 2001
+++ vm/kernel/fork.c	Mon Sep 17 01:26:25 2001
@@ -649,6 +649,8 @@
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
+	INIT_LIST_HEAD(&p->local_pages);
+
 	retval = -ENOMEM;
 	/* copy all the process information */
 	if (copy_files(clone_flags, p))
diff -urN vm-ref/kernel/signal.c vm/kernel/signal.c
--- vm-ref/kernel/signal.c	Mon Sep 17 01:26:12 2001
+++ vm/kernel/signal.c	Mon Sep 17 01:26:25 2001
@@ -382,7 +382,7 @@
 	switch (sig) {
 	case SIGKILL: case SIGCONT:
 		/* Wake up the process if stopped.  */
-		if (t->state == TASK_STOPPED)
+		if (t->state == TASK_STOPPED && !(t->ptrace & PT_PTRACED))
 			wake_up_process(t);
 		t->exit_code = 0;
 		rm_sig_from_queue(SIGSTOP, t);
diff -urN vm-ref/kernel/sysctl.c vm/kernel/sysctl.c
--- vm-ref/kernel/sysctl.c	Mon Sep 17 01:26:12 2001
+++ vm/kernel/sysctl.c	Mon Sep 17 01:26:25 2001
@@ -254,17 +254,11 @@
 };
 
 static ctl_table vm_table[] = {
-	{VM_FREEPG, "freepages", 
-	 &freepages, sizeof(freepages_t), 0444, NULL, &proc_dointvec},
 	{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
 	 &proc_dointvec_minmax, &sysctl_intvec, NULL,
 	 &bdflush_min, &bdflush_max},
 	{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
-	{VM_BUFFERMEM, "buffermem",
-	 &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGECACHE, "pagecache",
-	 &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
 	{VM_PAGERDAEMON, "kswapd",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
 	{VM_PGT_CACHE, "pagetable_cache", 
diff -urN vm-ref/mm/filemap.c vm/mm/filemap.c
--- vm-ref/mm/filemap.c	Mon Sep 17 01:26:15 2001
+++ vm/mm/filemap.c	Mon Sep 17 01:26:25 2001
@@ -419,6 +419,9 @@
 		if (page->index == offset)
 			break;
 	}
+
+	SetPageReferenced(page);
+
 not_found:
 	return page;
 }
@@ -596,9 +599,9 @@
 	if (!PageLocked(page))
 		BUG();
 
+	page->index = index;
 	page_cache_get(page);
 	spin_lock(&pagecache_lock);
-	page->index = index;
 	add_page_to_inode_queue(mapping, page);
 	add_page_to_hash_queue(page, page_hash(mapping, index));
 	lru_cache_add(page);
@@ -618,7 +621,7 @@
 	if (PageLocked(page))
 		BUG();
 
-	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1) | (1 << PG_checked));
+	flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
 	page->flags = flags | (1 << PG_locked);
 	page_cache_get(page);
 	page->index = offset;
@@ -658,7 +661,8 @@
  * This adds the requested page to the page cache if it isn't already there,
  * and schedules an I/O to read in its contents from disk.
  */
-static inline int page_cache_read(struct file * file, unsigned long offset) 
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+static int page_cache_read(struct file * file, unsigned long offset)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
@@ -666,7 +670,7 @@
 	struct page *page; 
 
 	spin_lock(&pagecache_lock);
-	page = __find_page_nolock(mapping, offset, *hash); 
+	page = __find_page_nolock(mapping, offset, *hash);
 	spin_unlock(&pagecache_lock);
 	if (page)
 		return 0;
@@ -684,7 +688,7 @@
 	 * We arrive here in the unlikely event that someone 
 	 * raced with us and added our page to the cache first.
 	 */
-	page_cache_free(page);
+	page_cache_release(page);
 	return 0;
 }
 
@@ -692,6 +696,8 @@
  * Read in an entire cluster at once.  A cluster is usually a 64k-
  * aligned block that includes the page requested in "offset."
  */
+static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
+					     unsigned long filesize));
 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 	unsigned long filesize)
 {
@@ -1081,26 +1087,6 @@
 }
 
 /*
- * Mark a page as having seen activity.
- *
- * If it was already so marked, move it
- * to the active queue and drop the referenced
- * bit. Otherwise, just mark it for future
- * action..
- */
-void mark_page_accessed(struct page *page)
-{
-	if (!PageActive(page) && PageReferenced(page)) {
-		activate_page(page);
-		ClearPageReferenced(page);
-		return;
-	}
-
-	/* Mark the page referenced, AFTER checking for previous usage.. */
-	SetPageReferenced(page);
-}
-
-/*
  * This is a generic file read routine, and uses the
  * inode->i_op->readpage() function for the actual low-level
  * stuff.
@@ -1224,7 +1210,6 @@
 		index += offset >> PAGE_CACHE_SHIFT;
 		offset &= ~PAGE_CACHE_MASK;
 
-		mark_page_accessed(page);
 		page_cache_release(page);
 		if (ret == nr && desc->count)
 			continue;
@@ -1320,7 +1305,7 @@
 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
 	filp->f_reada = 1;
 	if (cached_page)
-		page_cache_free(cached_page);
+		page_cache_release(cached_page);
 	UPDATE_ATIME(inode);
 }
 
@@ -2569,8 +2554,7 @@
 		}
 	}
 	if (cached_page)
-		page_cache_free(cached_page);
-	mark_page_accessed(page);
+		page_cache_release(cached_page);
 	return page;
 }
 
@@ -2639,7 +2623,7 @@
 	struct page *cached_page = NULL;
 	struct page *page = __grab_cache_page(mapping,index,&cached_page);
 	if (cached_page)
-		page_cache_free(cached_page);
+		page_cache_release(cached_page);
 	return page;
 }
 
@@ -2861,7 +2845,7 @@
 	*ppos = pos;
 
 	if (cached_page)
-		page_cache_free(cached_page);
+		page_cache_release(cached_page);
 
 	/* For now, when the user asks for O_SYNC, we'll actually
 	 * provide O_DSYNC. */
diff -urN vm-ref/mm/memory.c vm/mm/memory.c
--- vm-ref/mm/memory.c	Mon Sep 17 01:26:12 2001
+++ vm/mm/memory.c	Mon Sep 17 01:26:27 2001
@@ -274,12 +274,8 @@
 		 * free_page() used to be able to clear swap cache
 		 * entries.  We may now have to do it manually.  
 		 */
-		if (page->mapping) {
-			if (pte_dirty(pte))
-				set_page_dirty(page);
-			if (pte_young(pte))
-				mark_page_accessed(page);
-		}
+		if (pte_dirty(pte) && page->mapping)
+			set_page_dirty(page);
 		free_page_and_swap_cache(page);
 		return 1;
 	}
@@ -928,6 +924,10 @@
 			break;
 		/* Recheck swapcachedness once the page is locked */
 		can_reuse = exclusive_swap_page(old_page);
+#if 1
+		if (can_reuse)
+			delete_from_swap_cache_nolock(old_page);
+#endif
 		UnlockPage(old_page);
 		if (!can_reuse)
 			break;
@@ -1154,12 +1154,13 @@
 
 	swap_free(entry);
 	if (exclusive_swap_page(page)) {	
+#if 0
 		if (write_access)
 			pte = pte_mkwrite(pte_mkdirty(pte));
-		if (vm_swap_full()) {
-			delete_from_swap_cache_nolock(page);
-			pte = pte_mkdirty(pte);
-		}
+#else
+		delete_from_swap_cache_nolock(page);
+		pte = pte_mkwrite(pte_mkdirty(pte));
+#endif
 	}
 	UnlockPage(page);
 
diff -urN vm-ref/mm/numa.c vm/mm/numa.c
--- vm-ref/mm/numa.c	Wed Jul  4 04:03:47 2001
+++ vm/mm/numa.c	Mon Sep 17 01:26:25 2001
@@ -31,7 +31,7 @@
 
 #endif /* !CONFIG_DISCONTIGMEM */
 
-struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
+struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order)
 {
 #ifdef CONFIG_NUMA
 	return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
@@ -82,8 +82,8 @@
 	memset(pgdat->valid_addr_bitmap, 0, size);
 }
 
-static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
-	unsigned long order)
+static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask,
+	unsigned int order)
 {
 	return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
 }
@@ -92,7 +92,7 @@
  * This can be refined. Currently, tries to do round robin, instead
  * should do concentratic circle search, starting from current node.
  */
-struct page * _alloc_pages(unsigned int gfp_mask, unsigned long order)
+struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
 {
 	struct page *ret = 0;
 	pg_data_t *start, *temp;
diff -urN vm-ref/mm/oom_kill.c vm/mm/oom_kill.c
--- vm-ref/mm/oom_kill.c	Thu Aug 16 22:03:41 2001
+++ vm/mm/oom_kill.c	Mon Sep 17 01:26:25 2001
@@ -192,43 +192,3 @@
 	schedule();
 	return;
 }
-
-/**
- * out_of_memory - is the system out of memory?
- *
- * Returns 0 if there is still enough memory left,
- * 1 when we are out of memory (otherwise).
- */
-int out_of_memory(void)
-{
-	long cache_mem, limit;
-
-	/* Enough free memory?  Not OOM. */
-	if (nr_free_pages() > freepages.min)
-		return 0;
-
-	if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low)
-		return 0;
-
-	/*
-	 * If the buffer and page cache (excluding swap cache) are over
-	 * their (/proc tunable) minimum, we're still not OOM.  We test
-	 * this to make sure we don't return OOM when the system simply
-	 * has a hard time with the cache.
-	 */
-	cache_mem = atomic_read(&page_cache_size);
-	cache_mem += atomic_read(&buffermem_pages);
-	cache_mem -= swapper_space.nrpages;
-	limit = (page_cache.min_percent + buffer_mem.min_percent);
-	limit *= num_physpages / 100;
-
-	if (cache_mem > limit)
-		return 0;
-
-	/* Enough swap space left?  Not OOM. */
-	if (nr_swap_pages > 0)
-		return 0;
-
-	/* Else... */
-	return 1;
-}
diff -urN vm-ref/mm/page_alloc.c vm/mm/page_alloc.c
--- vm-ref/mm/page_alloc.c	Mon Sep 17 01:26:14 2001
+++ vm/mm/page_alloc.c	Mon Sep 17 01:26:25 2001
@@ -21,16 +21,16 @@
 
 int nr_swap_pages;
 int nr_active_pages;
-int nr_inactive_dirty_pages;
+int nr_inactive_pages;
+struct list_head inactive_list;
+struct list_head active_list;
 pg_data_t *pgdat_list;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
-static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
-static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
+static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 32, 128, 128, };
+static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
+static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
 
-struct list_head active_list;
-struct list_head inactive_dirty_list;
 /*
  * Free_page() adds the page to the free lists. This is optimized for
  * fast normal cases (no error jumps taken normally).
@@ -62,8 +62,8 @@
  * Hint: -mask = 1+~mask
  */
 
-static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
-static void __free_pages_ok (struct page *page, unsigned long order)
+static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
+static void __free_pages_ok (struct page *page, unsigned int order)
 {
 	unsigned long index, page_idx, mask, flags;
 	free_area_t *area;
@@ -84,14 +84,15 @@
 		BUG();
 	if (PageActive(page))
 		BUG();
-	if (PageInactiveDirty(page))
+	if (PageInactive(page))
 		BUG();
-	if (PageInactiveClean(page))
+	if (PageDirty(page))
 		BUG();
 
-	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
-	page->age = PAGE_AGE_START;
-	
+	if (current->flags & PF_FREE_PAGES)
+		goto local_freelist;
+ back_local_freelist:
+
 	zone = page->zone;
 
 	mask = (~0UL) << order;
@@ -136,14 +137,21 @@
 	memlist_add_head(&(base + page_idx)->list, &area->free_list);
 
 	spin_unlock_irqrestore(&zone->lock, flags);
+	return;
 
+ local_freelist:
 	/*
-	 * We don't want to protect this variable from race conditions
-	 * since it's nothing important, but we do want to make sure
-	 * it never gets negative.
+	 * This is a little subtle: if the allocation order
+	 * wanted is major than zero we'd better take all the pages
+	 * local since we must deal with fragmentation too and we
+	 * can't rely on the nr_local_pages information.
 	 */
-	if (memory_pressure > NR_CPUS)
-		memory_pressure--;
+	if (current->nr_local_pages && !current->allocation_order)
+		goto back_local_freelist;
+
+	list_add(&page->list, &current->local_pages);
+	page->index = order;
+	current->nr_local_pages++;
 }
 
 #define MARK_USED(index, order, area) \
@@ -170,11 +178,11 @@
 	return page;
 }
 
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
+static struct page * rmqueue(zone_t *zone, unsigned int order)
 {
 	free_area_t * area = zone->free_area + order;
-	unsigned long curr_order = order;
+	unsigned int curr_order = order;
 	struct list_head *head, *curr;
 	unsigned long flags;
 	struct page *page;
@@ -194,7 +202,7 @@
 			index = page - zone->zone_mem_map;
 			if (curr_order != MAX_ORDER-1)
 				MARK_USED(index, curr_order, area);
-			zone->free_pages -= 1 << order;
+			zone->free_pages -= 1UL << order;
 
 			page = expand(zone, page, index, order, curr_order, area);
 			spin_unlock_irqrestore(&zone->lock, flags);
@@ -202,7 +210,7 @@
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
 				BUG();
-			DEBUG_ADD_PAGE
+			DEBUG_LRU_PAGE(page);
 			return page;	
 		}
 		curr_order++;
@@ -213,305 +221,193 @@
 	return NULL;
 }
 
-#define PAGES_MIN	0
-#define PAGES_LOW	1
-#define PAGES_HIGH	2
-
-/*
- * This function does the dirty work for __alloc_pages
- * and is separated out to keep the code size smaller.
- * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
- */
-static struct page * __alloc_pages_limit(zonelist_t *zonelist,
-			unsigned long order, int limit, int direct_reclaim)
+#ifndef CONFIG_DISCONTIGMEM
+struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
 {
-	zone_t **zone = zonelist->zones;
+	return __alloc_pages(gfp_mask, order,
+		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+}
+#endif
 
-	for (;;) {
-		zone_t *z = *(zone++);
-		unsigned long water_mark;
+static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
+static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
+{
+	struct page * page = NULL;
+	int __freed = 0;
 
-		if (!z)
-			break;
-		if (!z->size)
-			BUG();
+	if (!(gfp_mask & __GFP_WAIT))
+		goto out;
+	if (in_interrupt())
+		BUG();
+
+	current->allocation_order = order;
+	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
+
+	__freed = try_to_free_pages(classzone, gfp_mask, order);
+
+	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
+
+	if (current->nr_local_pages) {
+		struct list_head * entry, * local_pages;
+		struct page * tmp;
+		int nr_pages;
+
+		local_pages = &current->local_pages;
+
+		if (__freed) {
+			/* pick from the last inserted so we're lifo */
+			entry = local_pages->next;
+			do {
+				tmp = list_entry(entry, struct page, list);
+				if (tmp->index == order && memclass(tmp->zone, classzone)) {
+					list_del(entry);
+					current->nr_local_pages--;
+					set_page_count(tmp, 1);
+					page = tmp;
+
+					if (page->buffers)
+						BUG();
+					if (page->mapping)
+						BUG();
+					if (!VALID_PAGE(page))
+						BUG();
+					if (PageSwapCache(page))
+						BUG();
+					if (PageLocked(page))
+						BUG();
+					if (PageDecrAfter(page))
+						BUG();
+					if (PageActive(page))
+						BUG();
+					if (PageInactive(page))
+						BUG();
+					if (PageDirty(page))
+						BUG();
 
-		/*
-		 * We allocate if the number of free + inactive_clean
-		 * pages is above the watermark.
-		 */
-		switch (limit) {
-			default:
-			case PAGES_MIN:
-				water_mark = z->pages_min;
-				break;
-			case PAGES_LOW:
-				water_mark = z->pages_low;
-				break;
-			case PAGES_HIGH:
-				water_mark = z->pages_high;
+					break;
+				}
+			} while ((entry = entry->next) != local_pages);
 		}
 
-		if (z->free_pages + z->inactive_clean_pages >= water_mark) {
-			struct page *page = NULL;
-			/* If possible, reclaim a page directly. */
-			if (direct_reclaim)
-				page = reclaim_page(z);
-			/* If that fails, fall back to rmqueue. */
-			if (!page)
-				page = rmqueue(z, order);
-			if (page)
-				return page;
+		nr_pages = current->nr_local_pages;
+		/* free in reverse order so that the global order will be lifo */
+		while ((entry = local_pages->prev) != local_pages) {
+			list_del(entry);
+			tmp = list_entry(entry, struct page, list);
+			__free_pages_ok(tmp, tmp->index);
+			if (!nr_pages--)
+				BUG();
 		}
+		current->nr_local_pages = 0;
 	}
-
-	/* Found nothing. */
-	return NULL;
+ out:
+	*freed = __freed;
+	return page;
 }
 
-#ifndef CONFIG_DISCONTIGMEM
-struct page *_alloc_pages(unsigned int gfp_mask, unsigned long order)
+static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
 {
-	return __alloc_pages(gfp_mask, order,
-		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+	long free = zone->free_pages - (1UL << order);
+	return free >= 0 ? free : 0;
 }
-#endif
 
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
-struct page * __alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist)
+struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 {
-	zone_t **zone;
-	int direct_reclaim = 0;
+	zone_t **zone, * classzone;
 	struct page * page;
+	int freed;
 
-	/*
-	 * Allocations put pressure on the VM subsystem.
-	 */
-	memory_pressure++;
+	zone = zonelist->zones;
+	classzone = *zone;
+	for (;;) {
+		zone_t *z = *(zone++);
+		if (!z)
+			break;
 
-	/*
-	 * (If anyone calls gfp from interrupts nonatomically then it
-	 * will sooner or later tripped up by a schedule().)
-	 *
-	 * We are falling back to lower-level zones if allocation
-	 * in a higher zone fails.
-	 */
+		if (zone_free_pages(z, order) > z->pages_low) {
+			page = rmqueue(z, order);
+			if (page)
+				return page;
+		}
+	}
 
-	/*
-	 * Can we take pages directly from the inactive_clean
-	 * list?
-	 */
-	if (order == 0 && (gfp_mask & __GFP_WAIT))
-		direct_reclaim = 1;
+	classzone->need_balance = 1;
+	mb();
+	if (waitqueue_active(&kswapd_wait))
+		wake_up_interruptible(&kswapd_wait);
 
-try_again:
-	/*
-	 * First, see if we have any zones with lots of free memory.
-	 *
-	 * We allocate free memory first because it doesn't contain
-	 * any data ... DUH!
-	 */
 	zone = zonelist->zones;
 	for (;;) {
 		zone_t *z = *(zone++);
 		if (!z)
 			break;
-		if (!z->size)
-			BUG();
 
-		if (z->free_pages >= z->pages_low) {
+		if (zone_free_pages(z, order) > (gfp_mask & __GFP_HIGH ? z->pages_min / 2 : z->pages_min)) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
-		} else if (z->free_pages < z->pages_min &&
-					waitqueue_active(&kreclaimd_wait)) {
-				wake_up_interruptible(&kreclaimd_wait);
 		}
 	}
 
-	/*
-	 * Try to allocate a page from a zone with a HIGH
-	 * amount of free + inactive_clean pages.
-	 *
-	 * If there is a lot of activity, inactive_target
-	 * will be high and we'll have a good chance of
-	 * finding a page using the HIGH limit.
-	 */
-	page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
-	if (page)
-		return page;
+	/* here we're in the low on memory slow path */
 
-	/*
-	 * Then try to allocate a page from a zone with more
-	 * than zone->pages_low free + inactive_clean pages.
-	 *
-	 * When the working set is very large and VM activity
-	 * is low, we're most likely to have our allocation
-	 * succeed here.
-	 */
-	page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
-	if (page)
-		return page;
+	if (current->flags & PF_MEMALLOC) {
+		zone = zonelist->zones;
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-	/*
-	 * OK, none of the zones on our zonelist has lots
-	 * of pages free.
-	 *
-	 * We wake up kswapd, in the hope that kswapd will
-	 * resolve this situation before memory gets tight.
-	 *
-	 * We also yield the CPU, because that:
-	 * - gives kswapd a chance to do something
-	 * - slows down allocations, in particular the
-	 *   allocations from the fast allocator that's
-	 *   causing the problems ...
-	 * - ... which minimises the impact the "bad guys"
-	 *   have on the rest of the system
-	 * - if we don't have __GFP_IO set, kswapd may be
-	 *   able to free some memory we can't free ourselves
-	 */
-	wakeup_kswapd();
-	if (gfp_mask & __GFP_WAIT && !(current->flags & PF_ATOMICALLOC)) {
-		__set_current_state(TASK_RUNNING);
-		current->policy |= SCHED_YIELD;
-		schedule();
+			page = rmqueue(z, order);
+			if (page)
+				return page;
+		}
+		return NULL;
 	}
 
-	/*
-	 * After waking up kswapd, we try to allocate a page
-	 * from any zone which isn't critical yet.
-	 *
-	 * Kswapd should, in most situations, bring the situation
-	 * back to normal in no time.
-	 */
-	page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+	page = balance_classzone(classzone, gfp_mask, order, &freed);
 	if (page)
 		return page;
 
-	/*
-	 * Damn, we didn't succeed.
-	 *
-	 * This can be due to 2 reasons:
-	 * - we're doing a higher-order allocation
-	 * 	--> move pages to the free list until we succeed
-	 * - we're /really/ tight on memory
-	 * 	--> try to free pages ourselves with page_launder
-	 */
-	if (!(current->flags & PF_MEMALLOC)) {
-		/*
-		 * Are we dealing with a higher order allocation?
-		 *
-		 * Move pages from the inactive_clean to the free list
-		 * in the hope of creating a large, physically contiguous
-		 * piece of free memory.
-		 */
-		if (order > 0 && (gfp_mask & __GFP_WAIT)) {
-			zone = zonelist->zones;
-			/* First, clean some dirty pages. */
-			current->flags |= PF_MEMALLOC;
-			page_launder(gfp_mask, 1);
-			current->flags &= ~PF_MEMALLOC;
-			for (;;) {
-				zone_t *z = *(zone++);
-				if (!z)
-					break;
-				if (!z->size)
-					continue;
-				while (z->inactive_clean_pages) {
-					struct page * page;
-					/* Move one page to the free list. */
-					page = reclaim_page(z);
-					if (!page)
-						break;
-					__free_page(page);
-					/* Try if the allocation succeeds. */
-					page = rmqueue(z, order);
-					if (page)
-						return page;
-				}
-			}
-		}
-		/*
-		 * When we arrive here, we are really tight on memory.
-		 * Since kswapd didn't succeed in freeing pages for us,
-		 * we try to help it.
-		 *
-		 * Single page allocs loop until the allocation succeeds.
-		 * Multi-page allocs can fail due to memory fragmentation;
-		 * in that case we bail out to prevent infinite loops and
-		 * hanging device drivers ...
-		 *
-		 * Another issue are GFP_NOFS allocations; because they
-		 * do not have __GFP_FS set it's possible we cannot make
-		 * any progress freeing pages, in that case it's better
-		 * to give up than to deadlock the kernel looping here.
-		 */
-		if (gfp_mask & __GFP_WAIT) {
-			if (!order || free_shortage()) {
-				int progress = try_to_free_pages(gfp_mask);
-				if (progress || (gfp_mask & __GFP_FS))
-					goto try_again;
-				/*
-				 * Fail in case no progress was made and the
-				 * allocation may not be able to block on IO.
-				 */
-				return NULL;
-			}
-		}
-	}
-
-	/*
-	 * Final phase: allocate anything we can!
-	 *
-	 * Higher order allocations, GFP_ATOMIC allocations and
-	 * recursive allocations (PF_MEMALLOC) end up here.
-	 *
-	 * Only recursive allocations can use the very last pages
-	 * in the system, otherwise it would be just too easy to
-	 * deadlock the system...
-	 */
 	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		struct page * page = NULL;
-		if (!z)
-			break;
-		if (!z->size)
-			BUG();
+	if (__builtin_expect(freed, 1)) {
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-		/*
-		 * SUBTLE: direct_reclaim is only possible if the task
-		 * becomes PF_MEMALLOC while looping above. This will
-		 * happen when the OOM killer selects this task for
-		 * instant execution...
-		 */
-		if (direct_reclaim) {
-			page = reclaim_page(z);
-			if (page)
-				return page;
+			if (zone_free_pages(z, order) > (gfp_mask & __GFP_HIGH ? z->pages_min / 2 : z->pages_min)) {
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
 		}
+	} else {
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-		/* XXX: is pages_min/4 a good amount to reserve for this? */
-		if (z->free_pages < z->pages_min / 4 &&
-				!(current->flags & PF_MEMALLOC))
-			continue;
-		page = rmqueue(z, order);
-		if (page)
-			return page;
+			if (zone_free_pages(z, order) > z->pages_high) {
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
+		}
 	}
 
-	/* No luck.. */
-	printk(KERN_ERR "__alloc_pages: %lu-order allocation failed (gfp=0x%x/%i).\n",
-		order, gfp_mask, !!(current->flags & PF_MEMALLOC));
+	printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i) from %p\n",
+	       order, gfp_mask, !!(current->flags & PF_MEMALLOC), __builtin_return_address(0));
 	return NULL;
 }
 
 /*
  * Common helper functions.
  */
-unsigned long __get_free_pages(int gfp_mask, unsigned long order)
+unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
 {
 	struct page * page;
 
@@ -521,7 +417,7 @@
 	return (unsigned long) page_address(page);
 }
 
-unsigned long get_zeroed_page(int gfp_mask)
+unsigned long get_zeroed_page(unsigned int gfp_mask)
 {
 	struct page * page;
 
@@ -534,13 +430,13 @@
 	return 0;
 }
 
-void __free_pages(struct page *page, unsigned long order)
+void __free_pages(struct page *page, unsigned int order)
 {
 	if (!PageReserved(page) && put_page_testzero(page))
 		__free_pages_ok(page, order);
 }
 
-void free_pages(unsigned long addr, unsigned long order)
+void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0)
 		__free_pages(virt_to_page(addr), order);
@@ -613,24 +509,6 @@
 }
 
 /*
- * Total amount of inactive_clean (allocatable) RAM:
- */
-unsigned int nr_inactive_clean_pages (void)
-{
-	unsigned int sum;
-	zone_t *zone;
-	pg_data_t *pgdat = pgdat_list;
-
-	sum = 0;
-	while (pgdat) {
-		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
-			sum += zone->inactive_clean_pages;
-		pgdat = pgdat->node_next;
-	}
-	return sum;
-}
-
-/*
  * Amount of free RAM allocatable as buffer memory:
  */
 unsigned int nr_free_buffer_pages (void)
@@ -645,12 +523,12 @@
 		zonep = zonelist->zones;
 
 		for (zone = *zonep++; zone; zone = *zonep++)
-			sum += zone->free_pages + zone->inactive_clean_pages + zone->inactive_dirty_pages;
+			sum += zone->free_pages;
 
 		pgdat = pgdat->node_next;
 	} while (pgdat);
 
-	return sum;
+	return sum + nr_active_pages + nr_inactive_pages;
 }
 
 #if CONFIG_HIGHMEM
@@ -674,21 +552,17 @@
  */
 void show_free_areas_core(pg_data_t *pgdat)
 {
- 	unsigned long order;
+ 	unsigned int order;
 	unsigned type;
 
 	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
 		nr_free_pages() << (PAGE_SHIFT-10),
 		nr_free_highpages() << (PAGE_SHIFT-10));
 
-	printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
-		nr_active_pages,
-		nr_inactive_dirty_pages,
-		nr_inactive_clean_pages(),
-		nr_free_pages(),
-		freepages.min,
-		freepages.low,
-		freepages.high);
+	printk("( Active: %d, inactive: %d, free: %d )\n",
+	       nr_active_pages,
+	       nr_inactive_pages,
+	       nr_free_pages());
 
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
@@ -808,8 +682,8 @@
 			
 	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 
-	memlist_init(&active_list);
-	memlist_init(&inactive_dirty_list);
+	INIT_LIST_HEAD(&active_list);
+	INIT_LIST_HEAD(&inactive_list);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -828,6 +702,7 @@
 	pgdat->node_size = totalpages;
 	pgdat->node_start_paddr = zone_start_paddr;
 	pgdat->node_start_mapnr = (lmem_map - mem_map);
+	pgdat->nr_zones = 0;
 
 	/*
 	 * Initially all pages are reserved - free ones are freed
@@ -857,12 +732,11 @@
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
-		zone->inactive_clean_pages = 0;
-		zone->inactive_dirty_pages = 0;
-		memlist_init(&zone->inactive_clean_list);
 		if (!size)
 			continue;
 
+		pgdat->nr_zones = j+1;
+
 		mask = (realsize / zone_balance_ratio[j]);
 		if (mask < zone_balance_min[j])
 			mask = zone_balance_min[j];
@@ -871,20 +745,7 @@
 		zone->pages_min = mask;
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
-		/*
-		 * Add these free targets to the global free target;
-		 * we have to be SURE that freepages.high is higher
-		 * than SUM [zone->pages_min] for all zones, otherwise
-		 * we may have bad bad problems.
-		 *
-		 * This means we cannot make the freepages array writable
-		 * in /proc, but have to add a separate extra_free_target
-		 * for people who require it to catch load spikes in eg.
-		 * gigabit ethernet routing...
-		 */
-		freepages.min += mask;
-		freepages.low += mask*2;
-		freepages.high += mask*3;
+
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
diff -urN vm-ref/mm/shmem.c vm/mm/shmem.c
--- vm-ref/mm/shmem.c	Mon Sep 17 00:15:00 2001
+++ vm/mm/shmem.c	Mon Sep 17 01:26:25 2001
@@ -353,7 +353,7 @@
 		swap_free(*entry);
 		*entry = (swp_entry_t) {0};
 		delete_from_swap_cache_nolock(page);
-		flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
+		flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1);
 		page->flags = flags | (1 << PG_dirty);
 		add_to_page_cache_locked(page, mapping, idx);
 		info->swapped--;
diff -urN vm-ref/mm/slab.c vm/mm/slab.c
--- vm-ref/mm/slab.c	Mon Sep 17 01:26:14 2001
+++ vm/mm/slab.c	Mon Sep 17 01:26:25 2001
@@ -1704,7 +1704,7 @@
  *
  * Called from do_try_to_free_pages() and __alloc_pages()
  */
-void kmem_cache_reap (int gfp_mask)
+int kmem_cache_reap (int gfp_mask)
 {
 	slab_t *slabp;
 	kmem_cache_t *searchp;
@@ -1712,12 +1712,13 @@
 	unsigned int best_pages;
 	unsigned int best_len;
 	unsigned int scan;
+	int ret = 0;
 
 	if (gfp_mask & __GFP_WAIT && !(current->flags & PF_ATOMICALLOC))
 		down(&cache_chain_sem);
 	else
 		if (down_trylock(&cache_chain_sem))
-			return;
+			return 0;
 
 	scan = REAP_SCANLEN;
 	best_len = 0;
@@ -1821,9 +1822,10 @@
 		spin_lock_irq(&best_cachep->spinlock);
 	}
 	spin_unlock_irq(&best_cachep->spinlock);
+	ret = scan * (1 << best_cachep->gfporder);
 out:
 	up(&cache_chain_sem);
-	return;
+	return ret;
 }
 
 #ifdef CONFIG_PROC_FS
diff -urN vm-ref/mm/swap.c vm/mm/swap.c
--- vm-ref/mm/swap.c	Mon Sep 17 01:26:13 2001
+++ vm/mm/swap.c	Mon Sep 17 01:26:25 2001
@@ -24,50 +24,13 @@
 #include <asm/uaccess.h> /* for copy_to/from_user */
 #include <asm/pgtable.h>
 
-/*
- * We identify three levels of free memory.  We never let free mem
- * fall below the freepages.min except for atomic allocations.  We
- * start background swapping if we fall below freepages.high free
- * pages, and we begin intensive swapping below freepages.low.
- *
- * Actual initialization is done in mm/page_alloc.c
- */
-freepages_t freepages = {
-	0,	/* freepages.min */
-	0,	/* freepages.low */
-	0	/* freepages.high */
-};
-
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
-/*
- * This variable contains the amount of page steals the system
- * is doing, averaged over a minute. We use this to determine how
- * many inactive pages we should have.
- *
- * In reclaim_page and __alloc_pages: memory_pressure++
- * In __free_pages_ok: memory_pressure--
- * In recalculate_vm_stats the value is decayed (once a second)
- */
-int memory_pressure;
-
 /* We track the number of pages currently being asynchronously swapped
    out, so that we don't try to swap TOO many pages out at once */
 atomic_t nr_async_pages = ATOMIC_INIT(0);
 
-buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
-	60	/* maximum percent buffer */
-};
-
-buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
-	15,	/* borrow percent page cache */
-	75	/* maximum */
-};
-
 pager_daemon_t pager_daemon = {
 	512,	/* base number for calculating the number of tries */
 	SWAP_CLUSTER_MAX,	/* minimum number of tries */
@@ -87,25 +50,9 @@
  */
 void deactivate_page_nolock(struct page * page)
 {
-	/*
-	 * One for the cache, one for the extra reference the
-	 * caller has and (maybe) one for the buffers.
-	 *
-	 * This isn't perfect, but works for just about everything.
-	 * Besides, as long as we don't move unfreeable pages to the
-	 * inactive_clean list it doesn't need to be perfect...
-	 */
-	int maxcount = (page->buffers ? 3 : 2);
-	page->age = 0;
-	ClearPageReferenced(page);
-
-	/*
-	 * Don't touch it if it's not on the active list.
-	 * (some pages aren't on any list at all)
-	 */
-	if (PageActive(page) && page_count(page) <= maxcount) {
+	if (PageActive(page)) {
 		del_page_from_active_list(page);
-		add_page_to_inactive_dirty_list(page);
+		add_page_to_inactive_list(page);
 	}
 }	
 
@@ -121,22 +68,10 @@
  */
 void activate_page_nolock(struct page * page)
 {
-	if (PageInactiveDirty(page)) {
-		del_page_from_inactive_dirty_list(page);
-		add_page_to_active_list(page);
-	} else if (PageInactiveClean(page)) {
-		del_page_from_inactive_clean_list(page);
+	if (PageInactive(page)) {
+		del_page_from_inactive_list(page);
 		add_page_to_active_list(page);
-	} else {
-		/*
-		 * The page was not on any list, so we take care
-		 * not to do anything.
-		 */
 	}
-
-	/* Make sure the page gets a fair chance at staying active. */
-	if (page->age < PAGE_AGE_START)
-		page->age = PAGE_AGE_START;
 }
 
 void activate_page(struct page * page)
@@ -152,11 +87,10 @@
  */
 void lru_cache_add(struct page * page)
 {
-	spin_lock(&pagemap_lru_lock);
 	if (!PageLocked(page))
 		BUG();
-	add_page_to_inactive_dirty_list(page);
-	page->age = 0;
+	spin_lock(&pagemap_lru_lock);
+	add_page_to_inactive_list(page);
 	spin_unlock(&pagemap_lru_lock);
 }
 
@@ -171,14 +105,11 @@
 {
 	if (PageActive(page)) {
 		del_page_from_active_list(page);
-	} else if (PageInactiveDirty(page)) {
-		del_page_from_inactive_dirty_list(page);
-	} else if (PageInactiveClean(page)) {
-		del_page_from_inactive_clean_list(page);
-	} else {
+	} else if (PageInactive(page)) {
+		del_page_from_inactive_list(page);
+	} else
 		printk("VM: __lru_cache_del, found unknown page ?!\n");
-	}
-	DEBUG_ADD_PAGE
+	DEBUG_LRU_PAGE(page);
 }
 
 /**
@@ -192,22 +123,6 @@
 	spin_lock(&pagemap_lru_lock);
 	__lru_cache_del(page);
 	spin_unlock(&pagemap_lru_lock);
-}
-
-/**
- * recalculate_vm_stats - recalculate VM statistics
- *
- * This function should be called once a second to recalculate
- * some useful statistics the VM subsystem uses to determine
- * its behaviour.
- */
-void recalculate_vm_stats(void)
-{
-	/*
-	 * Substract one second worth of memory_pressure from
-	 * memory_pressure.
-	 */
-	memory_pressure -= (memory_pressure >> INACTIVE_SHIFT);
 }
 
 /*
diff -urN vm-ref/mm/swap_state.c vm/mm/swap_state.c
--- vm-ref/mm/swap_state.c	Mon Sep 17 00:15:00 2001
+++ vm/mm/swap_state.c	Mon Sep 17 01:26:25 2001
@@ -23,17 +23,6 @@
  */
 static int swap_writepage(struct page *page)
 {
-	/* One for the page cache, one for this user, one for page->buffers */
-	if (page_count(page) > 2 + !!page->buffers)
-		goto in_use;
-	if (swap_count(page) > 1)
-		goto in_use;
-
-	delete_from_swap_cache_nolock(page);
-	UnlockPage(page);
-	return 0;
-
-in_use:
 	rw_swap_page(WRITE, page);
 	return 0;
 }
@@ -81,9 +70,8 @@
 		BUG();
 
 	/* clear PG_dirty so a subsequent set_page_dirty takes effect */
-	flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_arch_1));
+	flags = page->flags & ~(1 << PG_error | 1 << PG_dirty | 1 << PG_arch_1 | 1 << PG_referenced);
 	page->flags = flags | (1 << PG_uptodate);
-	page->age = PAGE_AGE_START;
 	add_to_page_cache_locked(page, &swapper_space, entry.val);
 }
 
diff -urN vm-ref/mm/swapfile.c vm/mm/swapfile.c
--- vm-ref/mm/swapfile.c	Mon Sep 17 00:15:00 2001
+++ vm/mm/swapfile.c	Mon Sep 17 01:26:25 2001
@@ -31,25 +31,6 @@
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
-/*
- * When swap space gets filled up, we will set this flag.
- * This will make do_swap_page(), in the page fault path,
- * free swap entries on swapin so we'll reclaim swap space
- * in order to be able to swap something out.
- *
- * At the moment we start reclaiming when swap usage goes
- * over 80% of swap space.
- *
- * XXX: Random numbers, fixme.
- */
-#define SWAP_FULL_PCT 80
-int vm_swap_full (void)
-{
-	int swap_used = total_swap_pages - nr_swap_pages;
-
-	return swap_used * 100 > total_swap_pages * SWAP_FULL_PCT;
-}
-
 #define SWAPFILE_CLUSTER 256
 
 static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count)
@@ -471,7 +452,6 @@
 		lock_page(page);
 		if (PageSwapCache(page))
 			delete_from_swap_cache_nolock(page);
-		SetPageDirty(page);
 		UnlockPage(page);
 		flush_page_to_ram(page);
 
@@ -512,6 +492,7 @@
 			mmput(start_mm);
 			start_mm = new_start_mm;
 		}
+		ClearPageDirty(page);
 		page_cache_release(page);
 
 		/*
diff -urN vm-ref/mm/vmscan.c vm/mm/vmscan.c
--- vm-ref/mm/vmscan.c	Mon Sep 17 01:26:13 2001
+++ vm/mm/vmscan.c	Mon Sep 17 01:26:25 2001
@@ -32,19 +32,6 @@
  */
 #define DEF_PRIORITY (6)
 
-static inline void age_page_up(struct page *page)
-{
-	unsigned age = page->age + PAGE_AGE_ADV;
-	if (age > PAGE_AGE_MAX)
-		age = PAGE_AGE_MAX;
-	page->age = age;
-}
-
-static inline void age_page_down(struct page * page)
-{
-	page->age /= 2;
-}
-
 /*
  * The swap-out function returns 1 if it successfully
  * scanned all the pages it was asked to (`count').
@@ -54,55 +41,24 @@
  * doesn't count as having freed a page.
  */
 
-/*
- * Estimate whether a zone has enough inactive or free pages..
- */
-static unsigned int zone_inactive_plenty(zone_t *zone)
-{
-	unsigned int inactive;
-
-	if (!zone->size)
-		return 0;
-		
-	inactive = zone->inactive_dirty_pages;
-	inactive += zone->inactive_clean_pages;
-	inactive += zone->free_pages;
-
-	return (inactive > (zone->size / 3));
-}
-
-static unsigned int zone_free_plenty(zone_t *zone)
-{
-	unsigned int free;
-
-	free = zone->free_pages;
-	free += zone->inactive_clean_pages;
-
-	return free > zone->pages_high*2;
-}
-
 /* mm->page_table_lock is held. mmap_sem is not held */
-static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
+static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
 {
 	pte_t pte;
 	swp_entry_t entry;
 
-	/* 
-	 * If we are doing a zone-specific scan, do not
-	 * touch pages from zones which don't have a 
-	 * shortage.
-	 */
-	if (zone_inactive_plenty(page->zone))
-		return;
-
 	/* Don't look at this pte if it's been accessed recently. */
 	if (ptep_test_and_clear_young(page_table)) {
-		mark_page_accessed(page);
-		return;
+		flush_tlb_page(vma, address);
+		SetPageReferenced(page);
+		return 0;
 	}
 
+	if (!memclass(page->zone, classzone))
+		return 0;
+
 	if (TryLockPage(page))
-		return;
+		return 0;
 
 	/* From this point on, the odds are that we're going to
 	 * nuke this pte, so read and clear the pte.  This hook
@@ -127,11 +83,14 @@
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
 		mm->rss--;
-		if (!PageReferenced(page))
-			deactivate_page(page);
 		UnlockPage(page);
-		page_cache_release(page);
-		return;
+		{
+			int freeable = page_count(page) - !!page->buffers <= 2;
+			if (freeable)
+				deactivate_page(page);
+			page_cache_release(page);
+			return freeable;
+		}
 	}
 
 	/*
@@ -178,11 +137,11 @@
 out_unlock_restore:
 	set_pte(page_table, pte);
 	UnlockPage(page);
-	return;
+	return 0;
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -206,20 +165,22 @@
 			struct page *page = pte_page(*pte);
 
 			if (VALID_PAGE(page) && !PageReserved(page)) {
-				try_to_swap_out(mm, vma, address, pte, page);
-				if (!--count)
+				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
+				if (!count) {
+					address += PAGE_SIZE;
 					break;
+				}
 			}
 		}
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
-	mm->swap_address = address + PAGE_SIZE;
+	mm->swap_address = address;
 	return count;
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -239,7 +200,7 @@
 		end = pgd_end;
 	
 	do {
-		count = swap_out_pmd(mm, vma, pmd, address, end, count);
+		count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
 		if (!count)
 			break;
 		address = (address + PMD_SIZE) & PMD_MASK;
@@ -249,7 +210,7 @@
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
+static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -264,7 +225,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
 		if (!count)
 			break;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -273,25 +234,26 @@
 	return count;
 }
 
+/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
+struct mm_struct *swap_mm = &init_mm;
+
 /*
  * Returns non-zero if we scanned all `count' pages
  */
-static int swap_out_mm(struct mm_struct * mm, int count)
+static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone_t * classzone)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
 
-	if (!count)
-		return 1;
-	/*
-	 * Go through process' page directory.
-	 */
-
 	/*
 	 * Find the proper vm-area after freezing the vma chain 
 	 * and ptes.
 	 */
 	spin_lock(&mm->page_table_lock);
+	*race = 1;
+	if (swap_mm != mm)
+		goto out_unlock;
+	*race = 0;
 	address = mm->swap_address;
 	vma = find_vma(mm, address);
 	if (vma) {
@@ -299,7 +261,7 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			count = swap_out_vma(mm, vma, address, count);
+			count = swap_out_vma(mm, vma, address, count, classzone);
 			if (!count)
 				goto out_unlock;
 			vma = vma->vm_next;
@@ -311,224 +273,106 @@
 	/* Reset to 0 when we reach the end of address space */
 	mm->swap_address = 0;
 
+	spin_lock(&mmlist_lock);
+	swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
+	spin_unlock(&mmlist_lock);
+
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
-	return !count;
-}
-
-#define SWAP_MM_SHIFT	4
-#define SWAP_SHIFT	5
-#define SWAP_MIN	8
 
-static inline int swap_amount(struct mm_struct *mm)
-{
-	int nr = mm->rss >> SWAP_SHIFT;
-	if (nr < SWAP_MIN) {
-		nr = SWAP_MIN;
-		if (nr > mm->rss)
-			nr = mm->rss;
-	}
-	return nr;
+	return count;
 }
 
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
-
-static void swap_out(unsigned int priority, int gfp_mask)
+static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
+static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
 {
-	int counter;
-	int retval = 0;
-	struct mm_struct *mm = current->mm;
-
-	/* Always start by trying to penalize the process that is allocating memory */
-	if (mm)
-		retval = swap_out_mm(mm, swap_amount(mm));
+	int counter, race;
+	struct mm_struct *mm;
 
 	/* Then, look at the other mm's */
-	counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority;
+	counter = mmlist_nr / priority;
 	do {
+		if (current->need_resched)
+			schedule();
+
 		spin_lock(&mmlist_lock);
 		mm = swap_mm;
 		if (mm == &init_mm) {
 			mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 			if (mm == &init_mm)
 				goto empty;
+			swap_mm = mm;
 		}
-		/* Set pointer for next call to next in the list */
-		swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 
 		/* Make sure the mm doesn't disappear when we drop the lock.. */
 		atomic_inc(&mm->mm_users);
 		spin_unlock(&mmlist_lock);
 
-		/* Walk about 6% of the address space each time */
-		retval |= swap_out_mm(mm, swap_amount(mm));
+		nr_pages = swap_out_mm(mm, nr_pages, &race, classzone);
+
 		mmput(mm);
-	} while (--counter >= 0);
-	return;
+
+		if (!nr_pages)
+			return 1;
+	} while (race || --counter >= 0);
+
+	return 0;
 
 empty:
 	spin_unlock(&mmlist_lock);
+	return 0;
 }
 
-
-/**
- * reclaim_page -	reclaims one page from the inactive_clean list
- * @zone: reclaim a page from this zone
- *
- * The pages on the inactive_clean can be instantly reclaimed.
- * The tests look impressive, but most of the time we'll grab
- * the first page of the list and exit successfully.
- */
-struct page * reclaim_page(zone_t * zone)
+static int FASTCALL(shrink_cache(struct list_head * lru, int * max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask));
+static int shrink_cache(struct list_head * lru, int * max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask)
 {
-	struct page * page = NULL;
-	struct list_head * page_lru;
-	int maxscan;
+	LIST_HEAD(active_local_lru);
+	LIST_HEAD(inactive_local_lru);
+	struct list_head * entry;
+	int __max_scan = *max_scan;
 
-	/*
-	 * We only need the pagemap_lru_lock if we don't reclaim the page,
-	 * but we have to grab the pagecache_lock before the pagemap_lru_lock
-	 * to avoid deadlocks and most of the time we'll succeed anyway.
-	 */
-	spin_lock(&pagecache_lock);
 	spin_lock(&pagemap_lru_lock);
-	maxscan = zone->inactive_clean_pages;
-	while ((page_lru = zone->inactive_clean_list.prev) !=
-			&zone->inactive_clean_list && maxscan--) {
-		page = list_entry(page_lru, struct page, lru);
+	while (__max_scan && (entry = lru->prev) != lru) {
+		struct page * page;
 
-		/* Wrong page on list?! (list corruption, should not happen) */
-		if (!PageInactiveClean(page)) {
-			printk("VM: reclaim_page, wrong page on list.\n");
-			list_del(page_lru);
-			page->zone->inactive_clean_pages--;
+		if (__builtin_expect(current->need_resched, 0)) {
+			spin_unlock(&pagemap_lru_lock);
+			schedule();
+			spin_lock(&pagemap_lru_lock);
 			continue;
 		}
 
-		/* Page is referenced? Clear and move to the head of the list.. */
-		if (PageTestandClearReferenced(page)) {
-			list_del(page_lru);
-			list_add(page_lru, &zone->inactive_clean_list);
-		}
+		page = list_entry(entry, struct page, lru);
 
-		/* The page is dirty, or locked, move to inactive_dirty list. */
-		if (page->buffers || PageDirty(page) || TryLockPage(page)) {
-			del_page_from_inactive_clean_list(page);
-			add_page_to_inactive_dirty_list(page);
-			continue;
-		}
+		if (__builtin_expect(!PageInactive(page) && !PageActive(page), 0))
+			BUG();
 
-		/* Page is in use?  Move it to the active list. */
-		if (page_count(page) > 1) {
-			UnlockPage(page);
-			del_page_from_inactive_clean_list(page);
-			add_page_to_active_list(page);
+		if (PageTestandClearReferenced(page)) {
+			if (PageInactive(page)) {
+				del_page_from_inactive_list(page);
+				add_page_to_active_list(page);
+			} else if (PageActive(page)) {
+				list_del(entry);
+				list_add(entry, &active_list);
+			} else
+				BUG();
 			continue;
 		}
 
-		/* OK, remove the page from the caches. */
-		if (PageSwapCache(page)) {
-			__delete_from_swap_cache(page);
-			goto found_page;
-		}
-
-		if (page->mapping) {
-			__remove_inode_page(page);
-			goto found_page;
-		}
-
-		/* We should never ever get here. */
-		printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
-		list_del(page_lru);
-		zone->inactive_clean_pages--;
-		UnlockPage(page);
-	}
-	/* Reset page pointer, maybe we encountered an unfreeable page. */
-	page = NULL;
-	goto out;
-
-found_page:
-	memory_pressure++;
-	del_page_from_inactive_clean_list(page);
-	UnlockPage(page);
-	page->age = PAGE_AGE_START;
-	if (page_count(page) != 1)
-		printk("VM: reclaim_page, found page with count %d!\n",
-				page_count(page));
-out:
-	spin_unlock(&pagemap_lru_lock);
-	spin_unlock(&pagecache_lock);
-	return page;
-}
-
-/**
- * page_launder - clean dirty inactive pages, move to inactive_clean list
- * @gfp_mask: what operations we are allowed to do
- * @sync: are we allowed to do synchronous IO in emergencies ?
- *
- * When this function is called, we are most likely low on free +
- * inactive_clean pages. Since we want to refill those pages as
- * soon as possible, we'll make two loops over the inactive list,
- * one to move the already cleaned pages to the inactive_clean lists
- * and one to (often asynchronously) clean the dirty inactive pages.
- *
- * In situations where kswapd cannot keep up, user processes will
- * end up calling this function. Since the user process needs to
- * have a page before it can continue with its allocation, we'll
- * do synchronous page flushing in that case.
- *
- * This code used to be heavily inspired by the FreeBSD source code. 
- * Thanks go out to Matthew Dillon.
- */
-#define CAN_DO_FS		(gfp_mask & __GFP_FS)
-int page_launder(int gfp_mask, int sync)
-{
-	int maxscan, cleaned_pages;
-	struct list_head * page_lru;
-	struct page * page;
-
-	cleaned_pages = 0;
-
-	/* Will we wait on IO? */
-	if (!sync)
-		gfp_mask &= ~__GFP_WAIT;
-
-	spin_lock(&pagemap_lru_lock);
-	maxscan = nr_inactive_dirty_pages >> DEF_PRIORITY;
-	while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
-				maxscan-- > 0) {
-		page = list_entry(page_lru, struct page, lru);
+		deactivate_page_nolock(page);
+		list_del(entry);
+		list_add_tail(entry, &inactive_local_lru);
 
-		/* Wrong page on list?! (list corruption, should not happen) */
-		if (!PageInactiveDirty(page)) {
-			printk("VM: page_launder, wrong page on list.\n");
-			list_del(page_lru);
-			nr_inactive_dirty_pages--;
-			page->zone->inactive_dirty_pages--;
+		if (__builtin_expect(!memclass(page->zone, classzone), 0))
 			continue;
-		}
 
-		/* Page is referenced? Clear and move to the head of the list.. */
-		if (PageTestandClearReferenced(page)) {
-			list_del(page_lru);
-			list_add(page_lru, &inactive_dirty_list);
-		}
-
-		/* Page is in use?  Move it to the active list. */
-		if ((!page->buffers && page_count(page) > 1)) {
-			del_page_from_inactive_dirty_list(page);
-			add_page_to_active_list(page);
-			continue;
-		}
+		__max_scan--;
 
-		/* 
-		 * If this zone has plenty of pages free,
-		 * don't spend time on cleaning it.
-		 */
-		if (zone_free_plenty(page->zone)) {
-			list_del(page_lru);
-			list_add(page_lru, &inactive_dirty_list);
+		/* Racy check to avoid trylocking when not worthwhile */
+		if (!page->buffers && page_count(page) != 1) {
+			activate_page_nolock(page);
+			list_del(entry);
+			list_add_tail(entry, &active_local_lru);
 			continue;
 		}
 
@@ -536,362 +380,252 @@
 		 * The page is locked. IO in progress?
 		 * Move it to the back of the list.
 		 */
-		if (TryLockPage(page)) {
-			list_del(page_lru);
-			list_add(page_lru, &inactive_dirty_list);
+		if (__builtin_expect(TryLockPage(page), 0))
 			continue;
-		}
 
-		/*
-		 * Dirty swap-cache page? Write it out if
-		 * last copy..
-		 */
-		if (PageDirty(page)) {
+		if (PageDirty(page) && is_page_cache_freeable(page)) {
+			/*
+			 * It is not critical here to write it only if
+			 * the page is unmapped beause any direct writer
+			 * like O_DIRECT would set the PG_dirty bitflag
+			 * on the phisical page after having successfully
+			 * pinned it and after the I/O to the page is finished,
+			 * so the direct writes to the page cannot get lost.
+			 */
 			int (*writepage)(struct page *);
 
-			/* Can a page get here without page->mapping? */
-			if (!page->mapping)
-				goto page_active;
 			writepage = page->mapping->a_ops->writepage;
-			if (!writepage)
-				goto page_active;
+			if (gfp_mask & __GFP_FS && writepage) {
+				spin_unlock(&pagemap_lru_lock);
 
-			/* Can't do it? Move it to the back of the list */
-			if (!CAN_DO_FS) {
-				list_del(page_lru);
-				list_add(page_lru, &inactive_dirty_list);
-				UnlockPage(page);
+				ClearPageDirty(page);
+				writepage(page);
+
+				spin_lock(&pagemap_lru_lock);
 				continue;
 			}
-
-			/* OK, do a physical asynchronous write to swap.  */
-			ClearPageDirty(page);
-			page_cache_get(page);
-			spin_unlock(&pagemap_lru_lock);
-
-			writepage(page);
-			page_cache_release(page);
-
-			/* And re-start the thing.. */
-			spin_lock(&pagemap_lru_lock);
-			continue;
 		}
 
 		/*
 		 * If the page has buffers, try to free the buffer mappings
-		 * associated with this page. If we succeed we either free
-		 * the page (in case it was a buffercache only page) or we
-		 * move the page to the inactive_clean list.
-		 *
-		 * On the first round, we should free all previously cleaned
-		 * buffer pages
+		 * associated with this page. If we succeed we try to free
+		 * the page as well.
 		 */
 		if (page->buffers) {
-			int clearedbuf;
-			int freed_page = 0;
-
-			/*
-			 * Since we might be doing disk IO, we have to
-			 * drop the spinlock and take an extra reference
-			 * on the page so it doesn't go away from under us.
-			 */
-			del_page_from_inactive_dirty_list(page);
-			page_cache_get(page);
 			spin_unlock(&pagemap_lru_lock);
 
-			/* Try to free the page buffers. */
-			clearedbuf = try_to_free_buffers(page, gfp_mask);
+			/* avoid to free a locked page */
+			page_cache_get(page);
 
-			/*
-			 * Re-take the spinlock. Note that we cannot
-			 * unlock the page yet since we're still
-			 * accessing the page_struct here...
-			 */
-			spin_lock(&pagemap_lru_lock);
+			if (try_to_free_buffers(page, gfp_mask)) {
+				if (!page->mapping) {
+					UnlockPage(page);
+
+					/*
+					 * Account we successfully freed a page
+					 * of buffer cache.
+					 */
+					atomic_dec(&buffermem_pages);
 
-			/* The buffers were not freed. */
-			if (!clearedbuf) {
-				add_page_to_inactive_dirty_list(page);
-
-			/* The page was only in the buffer cache. */
-			} else if (!page->mapping) {
-				atomic_dec(&buffermem_pages);
-				freed_page = 1;
-				cleaned_pages++;
+					spin_lock(&pagemap_lru_lock);
+					__lru_cache_del(page);
 
-			/* The page has more users besides the cache and us. */
-			} else if (page_count(page) > 2) {
-				add_page_to_active_list(page);
+					/* effectively free the page here */
+					page_cache_release(page);
 
-			/* OK, we "created" a freeable page. */
-			} else /* page->mapping && page_count(page) == 2 */ {
-				add_page_to_inactive_clean_list(page);
-				cleaned_pages++;
-			}
+					if (--nr_pages)
+						continue;
+					break;
+				} else {
+					/*
+					 * The page is still in pagecache so undo the stuff
+					 * before the try_to_free_buffers since we've not
+					 * finished and we can now try the next step.
+					 */
+					page_cache_release(page);
 
-			/*
-			 * Unlock the page and drop the extra reference.
-			 * We can only do it here because we are accessing
-			 * the page struct above.
-			 */
-			UnlockPage(page);
-			page_cache_release(page);
+					spin_lock(&pagemap_lru_lock);
+				}
+			} else {
+				/* failed to drop the buffers so stop here */
+				UnlockPage(page);
+				page_cache_release(page);
 
-			continue;
-		} else if (page->mapping && !PageDirty(page)) {
-			/*
-			 * If a page had an extra reference in
-			 * deactivate_page(), we will find it here.
-			 * Now the page is really freeable, so we
-			 * move it to the inactive_clean list.
-			 */
-			del_page_from_inactive_dirty_list(page);
-			add_page_to_inactive_clean_list(page);
-			UnlockPage(page);
-			cleaned_pages++;
-		} else {
-page_active:
-			/*
-			 * OK, we don't know what to do with the page.
-			 * It's no use keeping it here, so we move it to
-			 * the active list.
-			 */
-			del_page_from_inactive_dirty_list(page);
-			add_page_to_active_list(page);
-			UnlockPage(page);
+				spin_lock(&pagemap_lru_lock);
+				continue;
+			}
 		}
-	}
-	spin_unlock(&pagemap_lru_lock);
-
-	/* Return the number of pages moved to the inactive_clean list. */
-	return cleaned_pages;
-}
 
-/**
- * refill_inactive_scan - scan the active list and find pages to deactivate
- * @priority: the priority at which to scan
- *
- * This function will scan a portion of the active list to find
- * unused pages, those pages will then be moved to the inactive list.
- */
-static int refill_inactive_scan(unsigned int priority)
-{
-	struct list_head * page_lru;
-	struct page * page;
-	int maxscan = nr_active_pages >> priority;
-	int page_active = 0;
-	int nr_deactivated = 0;
+		if (__builtin_expect(!page->mapping, 0))
+			BUG();
 
-	/* Take the lock while messing with the list... */
-	spin_lock(&pagemap_lru_lock);
-	while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
-		page = list_entry(page_lru, struct page, lru);
+		if (__builtin_expect(!spin_trylock(&pagecache_lock), 0)) {
+			/* we hold the page lock so the page cannot go away from under us */
+			spin_unlock(&pagemap_lru_lock);
 
-		/* Wrong page on list?! (list corruption, should not happen) */
-		if (!PageActive(page)) {
-			printk("VM: refill_inactive, wrong page on list.\n");
-			list_del(page_lru);
-			nr_active_pages--;
-			continue;
+			spin_lock(&pagecache_lock);
+			spin_lock(&pagemap_lru_lock);
 		}
 
 		/*
-		 * Do not deactivate pages from zones which 
-		 * have plenty inactive pages.
+		 * this is the non-racy check, it is critical to check
+		 * PageDirty _after_ we made sure the page is freeable
+		 * so not in use by anybody.
 		 */
-
-		if (zone_inactive_plenty(page->zone)) {
-			page_active = 1;
-			goto skip_page;
+		if (!is_page_cache_freeable(page) || PageDirty(page)) {
+			spin_unlock(&pagecache_lock);
+			UnlockPage(page);
+			continue;
 		}
 
-		/* Do aging on the pages. */
-		if (PageTestandClearReferenced(page)) {
-			age_page_up(page);
-			page_active = 1;
-		} else {
-			age_page_down(page);
-			/*
-			 * Since we don't hold a reference on the page
-			 * ourselves, we have to do our test a bit more
-			 * strict then deactivate_page(). This is needed
-			 * since otherwise the system could hang shuffling
-			 * unfreeable pages from the active list to the
-			 * inactive_dirty list and back again...
-			 *
-			 * SUBTLE: we can have buffer pages with count 1.
-			 */
-			if (page_count(page) <=	(page->buffers ? 2 : 1)) {
-				deactivate_page_nolock(page);
-				page_active = 0;
-			} else {
-				page_active = 1;
-			}
-		}
-		/*
-		 * If the page is still on the active list, move it
-		 * to the other end of the list. Otherwise we exit if
-		 * we have done enough work.
-		 */
-		if (page_active || PageActive(page)) {
-skip_page:
-			list_del(page_lru);
-			list_add(page_lru, &active_list);
-		} else {
-			nr_deactivated++;
-		}
+		/* point of no return */
+		if (__builtin_expect(!PageSwapCache(page), 1))
+			__remove_inode_page(page);
+		else
+			__delete_from_swap_cache(page);
+		spin_unlock(&pagecache_lock);
+
+		__lru_cache_del(page);
+
+		UnlockPage(page);
+
+		/* effectively free the page here */
+		page_cache_release(page);
+
+		if (--nr_pages)
+			continue;
+		break;
 	}
+
+	list_splice(&inactive_local_lru, &inactive_list);
+	list_splice(&active_local_lru, &active_list);
 	spin_unlock(&pagemap_lru_lock);
 
-	return nr_deactivated;
+	*max_scan = __max_scan;
+	return nr_pages;
 }
 
-/*
- * Check if there are zones with a severe shortage of free pages,
- * or if all zones have a minor shortage.
- */
-int free_shortage(void)
+static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
+static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
 {
-	pg_data_t *pgdat;
-	unsigned int global_free = 0;
-	unsigned int global_target = freepages.high;
-
-	/* Are we low on free pages anywhere? */
-	pgdat = pgdat_list;
-	do {
-		int i;
-		for(i = 0; i < MAX_NR_ZONES; i++) {
-			zone_t *zone = pgdat->node_zones+ i;
-			unsigned int free;
+	int max_scan = (nr_inactive_pages + nr_active_pages / priority) / priority;
 
-			if (!zone->size)
-				continue;
+	nr_pages -= kmem_cache_reap(gfp_mask);
+	if (nr_pages <= 0)
+		return 0;
 
-			free = zone->free_pages;
-			free += zone->inactive_clean_pages;
+	nr_pages = shrink_cache(&inactive_list, &max_scan, nr_pages, classzone, gfp_mask);
+	if (nr_pages <= 0)
+		return 0;
 
-			/* Local shortage? */
-			if (free < zone->pages_low)
-				return 1;
+	shrink_dcache_memory(priority, gfp_mask);
+	shrink_icache_memory(priority, gfp_mask);
 
-			global_free += free;
-		}
-		pgdat = pgdat->node_next;
-	} while (pgdat);
+	nr_pages = shrink_cache(&active_list, &max_scan, nr_pages, classzone, gfp_mask);
+	if (nr_pages <= 0)
+		return 0;
 
-	/* Global shortage? */
-	return global_free < global_target;
+	return nr_pages;
 }
 
-/*
- * Are we low on inactive pages globally or in any zone?
- */
-int inactive_shortage(void)
+int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order)
 {
-	pg_data_t *pgdat;
-	unsigned int global_target = freepages.high + inactive_target;
-	unsigned int global_inactive = 0;
+	int priority = DEF_PRIORITY;
 
-	pgdat = pgdat_list;
 	do {
-		int i;
-		for(i = 0; i < MAX_NR_ZONES; i++) {
-			zone_t *zone = pgdat->node_zones + i;
-			unsigned int inactive;
+		int nr_pages = SWAP_CLUSTER_MAX;
+		nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages);
+		if (nr_pages <= 0)
+			return 1;
 
-			if (!zone->size)
-				continue;
+		swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX);
+	} while (--priority);
 
-			inactive  = zone->inactive_dirty_pages;
-			inactive += zone->inactive_clean_pages;
-			inactive += zone->free_pages;
+	return 0;
+}
 
-			/* Local shortage? */
-			if (inactive < zone->pages_high)
-				return 1;
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 
-			global_inactive += inactive;
-		}
-		pgdat = pgdat->node_next;
-	} while (pgdat);
+static int check_classzone_need_balance(zone_t * classzone)
+{
+	zone_t * first_classzone;
 
-	/* Global shortage? */
-	return global_inactive < global_target;
+	first_classzone = classzone->zone_pgdat->node_zones;
+	while (classzone >= first_classzone) {
+		if (classzone->free_pages > classzone->pages_high)
+			return 0;
+		classzone--;
+	}
+	return 1;
 }
 
-/*
- * Loop until we are no longer under an inactive or free
- * shortage. Return 1 on success, 0 if we failed to get
- * there even after "maxtry" loops.
- */
-#define INACTIVE_SHORTAGE 1
-#define FREE_SHORTAGE 2
-#define GENERAL_SHORTAGE 4
-static int do_try_to_free_pages(unsigned int gfp_mask, int user)
+static int kswapd_balance_pgdat(pg_data_t * pgdat)
 {
-	int shortage = 0;
-	int maxtry;
+	int need_more_balance = 0, i;
+	zone_t * zone;
 
-	/* Always walk at least the active queue when called */
-	refill_inactive_scan(DEF_PRIORITY);
+	for (i = pgdat->nr_zones-1; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (current->need_resched)
+			schedule();
+		if (!zone->need_balance)
+			continue;
+		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
+			zone->need_balance = 0;
+			continue;
+		}
+		if (check_classzone_need_balance(zone))
+			need_more_balance = 1;
+		else
+			zone->need_balance = 0;
+	}
 
-	maxtry = 1 << DEF_PRIORITY;
-	do {
-		/*
-		 * If needed, we move pages from the active list
-		 * to the inactive list.
-		 */
-		if (shortage & INACTIVE_SHORTAGE) {
-			/* Walk the VM space for a bit.. */
-			swap_out(DEF_PRIORITY, gfp_mask);
+	return need_more_balance;
+}
 
-			/* ..and refill the inactive list */
-			refill_inactive_scan(DEF_PRIORITY);
-		}
+static void kswapd_balance(void)
+{
+	int need_more_balance;
+	pg_data_t * pgdat;
 
-		/*
-		 * If we're low on free pages, move pages from the
-		 * inactive_dirty list to the inactive_clean list.
-		 *
-		 * Usually bdflush will have pre-cleaned the pages
-		 * before we get around to moving them to the other
-		 * list, so this is a relatively cheap operation.
-		 */
-		if (shortage & FREE_SHORTAGE)
-			page_launder(gfp_mask, user);
+	do {
+		need_more_balance = 0;
+		pgdat = pgdat_list;
+		do
+			need_more_balance |= kswapd_balance_pgdat(pgdat);
+		while ((pgdat = pgdat->node_next));
+	} while (need_more_balance);
+}
 
-		/* 	
-		 * Reclaim unused slab cache if we were short on memory.
-		 */
-		if (shortage & GENERAL_SHORTAGE) {
-			shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
-			shrink_icache_memory(DEF_PRIORITY, gfp_mask);
+static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
+{
+	zone_t * zone;
+	int i;
 
-			kmem_cache_reap(gfp_mask);
-		}
+	for (i = pgdat->nr_zones-1; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (!zone->need_balance)
+			continue;
+		return 0;
+	}
 
-		if (current->need_resched) {
-			 __set_current_state(TASK_RUNNING);
-			schedule();
-		}
+	return 1;
+}
 
-		shortage = 0;
-		if (inactive_shortage())
-			shortage |= INACTIVE_SHORTAGE | GENERAL_SHORTAGE;
-		if (free_shortage())
-			shortage |= FREE_SHORTAGE | GENERAL_SHORTAGE;
+static int kswapd_can_sleep(void)
+{
+	pg_data_t * pgdat;
 
-		if (--maxtry <= 0)
-			break;
-	} while (shortage);
+	pgdat = pgdat_list;
+	do {
+		if (kswapd_can_sleep_pgdat(pgdat))
+			continue;
+		return 0;
+	} while ((pgdat = pgdat->node_next));
 
-	/* Return success if we're not "totally short" */
-	return shortage != (FREE_SHORTAGE | INACTIVE_SHORTAGE | GENERAL_SHORTAGE);
+	return 1;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
-
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process. 
@@ -908,6 +642,7 @@
 int kswapd(void *unused)
 {
 	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
 
 	daemonize();
 	strcpy(tsk->comm, "kswapd");
@@ -931,107 +666,31 @@
 	 * Kswapd main loop.
 	 */
 	for (;;) {
-		static long recalc = 0;
-
-		/* Once a second ... */
-		if (time_after(jiffies, recalc + HZ)) {
-			recalc = jiffies;
-
-			/* Recalculate VM statistics. */
-			recalculate_vm_stats();
-		}
-
-		if (!do_try_to_free_pages(GFP_KSWAPD, 1)) {
-			if (out_of_memory())
-				oom_kill();
-			continue;
-		}
-
-		run_task_queue(&tq_disk);
-		interruptible_sleep_on_timeout(&kswapd_wait, HZ);
-	}
-}
-
-void wakeup_kswapd(void)
-{
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
-}
-
-/*
- * Called by non-kswapd processes when they want more
- * memory but are unable to sleep on kswapd because
- * they might be holding some IO locks ...
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
-	int ret = 1;
-
-	if (gfp_mask & __GFP_WAIT) {
-		current->flags |= PF_MEMALLOC;
-		ret = do_try_to_free_pages(gfp_mask, 1);
-		current->flags &= ~PF_MEMALLOC;
-	}
-
-	return ret;
-}
+		__set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kswapd_wait, &wait);
 
-DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
-/*
- * Kreclaimd will move pages from the inactive_clean list to the
- * free list, in order to keep atomic allocations possible under
- * all circumstances.
- */
-int kreclaimd(void *unused)
-{
-	struct task_struct *tsk = current;
-	pg_data_t *pgdat;
-
-	daemonize();
-	strcpy(tsk->comm, "kreclaimd");
-	sigfillset(&tsk->blocked);
-	current->flags |= PF_MEMALLOC;
-
-	while (1) {
+		mb();
+		if (kswapd_can_sleep())
+			schedule();
 
-		/*
-		 * We sleep until someone wakes us up from
-		 * page_alloc.c::__alloc_pages().
-		 */
-		interruptible_sleep_on(&kreclaimd_wait);
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kswapd_wait, &wait);
 
 		/*
-		 * Move some pages from the inactive_clean lists to
-		 * the free lists, if it is needed.
+		 * If we actually get into a low-memory situation,
+		 * the processes needing more memory will wake us
+		 * up on a more timely basis.
 		 */
-		pgdat = pgdat_list;
-		do {
-			int i;
-			for(i = 0; i < MAX_NR_ZONES; i++) {
-				zone_t *zone = pgdat->node_zones + i;
-				if (!zone->size)
-					continue;
-
-				while (zone->free_pages < zone->pages_low) {
-					struct page * page;
-					page = reclaim_page(zone);
-					if (!page)
-						break;
-					__free_page(page);
-				}
-			}
-			pgdat = pgdat->node_next;
-		} while (pgdat);
+		kswapd_balance();
+		run_task_queue(&tq_disk);
 	}
 }
 
-
 static int __init kswapd_init(void)
 {
-	printk("Starting kswapd v1.8\n");
+	printk("Starting kswapd\n");
 	swap_setup();
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
-	kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 	return 0;
 }